tests/unit_tests/test_token_counter.py

"""Tests for token counter utility."""

from skyvern.utils.token_counter import count_tokens


class TestCountTokens:
    """Tests for count_tokens function."""

    def test_empty_string(self):
        """Empty string should have 0 tokens."""
        assert count_tokens("") == 0

    def test_single_word(self):
        """Single word should return token count."""
        result = count_tokens("hello")
        assert result > 0
        assert isinstance(result, int)

    def test_simple_sentence(self):
        """Simple sentence should have reasonable token count."""
        result = count_tokens("Hello, world!")
        assert result > 0
        # "Hello, world!" typically tokenizes to ~4 tokens
        assert result < 10

    def test_longer_text(self):
        """Longer text should have more tokens."""
        short = count_tokens("Hi")
        long = count_tokens("This is a much longer sentence with many more words in it.")
        assert long > short

    def test_returns_integer(self):
        """Should return an integer."""
        result = count_tokens("test")
        assert isinstance(result, int)

    def test_whitespace_only(self):
        """Whitespace should be tokenized."""
        result = count_tokens("   ")
        # Whitespace is typically tokenized
        assert isinstance(result, int)

    def test_special_characters(self):
        """Special characters should be tokenized."""
        result = count_tokens("!@#$%^&*()")
        assert result > 0

    def test_numbers(self):
        """Numbers should be tokenized."""
        result = count_tokens("12345")
        assert result > 0

    def test_unicode(self):
        """Unicode characters should be tokenized."""
        result = count_tokens("你好世界")
        assert result > 0

    def test_mixed_content(self):
        """Mixed content (text, numbers, special chars) should work."""
        result = count_tokens("Hello123!@#World")
        assert result > 0

    def test_newlines(self):
        """Text with newlines should be tokenized."""
        result = count_tokens("Hello\nWorld\nTest")
        assert result > 0

    def test_code_snippet(self):
        """Code snippets should be tokenized."""
        code = """
def hello():
    print("Hello, World!")
"""
        result = count_tokens(code)
        assert result > 5  # Code should have multiple tokens

    def test_json_content(self):
        """JSON content should be tokenized."""
        json_str = '{"key": "value", "number": 123}'
        result = count_tokens(json_str)
        assert result > 0

    def test_url(self):
        """URLs should be tokenized."""
        result = count_tokens("https://www.example.com/path?query=value")
        assert result > 0

    def test_consistency(self):
        """Same input should always produce same output."""
        text = "This is a test sentence."
        result1 = count_tokens(text)
        result2 = count_tokens(text)
        assert result1 == result2

    def test_very_long_text(self):
        """Very long text should be handled."""
        long_text = "word " * 1000
        result = count_tokens(long_text)
        assert result > 100  # Should have many tokens

    def test_token_count_approximation(self):
        """Token count should be roughly 1 token per 4 chars for English."""
        text = "This is a sample text for testing token count approximation."
        result = count_tokens(text)
        # GPT tokenizers typically produce ~1 token per 4 characters
        char_count = len(text)
        assert result > char_count / 10  # Very loose lower bound
        assert result < char_count  # Token count should be less than char count
Add 116 unit tests for core utility modules (#4269) Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Shuchang Zheng <wintonzheng0325@gmail.com> 2025-12-15 07:06:56 +08:00			`"""Tests for token counter utility."""`

			`from skyvern.utils.token_counter import count_tokens`


			`class TestCountTokens:`
			`"""Tests for count_tokens function."""`

			`def test_empty_string(self):`
			`"""Empty string should have 0 tokens."""`
			`assert count_tokens("") == 0`

			`def test_single_word(self):`
			`"""Single word should return token count."""`
			`result = count_tokens("hello")`
			`assert result > 0`
			`assert isinstance(result, int)`

			`def test_simple_sentence(self):`
			`"""Simple sentence should have reasonable token count."""`
			`result = count_tokens("Hello, world!")`
			`assert result > 0`
			`# "Hello, world!" typically tokenizes to ~4 tokens`
			`assert result < 10`

			`def test_longer_text(self):`
			`"""Longer text should have more tokens."""`
			`short = count_tokens("Hi")`
			`long = count_tokens("This is a much longer sentence with many more words in it.")`
			`assert long > short`

			`def test_returns_integer(self):`
			`"""Should return an integer."""`
			`result = count_tokens("test")`
			`assert isinstance(result, int)`

			`def test_whitespace_only(self):`
			`"""Whitespace should be tokenized."""`
			`result = count_tokens(" ")`
			`# Whitespace is typically tokenized`
			`assert isinstance(result, int)`

			`def test_special_characters(self):`
			`"""Special characters should be tokenized."""`
			`result = count_tokens("!@#$%^&*()")`
			`assert result > 0`

			`def test_numbers(self):`
			`"""Numbers should be tokenized."""`
			`result = count_tokens("12345")`
			`assert result > 0`

			`def test_unicode(self):`
			`"""Unicode characters should be tokenized."""`
			`result = count_tokens("你好世界")`
			`assert result > 0`

			`def test_mixed_content(self):`
			`"""Mixed content (text, numbers, special chars) should work."""`
			`result = count_tokens("Hello123!@#World")`
			`assert result > 0`

			`def test_newlines(self):`
			`"""Text with newlines should be tokenized."""`
			`result = count_tokens("Hello\nWorld\nTest")`
			`assert result > 0`

			`def test_code_snippet(self):`
			`"""Code snippets should be tokenized."""`
			`code = """`
			`def hello():`
			`print("Hello, World!")`
			`"""`
			`result = count_tokens(code)`
			`assert result > 5 # Code should have multiple tokens`

			`def test_json_content(self):`
			`"""JSON content should be tokenized."""`
			`json_str = '{"key": "value", "number": 123}'`
			`result = count_tokens(json_str)`
			`assert result > 0`

			`def test_url(self):`
			`"""URLs should be tokenized."""`
			`result = count_tokens("https://www.example.com/path?query=value")`
			`assert result > 0`

			`def test_consistency(self):`
			`"""Same input should always produce same output."""`
			`text = "This is a test sentence."`
			`result1 = count_tokens(text)`
			`result2 = count_tokens(text)`
			`assert result1 == result2`

			`def test_very_long_text(self):`
			`"""Very long text should be handled."""`
			`long_text = "word " * 1000`
			`result = count_tokens(long_text)`
			`assert result > 100 # Should have many tokens`

			`def test_token_count_approximation(self):`
			`"""Token count should be roughly 1 token per 4 chars for English."""`
			`text = "This is a sample text for testing token count approximation."`
			`result = count_tokens(text)`
			`# GPT tokenizers typically produce ~1 token per 4 characters`
			`char_count = len(text)`
			`assert result > char_count / 10 # Very loose lower bound`
			`assert result < char_count # Token count should be less than char count`