Dorod-Sky/tests/unit_tests/test_token_counter.py

"""Tests for token counter utility."""

from skyvern.utils.token_counter import count_tokens


class TestCountTokens:
    """Tests for count_tokens function."""

    def test_empty_string(self):
        """Empty string should have 0 tokens."""
        assert count_tokens("") == 0

    def test_single_word(self):
        """Single word should return token count."""
        result = count_tokens("hello")
        assert result > 0
        assert isinstance(result, int)

    def test_simple_sentence(self):
        """Simple sentence should have reasonable token count."""
        result = count_tokens("Hello, world!")
        assert result > 0
        # "Hello, world!" typically tokenizes to ~4 tokens
        assert result < 10

    def test_longer_text(self):
        """Longer text should have more tokens."""
        short = count_tokens("Hi")
        long = count_tokens("This is a much longer sentence with many more words in it.")
        assert long > short

    def test_returns_integer(self):
        """Should return an integer."""
        result = count_tokens("test")
        assert isinstance(result, int)

    def test_whitespace_only(self):
        """Whitespace should be tokenized."""
        result = count_tokens("   ")
        # Whitespace is typically tokenized
        assert isinstance(result, int)

    def test_special_characters(self):
        """Special characters should be tokenized."""
        result = count_tokens("!@#$%^&*()")
        assert result > 0

    def test_numbers(self):
        """Numbers should be tokenized."""
        result = count_tokens("12345")
        assert result > 0

    def test_unicode(self):
        """Unicode characters should be tokenized."""
        result = count_tokens("你好世界")
        assert result > 0

    def test_mixed_content(self):
        """Mixed content (text, numbers, special chars) should work."""
        result = count_tokens("Hello123!@#World")
        assert result > 0

    def test_newlines(self):
        """Text with newlines should be tokenized."""
        result = count_tokens("Hello\nWorld\nTest")
        assert result > 0

    def test_code_snippet(self):
        """Code snippets should be tokenized."""
        code = """
def hello():
    print("Hello, World!")
"""
        result = count_tokens(code)
        assert result > 5  # Code should have multiple tokens

    def test_json_content(self):
        """JSON content should be tokenized."""
        json_str = '{"key": "value", "number": 123}'
        result = count_tokens(json_str)
        assert result > 0

    def test_url(self):
        """URLs should be tokenized."""
        result = count_tokens("https://www.example.com/path?query=value")
        assert result > 0

    def test_consistency(self):
        """Same input should always produce same output."""
        text = "This is a test sentence."
        result1 = count_tokens(text)
        result2 = count_tokens(text)
        assert result1 == result2

    def test_very_long_text(self):
        """Very long text should be handled."""
        long_text = "word " * 1000
        result = count_tokens(long_text)
        assert result > 100  # Should have many tokens

    def test_token_count_approximation(self):
        """Token count should be roughly 1 token per 4 chars for English."""
        text = "This is a sample text for testing token count approximation."
        result = count_tokens(text)
        # GPT tokenizers typically produce ~1 token per 4 characters
        char_count = len(text)
        assert result > char_count / 10  # Very loose lower bound
        assert result < char_count  # Token count should be less than char count