Files
Dorod-Sky/tests/unit_tests/test_token_counter.py

109 lines
3.6 KiB
Python
Raw Permalink Normal View History

"""Tests for token counter utility."""
from skyvern.utils.token_counter import count_tokens
class TestCountTokens:
"""Tests for count_tokens function."""
def test_empty_string(self):
"""Empty string should have 0 tokens."""
assert count_tokens("") == 0
def test_single_word(self):
"""Single word should return token count."""
result = count_tokens("hello")
assert result > 0
assert isinstance(result, int)
def test_simple_sentence(self):
"""Simple sentence should have reasonable token count."""
result = count_tokens("Hello, world!")
assert result > 0
# "Hello, world!" typically tokenizes to ~4 tokens
assert result < 10
def test_longer_text(self):
"""Longer text should have more tokens."""
short = count_tokens("Hi")
long = count_tokens("This is a much longer sentence with many more words in it.")
assert long > short
def test_returns_integer(self):
"""Should return an integer."""
result = count_tokens("test")
assert isinstance(result, int)
def test_whitespace_only(self):
"""Whitespace should be tokenized."""
result = count_tokens(" ")
# Whitespace is typically tokenized
assert isinstance(result, int)
def test_special_characters(self):
"""Special characters should be tokenized."""
result = count_tokens("!@#$%^&*()")
assert result > 0
def test_numbers(self):
"""Numbers should be tokenized."""
result = count_tokens("12345")
assert result > 0
def test_unicode(self):
"""Unicode characters should be tokenized."""
result = count_tokens("你好世界")
assert result > 0
def test_mixed_content(self):
"""Mixed content (text, numbers, special chars) should work."""
result = count_tokens("Hello123!@#World")
assert result > 0
def test_newlines(self):
"""Text with newlines should be tokenized."""
result = count_tokens("Hello\nWorld\nTest")
assert result > 0
def test_code_snippet(self):
"""Code snippets should be tokenized."""
code = """
def hello():
print("Hello, World!")
"""
result = count_tokens(code)
assert result > 5 # Code should have multiple tokens
def test_json_content(self):
"""JSON content should be tokenized."""
json_str = '{"key": "value", "number": 123}'
result = count_tokens(json_str)
assert result > 0
def test_url(self):
"""URLs should be tokenized."""
result = count_tokens("https://www.example.com/path?query=value")
assert result > 0
def test_consistency(self):
"""Same input should always produce same output."""
text = "This is a test sentence."
result1 = count_tokens(text)
result2 = count_tokens(text)
assert result1 == result2
def test_very_long_text(self):
"""Very long text should be handled."""
long_text = "word " * 1000
result = count_tokens(long_text)
assert result > 100 # Should have many tokens
def test_token_count_approximation(self):
"""Token count should be roughly 1 token per 4 chars for English."""
text = "This is a sample text for testing token count approximation."
result = count_tokens(text)
# GPT tokenizers typically produce ~1 token per 4 characters
char_count = len(text)
assert result > char_count / 10 # Very loose lower bound
assert result < char_count # Token count should be less than char count