Files
Dorod-Sky/tests/unit/test_sanitization.py

98 lines
3.8 KiB
Python
Raw Normal View History

from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text
def test_sanitize_postgres_text__normal_text() -> None:
"""Test that normal text passes through unchanged."""
normal_text = "Hello, World! This is a normal PDF text with numbers 123 and symbols @#$%."
result = sanitize_postgres_text(normal_text)
assert result == normal_text
def test_sanitize_postgres_text__with_nul_bytes() -> None:
"""Test that NUL bytes (0x00) are removed."""
text_with_nul = "Hello\x00World\x00Test"
expected = "HelloWorldTest"
result = sanitize_postgres_text(text_with_nul)
assert result == expected
def test_sanitize_postgres_text__with_control_characters() -> None:
"""Test that problematic control characters are removed."""
# Test various control characters that should be removed
text_with_controls = "Hello\x01\x02\x03World\x08\x0b\x0c\x0e\x1fTest"
expected = "HelloWorldTest"
result = sanitize_postgres_text(text_with_controls)
assert result == expected
def test_sanitize_postgres_text__preserve_whitespace() -> None:
"""Test that common whitespace characters are preserved."""
text_with_whitespace = "Hello\tWorld\nNew Line\rCarriage Return"
result = sanitize_postgres_text(text_with_whitespace)
assert result == text_with_whitespace
assert "\t" in result
assert "\n" in result
assert "\r" in result
def test_sanitize_postgres_text__empty_string() -> None:
"""Test that empty string is handled correctly."""
result = sanitize_postgres_text("")
assert result == ""
def test_sanitize_postgres_text__mixed_case() -> None:
"""Test text with mix of normal text, NUL bytes, and control characters."""
mixed_text = "PDF Text\x00with NUL\tbytes\nand\x01control\x08chars\rand normal text."
# \r should be preserved as it's a valid whitespace character
expected = "PDF Textwith NUL\tbytes\nandcontrolchars\rand normal text."
result = sanitize_postgres_text(mixed_text)
assert result == expected
def test_sanitize_postgres_text__multiple_nul_bytes() -> None:
"""Test that multiple consecutive NUL bytes are all removed."""
text_with_multiple_nuls = "Start\x00\x00\x00Middle\x00\x00End"
expected = "StartMiddleEnd"
result = sanitize_postgres_text(text_with_multiple_nuls)
assert result == expected
def test_sanitize_postgres_text__unicode_text() -> None:
"""Test that Unicode characters are preserved."""
unicode_text = "中文测试 Unicode: café, naïve, Ω, emoji 😀"
result = sanitize_postgres_text(unicode_text)
assert result == unicode_text
def test_sanitize_postgres_text__real_world_pdf_scenario() -> None:
"""Test a realistic scenario with PDF extraction artifacts."""
# Simulate what might come from a PDF extraction
pdf_text = "Invoice\x00Number:\t12345\nDate:\t2024-01-01\x00\nTotal:\t$100.00\x01\x02"
expected = "InvoiceNumber:\t12345\nDate:\t2024-01-01\nTotal:\t$100.00"
result = sanitize_postgres_text(pdf_text)
assert result == expected
def test_sanitize_postgres_text__only_control_characters() -> None:
"""Test string with only problematic characters."""
only_controls = "\x00\x01\x02\x03\x08"
expected = ""
result = sanitize_postgres_text(only_controls)
assert result == expected
def test_sanitize_postgres_text__preserves_spaces_and_punctuation() -> None:
"""Test that normal spaces and punctuation are preserved."""
text = "Hello, World! How are you? I'm fine. Test@example.com"
result = sanitize_postgres_text(text)
assert result == text
def test_sanitize_postgres_text__newlines_and_paragraphs() -> None:
"""Test multi-paragraph text with newlines."""
multiline_text = "Paragraph 1\n\nParagraph 2\n\nParagraph 3"
result = sanitize_postgres_text(multiline_text)
assert result == multiline_text
assert result.count("\n") == 4