98 lines
3.8 KiB
Python
98 lines
3.8 KiB
Python
|
|
from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__normal_text() -> None:
|
||
|
|
"""Test that normal text passes through unchanged."""
|
||
|
|
normal_text = "Hello, World! This is a normal PDF text with numbers 123 and symbols @#$%."
|
||
|
|
result = sanitize_postgres_text(normal_text)
|
||
|
|
assert result == normal_text
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__with_nul_bytes() -> None:
|
||
|
|
"""Test that NUL bytes (0x00) are removed."""
|
||
|
|
text_with_nul = "Hello\x00World\x00Test"
|
||
|
|
expected = "HelloWorldTest"
|
||
|
|
result = sanitize_postgres_text(text_with_nul)
|
||
|
|
assert result == expected
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__with_control_characters() -> None:
|
||
|
|
"""Test that problematic control characters are removed."""
|
||
|
|
# Test various control characters that should be removed
|
||
|
|
text_with_controls = "Hello\x01\x02\x03World\x08\x0b\x0c\x0e\x1fTest"
|
||
|
|
expected = "HelloWorldTest"
|
||
|
|
result = sanitize_postgres_text(text_with_controls)
|
||
|
|
assert result == expected
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__preserve_whitespace() -> None:
|
||
|
|
"""Test that common whitespace characters are preserved."""
|
||
|
|
text_with_whitespace = "Hello\tWorld\nNew Line\rCarriage Return"
|
||
|
|
result = sanitize_postgres_text(text_with_whitespace)
|
||
|
|
assert result == text_with_whitespace
|
||
|
|
assert "\t" in result
|
||
|
|
assert "\n" in result
|
||
|
|
assert "\r" in result
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__empty_string() -> None:
|
||
|
|
"""Test that empty string is handled correctly."""
|
||
|
|
result = sanitize_postgres_text("")
|
||
|
|
assert result == ""
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__mixed_case() -> None:
|
||
|
|
"""Test text with mix of normal text, NUL bytes, and control characters."""
|
||
|
|
mixed_text = "PDF Text\x00with NUL\tbytes\nand\x01control\x08chars\rand normal text."
|
||
|
|
# \r should be preserved as it's a valid whitespace character
|
||
|
|
expected = "PDF Textwith NUL\tbytes\nandcontrolchars\rand normal text."
|
||
|
|
result = sanitize_postgres_text(mixed_text)
|
||
|
|
assert result == expected
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__multiple_nul_bytes() -> None:
|
||
|
|
"""Test that multiple consecutive NUL bytes are all removed."""
|
||
|
|
text_with_multiple_nuls = "Start\x00\x00\x00Middle\x00\x00End"
|
||
|
|
expected = "StartMiddleEnd"
|
||
|
|
result = sanitize_postgres_text(text_with_multiple_nuls)
|
||
|
|
assert result == expected
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__unicode_text() -> None:
|
||
|
|
"""Test that Unicode characters are preserved."""
|
||
|
|
unicode_text = "中文测试 Unicode: café, naïve, Ω, emoji 😀"
|
||
|
|
result = sanitize_postgres_text(unicode_text)
|
||
|
|
assert result == unicode_text
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__real_world_pdf_scenario() -> None:
|
||
|
|
"""Test a realistic scenario with PDF extraction artifacts."""
|
||
|
|
# Simulate what might come from a PDF extraction
|
||
|
|
pdf_text = "Invoice\x00Number:\t12345\nDate:\t2024-01-01\x00\nTotal:\t$100.00\x01\x02"
|
||
|
|
expected = "InvoiceNumber:\t12345\nDate:\t2024-01-01\nTotal:\t$100.00"
|
||
|
|
result = sanitize_postgres_text(pdf_text)
|
||
|
|
assert result == expected
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__only_control_characters() -> None:
|
||
|
|
"""Test string with only problematic characters."""
|
||
|
|
only_controls = "\x00\x01\x02\x03\x08"
|
||
|
|
expected = ""
|
||
|
|
result = sanitize_postgres_text(only_controls)
|
||
|
|
assert result == expected
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__preserves_spaces_and_punctuation() -> None:
|
||
|
|
"""Test that normal spaces and punctuation are preserved."""
|
||
|
|
text = "Hello, World! How are you? I'm fine. Test@example.com"
|
||
|
|
result = sanitize_postgres_text(text)
|
||
|
|
assert result == text
|
||
|
|
|
||
|
|
|
||
|
|
def test_sanitize_postgres_text__newlines_and_paragraphs() -> None:
|
||
|
|
"""Test multi-paragraph text with newlines."""
|
||
|
|
multiline_text = "Paragraph 1\n\nParagraph 2\n\nParagraph 3"
|
||
|
|
result = sanitize_postgres_text(multiline_text)
|
||
|
|
assert result == multiline_text
|
||
|
|
assert result.count("\n") == 4
|