Remove setup.sh in favor of skyvern CLI (#4737)
This commit is contained in:
97
tests/unit/test_sanitization.py
Normal file
97
tests/unit/test_sanitization.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__normal_text() -> None:
|
||||
"""Test that normal text passes through unchanged."""
|
||||
normal_text = "Hello, World! This is a normal PDF text with numbers 123 and symbols @#$%."
|
||||
result = sanitize_postgres_text(normal_text)
|
||||
assert result == normal_text
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__with_nul_bytes() -> None:
|
||||
"""Test that NUL bytes (0x00) are removed."""
|
||||
text_with_nul = "Hello\x00World\x00Test"
|
||||
expected = "HelloWorldTest"
|
||||
result = sanitize_postgres_text(text_with_nul)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__with_control_characters() -> None:
|
||||
"""Test that problematic control characters are removed."""
|
||||
# Test various control characters that should be removed
|
||||
text_with_controls = "Hello\x01\x02\x03World\x08\x0b\x0c\x0e\x1fTest"
|
||||
expected = "HelloWorldTest"
|
||||
result = sanitize_postgres_text(text_with_controls)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__preserve_whitespace() -> None:
|
||||
"""Test that common whitespace characters are preserved."""
|
||||
text_with_whitespace = "Hello\tWorld\nNew Line\rCarriage Return"
|
||||
result = sanitize_postgres_text(text_with_whitespace)
|
||||
assert result == text_with_whitespace
|
||||
assert "\t" in result
|
||||
assert "\n" in result
|
||||
assert "\r" in result
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__empty_string() -> None:
|
||||
"""Test that empty string is handled correctly."""
|
||||
result = sanitize_postgres_text("")
|
||||
assert result == ""
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__mixed_case() -> None:
|
||||
"""Test text with mix of normal text, NUL bytes, and control characters."""
|
||||
mixed_text = "PDF Text\x00with NUL\tbytes\nand\x01control\x08chars\rand normal text."
|
||||
# \r should be preserved as it's a valid whitespace character
|
||||
expected = "PDF Textwith NUL\tbytes\nandcontrolchars\rand normal text."
|
||||
result = sanitize_postgres_text(mixed_text)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__multiple_nul_bytes() -> None:
|
||||
"""Test that multiple consecutive NUL bytes are all removed."""
|
||||
text_with_multiple_nuls = "Start\x00\x00\x00Middle\x00\x00End"
|
||||
expected = "StartMiddleEnd"
|
||||
result = sanitize_postgres_text(text_with_multiple_nuls)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__unicode_text() -> None:
|
||||
"""Test that Unicode characters are preserved."""
|
||||
unicode_text = "中文测试 Unicode: café, naïve, Ω, emoji 😀"
|
||||
result = sanitize_postgres_text(unicode_text)
|
||||
assert result == unicode_text
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__real_world_pdf_scenario() -> None:
|
||||
"""Test a realistic scenario with PDF extraction artifacts."""
|
||||
# Simulate what might come from a PDF extraction
|
||||
pdf_text = "Invoice\x00Number:\t12345\nDate:\t2024-01-01\x00\nTotal:\t$100.00\x01\x02"
|
||||
expected = "InvoiceNumber:\t12345\nDate:\t2024-01-01\nTotal:\t$100.00"
|
||||
result = sanitize_postgres_text(pdf_text)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__only_control_characters() -> None:
|
||||
"""Test string with only problematic characters."""
|
||||
only_controls = "\x00\x01\x02\x03\x08"
|
||||
expected = ""
|
||||
result = sanitize_postgres_text(only_controls)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__preserves_spaces_and_punctuation() -> None:
|
||||
"""Test that normal spaces and punctuation are preserved."""
|
||||
text = "Hello, World! How are you? I'm fine. Test@example.com"
|
||||
result = sanitize_postgres_text(text)
|
||||
assert result == text
|
||||
|
||||
|
||||
def test_sanitize_postgres_text__newlines_and_paragraphs() -> None:
|
||||
"""Test multi-paragraph text with newlines."""
|
||||
multiline_text = "Paragraph 1\n\nParagraph 2\n\nParagraph 3"
|
||||
result = sanitize_postgres_text(multiline_text)
|
||||
assert result == multiline_text
|
||||
assert result.count("\n") == 4
|
||||
Reference in New Issue
Block a user