from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text


def test_sanitize_postgres_text__normal_text() -> None:
    """Test that normal text passes through unchanged."""
    normal_text = "Hello, World! This is a normal PDF text with numbers 123 and symbols @#$%."
    result = sanitize_postgres_text(normal_text)
    assert result == normal_text


def test_sanitize_postgres_text__with_nul_bytes() -> None:
    """Test that NUL bytes (0x00) are removed."""
    text_with_nul = "Hello\x00World\x00Test"
    expected = "HelloWorldTest"
    result = sanitize_postgres_text(text_with_nul)
    assert result == expected


def test_sanitize_postgres_text__with_control_characters() -> None:
    """Test that problematic control characters are removed."""
    # Test various control characters that should be removed
    text_with_controls = "Hello\x01\x02\x03World\x08\x0b\x0c\x0e\x1fTest"
    expected = "HelloWorldTest"
    result = sanitize_postgres_text(text_with_controls)
    assert result == expected


def test_sanitize_postgres_text__preserve_whitespace() -> None:
    """Test that common whitespace characters are preserved."""
    text_with_whitespace = "Hello\tWorld\nNew Line\rCarriage Return"
    result = sanitize_postgres_text(text_with_whitespace)
    assert result == text_with_whitespace
    assert "\t" in result
    assert "\n" in result
    assert "\r" in result


def test_sanitize_postgres_text__empty_string() -> None:
    """Test that empty string is handled correctly."""
    result = sanitize_postgres_text("")
    assert result == ""


def test_sanitize_postgres_text__mixed_case() -> None:
    """Test text with mix of normal text, NUL bytes, and control characters."""
    mixed_text = "PDF Text\x00with NUL\tbytes\nand\x01control\x08chars\rand normal text."
    # \r should be preserved as it's a valid whitespace character
    expected = "PDF Textwith NUL\tbytes\nandcontrolchars\rand normal text."
    result = sanitize_postgres_text(mixed_text)
    assert result == expected


def test_sanitize_postgres_text__multiple_nul_bytes() -> None:
    """Test that multiple consecutive NUL bytes are all removed."""
    text_with_multiple_nuls = "Start\x00\x00\x00Middle\x00\x00End"
    expected = "StartMiddleEnd"
    result = sanitize_postgres_text(text_with_multiple_nuls)
    assert result == expected


def test_sanitize_postgres_text__unicode_text() -> None:
    """Test that Unicode characters are preserved."""
    unicode_text = "中文测试 Unicode: café, naïve, Ω, emoji 😀"
    result = sanitize_postgres_text(unicode_text)
    assert result == unicode_text


def test_sanitize_postgres_text__real_world_pdf_scenario() -> None:
    """Test a realistic scenario with PDF extraction artifacts."""
    # Simulate what might come from a PDF extraction
    pdf_text = "Invoice\x00Number:\t12345\nDate:\t2024-01-01\x00\nTotal:\t$100.00\x01\x02"
    expected = "InvoiceNumber:\t12345\nDate:\t2024-01-01\nTotal:\t$100.00"
    result = sanitize_postgres_text(pdf_text)
    assert result == expected


def test_sanitize_postgres_text__only_control_characters() -> None:
    """Test string with only problematic characters."""
    only_controls = "\x00\x01\x02\x03\x08"
    expected = ""
    result = sanitize_postgres_text(only_controls)
    assert result == expected


def test_sanitize_postgres_text__preserves_spaces_and_punctuation() -> None:
    """Test that normal spaces and punctuation are preserved."""
    text = "Hello, World! How are you? I'm fine. Test@example.com"
    result = sanitize_postgres_text(text)
    assert result == text


def test_sanitize_postgres_text__newlines_and_paragraphs() -> None:
    """Test multi-paragraph text with newlines."""
    multiline_text = "Paragraph 1\n\nParagraph 2\n\nParagraph 3"
    result = sanitize_postgres_text(multiline_text)
    assert result == multiline_text
    assert result.count("\n") == 4