Fix: Sanitize PDF text to prevent PostgreSQL NUL byte errors (#4650)

2026-02-06 12:27:17 +08:00
parent 820b0a6f06
commit b55f0e35d8
2 changed files with 47 additions and 4 deletions
--- a/skyvern/forge/sdk/utils/pdf_parser.py
+++ b/skyvern/forge/sdk/utils/pdf_parser.py
@@ -11,6 +11,7 @@ from pypdf import PdfReader

 from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS
 from skyvern.exceptions import PDFParsingError
+from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text
 from skyvern.utils.token_counter import count_tokens

 LOG = structlog.get_logger(__name__)
@@ -69,6 +70,9 @@ def extract_pdf_file(
                current_tokens += page_tokens
            extracted_text += page_text + "\n"

+        # Sanitize text to remove characters that cannot be stored in PostgreSQL
+        extracted_text = sanitize_postgres_text(extracted_text)
+
        LOG.info(
            "Successfully parsed PDF with pypdf",
            file_identifier=identifier,
@@ -109,6 +113,9 @@ def extract_pdf_file(
                            current_tokens += page_tokens
                        extracted_text += page_text + "\n"

+                # Sanitize text to remove characters that cannot be stored in PostgreSQL
+                extracted_text = sanitize_postgres_text(extracted_text)
+
                LOG.info(
                    "Successfully parsed PDF with pdfplumber",
                    file_identifier=identifier,
--- a/skyvern/forge/sdk/utils/sanitization.py
+++ b/skyvern/forge/sdk/utils/sanitization.py
@@ -2,17 +2,53 @@
 Utility functions for sanitizing content before storing in the database.
 """

+import structlog
+
+LOG = structlog.get_logger(__name__)
+

 def sanitize_postgres_text(text: str) -> str:
    """
-    Sanitize text to be stored in PostgreSQL by removing NUL bytes.
+    Sanitize text to be stored in PostgreSQL by removing problematic characters.

-    PostgreSQL text fields cannot contain NUL (0x00) bytes, so we remove them.
+    PostgreSQL text fields cannot contain:
+    - NUL bytes (0x00)
+    - Other problematic control characters
+
+    This function removes these characters while preserving normal whitespace.

    Args:
        text: The text to sanitize

    Returns:
-        The sanitized text without NUL bytes
+        The sanitized text safe for PostgreSQL storage
    """
-    return text.replace("\0", "")
+    if not text:
+        return text
+
+    original_length = len(text)
+
+    # Remove NUL bytes (0x00) - PostgreSQL cannot store these
+    sanitized = text.replace("\x00", "")
+
+    # Remove other problematic control characters (0x01-0x08, 0x0B-0x0C, 0x0E-0x1F)
+    # Keep common whitespace: \t (0x09), \n (0x0A), \r (0x0D)
+    control_chars = (
+        "".join(chr(i) for i in range(1, 9))
+        + "".join(chr(i) for i in range(11, 13))
+        + "".join(chr(i) for i in range(14, 32))
+    )
+
+    for char in control_chars:
+        sanitized = sanitized.replace(char, "")
+
+    removed_count = original_length - len(sanitized)
+    if removed_count > 0:
+        LOG.debug(
+            "Removed problematic characters from text",
+            original_length=original_length,
+            removed_count=removed_count,
+            sanitized_length=len(sanitized),
+        )
+
+    return sanitized