diff --git a/skyvern/forge/sdk/utils/pdf_parser.py b/skyvern/forge/sdk/utils/pdf_parser.py index 5843e1e3..cad98ed4 100644 --- a/skyvern/forge/sdk/utils/pdf_parser.py +++ b/skyvern/forge/sdk/utils/pdf_parser.py @@ -11,6 +11,7 @@ from pypdf import PdfReader from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS from skyvern.exceptions import PDFParsingError +from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text from skyvern.utils.token_counter import count_tokens LOG = structlog.get_logger(__name__) @@ -69,6 +70,9 @@ def extract_pdf_file( current_tokens += page_tokens extracted_text += page_text + "\n" + # Sanitize text to remove characters that cannot be stored in PostgreSQL + extracted_text = sanitize_postgres_text(extracted_text) + LOG.info( "Successfully parsed PDF with pypdf", file_identifier=identifier, @@ -109,6 +113,9 @@ def extract_pdf_file( current_tokens += page_tokens extracted_text += page_text + "\n" + # Sanitize text to remove characters that cannot be stored in PostgreSQL + extracted_text = sanitize_postgres_text(extracted_text) + LOG.info( "Successfully parsed PDF with pdfplumber", file_identifier=identifier, diff --git a/skyvern/forge/sdk/utils/sanitization.py b/skyvern/forge/sdk/utils/sanitization.py index 9f3e91a2..3f144029 100644 --- a/skyvern/forge/sdk/utils/sanitization.py +++ b/skyvern/forge/sdk/utils/sanitization.py @@ -2,17 +2,53 @@ Utility functions for sanitizing content before storing in the database. """ +import structlog + +LOG = structlog.get_logger(__name__) + def sanitize_postgres_text(text: str) -> str: """ - Sanitize text to be stored in PostgreSQL by removing NUL bytes. + Sanitize text to be stored in PostgreSQL by removing problematic characters. - PostgreSQL text fields cannot contain NUL (0x00) bytes, so we remove them. + PostgreSQL text fields cannot contain: + - NUL bytes (0x00) + - Other problematic control characters + + This function removes these characters while preserving normal whitespace. Args: text: The text to sanitize Returns: - The sanitized text without NUL bytes + The sanitized text safe for PostgreSQL storage """ - return text.replace("\0", "") + if not text: + return text + + original_length = len(text) + + # Remove NUL bytes (0x00) - PostgreSQL cannot store these + sanitized = text.replace("\x00", "") + + # Remove other problematic control characters (0x01-0x08, 0x0B-0x0C, 0x0E-0x1F) + # Keep common whitespace: \t (0x09), \n (0x0A), \r (0x0D) + control_chars = ( + "".join(chr(i) for i in range(1, 9)) + + "".join(chr(i) for i in range(11, 13)) + + "".join(chr(i) for i in range(14, 32)) + ) + + for char in control_chars: + sanitized = sanitized.replace(char, "") + + removed_count = original_length - len(sanitized) + if removed_count > 0: + LOG.debug( + "Removed problematic characters from text", + original_length=original_length, + removed_count=removed_count, + sanitized_length=len(sanitized), + ) + + return sanitized