Fix: Sanitize PDF text to prevent PostgreSQL NUL byte errors (#4650)

This commit is contained in:
LawyZheng
2026-02-06 12:27:17 +08:00
committed by GitHub
parent 820b0a6f06
commit b55f0e35d8
2 changed files with 47 additions and 4 deletions

View File

@@ -11,6 +11,7 @@ from pypdf import PdfReader
from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS
from skyvern.exceptions import PDFParsingError
from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text
from skyvern.utils.token_counter import count_tokens
LOG = structlog.get_logger(__name__)
@@ -69,6 +70,9 @@ def extract_pdf_file(
current_tokens += page_tokens
extracted_text += page_text + "\n"
# Sanitize text to remove characters that cannot be stored in PostgreSQL
extracted_text = sanitize_postgres_text(extracted_text)
LOG.info(
"Successfully parsed PDF with pypdf",
file_identifier=identifier,
@@ -109,6 +113,9 @@ def extract_pdf_file(
current_tokens += page_tokens
extracted_text += page_text + "\n"
# Sanitize text to remove characters that cannot be stored in PostgreSQL
extracted_text = sanitize_postgres_text(extracted_text)
LOG.info(
"Successfully parsed PDF with pdfplumber",
file_identifier=identifier,

View File

@@ -2,17 +2,53 @@
Utility functions for sanitizing content before storing in the database.
"""
import structlog
LOG = structlog.get_logger(__name__)
def sanitize_postgres_text(text: str) -> str:
"""
Sanitize text to be stored in PostgreSQL by removing NUL bytes.
Sanitize text to be stored in PostgreSQL by removing problematic characters.
PostgreSQL text fields cannot contain NUL (0x00) bytes, so we remove them.
PostgreSQL text fields cannot contain:
- NUL bytes (0x00)
- Other problematic control characters
This function removes these characters while preserving normal whitespace.
Args:
text: The text to sanitize
Returns:
The sanitized text without NUL bytes
The sanitized text safe for PostgreSQL storage
"""
return text.replace("\0", "")
if not text:
return text
original_length = len(text)
# Remove NUL bytes (0x00) - PostgreSQL cannot store these
sanitized = text.replace("\x00", "")
# Remove other problematic control characters (0x01-0x08, 0x0B-0x0C, 0x0E-0x1F)
# Keep common whitespace: \t (0x09), \n (0x0A), \r (0x0D)
control_chars = (
"".join(chr(i) for i in range(1, 9))
+ "".join(chr(i) for i in range(11, 13))
+ "".join(chr(i) for i in range(14, 32))
)
for char in control_chars:
sanitized = sanitized.replace(char, "")
removed_count = original_length - len(sanitized)
if removed_count > 0:
LOG.debug(
"Removed problematic characters from text",
original_length=original_length,
removed_count=removed_count,
sanitized_length=len(sanitized),
)
return sanitized