Fix: Sanitize PDF text to prevent PostgreSQL NUL byte errors (#4650)
This commit is contained in:
@@ -11,6 +11,7 @@ from pypdf import PdfReader
|
||||
|
||||
from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS
|
||||
from skyvern.exceptions import PDFParsingError
|
||||
from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text
|
||||
from skyvern.utils.token_counter import count_tokens
|
||||
|
||||
LOG = structlog.get_logger(__name__)
|
||||
@@ -69,6 +70,9 @@ def extract_pdf_file(
|
||||
current_tokens += page_tokens
|
||||
extracted_text += page_text + "\n"
|
||||
|
||||
# Sanitize text to remove characters that cannot be stored in PostgreSQL
|
||||
extracted_text = sanitize_postgres_text(extracted_text)
|
||||
|
||||
LOG.info(
|
||||
"Successfully parsed PDF with pypdf",
|
||||
file_identifier=identifier,
|
||||
@@ -109,6 +113,9 @@ def extract_pdf_file(
|
||||
current_tokens += page_tokens
|
||||
extracted_text += page_text + "\n"
|
||||
|
||||
# Sanitize text to remove characters that cannot be stored in PostgreSQL
|
||||
extracted_text = sanitize_postgres_text(extracted_text)
|
||||
|
||||
LOG.info(
|
||||
"Successfully parsed PDF with pdfplumber",
|
||||
file_identifier=identifier,
|
||||
|
||||
@@ -2,17 +2,53 @@
|
||||
Utility functions for sanitizing content before storing in the database.
|
||||
"""
|
||||
|
||||
import structlog
|
||||
|
||||
LOG = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
def sanitize_postgres_text(text: str) -> str:
|
||||
"""
|
||||
Sanitize text to be stored in PostgreSQL by removing NUL bytes.
|
||||
Sanitize text to be stored in PostgreSQL by removing problematic characters.
|
||||
|
||||
PostgreSQL text fields cannot contain NUL (0x00) bytes, so we remove them.
|
||||
PostgreSQL text fields cannot contain:
|
||||
- NUL bytes (0x00)
|
||||
- Other problematic control characters
|
||||
|
||||
This function removes these characters while preserving normal whitespace.
|
||||
|
||||
Args:
|
||||
text: The text to sanitize
|
||||
|
||||
Returns:
|
||||
The sanitized text without NUL bytes
|
||||
The sanitized text safe for PostgreSQL storage
|
||||
"""
|
||||
return text.replace("\0", "")
|
||||
if not text:
|
||||
return text
|
||||
|
||||
original_length = len(text)
|
||||
|
||||
# Remove NUL bytes (0x00) - PostgreSQL cannot store these
|
||||
sanitized = text.replace("\x00", "")
|
||||
|
||||
# Remove other problematic control characters (0x01-0x08, 0x0B-0x0C, 0x0E-0x1F)
|
||||
# Keep common whitespace: \t (0x09), \n (0x0A), \r (0x0D)
|
||||
control_chars = (
|
||||
"".join(chr(i) for i in range(1, 9))
|
||||
+ "".join(chr(i) for i in range(11, 13))
|
||||
+ "".join(chr(i) for i in range(14, 32))
|
||||
)
|
||||
|
||||
for char in control_chars:
|
||||
sanitized = sanitized.replace(char, "")
|
||||
|
||||
removed_count = original_length - len(sanitized)
|
||||
if removed_count > 0:
|
||||
LOG.debug(
|
||||
"Removed problematic characters from text",
|
||||
original_length=original_length,
|
||||
removed_count=removed_count,
|
||||
sanitized_length=len(sanitized),
|
||||
)
|
||||
|
||||
return sanitized
|
||||
|
||||
Reference in New Issue
Block a user