Fix: Sanitize PDF text to prevent PostgreSQL NUL byte errors (#4650)
This commit is contained in:
@@ -11,6 +11,7 @@ from pypdf import PdfReader
|
|||||||
|
|
||||||
from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS
|
from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS
|
||||||
from skyvern.exceptions import PDFParsingError
|
from skyvern.exceptions import PDFParsingError
|
||||||
|
from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text
|
||||||
from skyvern.utils.token_counter import count_tokens
|
from skyvern.utils.token_counter import count_tokens
|
||||||
|
|
||||||
LOG = structlog.get_logger(__name__)
|
LOG = structlog.get_logger(__name__)
|
||||||
@@ -69,6 +70,9 @@ def extract_pdf_file(
|
|||||||
current_tokens += page_tokens
|
current_tokens += page_tokens
|
||||||
extracted_text += page_text + "\n"
|
extracted_text += page_text + "\n"
|
||||||
|
|
||||||
|
# Sanitize text to remove characters that cannot be stored in PostgreSQL
|
||||||
|
extracted_text = sanitize_postgres_text(extracted_text)
|
||||||
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
"Successfully parsed PDF with pypdf",
|
"Successfully parsed PDF with pypdf",
|
||||||
file_identifier=identifier,
|
file_identifier=identifier,
|
||||||
@@ -109,6 +113,9 @@ def extract_pdf_file(
|
|||||||
current_tokens += page_tokens
|
current_tokens += page_tokens
|
||||||
extracted_text += page_text + "\n"
|
extracted_text += page_text + "\n"
|
||||||
|
|
||||||
|
# Sanitize text to remove characters that cannot be stored in PostgreSQL
|
||||||
|
extracted_text = sanitize_postgres_text(extracted_text)
|
||||||
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
"Successfully parsed PDF with pdfplumber",
|
"Successfully parsed PDF with pdfplumber",
|
||||||
file_identifier=identifier,
|
file_identifier=identifier,
|
||||||
|
|||||||
@@ -2,17 +2,53 @@
|
|||||||
Utility functions for sanitizing content before storing in the database.
|
Utility functions for sanitizing content before storing in the database.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
LOG = structlog.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def sanitize_postgres_text(text: str) -> str:
|
def sanitize_postgres_text(text: str) -> str:
|
||||||
"""
|
"""
|
||||||
Sanitize text to be stored in PostgreSQL by removing NUL bytes.
|
Sanitize text to be stored in PostgreSQL by removing problematic characters.
|
||||||
|
|
||||||
PostgreSQL text fields cannot contain NUL (0x00) bytes, so we remove them.
|
PostgreSQL text fields cannot contain:
|
||||||
|
- NUL bytes (0x00)
|
||||||
|
- Other problematic control characters
|
||||||
|
|
||||||
|
This function removes these characters while preserving normal whitespace.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: The text to sanitize
|
text: The text to sanitize
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The sanitized text without NUL bytes
|
The sanitized text safe for PostgreSQL storage
|
||||||
"""
|
"""
|
||||||
return text.replace("\0", "")
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
original_length = len(text)
|
||||||
|
|
||||||
|
# Remove NUL bytes (0x00) - PostgreSQL cannot store these
|
||||||
|
sanitized = text.replace("\x00", "")
|
||||||
|
|
||||||
|
# Remove other problematic control characters (0x01-0x08, 0x0B-0x0C, 0x0E-0x1F)
|
||||||
|
# Keep common whitespace: \t (0x09), \n (0x0A), \r (0x0D)
|
||||||
|
control_chars = (
|
||||||
|
"".join(chr(i) for i in range(1, 9))
|
||||||
|
+ "".join(chr(i) for i in range(11, 13))
|
||||||
|
+ "".join(chr(i) for i in range(14, 32))
|
||||||
|
)
|
||||||
|
|
||||||
|
for char in control_chars:
|
||||||
|
sanitized = sanitized.replace(char, "")
|
||||||
|
|
||||||
|
removed_count = original_length - len(sanitized)
|
||||||
|
if removed_count > 0:
|
||||||
|
LOG.debug(
|
||||||
|
"Removed problematic characters from text",
|
||||||
|
original_length=original_length,
|
||||||
|
removed_count=removed_count,
|
||||||
|
sanitized_length=len(sanitized),
|
||||||
|
)
|
||||||
|
|
||||||
|
return sanitized
|
||||||
|
|||||||
Reference in New Issue
Block a user