Fix: Sanitize PDF text to prevent PostgreSQL NUL byte errors (#4650)

This commit is contained in:
LawyZheng
2026-02-06 12:27:17 +08:00
committed by GitHub
parent 820b0a6f06
commit b55f0e35d8
2 changed files with 47 additions and 4 deletions

View File

@@ -11,6 +11,7 @@ from pypdf import PdfReader
from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS
from skyvern.exceptions import PDFParsingError
from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text
from skyvern.utils.token_counter import count_tokens
LOG = structlog.get_logger(__name__)
@@ -69,6 +70,9 @@ def extract_pdf_file(
current_tokens += page_tokens
extracted_text += page_text + "\n"
# Sanitize text to remove characters that cannot be stored in PostgreSQL
extracted_text = sanitize_postgres_text(extracted_text)
LOG.info(
"Successfully parsed PDF with pypdf",
file_identifier=identifier,
@@ -109,6 +113,9 @@ def extract_pdf_file(
current_tokens += page_tokens
extracted_text += page_text + "\n"
# Sanitize text to remove characters that cannot be stored in PostgreSQL
extracted_text = sanitize_postgres_text(extracted_text)
LOG.info(
"Successfully parsed PDF with pdfplumber",
file_identifier=identifier,