210 lines
7.2 KiB
Python
210 lines
7.2 KiB
Python
"""
|
|
Utility functions for PDF parsing with fallback support.
|
|
|
|
This module provides robust PDF parsing that tries pypdf first and falls back
|
|
to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats.
|
|
"""
|
|
|
|
import pdfplumber
|
|
import structlog
|
|
from pypdf import PdfReader
|
|
|
|
from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS
|
|
from skyvern.exceptions import PDFParsingError
|
|
from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text
|
|
from skyvern.utils.token_counter import count_tokens
|
|
|
|
LOG = structlog.get_logger(__name__)
|
|
|
|
|
|
def extract_pdf_file(
|
|
file_path: str,
|
|
file_identifier: str | None = None,
|
|
max_tokens: int = MAX_FILE_PARSE_INPUT_TOKENS,
|
|
) -> str:
|
|
"""
|
|
Extract text from a PDF file with fallback support.
|
|
|
|
This function attempts to parse the PDF using pypdf first. If that fails,
|
|
it automatically falls back to pdfplumber. This provides robust handling
|
|
of various PDF formats, including those with corrupted streams or non-standard
|
|
formatting that may cause pypdf to fail.
|
|
|
|
Args:
|
|
file_path: Path to the PDF file to parse
|
|
file_identifier: Optional identifier for logging (e.g., URL or filename).
|
|
If not provided, uses file_path.
|
|
|
|
Returns:
|
|
Extracted text from all pages of the PDF
|
|
|
|
Raises:
|
|
PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF
|
|
|
|
Example:
|
|
>>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf")
|
|
>>> print(f"Extracted {len(text)} characters")
|
|
"""
|
|
identifier = file_identifier or file_path
|
|
|
|
# Try pypdf first
|
|
try:
|
|
reader = PdfReader(file_path)
|
|
extracted_text = ""
|
|
current_tokens = 0
|
|
page_count = len(reader.pages)
|
|
|
|
for i in range(page_count):
|
|
page_text = reader.pages[i].extract_text() or ""
|
|
if max_tokens:
|
|
page_tokens = count_tokens(page_text)
|
|
if current_tokens + page_tokens > max_tokens:
|
|
LOG.warning(
|
|
"PDF text exceeds token limit, truncating at page boundary",
|
|
file_identifier=identifier,
|
|
pages_included=i,
|
|
total_pages=page_count,
|
|
max_tokens=max_tokens,
|
|
)
|
|
break
|
|
current_tokens += page_tokens
|
|
extracted_text += page_text + "\n"
|
|
|
|
# Sanitize text to remove characters that cannot be stored in PostgreSQL
|
|
extracted_text = sanitize_postgres_text(extracted_text)
|
|
|
|
LOG.info(
|
|
"Successfully parsed PDF with pypdf",
|
|
file_identifier=identifier,
|
|
page_count=page_count,
|
|
text_length=len(extracted_text),
|
|
)
|
|
return extracted_text
|
|
|
|
except Exception as pypdf_error:
|
|
LOG.warning(
|
|
"Failed to parse PDF with pypdf, trying pdfplumber",
|
|
file_identifier=identifier,
|
|
error=str(pypdf_error),
|
|
error_type=type(pypdf_error).__name__,
|
|
)
|
|
|
|
# Fallback to pdfplumber
|
|
try:
|
|
with pdfplumber.open(file_path) as pdf:
|
|
extracted_text = ""
|
|
current_tokens = 0
|
|
page_count = len(pdf.pages)
|
|
|
|
for i, page in enumerate(pdf.pages):
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
if max_tokens:
|
|
page_tokens = count_tokens(page_text)
|
|
if current_tokens + page_tokens > max_tokens:
|
|
LOG.warning(
|
|
"PDF text exceeds token limit, truncating at page boundary",
|
|
file_identifier=identifier,
|
|
pages_included=i,
|
|
total_pages=page_count,
|
|
max_tokens=max_tokens,
|
|
)
|
|
break
|
|
current_tokens += page_tokens
|
|
extracted_text += page_text + "\n"
|
|
|
|
# Sanitize text to remove characters that cannot be stored in PostgreSQL
|
|
extracted_text = sanitize_postgres_text(extracted_text)
|
|
|
|
LOG.info(
|
|
"Successfully parsed PDF with pdfplumber",
|
|
file_identifier=identifier,
|
|
page_count=page_count,
|
|
text_length=len(extracted_text),
|
|
)
|
|
return extracted_text
|
|
|
|
except Exception as pdfplumber_error:
|
|
LOG.error(
|
|
"Failed to parse PDF with both pypdf and pdfplumber",
|
|
file_identifier=identifier,
|
|
pypdf_error=str(pypdf_error),
|
|
pdfplumber_error=str(pdfplumber_error),
|
|
)
|
|
raise PDFParsingError(
|
|
file_identifier=identifier,
|
|
pypdf_error=str(pypdf_error),
|
|
pdfplumber_error=str(pdfplumber_error),
|
|
)
|
|
|
|
|
|
def validate_pdf_file(
|
|
file_path: str,
|
|
file_identifier: str | None = None,
|
|
) -> bool:
|
|
"""
|
|
Validate that a file is a readable PDF.
|
|
|
|
This function attempts to validate the PDF using pypdf first. If that fails,
|
|
it automatically falls back to pdfplumber validation.
|
|
|
|
Args:
|
|
file_path: Path to the PDF file to validate
|
|
file_identifier: Optional identifier for logging (e.g., URL or filename).
|
|
If not provided, uses file_path.
|
|
|
|
Returns:
|
|
True if the PDF can be opened and read by at least one parser
|
|
|
|
Raises:
|
|
PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF
|
|
|
|
Example:
|
|
>>> if validate_pdf_file("/path/to/file.pdf"):
|
|
... print("Valid PDF file")
|
|
"""
|
|
identifier = file_identifier or file_path
|
|
|
|
# Try pypdf first
|
|
try:
|
|
reader = PdfReader(file_path)
|
|
# Just check if we can access pages, don't read content yet
|
|
_ = len(reader.pages)
|
|
LOG.debug(
|
|
"PDF validation successful with pypdf",
|
|
file_identifier=identifier,
|
|
)
|
|
return True
|
|
|
|
except Exception as pypdf_error:
|
|
LOG.debug(
|
|
"PDF validation with pypdf failed, trying pdfplumber",
|
|
file_identifier=identifier,
|
|
error=str(pypdf_error),
|
|
)
|
|
|
|
# Fallback to pdfplumber
|
|
try:
|
|
with pdfplumber.open(file_path) as pdf:
|
|
_ = len(pdf.pages)
|
|
|
|
LOG.info(
|
|
"PDF validation: pypdf failed but pdfplumber succeeded",
|
|
file_identifier=identifier,
|
|
pypdf_error=str(pypdf_error),
|
|
)
|
|
return True
|
|
|
|
except Exception as pdfplumber_error:
|
|
LOG.error(
|
|
"PDF validation failed with both pypdf and pdfplumber",
|
|
file_identifier=identifier,
|
|
pypdf_error=str(pypdf_error),
|
|
pdfplumber_error=str(pdfplumber_error),
|
|
)
|
|
raise PDFParsingError(
|
|
file_identifier=identifier,
|
|
pypdf_error=str(pypdf_error),
|
|
pdfplumber_error=str(pdfplumber_error),
|
|
)
|