Files
Dorod-Sky/skyvern/forge/sdk/utils/pdf_parser.py
2025-12-17 00:59:14 +08:00

174 lines
5.4 KiB
Python

"""
Utility functions for PDF parsing with fallback support.
This module provides robust PDF parsing that tries pypdf first and falls back
to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats.
"""
import pdfplumber
import structlog
from pypdf import PdfReader
from skyvern.exceptions import PDFParsingError
LOG = structlog.get_logger(__name__)
def extract_pdf_file(
file_path: str,
file_identifier: str | None = None,
) -> str:
"""
Extract text from a PDF file with fallback support.
This function attempts to parse the PDF using pypdf first. If that fails,
it automatically falls back to pdfplumber. This provides robust handling
of various PDF formats, including those with corrupted streams or non-standard
formatting that may cause pypdf to fail.
Args:
file_path: Path to the PDF file to parse
file_identifier: Optional identifier for logging (e.g., URL or filename).
If not provided, uses file_path.
Returns:
Extracted text from all pages of the PDF
Raises:
PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF
Example:
>>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf")
>>> print(f"Extracted {len(text)} characters")
"""
identifier = file_identifier or file_path
# Try pypdf first
try:
reader = PdfReader(file_path)
extracted_text = ""
page_count = len(reader.pages)
for i in range(page_count):
page_text = reader.pages[i].extract_text() or ""
extracted_text += page_text + "\n"
LOG.info(
"Successfully parsed PDF with pypdf",
file_identifier=identifier,
page_count=page_count,
text_length=len(extracted_text),
)
return extracted_text
except Exception as pypdf_error:
LOG.warning(
"Failed to parse PDF with pypdf, trying pdfplumber",
file_identifier=identifier,
error=str(pypdf_error),
error_type=type(pypdf_error).__name__,
)
# Fallback to pdfplumber
try:
with pdfplumber.open(file_path) as pdf:
extracted_text = ""
page_count = len(pdf.pages)
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
extracted_text += page_text + "\n"
LOG.info(
"Successfully parsed PDF with pdfplumber",
file_identifier=identifier,
page_count=page_count,
text_length=len(extracted_text),
)
return extracted_text
except Exception as pdfplumber_error:
LOG.error(
"Failed to parse PDF with both pypdf and pdfplumber",
file_identifier=identifier,
pypdf_error=str(pypdf_error),
pdfplumber_error=str(pdfplumber_error),
)
raise PDFParsingError(
file_identifier=identifier,
pypdf_error=str(pypdf_error),
pdfplumber_error=str(pdfplumber_error),
)
def validate_pdf_file(
file_path: str,
file_identifier: str | None = None,
) -> bool:
"""
Validate that a file is a readable PDF.
This function attempts to validate the PDF using pypdf first. If that fails,
it automatically falls back to pdfplumber validation.
Args:
file_path: Path to the PDF file to validate
file_identifier: Optional identifier for logging (e.g., URL or filename).
If not provided, uses file_path.
Returns:
True if the PDF can be opened and read by at least one parser
Raises:
PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF
Example:
>>> if validate_pdf_file("/path/to/file.pdf"):
... print("Valid PDF file")
"""
identifier = file_identifier or file_path
# Try pypdf first
try:
reader = PdfReader(file_path)
# Just check if we can access pages, don't read content yet
_ = len(reader.pages)
LOG.debug(
"PDF validation successful with pypdf",
file_identifier=identifier,
)
return True
except Exception as pypdf_error:
LOG.debug(
"PDF validation with pypdf failed, trying pdfplumber",
file_identifier=identifier,
error=str(pypdf_error),
)
# Fallback to pdfplumber
try:
with pdfplumber.open(file_path) as pdf:
_ = len(pdf.pages)
LOG.info(
"PDF validation: pypdf failed but pdfplumber succeeded",
file_identifier=identifier,
pypdf_error=str(pypdf_error),
)
return True
except Exception as pdfplumber_error:
LOG.error(
"PDF validation failed with both pypdf and pdfplumber",
file_identifier=identifier,
pypdf_error=str(pypdf_error),
pdfplumber_error=str(pdfplumber_error),
)
raise PDFParsingError(
file_identifier=identifier,
pypdf_error=str(pypdf_error),
pdfplumber_error=str(pdfplumber_error),
)