Pdf parser robustness improvement (#4307)

2025-12-17 00:59:14 +08:00
parent a1ec9cc633
commit 9aa490f475
6 changed files with 266 additions and 31 deletions
--- a/skyvern/forge/sdk/utils/pdf_parser.py
+++ b/skyvern/forge/sdk/utils/pdf_parser.py
@@ -0,0 +1,173 @@
+"""
+Utility functions for PDF parsing with fallback support.
+
+This module provides robust PDF parsing that tries pypdf first and falls back
+to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats.
+"""
+
+import pdfplumber
+import structlog
+from pypdf import PdfReader
+
+from skyvern.exceptions import PDFParsingError
+
+LOG = structlog.get_logger(__name__)
+
+
+def extract_pdf_file(
+    file_path: str,
+    file_identifier: str | None = None,
+) -> str:
+    """
+    Extract text from a PDF file with fallback support.
+
+    This function attempts to parse the PDF using pypdf first. If that fails,
+    it automatically falls back to pdfplumber. This provides robust handling
+    of various PDF formats, including those with corrupted streams or non-standard
+    formatting that may cause pypdf to fail.
+
+    Args:
+        file_path: Path to the PDF file to parse
+        file_identifier: Optional identifier for logging (e.g., URL or filename).
+                        If not provided, uses file_path.
+
+    Returns:
+        Extracted text from all pages of the PDF
+
+    Raises:
+        PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF
+
+    Example:
+        >>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf")
+        >>> print(f"Extracted {len(text)} characters")
+    """
+    identifier = file_identifier or file_path
+
+    # Try pypdf first
+    try:
+        reader = PdfReader(file_path)
+        extracted_text = ""
+        page_count = len(reader.pages)
+
+        for i in range(page_count):
+            page_text = reader.pages[i].extract_text() or ""
+            extracted_text += page_text + "\n"
+
+        LOG.info(
+            "Successfully parsed PDF with pypdf",
+            file_identifier=identifier,
+            page_count=page_count,
+            text_length=len(extracted_text),
+        )
+        return extracted_text
+
+    except Exception as pypdf_error:
+        LOG.warning(
+            "Failed to parse PDF with pypdf, trying pdfplumber",
+            file_identifier=identifier,
+            error=str(pypdf_error),
+            error_type=type(pypdf_error).__name__,
+        )
+
+        # Fallback to pdfplumber
+        try:
+            with pdfplumber.open(file_path) as pdf:
+                extracted_text = ""
+                page_count = len(pdf.pages)
+
+                for page in pdf.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        extracted_text += page_text + "\n"
+
+                LOG.info(
+                    "Successfully parsed PDF with pdfplumber",
+                    file_identifier=identifier,
+                    page_count=page_count,
+                    text_length=len(extracted_text),
+                )
+                return extracted_text
+
+        except Exception as pdfplumber_error:
+            LOG.error(
+                "Failed to parse PDF with both pypdf and pdfplumber",
+                file_identifier=identifier,
+                pypdf_error=str(pypdf_error),
+                pdfplumber_error=str(pdfplumber_error),
+            )
+            raise PDFParsingError(
+                file_identifier=identifier,
+                pypdf_error=str(pypdf_error),
+                pdfplumber_error=str(pdfplumber_error),
+            )
+
+
+def validate_pdf_file(
+    file_path: str,
+    file_identifier: str | None = None,
+) -> bool:
+    """
+    Validate that a file is a readable PDF.
+
+    This function attempts to validate the PDF using pypdf first. If that fails,
+    it automatically falls back to pdfplumber validation.
+
+    Args:
+        file_path: Path to the PDF file to validate
+        file_identifier: Optional identifier for logging (e.g., URL or filename).
+                        If not provided, uses file_path.
+
+    Returns:
+        True if the PDF can be opened and read by at least one parser
+
+    Raises:
+        PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF
+
+    Example:
+        >>> if validate_pdf_file("/path/to/file.pdf"):
+        ...     print("Valid PDF file")
+    """
+    identifier = file_identifier or file_path
+
+    # Try pypdf first
+    try:
+        reader = PdfReader(file_path)
+        # Just check if we can access pages, don't read content yet
+        _ = len(reader.pages)
+        LOG.debug(
+            "PDF validation successful with pypdf",
+            file_identifier=identifier,
+        )
+        return True
+
+    except Exception as pypdf_error:
+        LOG.debug(
+            "PDF validation with pypdf failed, trying pdfplumber",
+            file_identifier=identifier,
+            error=str(pypdf_error),
+        )
+
+        # Fallback to pdfplumber
+        try:
+            with pdfplumber.open(file_path) as pdf:
+                _ = len(pdf.pages)
+
+            LOG.info(
+                "PDF validation: pypdf failed but pdfplumber succeeded",
+                file_identifier=identifier,
+                pypdf_error=str(pypdf_error),
+            )
+            return True
+
+        except Exception as pdfplumber_error:
+            LOG.error(
+                "PDF validation failed with both pypdf and pdfplumber",
+                file_identifier=identifier,
+                pypdf_error=str(pypdf_error),
+                pdfplumber_error=str(pdfplumber_error),
+            )
+            raise PDFParsingError(
+                file_identifier=identifier,
+                pypdf_error=str(pypdf_error),
+                pdfplumber_error=str(pdfplumber_error),
+            )