Pdf parser robustness improvement (#4307)

2025-12-17 00:59:14 +08:00
parent a1ec9cc633
commit 9aa490f475
6 changed files with 266 additions and 31 deletions
--- a/skyvern/exceptions.py
+++ b/skyvern/exceptions.py
@@ -902,3 +902,15 @@ class PDFEmbedBase64DecodeError(SkyvernException):
            src_preview = pdf_embed_src[:100] + "..." if len(pdf_embed_src) > 100 else pdf_embed_src
            message += f". PDF embed src: {src_preview}"
        super().__init__(message)
+
+
+class PDFParsingError(SkyvernException):
+    """Raised when PDF parsing fails with all available parsers."""
+
+    def __init__(self, file_identifier: str, pypdf_error: str, pdfplumber_error: str):
+        self.file_identifier = file_identifier
+        self.pypdf_error = pypdf_error
+        self.pdfplumber_error = pdfplumber_error
+        super().__init__(
+            f"Failed to parse PDF '{file_identifier}'. pypdf error: {pypdf_error}; pdfplumber error: {pdfplumber_error}"
+        )
--- a/skyvern/forge/sdk/utils/pdf_parser.py
+++ b/skyvern/forge/sdk/utils/pdf_parser.py
@@ -0,0 +1,173 @@
+"""
+Utility functions for PDF parsing with fallback support.
+
+This module provides robust PDF parsing that tries pypdf first and falls back
+to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats.
+"""
+
+import pdfplumber
+import structlog
+from pypdf import PdfReader
+
+from skyvern.exceptions import PDFParsingError
+
+LOG = structlog.get_logger(__name__)
+
+
+def extract_pdf_file(
+    file_path: str,
+    file_identifier: str | None = None,
+) -> str:
+    """
+    Extract text from a PDF file with fallback support.
+
+    This function attempts to parse the PDF using pypdf first. If that fails,
+    it automatically falls back to pdfplumber. This provides robust handling
+    of various PDF formats, including those with corrupted streams or non-standard
+    formatting that may cause pypdf to fail.
+
+    Args:
+        file_path: Path to the PDF file to parse
+        file_identifier: Optional identifier for logging (e.g., URL or filename).
+                        If not provided, uses file_path.
+
+    Returns:
+        Extracted text from all pages of the PDF
+
+    Raises:
+        PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF
+
+    Example:
+        >>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf")
+        >>> print(f"Extracted {len(text)} characters")
+    """
+    identifier = file_identifier or file_path
+
+    # Try pypdf first
+    try:
+        reader = PdfReader(file_path)
+        extracted_text = ""
+        page_count = len(reader.pages)
+
+        for i in range(page_count):
+            page_text = reader.pages[i].extract_text() or ""
+            extracted_text += page_text + "\n"
+
+        LOG.info(
+            "Successfully parsed PDF with pypdf",
+            file_identifier=identifier,
+            page_count=page_count,
+            text_length=len(extracted_text),
+        )
+        return extracted_text
+
+    except Exception as pypdf_error:
+        LOG.warning(
+            "Failed to parse PDF with pypdf, trying pdfplumber",
+            file_identifier=identifier,
+            error=str(pypdf_error),
+            error_type=type(pypdf_error).__name__,
+        )
+
+        # Fallback to pdfplumber
+        try:
+            with pdfplumber.open(file_path) as pdf:
+                extracted_text = ""
+                page_count = len(pdf.pages)
+
+                for page in pdf.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        extracted_text += page_text + "\n"
+
+                LOG.info(
+                    "Successfully parsed PDF with pdfplumber",
+                    file_identifier=identifier,
+                    page_count=page_count,
+                    text_length=len(extracted_text),
+                )
+                return extracted_text
+
+        except Exception as pdfplumber_error:
+            LOG.error(
+                "Failed to parse PDF with both pypdf and pdfplumber",
+                file_identifier=identifier,
+                pypdf_error=str(pypdf_error),
+                pdfplumber_error=str(pdfplumber_error),
+            )
+            raise PDFParsingError(
+                file_identifier=identifier,
+                pypdf_error=str(pypdf_error),
+                pdfplumber_error=str(pdfplumber_error),
+            )
+
+
+def validate_pdf_file(
+    file_path: str,
+    file_identifier: str | None = None,
+) -> bool:
+    """
+    Validate that a file is a readable PDF.
+
+    This function attempts to validate the PDF using pypdf first. If that fails,
+    it automatically falls back to pdfplumber validation.
+
+    Args:
+        file_path: Path to the PDF file to validate
+        file_identifier: Optional identifier for logging (e.g., URL or filename).
+                        If not provided, uses file_path.
+
+    Returns:
+        True if the PDF can be opened and read by at least one parser
+
+    Raises:
+        PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF
+
+    Example:
+        >>> if validate_pdf_file("/path/to/file.pdf"):
+        ...     print("Valid PDF file")
+    """
+    identifier = file_identifier or file_path
+
+    # Try pypdf first
+    try:
+        reader = PdfReader(file_path)
+        # Just check if we can access pages, don't read content yet
+        _ = len(reader.pages)
+        LOG.debug(
+            "PDF validation successful with pypdf",
+            file_identifier=identifier,
+        )
+        return True
+
+    except Exception as pypdf_error:
+        LOG.debug(
+            "PDF validation with pypdf failed, trying pdfplumber",
+            file_identifier=identifier,
+            error=str(pypdf_error),
+        )
+
+        # Fallback to pdfplumber
+        try:
+            with pdfplumber.open(file_path) as pdf:
+                _ = len(pdf.pages)
+
+            LOG.info(
+                "PDF validation: pypdf failed but pdfplumber succeeded",
+                file_identifier=identifier,
+                pypdf_error=str(pypdf_error),
+            )
+            return True
+
+        except Exception as pdfplumber_error:
+            LOG.error(
+                "PDF validation failed with both pypdf and pdfplumber",
+                file_identifier=identifier,
+                pypdf_error=str(pypdf_error),
+                pdfplumber_error=str(pdfplumber_error),
+            )
+            raise PDFParsingError(
+                file_identifier=identifier,
+                pypdf_error=str(pypdf_error),
+                pdfplumber_error=str(pdfplumber_error),
+            )
--- a/skyvern/forge/sdk/workflow/models/block.py
+++ b/skyvern/forge/sdk/workflow/models/block.py
@@ -27,8 +27,6 @@ from jinja2 import StrictUndefined
 from jinja2.sandbox import SandboxedEnvironment
 from playwright.async_api import Page
 from pydantic import BaseModel, Field, model_validator
-from pypdf import PdfReader
-from pypdf.errors import PdfReadError

 from skyvern.config import settings
 from skyvern.constants import (
@@ -41,6 +39,7 @@ from skyvern.exceptions import (
    ContextParameterValueNotFound,
    MissingBrowserState,
    MissingBrowserStatePage,
+    PDFParsingError,
    SkyvernException,
    TaskNotFound,
    UnexpectedTaskStatus,
@@ -70,6 +69,7 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskOutput, TaskStatus
 from skyvern.forge.sdk.services.bitwarden import BitwardenConstants
 from skyvern.forge.sdk.services.credentials import AzureVaultConstants, OnePasswordConstants
 from skyvern.forge.sdk.trace import TraceManager
+from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file, validate_pdf_file
 from skyvern.forge.sdk.workflow.context_manager import BlockMetadata, WorkflowRunContext
 from skyvern.forge.sdk.workflow.exceptions import (
    CustomizedCodeException,
@@ -3020,11 +3020,8 @@ class FileParserBlock(Block):
                )
        elif self.file_type == FileType.PDF:
            try:
-                # Try to read the file with PyPDF to validate it's a valid PDF file
-                reader = PdfReader(file_path)
-                # Just check if we can access pages, don't read content yet
-                _ = len(reader.pages)
-            except Exception as e:
+                validate_pdf_file(file_path, file_identifier=file_url_used)
+            except PDFParsingError as e:
                raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))

    async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
@@ -3087,15 +3084,14 @@ class FileParserBlock(Block):
            )

    async def _parse_pdf_file(self, file_path: str) -> str:
-        """Parse PDF file and return extracted text."""
+        """Parse PDF file and return extracted text.
+
+        Uses the shared PDF parsing utility that tries pypdf first,
+        then falls back to pdfplumber if pypdf fails.
+        """
        try:
-            reader = PdfReader(file_path)
-            extracted_text = ""
-            page_count = len(reader.pages)
-            for i in range(page_count):
-                extracted_text += reader.pages[i].extract_text() + "\n"
-            return extracted_text
-        except PdfReadError as e:
+            return extract_pdf_file(file_path, file_identifier=self.file_url)
+        except PDFParsingError as e:
            raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))

    async def _extract_with_ai(
@@ -3314,14 +3310,9 @@ class PDFParserBlock(Block):
        else:
            file_path = await download_file(self.file_url)

-        extracted_text = ""
        try:
-            reader = PdfReader(file_path)
-            page_count = len(reader.pages)
-            for i in range(page_count):
-                extracted_text += reader.pages[i].extract_text() + "\n"
-
-        except PdfReadError:
+            extracted_text = extract_pdf_file(file_path, file_identifier=self.file_url)
+        except PDFParsingError:
            return await self.build_block_result(
                success=False,
                failure_reason="Failed to parse PDF file",
--- a/skyvern/services/pdf_import_service.py
+++ b/skyvern/services/pdf_import_service.py
@@ -5,12 +5,12 @@ from typing import Any

 import structlog
 from fastapi import HTTPException
-from pypdf import PdfReader

 from skyvern.config import settings
 from skyvern.forge.prompts import prompt_engine
 from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
 from skyvern.forge.sdk.schemas.organizations import Organization
+from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file
 from skyvern.schemas.workflows import WorkflowCreateYAMLRequest

 LOG = structlog.get_logger(__name__)
@@ -133,7 +133,11 @@ class PDFImportService:
        return raw

    def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str:
-        """Extract text from PDF file contents. Raises HTTPException if invalid."""
+        """Extract text from PDF file contents. Raises HTTPException if invalid.
+
+        Uses the shared PDF parsing utility that tries pypdf first,
+        then falls back to pdfplumber if pypdf fails.
+        """
        LOG.info("Extracting text from PDF", filename=file_name)

        # Save the uploaded file to a temporary location
@@ -142,14 +146,10 @@ class PDFImportService:
            temp_file_path = temp_file.name

        try:
-            reader = PdfReader(temp_file_path)
-            sop_text = ""
-            for page_num, page in enumerate(reader.pages, 1):
-                page_text = page.extract_text() or ""
-                sop_text += page_text + "\n"
-                LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text))
+            # Use the shared PDF parsing utility
+            sop_text = extract_pdf_file(temp_file_path, file_identifier=file_name)

-            LOG.info("PDF text extraction complete", total_text_length=len(sop_text))
+            LOG.info("PDF text extraction complete", filename=file_name, total_text_length=len(sop_text))

            if not sop_text.strip():
                raise HTTPException(status_code=400, detail="No readable content found in the PDF.")