Pdf parser robustness improvement (#4307)

2025-12-17 00:59:14 +08:00
parent a1ec9cc633
commit 9aa490f475
6 changed files with 266 additions and 31 deletions
--- a/skyvern/services/pdf_import_service.py
+++ b/skyvern/services/pdf_import_service.py
@@ -5,12 +5,12 @@ from typing import Any

 import structlog
 from fastapi import HTTPException
-from pypdf import PdfReader

 from skyvern.config import settings
 from skyvern.forge.prompts import prompt_engine
 from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
 from skyvern.forge.sdk.schemas.organizations import Organization
+from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file
 from skyvern.schemas.workflows import WorkflowCreateYAMLRequest

 LOG = structlog.get_logger(__name__)
@@ -133,7 +133,11 @@ class PDFImportService:
        return raw

    def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str:
-        """Extract text from PDF file contents. Raises HTTPException if invalid."""
+        """Extract text from PDF file contents. Raises HTTPException if invalid.
+
+        Uses the shared PDF parsing utility that tries pypdf first,
+        then falls back to pdfplumber if pypdf fails.
+        """
        LOG.info("Extracting text from PDF", filename=file_name)

        # Save the uploaded file to a temporary location
@@ -142,14 +146,10 @@ class PDFImportService:
            temp_file_path = temp_file.name

        try:
-            reader = PdfReader(temp_file_path)
-            sop_text = ""
-            for page_num, page in enumerate(reader.pages, 1):
-                page_text = page.extract_text() or ""
-                sop_text += page_text + "\n"
-                LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text))
+            # Use the shared PDF parsing utility
+            sop_text = extract_pdf_file(temp_file_path, file_identifier=file_name)

-            LOG.info("PDF text extraction complete", total_text_length=len(sop_text))
+            LOG.info("PDF text extraction complete", filename=file_name, total_text_length=len(sop_text))

            if not sop_text.strip():
                raise HTTPException(status_code=400, detail="No readable content found in the PDF.")