Pdf parser robustness improvement (#4307)
This commit is contained in:
@@ -902,3 +902,15 @@ class PDFEmbedBase64DecodeError(SkyvernException):
|
||||
src_preview = pdf_embed_src[:100] + "..." if len(pdf_embed_src) > 100 else pdf_embed_src
|
||||
message += f". PDF embed src: {src_preview}"
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class PDFParsingError(SkyvernException):
|
||||
"""Raised when PDF parsing fails with all available parsers."""
|
||||
|
||||
def __init__(self, file_identifier: str, pypdf_error: str, pdfplumber_error: str):
|
||||
self.file_identifier = file_identifier
|
||||
self.pypdf_error = pypdf_error
|
||||
self.pdfplumber_error = pdfplumber_error
|
||||
super().__init__(
|
||||
f"Failed to parse PDF '{file_identifier}'. pypdf error: {pypdf_error}; pdfplumber error: {pdfplumber_error}"
|
||||
)
|
||||
|
||||
173
skyvern/forge/sdk/utils/pdf_parser.py
Normal file
173
skyvern/forge/sdk/utils/pdf_parser.py
Normal file
@@ -0,0 +1,173 @@
|
||||
"""
|
||||
Utility functions for PDF parsing with fallback support.
|
||||
|
||||
This module provides robust PDF parsing that tries pypdf first and falls back
|
||||
to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats.
|
||||
"""
|
||||
|
||||
import pdfplumber
|
||||
import structlog
|
||||
from pypdf import PdfReader
|
||||
|
||||
from skyvern.exceptions import PDFParsingError
|
||||
|
||||
LOG = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
def extract_pdf_file(
|
||||
file_path: str,
|
||||
file_identifier: str | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Extract text from a PDF file with fallback support.
|
||||
|
||||
This function attempts to parse the PDF using pypdf first. If that fails,
|
||||
it automatically falls back to pdfplumber. This provides robust handling
|
||||
of various PDF formats, including those with corrupted streams or non-standard
|
||||
formatting that may cause pypdf to fail.
|
||||
|
||||
Args:
|
||||
file_path: Path to the PDF file to parse
|
||||
file_identifier: Optional identifier for logging (e.g., URL or filename).
|
||||
If not provided, uses file_path.
|
||||
|
||||
Returns:
|
||||
Extracted text from all pages of the PDF
|
||||
|
||||
Raises:
|
||||
PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF
|
||||
|
||||
Example:
|
||||
>>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf")
|
||||
>>> print(f"Extracted {len(text)} characters")
|
||||
"""
|
||||
identifier = file_identifier or file_path
|
||||
|
||||
# Try pypdf first
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
extracted_text = ""
|
||||
page_count = len(reader.pages)
|
||||
|
||||
for i in range(page_count):
|
||||
page_text = reader.pages[i].extract_text() or ""
|
||||
extracted_text += page_text + "\n"
|
||||
|
||||
LOG.info(
|
||||
"Successfully parsed PDF with pypdf",
|
||||
file_identifier=identifier,
|
||||
page_count=page_count,
|
||||
text_length=len(extracted_text),
|
||||
)
|
||||
return extracted_text
|
||||
|
||||
except Exception as pypdf_error:
|
||||
LOG.warning(
|
||||
"Failed to parse PDF with pypdf, trying pdfplumber",
|
||||
file_identifier=identifier,
|
||||
error=str(pypdf_error),
|
||||
error_type=type(pypdf_error).__name__,
|
||||
)
|
||||
|
||||
# Fallback to pdfplumber
|
||||
try:
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
extracted_text = ""
|
||||
page_count = len(pdf.pages)
|
||||
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
extracted_text += page_text + "\n"
|
||||
|
||||
LOG.info(
|
||||
"Successfully parsed PDF with pdfplumber",
|
||||
file_identifier=identifier,
|
||||
page_count=page_count,
|
||||
text_length=len(extracted_text),
|
||||
)
|
||||
return extracted_text
|
||||
|
||||
except Exception as pdfplumber_error:
|
||||
LOG.error(
|
||||
"Failed to parse PDF with both pypdf and pdfplumber",
|
||||
file_identifier=identifier,
|
||||
pypdf_error=str(pypdf_error),
|
||||
pdfplumber_error=str(pdfplumber_error),
|
||||
)
|
||||
raise PDFParsingError(
|
||||
file_identifier=identifier,
|
||||
pypdf_error=str(pypdf_error),
|
||||
pdfplumber_error=str(pdfplumber_error),
|
||||
)
|
||||
|
||||
|
||||
def validate_pdf_file(
|
||||
file_path: str,
|
||||
file_identifier: str | None = None,
|
||||
) -> bool:
|
||||
"""
|
||||
Validate that a file is a readable PDF.
|
||||
|
||||
This function attempts to validate the PDF using pypdf first. If that fails,
|
||||
it automatically falls back to pdfplumber validation.
|
||||
|
||||
Args:
|
||||
file_path: Path to the PDF file to validate
|
||||
file_identifier: Optional identifier for logging (e.g., URL or filename).
|
||||
If not provided, uses file_path.
|
||||
|
||||
Returns:
|
||||
True if the PDF can be opened and read by at least one parser
|
||||
|
||||
Raises:
|
||||
PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF
|
||||
|
||||
Example:
|
||||
>>> if validate_pdf_file("/path/to/file.pdf"):
|
||||
... print("Valid PDF file")
|
||||
"""
|
||||
identifier = file_identifier or file_path
|
||||
|
||||
# Try pypdf first
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
# Just check if we can access pages, don't read content yet
|
||||
_ = len(reader.pages)
|
||||
LOG.debug(
|
||||
"PDF validation successful with pypdf",
|
||||
file_identifier=identifier,
|
||||
)
|
||||
return True
|
||||
|
||||
except Exception as pypdf_error:
|
||||
LOG.debug(
|
||||
"PDF validation with pypdf failed, trying pdfplumber",
|
||||
file_identifier=identifier,
|
||||
error=str(pypdf_error),
|
||||
)
|
||||
|
||||
# Fallback to pdfplumber
|
||||
try:
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
_ = len(pdf.pages)
|
||||
|
||||
LOG.info(
|
||||
"PDF validation: pypdf failed but pdfplumber succeeded",
|
||||
file_identifier=identifier,
|
||||
pypdf_error=str(pypdf_error),
|
||||
)
|
||||
return True
|
||||
|
||||
except Exception as pdfplumber_error:
|
||||
LOG.error(
|
||||
"PDF validation failed with both pypdf and pdfplumber",
|
||||
file_identifier=identifier,
|
||||
pypdf_error=str(pypdf_error),
|
||||
pdfplumber_error=str(pdfplumber_error),
|
||||
)
|
||||
raise PDFParsingError(
|
||||
file_identifier=identifier,
|
||||
pypdf_error=str(pypdf_error),
|
||||
pdfplumber_error=str(pdfplumber_error),
|
||||
)
|
||||
@@ -27,8 +27,6 @@ from jinja2 import StrictUndefined
|
||||
from jinja2.sandbox import SandboxedEnvironment
|
||||
from playwright.async_api import Page
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfReadError
|
||||
|
||||
from skyvern.config import settings
|
||||
from skyvern.constants import (
|
||||
@@ -41,6 +39,7 @@ from skyvern.exceptions import (
|
||||
ContextParameterValueNotFound,
|
||||
MissingBrowserState,
|
||||
MissingBrowserStatePage,
|
||||
PDFParsingError,
|
||||
SkyvernException,
|
||||
TaskNotFound,
|
||||
UnexpectedTaskStatus,
|
||||
@@ -70,6 +69,7 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskOutput, TaskStatus
|
||||
from skyvern.forge.sdk.services.bitwarden import BitwardenConstants
|
||||
from skyvern.forge.sdk.services.credentials import AzureVaultConstants, OnePasswordConstants
|
||||
from skyvern.forge.sdk.trace import TraceManager
|
||||
from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file, validate_pdf_file
|
||||
from skyvern.forge.sdk.workflow.context_manager import BlockMetadata, WorkflowRunContext
|
||||
from skyvern.forge.sdk.workflow.exceptions import (
|
||||
CustomizedCodeException,
|
||||
@@ -3020,11 +3020,8 @@ class FileParserBlock(Block):
|
||||
)
|
||||
elif self.file_type == FileType.PDF:
|
||||
try:
|
||||
# Try to read the file with PyPDF to validate it's a valid PDF file
|
||||
reader = PdfReader(file_path)
|
||||
# Just check if we can access pages, don't read content yet
|
||||
_ = len(reader.pages)
|
||||
except Exception as e:
|
||||
validate_pdf_file(file_path, file_identifier=file_url_used)
|
||||
except PDFParsingError as e:
|
||||
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
||||
|
||||
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
|
||||
@@ -3087,15 +3084,14 @@ class FileParserBlock(Block):
|
||||
)
|
||||
|
||||
async def _parse_pdf_file(self, file_path: str) -> str:
|
||||
"""Parse PDF file and return extracted text."""
|
||||
"""Parse PDF file and return extracted text.
|
||||
|
||||
Uses the shared PDF parsing utility that tries pypdf first,
|
||||
then falls back to pdfplumber if pypdf fails.
|
||||
"""
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
extracted_text = ""
|
||||
page_count = len(reader.pages)
|
||||
for i in range(page_count):
|
||||
extracted_text += reader.pages[i].extract_text() + "\n"
|
||||
return extracted_text
|
||||
except PdfReadError as e:
|
||||
return extract_pdf_file(file_path, file_identifier=self.file_url)
|
||||
except PDFParsingError as e:
|
||||
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
|
||||
|
||||
async def _extract_with_ai(
|
||||
@@ -3314,14 +3310,9 @@ class PDFParserBlock(Block):
|
||||
else:
|
||||
file_path = await download_file(self.file_url)
|
||||
|
||||
extracted_text = ""
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
page_count = len(reader.pages)
|
||||
for i in range(page_count):
|
||||
extracted_text += reader.pages[i].extract_text() + "\n"
|
||||
|
||||
except PdfReadError:
|
||||
extracted_text = extract_pdf_file(file_path, file_identifier=self.file_url)
|
||||
except PDFParsingError:
|
||||
return await self.build_block_result(
|
||||
success=False,
|
||||
failure_reason="Failed to parse PDF file",
|
||||
|
||||
@@ -5,12 +5,12 @@ from typing import Any
|
||||
|
||||
import structlog
|
||||
from fastapi import HTTPException
|
||||
from pypdf import PdfReader
|
||||
|
||||
from skyvern.config import settings
|
||||
from skyvern.forge.prompts import prompt_engine
|
||||
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
|
||||
from skyvern.forge.sdk.schemas.organizations import Organization
|
||||
from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file
|
||||
from skyvern.schemas.workflows import WorkflowCreateYAMLRequest
|
||||
|
||||
LOG = structlog.get_logger(__name__)
|
||||
@@ -133,7 +133,11 @@ class PDFImportService:
|
||||
return raw
|
||||
|
||||
def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str:
|
||||
"""Extract text from PDF file contents. Raises HTTPException if invalid."""
|
||||
"""Extract text from PDF file contents. Raises HTTPException if invalid.
|
||||
|
||||
Uses the shared PDF parsing utility that tries pypdf first,
|
||||
then falls back to pdfplumber if pypdf fails.
|
||||
"""
|
||||
LOG.info("Extracting text from PDF", filename=file_name)
|
||||
|
||||
# Save the uploaded file to a temporary location
|
||||
@@ -142,14 +146,10 @@ class PDFImportService:
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
try:
|
||||
reader = PdfReader(temp_file_path)
|
||||
sop_text = ""
|
||||
for page_num, page in enumerate(reader.pages, 1):
|
||||
page_text = page.extract_text() or ""
|
||||
sop_text += page_text + "\n"
|
||||
LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text))
|
||||
# Use the shared PDF parsing utility
|
||||
sop_text = extract_pdf_file(temp_file_path, file_identifier=file_name)
|
||||
|
||||
LOG.info("PDF text extraction complete", total_text_length=len(sop_text))
|
||||
LOG.info("PDF text extraction complete", filename=file_name, total_text_length=len(sop_text))
|
||||
|
||||
if not sop_text.strip():
|
||||
raise HTTPException(status_code=400, detail="No readable content found in the PDF.")
|
||||
|
||||
Reference in New Issue
Block a user