Pdf parser robustness improvement (#4307)

This commit is contained in:
LawyZheng
2025-12-17 00:59:14 +08:00
committed by GitHub
parent a1ec9cc633
commit 9aa490f475
6 changed files with 266 additions and 31 deletions

View File

@@ -0,0 +1,173 @@
"""
Utility functions for PDF parsing with fallback support.
This module provides robust PDF parsing that tries pypdf first and falls back
to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats.
"""
import pdfplumber
import structlog
from pypdf import PdfReader
from skyvern.exceptions import PDFParsingError
LOG = structlog.get_logger(__name__)
def extract_pdf_file(
file_path: str,
file_identifier: str | None = None,
) -> str:
"""
Extract text from a PDF file with fallback support.
This function attempts to parse the PDF using pypdf first. If that fails,
it automatically falls back to pdfplumber. This provides robust handling
of various PDF formats, including those with corrupted streams or non-standard
formatting that may cause pypdf to fail.
Args:
file_path: Path to the PDF file to parse
file_identifier: Optional identifier for logging (e.g., URL or filename).
If not provided, uses file_path.
Returns:
Extracted text from all pages of the PDF
Raises:
PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF
Example:
>>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf")
>>> print(f"Extracted {len(text)} characters")
"""
identifier = file_identifier or file_path
# Try pypdf first
try:
reader = PdfReader(file_path)
extracted_text = ""
page_count = len(reader.pages)
for i in range(page_count):
page_text = reader.pages[i].extract_text() or ""
extracted_text += page_text + "\n"
LOG.info(
"Successfully parsed PDF with pypdf",
file_identifier=identifier,
page_count=page_count,
text_length=len(extracted_text),
)
return extracted_text
except Exception as pypdf_error:
LOG.warning(
"Failed to parse PDF with pypdf, trying pdfplumber",
file_identifier=identifier,
error=str(pypdf_error),
error_type=type(pypdf_error).__name__,
)
# Fallback to pdfplumber
try:
with pdfplumber.open(file_path) as pdf:
extracted_text = ""
page_count = len(pdf.pages)
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
extracted_text += page_text + "\n"
LOG.info(
"Successfully parsed PDF with pdfplumber",
file_identifier=identifier,
page_count=page_count,
text_length=len(extracted_text),
)
return extracted_text
except Exception as pdfplumber_error:
LOG.error(
"Failed to parse PDF with both pypdf and pdfplumber",
file_identifier=identifier,
pypdf_error=str(pypdf_error),
pdfplumber_error=str(pdfplumber_error),
)
raise PDFParsingError(
file_identifier=identifier,
pypdf_error=str(pypdf_error),
pdfplumber_error=str(pdfplumber_error),
)
def validate_pdf_file(
file_path: str,
file_identifier: str | None = None,
) -> bool:
"""
Validate that a file is a readable PDF.
This function attempts to validate the PDF using pypdf first. If that fails,
it automatically falls back to pdfplumber validation.
Args:
file_path: Path to the PDF file to validate
file_identifier: Optional identifier for logging (e.g., URL or filename).
If not provided, uses file_path.
Returns:
True if the PDF can be opened and read by at least one parser
Raises:
PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF
Example:
>>> if validate_pdf_file("/path/to/file.pdf"):
... print("Valid PDF file")
"""
identifier = file_identifier or file_path
# Try pypdf first
try:
reader = PdfReader(file_path)
# Just check if we can access pages, don't read content yet
_ = len(reader.pages)
LOG.debug(
"PDF validation successful with pypdf",
file_identifier=identifier,
)
return True
except Exception as pypdf_error:
LOG.debug(
"PDF validation with pypdf failed, trying pdfplumber",
file_identifier=identifier,
error=str(pypdf_error),
)
# Fallback to pdfplumber
try:
with pdfplumber.open(file_path) as pdf:
_ = len(pdf.pages)
LOG.info(
"PDF validation: pypdf failed but pdfplumber succeeded",
file_identifier=identifier,
pypdf_error=str(pypdf_error),
)
return True
except Exception as pdfplumber_error:
LOG.error(
"PDF validation failed with both pypdf and pdfplumber",
file_identifier=identifier,
pypdf_error=str(pypdf_error),
pdfplumber_error=str(pdfplumber_error),
)
raise PDFParsingError(
file_identifier=identifier,
pypdf_error=str(pypdf_error),
pdfplumber_error=str(pdfplumber_error),
)

View File

@@ -27,8 +27,6 @@ from jinja2 import StrictUndefined
from jinja2.sandbox import SandboxedEnvironment
from playwright.async_api import Page
from pydantic import BaseModel, Field, model_validator
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from skyvern.config import settings
from skyvern.constants import (
@@ -41,6 +39,7 @@ from skyvern.exceptions import (
ContextParameterValueNotFound,
MissingBrowserState,
MissingBrowserStatePage,
PDFParsingError,
SkyvernException,
TaskNotFound,
UnexpectedTaskStatus,
@@ -70,6 +69,7 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskOutput, TaskStatus
from skyvern.forge.sdk.services.bitwarden import BitwardenConstants
from skyvern.forge.sdk.services.credentials import AzureVaultConstants, OnePasswordConstants
from skyvern.forge.sdk.trace import TraceManager
from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file, validate_pdf_file
from skyvern.forge.sdk.workflow.context_manager import BlockMetadata, WorkflowRunContext
from skyvern.forge.sdk.workflow.exceptions import (
CustomizedCodeException,
@@ -3020,11 +3020,8 @@ class FileParserBlock(Block):
)
elif self.file_type == FileType.PDF:
try:
# Try to read the file with PyPDF to validate it's a valid PDF file
reader = PdfReader(file_path)
# Just check if we can access pages, don't read content yet
_ = len(reader.pages)
except Exception as e:
validate_pdf_file(file_path, file_identifier=file_url_used)
except PDFParsingError as e:
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
@@ -3087,15 +3084,14 @@ class FileParserBlock(Block):
)
async def _parse_pdf_file(self, file_path: str) -> str:
"""Parse PDF file and return extracted text."""
"""Parse PDF file and return extracted text.
Uses the shared PDF parsing utility that tries pypdf first,
then falls back to pdfplumber if pypdf fails.
"""
try:
reader = PdfReader(file_path)
extracted_text = ""
page_count = len(reader.pages)
for i in range(page_count):
extracted_text += reader.pages[i].extract_text() + "\n"
return extracted_text
except PdfReadError as e:
return extract_pdf_file(file_path, file_identifier=self.file_url)
except PDFParsingError as e:
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
async def _extract_with_ai(
@@ -3314,14 +3310,9 @@ class PDFParserBlock(Block):
else:
file_path = await download_file(self.file_url)
extracted_text = ""
try:
reader = PdfReader(file_path)
page_count = len(reader.pages)
for i in range(page_count):
extracted_text += reader.pages[i].extract_text() + "\n"
except PdfReadError:
extracted_text = extract_pdf_file(file_path, file_identifier=self.file_url)
except PDFParsingError:
return await self.build_block_result(
success=False,
failure_reason="Failed to parse PDF file",