Pdf parser robustness improvement (#4307)
This commit is contained in:
173
skyvern/forge/sdk/utils/pdf_parser.py
Normal file
173
skyvern/forge/sdk/utils/pdf_parser.py
Normal file
@@ -0,0 +1,173 @@
|
||||
"""
|
||||
Utility functions for PDF parsing with fallback support.
|
||||
|
||||
This module provides robust PDF parsing that tries pypdf first and falls back
|
||||
to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats.
|
||||
"""
|
||||
|
||||
import pdfplumber
|
||||
import structlog
|
||||
from pypdf import PdfReader
|
||||
|
||||
from skyvern.exceptions import PDFParsingError
|
||||
|
||||
LOG = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
def extract_pdf_file(
|
||||
file_path: str,
|
||||
file_identifier: str | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Extract text from a PDF file with fallback support.
|
||||
|
||||
This function attempts to parse the PDF using pypdf first. If that fails,
|
||||
it automatically falls back to pdfplumber. This provides robust handling
|
||||
of various PDF formats, including those with corrupted streams or non-standard
|
||||
formatting that may cause pypdf to fail.
|
||||
|
||||
Args:
|
||||
file_path: Path to the PDF file to parse
|
||||
file_identifier: Optional identifier for logging (e.g., URL or filename).
|
||||
If not provided, uses file_path.
|
||||
|
||||
Returns:
|
||||
Extracted text from all pages of the PDF
|
||||
|
||||
Raises:
|
||||
PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF
|
||||
|
||||
Example:
|
||||
>>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf")
|
||||
>>> print(f"Extracted {len(text)} characters")
|
||||
"""
|
||||
identifier = file_identifier or file_path
|
||||
|
||||
# Try pypdf first
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
extracted_text = ""
|
||||
page_count = len(reader.pages)
|
||||
|
||||
for i in range(page_count):
|
||||
page_text = reader.pages[i].extract_text() or ""
|
||||
extracted_text += page_text + "\n"
|
||||
|
||||
LOG.info(
|
||||
"Successfully parsed PDF with pypdf",
|
||||
file_identifier=identifier,
|
||||
page_count=page_count,
|
||||
text_length=len(extracted_text),
|
||||
)
|
||||
return extracted_text
|
||||
|
||||
except Exception as pypdf_error:
|
||||
LOG.warning(
|
||||
"Failed to parse PDF with pypdf, trying pdfplumber",
|
||||
file_identifier=identifier,
|
||||
error=str(pypdf_error),
|
||||
error_type=type(pypdf_error).__name__,
|
||||
)
|
||||
|
||||
# Fallback to pdfplumber
|
||||
try:
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
extracted_text = ""
|
||||
page_count = len(pdf.pages)
|
||||
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
extracted_text += page_text + "\n"
|
||||
|
||||
LOG.info(
|
||||
"Successfully parsed PDF with pdfplumber",
|
||||
file_identifier=identifier,
|
||||
page_count=page_count,
|
||||
text_length=len(extracted_text),
|
||||
)
|
||||
return extracted_text
|
||||
|
||||
except Exception as pdfplumber_error:
|
||||
LOG.error(
|
||||
"Failed to parse PDF with both pypdf and pdfplumber",
|
||||
file_identifier=identifier,
|
||||
pypdf_error=str(pypdf_error),
|
||||
pdfplumber_error=str(pdfplumber_error),
|
||||
)
|
||||
raise PDFParsingError(
|
||||
file_identifier=identifier,
|
||||
pypdf_error=str(pypdf_error),
|
||||
pdfplumber_error=str(pdfplumber_error),
|
||||
)
|
||||
|
||||
|
||||
def validate_pdf_file(
|
||||
file_path: str,
|
||||
file_identifier: str | None = None,
|
||||
) -> bool:
|
||||
"""
|
||||
Validate that a file is a readable PDF.
|
||||
|
||||
This function attempts to validate the PDF using pypdf first. If that fails,
|
||||
it automatically falls back to pdfplumber validation.
|
||||
|
||||
Args:
|
||||
file_path: Path to the PDF file to validate
|
||||
file_identifier: Optional identifier for logging (e.g., URL or filename).
|
||||
If not provided, uses file_path.
|
||||
|
||||
Returns:
|
||||
True if the PDF can be opened and read by at least one parser
|
||||
|
||||
Raises:
|
||||
PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF
|
||||
|
||||
Example:
|
||||
>>> if validate_pdf_file("/path/to/file.pdf"):
|
||||
... print("Valid PDF file")
|
||||
"""
|
||||
identifier = file_identifier or file_path
|
||||
|
||||
# Try pypdf first
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
# Just check if we can access pages, don't read content yet
|
||||
_ = len(reader.pages)
|
||||
LOG.debug(
|
||||
"PDF validation successful with pypdf",
|
||||
file_identifier=identifier,
|
||||
)
|
||||
return True
|
||||
|
||||
except Exception as pypdf_error:
|
||||
LOG.debug(
|
||||
"PDF validation with pypdf failed, trying pdfplumber",
|
||||
file_identifier=identifier,
|
||||
error=str(pypdf_error),
|
||||
)
|
||||
|
||||
# Fallback to pdfplumber
|
||||
try:
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
_ = len(pdf.pages)
|
||||
|
||||
LOG.info(
|
||||
"PDF validation: pypdf failed but pdfplumber succeeded",
|
||||
file_identifier=identifier,
|
||||
pypdf_error=str(pypdf_error),
|
||||
)
|
||||
return True
|
||||
|
||||
except Exception as pdfplumber_error:
|
||||
LOG.error(
|
||||
"PDF validation failed with both pypdf and pdfplumber",
|
||||
file_identifier=identifier,
|
||||
pypdf_error=str(pypdf_error),
|
||||
pdfplumber_error=str(pdfplumber_error),
|
||||
)
|
||||
raise PDFParsingError(
|
||||
file_identifier=identifier,
|
||||
pypdf_error=str(pypdf_error),
|
||||
pdfplumber_error=str(pdfplumber_error),
|
||||
)
|
||||
@@ -27,8 +27,6 @@ from jinja2 import StrictUndefined
|
||||
from jinja2.sandbox import SandboxedEnvironment
|
||||
from playwright.async_api import Page
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfReadError
|
||||
|
||||
from skyvern.config import settings
|
||||
from skyvern.constants import (
|
||||
@@ -41,6 +39,7 @@ from skyvern.exceptions import (
|
||||
ContextParameterValueNotFound,
|
||||
MissingBrowserState,
|
||||
MissingBrowserStatePage,
|
||||
PDFParsingError,
|
||||
SkyvernException,
|
||||
TaskNotFound,
|
||||
UnexpectedTaskStatus,
|
||||
@@ -70,6 +69,7 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskOutput, TaskStatus
|
||||
from skyvern.forge.sdk.services.bitwarden import BitwardenConstants
|
||||
from skyvern.forge.sdk.services.credentials import AzureVaultConstants, OnePasswordConstants
|
||||
from skyvern.forge.sdk.trace import TraceManager
|
||||
from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file, validate_pdf_file
|
||||
from skyvern.forge.sdk.workflow.context_manager import BlockMetadata, WorkflowRunContext
|
||||
from skyvern.forge.sdk.workflow.exceptions import (
|
||||
CustomizedCodeException,
|
||||
@@ -3020,11 +3020,8 @@ class FileParserBlock(Block):
|
||||
)
|
||||
elif self.file_type == FileType.PDF:
|
||||
try:
|
||||
# Try to read the file with PyPDF to validate it's a valid PDF file
|
||||
reader = PdfReader(file_path)
|
||||
# Just check if we can access pages, don't read content yet
|
||||
_ = len(reader.pages)
|
||||
except Exception as e:
|
||||
validate_pdf_file(file_path, file_identifier=file_url_used)
|
||||
except PDFParsingError as e:
|
||||
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
||||
|
||||
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
|
||||
@@ -3087,15 +3084,14 @@ class FileParserBlock(Block):
|
||||
)
|
||||
|
||||
async def _parse_pdf_file(self, file_path: str) -> str:
|
||||
"""Parse PDF file and return extracted text."""
|
||||
"""Parse PDF file and return extracted text.
|
||||
|
||||
Uses the shared PDF parsing utility that tries pypdf first,
|
||||
then falls back to pdfplumber if pypdf fails.
|
||||
"""
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
extracted_text = ""
|
||||
page_count = len(reader.pages)
|
||||
for i in range(page_count):
|
||||
extracted_text += reader.pages[i].extract_text() + "\n"
|
||||
return extracted_text
|
||||
except PdfReadError as e:
|
||||
return extract_pdf_file(file_path, file_identifier=self.file_url)
|
||||
except PDFParsingError as e:
|
||||
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
|
||||
|
||||
async def _extract_with_ai(
|
||||
@@ -3314,14 +3310,9 @@ class PDFParserBlock(Block):
|
||||
else:
|
||||
file_path = await download_file(self.file_url)
|
||||
|
||||
extracted_text = ""
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
page_count = len(reader.pages)
|
||||
for i in range(page_count):
|
||||
extracted_text += reader.pages[i].extract_text() + "\n"
|
||||
|
||||
except PdfReadError:
|
||||
extracted_text = extract_pdf_file(file_path, file_identifier=self.file_url)
|
||||
except PDFParsingError:
|
||||
return await self.build_block_result(
|
||||
success=False,
|
||||
failure_reason="Failed to parse PDF file",
|
||||
|
||||
Reference in New Issue
Block a user