""" Utility functions for PDF parsing with fallback support. This module provides robust PDF parsing that tries pypdf first and falls back to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats. """ import pdfplumber import structlog from pypdf import PdfReader from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS from skyvern.exceptions import PDFParsingError from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text from skyvern.utils.token_counter import count_tokens LOG = structlog.get_logger(__name__) def extract_pdf_file( file_path: str, file_identifier: str | None = None, max_tokens: int = MAX_FILE_PARSE_INPUT_TOKENS, ) -> str: """ Extract text from a PDF file with fallback support. This function attempts to parse the PDF using pypdf first. If that fails, it automatically falls back to pdfplumber. This provides robust handling of various PDF formats, including those with corrupted streams or non-standard formatting that may cause pypdf to fail. Args: file_path: Path to the PDF file to parse file_identifier: Optional identifier for logging (e.g., URL or filename). If not provided, uses file_path. Returns: Extracted text from all pages of the PDF Raises: PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF Example: >>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf") >>> print(f"Extracted {len(text)} characters") """ identifier = file_identifier or file_path # Try pypdf first try: reader = PdfReader(file_path) extracted_text = "" current_tokens = 0 page_count = len(reader.pages) for i in range(page_count): page_text = reader.pages[i].extract_text() or "" if max_tokens: page_tokens = count_tokens(page_text) if current_tokens + page_tokens > max_tokens: LOG.warning( "PDF text exceeds token limit, truncating at page boundary", file_identifier=identifier, pages_included=i, total_pages=page_count, max_tokens=max_tokens, ) break current_tokens += page_tokens extracted_text += page_text + "\n" # Sanitize text to remove characters that cannot be stored in PostgreSQL extracted_text = sanitize_postgres_text(extracted_text) LOG.info( "Successfully parsed PDF with pypdf", file_identifier=identifier, page_count=page_count, text_length=len(extracted_text), ) return extracted_text except Exception as pypdf_error: LOG.warning( "Failed to parse PDF with pypdf, trying pdfplumber", file_identifier=identifier, error=str(pypdf_error), error_type=type(pypdf_error).__name__, ) # Fallback to pdfplumber try: with pdfplumber.open(file_path) as pdf: extracted_text = "" current_tokens = 0 page_count = len(pdf.pages) for i, page in enumerate(pdf.pages): page_text = page.extract_text() if page_text: if max_tokens: page_tokens = count_tokens(page_text) if current_tokens + page_tokens > max_tokens: LOG.warning( "PDF text exceeds token limit, truncating at page boundary", file_identifier=identifier, pages_included=i, total_pages=page_count, max_tokens=max_tokens, ) break current_tokens += page_tokens extracted_text += page_text + "\n" # Sanitize text to remove characters that cannot be stored in PostgreSQL extracted_text = sanitize_postgres_text(extracted_text) LOG.info( "Successfully parsed PDF with pdfplumber", file_identifier=identifier, page_count=page_count, text_length=len(extracted_text), ) return extracted_text except Exception as pdfplumber_error: LOG.error( "Failed to parse PDF with both pypdf and pdfplumber", file_identifier=identifier, pypdf_error=str(pypdf_error), pdfplumber_error=str(pdfplumber_error), ) raise PDFParsingError( file_identifier=identifier, pypdf_error=str(pypdf_error), pdfplumber_error=str(pdfplumber_error), ) def validate_pdf_file( file_path: str, file_identifier: str | None = None, ) -> bool: """ Validate that a file is a readable PDF. This function attempts to validate the PDF using pypdf first. If that fails, it automatically falls back to pdfplumber validation. Args: file_path: Path to the PDF file to validate file_identifier: Optional identifier for logging (e.g., URL or filename). If not provided, uses file_path. Returns: True if the PDF can be opened and read by at least one parser Raises: PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF Example: >>> if validate_pdf_file("/path/to/file.pdf"): ... print("Valid PDF file") """ identifier = file_identifier or file_path # Try pypdf first try: reader = PdfReader(file_path) # Just check if we can access pages, don't read content yet _ = len(reader.pages) LOG.debug( "PDF validation successful with pypdf", file_identifier=identifier, ) return True except Exception as pypdf_error: LOG.debug( "PDF validation with pypdf failed, trying pdfplumber", file_identifier=identifier, error=str(pypdf_error), ) # Fallback to pdfplumber try: with pdfplumber.open(file_path) as pdf: _ = len(pdf.pages) LOG.info( "PDF validation: pypdf failed but pdfplumber succeeded", file_identifier=identifier, pypdf_error=str(pypdf_error), ) return True except Exception as pdfplumber_error: LOG.error( "PDF validation failed with both pypdf and pdfplumber", file_identifier=identifier, pypdf_error=str(pypdf_error), pdfplumber_error=str(pdfplumber_error), ) raise PDFParsingError( file_identifier=identifier, pypdf_error=str(pypdf_error), pdfplumber_error=str(pdfplumber_error), )