Dorod-Sky/skyvern/forge/sdk/utils/pdf_parser.py

"""
Utility functions for PDF parsing with fallback support.

This module provides robust PDF parsing that tries pypdf first and falls back
to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats.
"""

import pdfplumber
import structlog
from pypdf import PdfReader

from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS
from skyvern.exceptions import PDFParsingError
from skyvern.forge.sdk.utils.sanitization import sanitize_postgres_text
from skyvern.utils.token_counter import count_tokens

LOG = structlog.get_logger(__name__)


def extract_pdf_file(
    file_path: str,
    file_identifier: str | None = None,
    max_tokens: int = MAX_FILE_PARSE_INPUT_TOKENS,
) -> str:
    """
    Extract text from a PDF file with fallback support.

    This function attempts to parse the PDF using pypdf first. If that fails,
    it automatically falls back to pdfplumber. This provides robust handling
    of various PDF formats, including those with corrupted streams or non-standard
    formatting that may cause pypdf to fail.

    Args:
        file_path: Path to the PDF file to parse
        file_identifier: Optional identifier for logging (e.g., URL or filename).
                        If not provided, uses file_path.

    Returns:
        Extracted text from all pages of the PDF

    Raises:
        PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF

    Example:
        >>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf")
        >>> print(f"Extracted {len(text)} characters")
    """
    identifier = file_identifier or file_path

    # Try pypdf first
    try:
        reader = PdfReader(file_path)
        extracted_text = ""
        current_tokens = 0
        page_count = len(reader.pages)

        for i in range(page_count):
            page_text = reader.pages[i].extract_text() or ""
            if max_tokens:
                page_tokens = count_tokens(page_text)
                if current_tokens + page_tokens > max_tokens:
                    LOG.warning(
                        "PDF text exceeds token limit, truncating at page boundary",
                        file_identifier=identifier,
                        pages_included=i,
                        total_pages=page_count,
                        max_tokens=max_tokens,
                    )
                    break
                current_tokens += page_tokens
            extracted_text += page_text + "\n"

        # Sanitize text to remove characters that cannot be stored in PostgreSQL
        extracted_text = sanitize_postgres_text(extracted_text)

        LOG.info(
            "Successfully parsed PDF with pypdf",
            file_identifier=identifier,
            page_count=page_count,
            text_length=len(extracted_text),
        )
        return extracted_text

    except Exception as pypdf_error:
        LOG.warning(
            "Failed to parse PDF with pypdf, trying pdfplumber",
            file_identifier=identifier,
            error=str(pypdf_error),
            error_type=type(pypdf_error).__name__,
        )

        # Fallback to pdfplumber
        try:
            with pdfplumber.open(file_path) as pdf:
                extracted_text = ""
                current_tokens = 0
                page_count = len(pdf.pages)

                for i, page in enumerate(pdf.pages):
                    page_text = page.extract_text()
                    if page_text:
                        if max_tokens:
                            page_tokens = count_tokens(page_text)
                            if current_tokens + page_tokens > max_tokens:
                                LOG.warning(
                                    "PDF text exceeds token limit, truncating at page boundary",
                                    file_identifier=identifier,
                                    pages_included=i,
                                    total_pages=page_count,
                                    max_tokens=max_tokens,
                                )
                                break
                            current_tokens += page_tokens
                        extracted_text += page_text + "\n"

                # Sanitize text to remove characters that cannot be stored in PostgreSQL
                extracted_text = sanitize_postgres_text(extracted_text)

                LOG.info(
                    "Successfully parsed PDF with pdfplumber",
                    file_identifier=identifier,
                    page_count=page_count,
                    text_length=len(extracted_text),
                )
                return extracted_text

        except Exception as pdfplumber_error:
            LOG.error(
                "Failed to parse PDF with both pypdf and pdfplumber",
                file_identifier=identifier,
                pypdf_error=str(pypdf_error),
                pdfplumber_error=str(pdfplumber_error),
            )
            raise PDFParsingError(
                file_identifier=identifier,
                pypdf_error=str(pypdf_error),
                pdfplumber_error=str(pdfplumber_error),
            )


def validate_pdf_file(
    file_path: str,
    file_identifier: str | None = None,
) -> bool:
    """
    Validate that a file is a readable PDF.

    This function attempts to validate the PDF using pypdf first. If that fails,
    it automatically falls back to pdfplumber validation.

    Args:
        file_path: Path to the PDF file to validate
        file_identifier: Optional identifier for logging (e.g., URL or filename).
                        If not provided, uses file_path.

    Returns:
        True if the PDF can be opened and read by at least one parser

    Raises:
        PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF

    Example:
        >>> if validate_pdf_file("/path/to/file.pdf"):
        ...     print("Valid PDF file")
    """
    identifier = file_identifier or file_path

    # Try pypdf first
    try:
        reader = PdfReader(file_path)
        # Just check if we can access pages, don't read content yet
        _ = len(reader.pages)
        LOG.debug(
            "PDF validation successful with pypdf",
            file_identifier=identifier,
        )
        return True

    except Exception as pypdf_error:
        LOG.debug(
            "PDF validation with pypdf failed, trying pdfplumber",
            file_identifier=identifier,
            error=str(pypdf_error),
        )

        # Fallback to pdfplumber
        try:
            with pdfplumber.open(file_path) as pdf:
                _ = len(pdf.pages)

            LOG.info(
                "PDF validation: pypdf failed but pdfplumber succeeded",
                file_identifier=identifier,
                pypdf_error=str(pypdf_error),
            )
            return True

        except Exception as pdfplumber_error:
            LOG.error(
                "PDF validation failed with both pypdf and pdfplumber",
                file_identifier=identifier,
                pypdf_error=str(pypdf_error),
                pdfplumber_error=str(pdfplumber_error),
            )
            raise PDFParsingError(
                file_identifier=identifier,
                pypdf_error=str(pypdf_error),
                pdfplumber_error=str(pdfplumber_error),
            )