diff --git a/pyproject.toml b/pyproject.toml index 584dec1f..6d64856d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ "asyncpg>=0.30.0,<0.31", "json-repair>=0.34.0,<0.35", "pypdf>=5.1.0,<6", + "pdfplumber>=0.11.0,<0.12", "fastmcp>=2.10.1,<3", "psutil>=7.0.0", "tiktoken>=0.9.0", diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py index 0193866c..58e5bede 100644 --- a/skyvern/exceptions.py +++ b/skyvern/exceptions.py @@ -902,3 +902,15 @@ class PDFEmbedBase64DecodeError(SkyvernException): src_preview = pdf_embed_src[:100] + "..." if len(pdf_embed_src) > 100 else pdf_embed_src message += f". PDF embed src: {src_preview}" super().__init__(message) + + +class PDFParsingError(SkyvernException): + """Raised when PDF parsing fails with all available parsers.""" + + def __init__(self, file_identifier: str, pypdf_error: str, pdfplumber_error: str): + self.file_identifier = file_identifier + self.pypdf_error = pypdf_error + self.pdfplumber_error = pdfplumber_error + super().__init__( + f"Failed to parse PDF '{file_identifier}'. pypdf error: {pypdf_error}; pdfplumber error: {pdfplumber_error}" + ) diff --git a/skyvern/forge/sdk/utils/pdf_parser.py b/skyvern/forge/sdk/utils/pdf_parser.py new file mode 100644 index 00000000..301f5e1d --- /dev/null +++ b/skyvern/forge/sdk/utils/pdf_parser.py @@ -0,0 +1,173 @@ +""" +Utility functions for PDF parsing with fallback support. + +This module provides robust PDF parsing that tries pypdf first and falls back +to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats. +""" + +import pdfplumber +import structlog +from pypdf import PdfReader + +from skyvern.exceptions import PDFParsingError + +LOG = structlog.get_logger(__name__) + + +def extract_pdf_file( + file_path: str, + file_identifier: str | None = None, +) -> str: + """ + Extract text from a PDF file with fallback support. + + This function attempts to parse the PDF using pypdf first. If that fails, + it automatically falls back to pdfplumber. This provides robust handling + of various PDF formats, including those with corrupted streams or non-standard + formatting that may cause pypdf to fail. + + Args: + file_path: Path to the PDF file to parse + file_identifier: Optional identifier for logging (e.g., URL or filename). + If not provided, uses file_path. + + Returns: + Extracted text from all pages of the PDF + + Raises: + PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF + + Example: + >>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf") + >>> print(f"Extracted {len(text)} characters") + """ + identifier = file_identifier or file_path + + # Try pypdf first + try: + reader = PdfReader(file_path) + extracted_text = "" + page_count = len(reader.pages) + + for i in range(page_count): + page_text = reader.pages[i].extract_text() or "" + extracted_text += page_text + "\n" + + LOG.info( + "Successfully parsed PDF with pypdf", + file_identifier=identifier, + page_count=page_count, + text_length=len(extracted_text), + ) + return extracted_text + + except Exception as pypdf_error: + LOG.warning( + "Failed to parse PDF with pypdf, trying pdfplumber", + file_identifier=identifier, + error=str(pypdf_error), + error_type=type(pypdf_error).__name__, + ) + + # Fallback to pdfplumber + try: + with pdfplumber.open(file_path) as pdf: + extracted_text = "" + page_count = len(pdf.pages) + + for page in pdf.pages: + page_text = page.extract_text() + if page_text: + extracted_text += page_text + "\n" + + LOG.info( + "Successfully parsed PDF with pdfplumber", + file_identifier=identifier, + page_count=page_count, + text_length=len(extracted_text), + ) + return extracted_text + + except Exception as pdfplumber_error: + LOG.error( + "Failed to parse PDF with both pypdf and pdfplumber", + file_identifier=identifier, + pypdf_error=str(pypdf_error), + pdfplumber_error=str(pdfplumber_error), + ) + raise PDFParsingError( + file_identifier=identifier, + pypdf_error=str(pypdf_error), + pdfplumber_error=str(pdfplumber_error), + ) + + +def validate_pdf_file( + file_path: str, + file_identifier: str | None = None, +) -> bool: + """ + Validate that a file is a readable PDF. + + This function attempts to validate the PDF using pypdf first. If that fails, + it automatically falls back to pdfplumber validation. + + Args: + file_path: Path to the PDF file to validate + file_identifier: Optional identifier for logging (e.g., URL or filename). + If not provided, uses file_path. + + Returns: + True if the PDF can be opened and read by at least one parser + + Raises: + PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF + + Example: + >>> if validate_pdf_file("/path/to/file.pdf"): + ... print("Valid PDF file") + """ + identifier = file_identifier or file_path + + # Try pypdf first + try: + reader = PdfReader(file_path) + # Just check if we can access pages, don't read content yet + _ = len(reader.pages) + LOG.debug( + "PDF validation successful with pypdf", + file_identifier=identifier, + ) + return True + + except Exception as pypdf_error: + LOG.debug( + "PDF validation with pypdf failed, trying pdfplumber", + file_identifier=identifier, + error=str(pypdf_error), + ) + + # Fallback to pdfplumber + try: + with pdfplumber.open(file_path) as pdf: + _ = len(pdf.pages) + + LOG.info( + "PDF validation: pypdf failed but pdfplumber succeeded", + file_identifier=identifier, + pypdf_error=str(pypdf_error), + ) + return True + + except Exception as pdfplumber_error: + LOG.error( + "PDF validation failed with both pypdf and pdfplumber", + file_identifier=identifier, + pypdf_error=str(pypdf_error), + pdfplumber_error=str(pdfplumber_error), + ) + raise PDFParsingError( + file_identifier=identifier, + pypdf_error=str(pypdf_error), + pdfplumber_error=str(pdfplumber_error), + ) diff --git a/skyvern/forge/sdk/workflow/models/block.py b/skyvern/forge/sdk/workflow/models/block.py index 063f7055..75cda342 100644 --- a/skyvern/forge/sdk/workflow/models/block.py +++ b/skyvern/forge/sdk/workflow/models/block.py @@ -27,8 +27,6 @@ from jinja2 import StrictUndefined from jinja2.sandbox import SandboxedEnvironment from playwright.async_api import Page from pydantic import BaseModel, Field, model_validator -from pypdf import PdfReader -from pypdf.errors import PdfReadError from skyvern.config import settings from skyvern.constants import ( @@ -41,6 +39,7 @@ from skyvern.exceptions import ( ContextParameterValueNotFound, MissingBrowserState, MissingBrowserStatePage, + PDFParsingError, SkyvernException, TaskNotFound, UnexpectedTaskStatus, @@ -70,6 +69,7 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskOutput, TaskStatus from skyvern.forge.sdk.services.bitwarden import BitwardenConstants from skyvern.forge.sdk.services.credentials import AzureVaultConstants, OnePasswordConstants from skyvern.forge.sdk.trace import TraceManager +from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file, validate_pdf_file from skyvern.forge.sdk.workflow.context_manager import BlockMetadata, WorkflowRunContext from skyvern.forge.sdk.workflow.exceptions import ( CustomizedCodeException, @@ -3020,11 +3020,8 @@ class FileParserBlock(Block): ) elif self.file_type == FileType.PDF: try: - # Try to read the file with PyPDF to validate it's a valid PDF file - reader = PdfReader(file_path) - # Just check if we can access pages, don't read content yet - _ = len(reader.pages) - except Exception as e: + validate_pdf_file(file_path, file_identifier=file_url_used) + except PDFParsingError as e: raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e)) async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]: @@ -3087,15 +3084,14 @@ class FileParserBlock(Block): ) async def _parse_pdf_file(self, file_path: str) -> str: - """Parse PDF file and return extracted text.""" + """Parse PDF file and return extracted text. + + Uses the shared PDF parsing utility that tries pypdf first, + then falls back to pdfplumber if pypdf fails. + """ try: - reader = PdfReader(file_path) - extracted_text = "" - page_count = len(reader.pages) - for i in range(page_count): - extracted_text += reader.pages[i].extract_text() + "\n" - return extracted_text - except PdfReadError as e: + return extract_pdf_file(file_path, file_identifier=self.file_url) + except PDFParsingError as e: raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e)) async def _extract_with_ai( @@ -3314,14 +3310,9 @@ class PDFParserBlock(Block): else: file_path = await download_file(self.file_url) - extracted_text = "" try: - reader = PdfReader(file_path) - page_count = len(reader.pages) - for i in range(page_count): - extracted_text += reader.pages[i].extract_text() + "\n" - - except PdfReadError: + extracted_text = extract_pdf_file(file_path, file_identifier=self.file_url) + except PDFParsingError: return await self.build_block_result( success=False, failure_reason="Failed to parse PDF file", diff --git a/skyvern/services/pdf_import_service.py b/skyvern/services/pdf_import_service.py index 223833db..08cf5acb 100644 --- a/skyvern/services/pdf_import_service.py +++ b/skyvern/services/pdf_import_service.py @@ -5,12 +5,12 @@ from typing import Any import structlog from fastapi import HTTPException -from pypdf import PdfReader from skyvern.config import settings from skyvern.forge.prompts import prompt_engine from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory from skyvern.forge.sdk.schemas.organizations import Organization +from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file from skyvern.schemas.workflows import WorkflowCreateYAMLRequest LOG = structlog.get_logger(__name__) @@ -133,7 +133,11 @@ class PDFImportService: return raw def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str: - """Extract text from PDF file contents. Raises HTTPException if invalid.""" + """Extract text from PDF file contents. Raises HTTPException if invalid. + + Uses the shared PDF parsing utility that tries pypdf first, + then falls back to pdfplumber if pypdf fails. + """ LOG.info("Extracting text from PDF", filename=file_name) # Save the uploaded file to a temporary location @@ -142,14 +146,10 @@ class PDFImportService: temp_file_path = temp_file.name try: - reader = PdfReader(temp_file_path) - sop_text = "" - for page_num, page in enumerate(reader.pages, 1): - page_text = page.extract_text() or "" - sop_text += page_text + "\n" - LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text)) + # Use the shared PDF parsing utility + sop_text = extract_pdf_file(temp_file_path, file_identifier=file_name) - LOG.info("PDF text extraction complete", total_text_length=len(sop_text)) + LOG.info("PDF text extraction complete", filename=file_name, total_text_length=len(sop_text)) if not sop_text.strip(): raise HTTPException(status_code=400, detail="No readable content found in the PDF.") diff --git a/uv.lock b/uv.lock index c381eebb..1a67c5c3 100644 --- a/uv.lock +++ b/uv.lock @@ -3540,6 +3540,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, ] +[[package]] +name = "pdfminer-six" +version = "20251107" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "charset-normalizer" }, + { name = "cryptography" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1d/50/5315f381a25dc80a8d2ea7c62d9a28c0137f10ccc263623a0db8b49fcced/pdfminer_six-20251107.tar.gz", hash = "sha256:5fb0c553799c591777f22c0c72b77fc2522d7d10c70654e25f4c5f1fd996e008", size = 7387104, upload-time = "2025-11-07T20:01:10.286Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/29/d1d9f6b900191288b77613ddefb73ed35b48fb35e44aaf8b01b0422b759d/pdfminer_six-20251107-py3-none-any.whl", hash = "sha256:c09df33e4cbe6b26b2a79248a4ffcccafaa5c5d39c9fff0e6e81567f165b5401", size = 5620299, upload-time = "2025-11-07T20:01:08.722Z" }, +] + +[[package]] +name = "pdfplumber" +version = "0.11.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pdfminer-six" }, + { name = "pillow" }, + { name = "pypdfium2" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/d8/cb9fda4261ce389656bec0bb0bdde905df109ad97f7ae387747ded070e8c/pdfplumber-0.11.8.tar.gz", hash = "sha256:db29b04bc8bb62f39dd444533bcf2e0ba33584bd24f5a54644f3ba30f4f22d31", size = 102724, upload-time = "2025-11-08T20:52:01.955Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/28/3958ed81a9be317610ab73df32f1968076751d651c84dff1bcb45b7c6c0e/pdfplumber-0.11.8-py3-none-any.whl", hash = "sha256:7dda117b8ed21bca9c8e7d7808fee2439f93c8bd6ea45989bfb1aead6dc3cad3", size = 60043, upload-time = "2025-11-08T20:52:00.652Z" }, +] + [[package]] name = "pexpect" version = "4.9.0" @@ -4219,6 +4246,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/d9/6cff57c80a6963e7dd183bf09e9f21604a77716644b1e580e97b259f7612/pypdf-5.9.0-py3-none-any.whl", hash = "sha256:be10a4c54202f46d9daceaa8788be07aa8cd5ea8c25c529c50dd509206382c35", size = 313193, upload-time = "2025-07-27T14:04:50.53Z" }, ] +[[package]] +name = "pypdfium2" +version = "5.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/ab/73c7d24e4eac9ba952569403b32b7cca9412fc5b9bef54fdbd669551389f/pypdfium2-5.2.0.tar.gz", hash = "sha256:43863625231ce999c1ebbed6721a88de818b2ab4d909c1de558d413b9a400256", size = 269999, upload-time = "2025-12-12T13:20:15.353Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/0c/9108ae5266ee4cdf495f99205c44d4b5c83b4eb227c2b610d35c9e9fe961/pypdfium2-5.2.0-py3-none-android_23_arm64_v8a.whl", hash = "sha256:1ba4187a45ce4cf08f2a8c7e0f8970c36b9aa1770c8a3412a70781c1d80fb145", size = 2763268, upload-time = "2025-12-12T13:19:37.354Z" }, + { url = "https://files.pythonhosted.org/packages/35/8c/55f5c8a2c6b293f5c020be4aa123eaa891e797c514e5eccd8cb042740d37/pypdfium2-5.2.0-py3-none-android_23_armeabi_v7a.whl", hash = "sha256:80c55e10a8c9242f0901d35a9a306dd09accce8e497507bb23fcec017d45fe2e", size = 2301821, upload-time = "2025-12-12T13:19:39.484Z" }, + { url = "https://files.pythonhosted.org/packages/5e/7d/efa013e3795b41c59dd1e472f7201c241232c3a6553be4917e3a26b9f225/pypdfium2-5.2.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:73523ae69cd95c084c1342096893b2143ea73c36fdde35494780ba431e6a7d6e", size = 2816428, upload-time = "2025-12-12T13:19:41.735Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ae/8c30af6ff2ab41a7cb84753ee79dd1e0a8932c9bda9fe19759d69cbbf115/pypdfium2-5.2.0-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:19c501d22ef5eb98e42416d22cc3ac66d4808b436e3d06686392f24d8d9f708d", size = 2939486, upload-time = "2025-12-12T13:19:43.176Z" }, + { url = "https://files.pythonhosted.org/packages/64/64/454a73c49a04c2c290917ad86184e4da959e9e5aba94b3b046328c89be93/pypdfium2-5.2.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ed15a3f58d6ee4905f0d0a731e30b381b457c30689512589c7f57950b0cdcec", size = 2979235, upload-time = "2025-12-12T13:19:44.635Z" }, + { url = "https://files.pythonhosted.org/packages/4e/29/f1cab8e31192dd367dc7b1afa71f45cfcb8ff0b176f1d2a0f528faf04052/pypdfium2-5.2.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:329cd1e9f068e8729e0d0b79a070d6126f52bc48ff1e40505cb207a5e20ce0ba", size = 2763001, upload-time = "2025-12-12T13:19:47.598Z" }, + { url = "https://files.pythonhosted.org/packages/bc/5d/e95fad8fdac960854173469c4b6931d5de5e09d05e6ee7d9756f8b95eef0/pypdfium2-5.2.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:325259759886e66619504df4721fef3b8deabf8a233e4f4a66e0c32ebae60c2f", size = 3057024, upload-time = "2025-12-12T13:19:49.179Z" }, + { url = "https://files.pythonhosted.org/packages/f4/32/468591d017ab67f8142d40f4db8163b6d8bb404fe0d22da75a5c661dc144/pypdfium2-5.2.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5683e8f08ab38ed05e0e59e611451ec74332803d4e78f8c45658ea1d372a17af", size = 3448598, upload-time = "2025-12-12T13:19:50.979Z" }, + { url = "https://files.pythonhosted.org/packages/f9/a5/57b4e389b77ab5f7e9361dc7fc03b5378e678ba81b21e791e85350fbb235/pypdfium2-5.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da4815426a5adcf03bf4d2c5f26c0ff8109dbfaf2c3415984689931bc6006ef9", size = 2993946, upload-time = "2025-12-12T13:19:53.154Z" }, + { url = "https://files.pythonhosted.org/packages/84/3a/e03e9978f817632aa56183bb7a4989284086fdd45de3245ead35f147179b/pypdfium2-5.2.0-py3-none-manylinux_2_27_s390x.manylinux_2_28_s390x.whl", hash = "sha256:64bf5c039b2c314dab1fd158bfff99db96299a5b5c6d96fc056071166056f1de", size = 3673148, upload-time = "2025-12-12T13:19:54.528Z" }, + { url = "https://files.pythonhosted.org/packages/13/ee/e581506806553afa4b7939d47bf50dca35c1151b8cc960f4542a6eb135ce/pypdfium2-5.2.0-py3-none-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:76b42a17748ac7dc04d5ef04d0561c6a0a4b546d113ec1d101d59650c6a340f7", size = 2964757, upload-time = "2025-12-12T13:19:56.406Z" }, + { url = "https://files.pythonhosted.org/packages/00/be/3715c652aff30f12284523dd337843d0efe3e721020f0ec303a99ffffd8d/pypdfium2-5.2.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:9d4367d471439fae846f0aba91ff9e8d66e524edcf3c8d6e02fe96fa306e13b9", size = 4130319, upload-time = "2025-12-12T13:19:57.889Z" }, + { url = "https://files.pythonhosted.org/packages/b0/0b/28aa2ede9004dd4192266bbad394df0896787f7c7bcfa4d1a6e091ad9a2c/pypdfium2-5.2.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:613f6bb2b47d76b66c0bf2ca581c7c33e3dd9dcb29d65d8c34fef4135f933149", size = 3746488, upload-time = "2025-12-12T13:19:59.469Z" }, + { url = "https://files.pythonhosted.org/packages/bc/04/1b791e1219652bbfc51df6498267d8dcec73ad508b99388b2890902ccd9d/pypdfium2-5.2.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c03fad3f2fa68d358f5dd4deb07e438482fa26fae439c49d127576d969769ca1", size = 4336534, upload-time = "2025-12-12T13:20:01.28Z" }, + { url = "https://files.pythonhosted.org/packages/4f/e3/6f00f963bb702ffd2e3e2d9c7286bc3bb0bebcdfa96ca897d466f66976c6/pypdfium2-5.2.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:f10be1900ae21879d02d9f4d58c2d2db3a2e6da611736a8e9decc22d1fb02909", size = 4375079, upload-time = "2025-12-12T13:20:03.117Z" }, + { url = "https://files.pythonhosted.org/packages/3a/2a/7ec2b191b5e1b7716a0dfc14e6860e89bb355fb3b94ed0c1d46db526858c/pypdfium2-5.2.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:97c1a126d30378726872f94866e38c055740cae80313638dafd1cd448d05e7c0", size = 3928648, upload-time = "2025-12-12T13:20:05.041Z" }, + { url = "https://files.pythonhosted.org/packages/bf/c3/c6d972fa095ff3ace76f9d3a91ceaf8a9dbbe0d9a5a84ac1d6178a46630e/pypdfium2-5.2.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:c369f183a90781b788af9a357a877bc8caddc24801e8346d0bf23f3295f89f3a", size = 4997772, upload-time = "2025-12-12T13:20:06.453Z" }, + { url = "https://files.pythonhosted.org/packages/22/45/2c64584b7a3ca5c4652280a884f4b85b8ed24e27662adeebdc06d991c917/pypdfium2-5.2.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b391f1cceb454934b612a05b54e90f98aafeffe5e73830d71700b17f0812226b", size = 4180046, upload-time = "2025-12-12T13:20:08.715Z" }, + { url = "https://files.pythonhosted.org/packages/d6/99/8d1ff87b626649400e62a2840e6e10fe258443ba518798e071fee4cd86f9/pypdfium2-5.2.0-py3-none-win32.whl", hash = "sha256:c68067938f617c37e4d17b18de7cac231fc7ce0eb7b6653b7283ebe8764d4999", size = 2990175, upload-time = "2025-12-12T13:20:10.241Z" }, + { url = "https://files.pythonhosted.org/packages/93/fc/114fff8895b620aac4984808e93d01b6d7b93e342a1635fcfe2a5f39cf39/pypdfium2-5.2.0-py3-none-win_amd64.whl", hash = "sha256:eb0591b720e8aaeab9475c66d653655ec1be0464b946f3f48a53922e843f0f3b", size = 3098615, upload-time = "2025-12-12T13:20:11.795Z" }, + { url = "https://files.pythonhosted.org/packages/08/97/eb738bff5998760d6e0cbcb7dd04cbf1a95a97b997fac6d4e57562a58992/pypdfium2-5.2.0-py3-none-win_arm64.whl", hash = "sha256:5dd1ef579f19fa3719aee4959b28bda44b1072405756708b5e83df8806a19521", size = 2939479, upload-time = "2025-12-12T13:20:13.815Z" }, +] + [[package]] name = "pyperclip" version = "1.11.0" @@ -5063,6 +5119,7 @@ dependencies = [ { name = "openai" }, { name = "orjson" }, { name = "pandas" }, + { name = "pdfplumber" }, { name = "pillow" }, { name = "playwright", version = "1.46.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, { name = "playwright", version = "1.56.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, @@ -5159,6 +5216,7 @@ requires-dist = [ { name = "openai", specifier = ">=1.68.2" }, { name = "orjson", specifier = ">=3.9.10,<4" }, { name = "pandas", specifier = ">=2.3.1,<3" }, + { name = "pdfplumber", specifier = ">=0.11.0,<0.12" }, { name = "pillow", specifier = ">=10.1.0,<11" }, { name = "playwright", marker = "python_full_version == '3.11.*'", specifier = "==1.46.0" }, { name = "playwright", marker = "python_full_version >= '3.12' and python_full_version < '3.14'", specifier = ">1.46.0" },