Pdf parser robustness improvement (#4307)
This commit is contained in:
@@ -51,6 +51,7 @@ dependencies = [
|
|||||||
"asyncpg>=0.30.0,<0.31",
|
"asyncpg>=0.30.0,<0.31",
|
||||||
"json-repair>=0.34.0,<0.35",
|
"json-repair>=0.34.0,<0.35",
|
||||||
"pypdf>=5.1.0,<6",
|
"pypdf>=5.1.0,<6",
|
||||||
|
"pdfplumber>=0.11.0,<0.12",
|
||||||
"fastmcp>=2.10.1,<3",
|
"fastmcp>=2.10.1,<3",
|
||||||
"psutil>=7.0.0",
|
"psutil>=7.0.0",
|
||||||
"tiktoken>=0.9.0",
|
"tiktoken>=0.9.0",
|
||||||
|
|||||||
@@ -902,3 +902,15 @@ class PDFEmbedBase64DecodeError(SkyvernException):
|
|||||||
src_preview = pdf_embed_src[:100] + "..." if len(pdf_embed_src) > 100 else pdf_embed_src
|
src_preview = pdf_embed_src[:100] + "..." if len(pdf_embed_src) > 100 else pdf_embed_src
|
||||||
message += f". PDF embed src: {src_preview}"
|
message += f". PDF embed src: {src_preview}"
|
||||||
super().__init__(message)
|
super().__init__(message)
|
||||||
|
|
||||||
|
|
||||||
|
class PDFParsingError(SkyvernException):
|
||||||
|
"""Raised when PDF parsing fails with all available parsers."""
|
||||||
|
|
||||||
|
def __init__(self, file_identifier: str, pypdf_error: str, pdfplumber_error: str):
|
||||||
|
self.file_identifier = file_identifier
|
||||||
|
self.pypdf_error = pypdf_error
|
||||||
|
self.pdfplumber_error = pdfplumber_error
|
||||||
|
super().__init__(
|
||||||
|
f"Failed to parse PDF '{file_identifier}'. pypdf error: {pypdf_error}; pdfplumber error: {pdfplumber_error}"
|
||||||
|
)
|
||||||
|
|||||||
173
skyvern/forge/sdk/utils/pdf_parser.py
Normal file
173
skyvern/forge/sdk/utils/pdf_parser.py
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
"""
|
||||||
|
Utility functions for PDF parsing with fallback support.
|
||||||
|
|
||||||
|
This module provides robust PDF parsing that tries pypdf first and falls back
|
||||||
|
to pdfplumber if pypdf fails, ensuring maximum compatibility with various PDF formats.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pdfplumber
|
||||||
|
import structlog
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
from skyvern.exceptions import PDFParsingError
|
||||||
|
|
||||||
|
LOG = structlog.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdf_file(
|
||||||
|
file_path: str,
|
||||||
|
file_identifier: str | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Extract text from a PDF file with fallback support.
|
||||||
|
|
||||||
|
This function attempts to parse the PDF using pypdf first. If that fails,
|
||||||
|
it automatically falls back to pdfplumber. This provides robust handling
|
||||||
|
of various PDF formats, including those with corrupted streams or non-standard
|
||||||
|
formatting that may cause pypdf to fail.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the PDF file to parse
|
||||||
|
file_identifier: Optional identifier for logging (e.g., URL or filename).
|
||||||
|
If not provided, uses file_path.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Extracted text from all pages of the PDF
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
PDFParsingError: When both pypdf and pdfplumber fail to parse the PDF
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> text = extract_pdf_file("/path/to/file.pdf", "document.pdf")
|
||||||
|
>>> print(f"Extracted {len(text)} characters")
|
||||||
|
"""
|
||||||
|
identifier = file_identifier or file_path
|
||||||
|
|
||||||
|
# Try pypdf first
|
||||||
|
try:
|
||||||
|
reader = PdfReader(file_path)
|
||||||
|
extracted_text = ""
|
||||||
|
page_count = len(reader.pages)
|
||||||
|
|
||||||
|
for i in range(page_count):
|
||||||
|
page_text = reader.pages[i].extract_text() or ""
|
||||||
|
extracted_text += page_text + "\n"
|
||||||
|
|
||||||
|
LOG.info(
|
||||||
|
"Successfully parsed PDF with pypdf",
|
||||||
|
file_identifier=identifier,
|
||||||
|
page_count=page_count,
|
||||||
|
text_length=len(extracted_text),
|
||||||
|
)
|
||||||
|
return extracted_text
|
||||||
|
|
||||||
|
except Exception as pypdf_error:
|
||||||
|
LOG.warning(
|
||||||
|
"Failed to parse PDF with pypdf, trying pdfplumber",
|
||||||
|
file_identifier=identifier,
|
||||||
|
error=str(pypdf_error),
|
||||||
|
error_type=type(pypdf_error).__name__,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fallback to pdfplumber
|
||||||
|
try:
|
||||||
|
with pdfplumber.open(file_path) as pdf:
|
||||||
|
extracted_text = ""
|
||||||
|
page_count = len(pdf.pages)
|
||||||
|
|
||||||
|
for page in pdf.pages:
|
||||||
|
page_text = page.extract_text()
|
||||||
|
if page_text:
|
||||||
|
extracted_text += page_text + "\n"
|
||||||
|
|
||||||
|
LOG.info(
|
||||||
|
"Successfully parsed PDF with pdfplumber",
|
||||||
|
file_identifier=identifier,
|
||||||
|
page_count=page_count,
|
||||||
|
text_length=len(extracted_text),
|
||||||
|
)
|
||||||
|
return extracted_text
|
||||||
|
|
||||||
|
except Exception as pdfplumber_error:
|
||||||
|
LOG.error(
|
||||||
|
"Failed to parse PDF with both pypdf and pdfplumber",
|
||||||
|
file_identifier=identifier,
|
||||||
|
pypdf_error=str(pypdf_error),
|
||||||
|
pdfplumber_error=str(pdfplumber_error),
|
||||||
|
)
|
||||||
|
raise PDFParsingError(
|
||||||
|
file_identifier=identifier,
|
||||||
|
pypdf_error=str(pypdf_error),
|
||||||
|
pdfplumber_error=str(pdfplumber_error),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_pdf_file(
|
||||||
|
file_path: str,
|
||||||
|
file_identifier: str | None = None,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Validate that a file is a readable PDF.
|
||||||
|
|
||||||
|
This function attempts to validate the PDF using pypdf first. If that fails,
|
||||||
|
it automatically falls back to pdfplumber validation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the PDF file to validate
|
||||||
|
file_identifier: Optional identifier for logging (e.g., URL or filename).
|
||||||
|
If not provided, uses file_path.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the PDF can be opened and read by at least one parser
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
PDFParsingError: When both pypdf and pdfplumber fail to validate the PDF
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> if validate_pdf_file("/path/to/file.pdf"):
|
||||||
|
... print("Valid PDF file")
|
||||||
|
"""
|
||||||
|
identifier = file_identifier or file_path
|
||||||
|
|
||||||
|
# Try pypdf first
|
||||||
|
try:
|
||||||
|
reader = PdfReader(file_path)
|
||||||
|
# Just check if we can access pages, don't read content yet
|
||||||
|
_ = len(reader.pages)
|
||||||
|
LOG.debug(
|
||||||
|
"PDF validation successful with pypdf",
|
||||||
|
file_identifier=identifier,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as pypdf_error:
|
||||||
|
LOG.debug(
|
||||||
|
"PDF validation with pypdf failed, trying pdfplumber",
|
||||||
|
file_identifier=identifier,
|
||||||
|
error=str(pypdf_error),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fallback to pdfplumber
|
||||||
|
try:
|
||||||
|
with pdfplumber.open(file_path) as pdf:
|
||||||
|
_ = len(pdf.pages)
|
||||||
|
|
||||||
|
LOG.info(
|
||||||
|
"PDF validation: pypdf failed but pdfplumber succeeded",
|
||||||
|
file_identifier=identifier,
|
||||||
|
pypdf_error=str(pypdf_error),
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as pdfplumber_error:
|
||||||
|
LOG.error(
|
||||||
|
"PDF validation failed with both pypdf and pdfplumber",
|
||||||
|
file_identifier=identifier,
|
||||||
|
pypdf_error=str(pypdf_error),
|
||||||
|
pdfplumber_error=str(pdfplumber_error),
|
||||||
|
)
|
||||||
|
raise PDFParsingError(
|
||||||
|
file_identifier=identifier,
|
||||||
|
pypdf_error=str(pypdf_error),
|
||||||
|
pdfplumber_error=str(pdfplumber_error),
|
||||||
|
)
|
||||||
@@ -27,8 +27,6 @@ from jinja2 import StrictUndefined
|
|||||||
from jinja2.sandbox import SandboxedEnvironment
|
from jinja2.sandbox import SandboxedEnvironment
|
||||||
from playwright.async_api import Page
|
from playwright.async_api import Page
|
||||||
from pydantic import BaseModel, Field, model_validator
|
from pydantic import BaseModel, Field, model_validator
|
||||||
from pypdf import PdfReader
|
|
||||||
from pypdf.errors import PdfReadError
|
|
||||||
|
|
||||||
from skyvern.config import settings
|
from skyvern.config import settings
|
||||||
from skyvern.constants import (
|
from skyvern.constants import (
|
||||||
@@ -41,6 +39,7 @@ from skyvern.exceptions import (
|
|||||||
ContextParameterValueNotFound,
|
ContextParameterValueNotFound,
|
||||||
MissingBrowserState,
|
MissingBrowserState,
|
||||||
MissingBrowserStatePage,
|
MissingBrowserStatePage,
|
||||||
|
PDFParsingError,
|
||||||
SkyvernException,
|
SkyvernException,
|
||||||
TaskNotFound,
|
TaskNotFound,
|
||||||
UnexpectedTaskStatus,
|
UnexpectedTaskStatus,
|
||||||
@@ -70,6 +69,7 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskOutput, TaskStatus
|
|||||||
from skyvern.forge.sdk.services.bitwarden import BitwardenConstants
|
from skyvern.forge.sdk.services.bitwarden import BitwardenConstants
|
||||||
from skyvern.forge.sdk.services.credentials import AzureVaultConstants, OnePasswordConstants
|
from skyvern.forge.sdk.services.credentials import AzureVaultConstants, OnePasswordConstants
|
||||||
from skyvern.forge.sdk.trace import TraceManager
|
from skyvern.forge.sdk.trace import TraceManager
|
||||||
|
from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file, validate_pdf_file
|
||||||
from skyvern.forge.sdk.workflow.context_manager import BlockMetadata, WorkflowRunContext
|
from skyvern.forge.sdk.workflow.context_manager import BlockMetadata, WorkflowRunContext
|
||||||
from skyvern.forge.sdk.workflow.exceptions import (
|
from skyvern.forge.sdk.workflow.exceptions import (
|
||||||
CustomizedCodeException,
|
CustomizedCodeException,
|
||||||
@@ -3020,11 +3020,8 @@ class FileParserBlock(Block):
|
|||||||
)
|
)
|
||||||
elif self.file_type == FileType.PDF:
|
elif self.file_type == FileType.PDF:
|
||||||
try:
|
try:
|
||||||
# Try to read the file with PyPDF to validate it's a valid PDF file
|
validate_pdf_file(file_path, file_identifier=file_url_used)
|
||||||
reader = PdfReader(file_path)
|
except PDFParsingError as e:
|
||||||
# Just check if we can access pages, don't read content yet
|
|
||||||
_ = len(reader.pages)
|
|
||||||
except Exception as e:
|
|
||||||
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
||||||
|
|
||||||
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
|
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
|
||||||
@@ -3087,15 +3084,14 @@ class FileParserBlock(Block):
|
|||||||
)
|
)
|
||||||
|
|
||||||
async def _parse_pdf_file(self, file_path: str) -> str:
|
async def _parse_pdf_file(self, file_path: str) -> str:
|
||||||
"""Parse PDF file and return extracted text."""
|
"""Parse PDF file and return extracted text.
|
||||||
|
|
||||||
|
Uses the shared PDF parsing utility that tries pypdf first,
|
||||||
|
then falls back to pdfplumber if pypdf fails.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
reader = PdfReader(file_path)
|
return extract_pdf_file(file_path, file_identifier=self.file_url)
|
||||||
extracted_text = ""
|
except PDFParsingError as e:
|
||||||
page_count = len(reader.pages)
|
|
||||||
for i in range(page_count):
|
|
||||||
extracted_text += reader.pages[i].extract_text() + "\n"
|
|
||||||
return extracted_text
|
|
||||||
except PdfReadError as e:
|
|
||||||
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
|
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
|
||||||
|
|
||||||
async def _extract_with_ai(
|
async def _extract_with_ai(
|
||||||
@@ -3314,14 +3310,9 @@ class PDFParserBlock(Block):
|
|||||||
else:
|
else:
|
||||||
file_path = await download_file(self.file_url)
|
file_path = await download_file(self.file_url)
|
||||||
|
|
||||||
extracted_text = ""
|
|
||||||
try:
|
try:
|
||||||
reader = PdfReader(file_path)
|
extracted_text = extract_pdf_file(file_path, file_identifier=self.file_url)
|
||||||
page_count = len(reader.pages)
|
except PDFParsingError:
|
||||||
for i in range(page_count):
|
|
||||||
extracted_text += reader.pages[i].extract_text() + "\n"
|
|
||||||
|
|
||||||
except PdfReadError:
|
|
||||||
return await self.build_block_result(
|
return await self.build_block_result(
|
||||||
success=False,
|
success=False,
|
||||||
failure_reason="Failed to parse PDF file",
|
failure_reason="Failed to parse PDF file",
|
||||||
|
|||||||
@@ -5,12 +5,12 @@ from typing import Any
|
|||||||
|
|
||||||
import structlog
|
import structlog
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
from pypdf import PdfReader
|
|
||||||
|
|
||||||
from skyvern.config import settings
|
from skyvern.config import settings
|
||||||
from skyvern.forge.prompts import prompt_engine
|
from skyvern.forge.prompts import prompt_engine
|
||||||
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
|
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
|
||||||
from skyvern.forge.sdk.schemas.organizations import Organization
|
from skyvern.forge.sdk.schemas.organizations import Organization
|
||||||
|
from skyvern.forge.sdk.utils.pdf_parser import extract_pdf_file
|
||||||
from skyvern.schemas.workflows import WorkflowCreateYAMLRequest
|
from skyvern.schemas.workflows import WorkflowCreateYAMLRequest
|
||||||
|
|
||||||
LOG = structlog.get_logger(__name__)
|
LOG = structlog.get_logger(__name__)
|
||||||
@@ -133,7 +133,11 @@ class PDFImportService:
|
|||||||
return raw
|
return raw
|
||||||
|
|
||||||
def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str:
|
def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str:
|
||||||
"""Extract text from PDF file contents. Raises HTTPException if invalid."""
|
"""Extract text from PDF file contents. Raises HTTPException if invalid.
|
||||||
|
|
||||||
|
Uses the shared PDF parsing utility that tries pypdf first,
|
||||||
|
then falls back to pdfplumber if pypdf fails.
|
||||||
|
"""
|
||||||
LOG.info("Extracting text from PDF", filename=file_name)
|
LOG.info("Extracting text from PDF", filename=file_name)
|
||||||
|
|
||||||
# Save the uploaded file to a temporary location
|
# Save the uploaded file to a temporary location
|
||||||
@@ -142,14 +146,10 @@ class PDFImportService:
|
|||||||
temp_file_path = temp_file.name
|
temp_file_path = temp_file.name
|
||||||
|
|
||||||
try:
|
try:
|
||||||
reader = PdfReader(temp_file_path)
|
# Use the shared PDF parsing utility
|
||||||
sop_text = ""
|
sop_text = extract_pdf_file(temp_file_path, file_identifier=file_name)
|
||||||
for page_num, page in enumerate(reader.pages, 1):
|
|
||||||
page_text = page.extract_text() or ""
|
|
||||||
sop_text += page_text + "\n"
|
|
||||||
LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text))
|
|
||||||
|
|
||||||
LOG.info("PDF text extraction complete", total_text_length=len(sop_text))
|
LOG.info("PDF text extraction complete", filename=file_name, total_text_length=len(sop_text))
|
||||||
|
|
||||||
if not sop_text.strip():
|
if not sop_text.strip():
|
||||||
raise HTTPException(status_code=400, detail="No readable content found in the PDF.")
|
raise HTTPException(status_code=400, detail="No readable content found in the PDF.")
|
||||||
|
|||||||
58
uv.lock
generated
58
uv.lock
generated
@@ -3540,6 +3540,33 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" },
|
{ url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pdfminer-six"
|
||||||
|
version = "20251107"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "charset-normalizer" },
|
||||||
|
{ name = "cryptography" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/1d/50/5315f381a25dc80a8d2ea7c62d9a28c0137f10ccc263623a0db8b49fcced/pdfminer_six-20251107.tar.gz", hash = "sha256:5fb0c553799c591777f22c0c72b77fc2522d7d10c70654e25f4c5f1fd996e008", size = 7387104, upload-time = "2025-11-07T20:01:10.286Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/64/29/d1d9f6b900191288b77613ddefb73ed35b48fb35e44aaf8b01b0422b759d/pdfminer_six-20251107-py3-none-any.whl", hash = "sha256:c09df33e4cbe6b26b2a79248a4ffcccafaa5c5d39c9fff0e6e81567f165b5401", size = 5620299, upload-time = "2025-11-07T20:01:08.722Z" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pdfplumber"
|
||||||
|
version = "0.11.8"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "pdfminer-six" },
|
||||||
|
{ name = "pillow" },
|
||||||
|
{ name = "pypdfium2" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/09/d8/cb9fda4261ce389656bec0bb0bdde905df109ad97f7ae387747ded070e8c/pdfplumber-0.11.8.tar.gz", hash = "sha256:db29b04bc8bb62f39dd444533bcf2e0ba33584bd24f5a54644f3ba30f4f22d31", size = 102724, upload-time = "2025-11-08T20:52:01.955Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/12/28/3958ed81a9be317610ab73df32f1968076751d651c84dff1bcb45b7c6c0e/pdfplumber-0.11.8-py3-none-any.whl", hash = "sha256:7dda117b8ed21bca9c8e7d7808fee2439f93c8bd6ea45989bfb1aead6dc3cad3", size = 60043, upload-time = "2025-11-08T20:52:00.652Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pexpect"
|
name = "pexpect"
|
||||||
version = "4.9.0"
|
version = "4.9.0"
|
||||||
@@ -4219,6 +4246,35 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/48/d9/6cff57c80a6963e7dd183bf09e9f21604a77716644b1e580e97b259f7612/pypdf-5.9.0-py3-none-any.whl", hash = "sha256:be10a4c54202f46d9daceaa8788be07aa8cd5ea8c25c529c50dd509206382c35", size = 313193, upload-time = "2025-07-27T14:04:50.53Z" },
|
{ url = "https://files.pythonhosted.org/packages/48/d9/6cff57c80a6963e7dd183bf09e9f21604a77716644b1e580e97b259f7612/pypdf-5.9.0-py3-none-any.whl", hash = "sha256:be10a4c54202f46d9daceaa8788be07aa8cd5ea8c25c529c50dd509206382c35", size = 313193, upload-time = "2025-07-27T14:04:50.53Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pypdfium2"
|
||||||
|
version = "5.2.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/f6/ab/73c7d24e4eac9ba952569403b32b7cca9412fc5b9bef54fdbd669551389f/pypdfium2-5.2.0.tar.gz", hash = "sha256:43863625231ce999c1ebbed6721a88de818b2ab4d909c1de558d413b9a400256", size = 269999, upload-time = "2025-12-12T13:20:15.353Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/fb/0c/9108ae5266ee4cdf495f99205c44d4b5c83b4eb227c2b610d35c9e9fe961/pypdfium2-5.2.0-py3-none-android_23_arm64_v8a.whl", hash = "sha256:1ba4187a45ce4cf08f2a8c7e0f8970c36b9aa1770c8a3412a70781c1d80fb145", size = 2763268, upload-time = "2025-12-12T13:19:37.354Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/35/8c/55f5c8a2c6b293f5c020be4aa123eaa891e797c514e5eccd8cb042740d37/pypdfium2-5.2.0-py3-none-android_23_armeabi_v7a.whl", hash = "sha256:80c55e10a8c9242f0901d35a9a306dd09accce8e497507bb23fcec017d45fe2e", size = 2301821, upload-time = "2025-12-12T13:19:39.484Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/5e/7d/efa013e3795b41c59dd1e472f7201c241232c3a6553be4917e3a26b9f225/pypdfium2-5.2.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:73523ae69cd95c084c1342096893b2143ea73c36fdde35494780ba431e6a7d6e", size = 2816428, upload-time = "2025-12-12T13:19:41.735Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/ec/ae/8c30af6ff2ab41a7cb84753ee79dd1e0a8932c9bda9fe19759d69cbbf115/pypdfium2-5.2.0-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:19c501d22ef5eb98e42416d22cc3ac66d4808b436e3d06686392f24d8d9f708d", size = 2939486, upload-time = "2025-12-12T13:19:43.176Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/64/64/454a73c49a04c2c290917ad86184e4da959e9e5aba94b3b046328c89be93/pypdfium2-5.2.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ed15a3f58d6ee4905f0d0a731e30b381b457c30689512589c7f57950b0cdcec", size = 2979235, upload-time = "2025-12-12T13:19:44.635Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/4e/29/f1cab8e31192dd367dc7b1afa71f45cfcb8ff0b176f1d2a0f528faf04052/pypdfium2-5.2.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:329cd1e9f068e8729e0d0b79a070d6126f52bc48ff1e40505cb207a5e20ce0ba", size = 2763001, upload-time = "2025-12-12T13:19:47.598Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/bc/5d/e95fad8fdac960854173469c4b6931d5de5e09d05e6ee7d9756f8b95eef0/pypdfium2-5.2.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:325259759886e66619504df4721fef3b8deabf8a233e4f4a66e0c32ebae60c2f", size = 3057024, upload-time = "2025-12-12T13:19:49.179Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/f4/32/468591d017ab67f8142d40f4db8163b6d8bb404fe0d22da75a5c661dc144/pypdfium2-5.2.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5683e8f08ab38ed05e0e59e611451ec74332803d4e78f8c45658ea1d372a17af", size = 3448598, upload-time = "2025-12-12T13:19:50.979Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/f9/a5/57b4e389b77ab5f7e9361dc7fc03b5378e678ba81b21e791e85350fbb235/pypdfium2-5.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da4815426a5adcf03bf4d2c5f26c0ff8109dbfaf2c3415984689931bc6006ef9", size = 2993946, upload-time = "2025-12-12T13:19:53.154Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/84/3a/e03e9978f817632aa56183bb7a4989284086fdd45de3245ead35f147179b/pypdfium2-5.2.0-py3-none-manylinux_2_27_s390x.manylinux_2_28_s390x.whl", hash = "sha256:64bf5c039b2c314dab1fd158bfff99db96299a5b5c6d96fc056071166056f1de", size = 3673148, upload-time = "2025-12-12T13:19:54.528Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/13/ee/e581506806553afa4b7939d47bf50dca35c1151b8cc960f4542a6eb135ce/pypdfium2-5.2.0-py3-none-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:76b42a17748ac7dc04d5ef04d0561c6a0a4b546d113ec1d101d59650c6a340f7", size = 2964757, upload-time = "2025-12-12T13:19:56.406Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/00/be/3715c652aff30f12284523dd337843d0efe3e721020f0ec303a99ffffd8d/pypdfium2-5.2.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:9d4367d471439fae846f0aba91ff9e8d66e524edcf3c8d6e02fe96fa306e13b9", size = 4130319, upload-time = "2025-12-12T13:19:57.889Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/b0/0b/28aa2ede9004dd4192266bbad394df0896787f7c7bcfa4d1a6e091ad9a2c/pypdfium2-5.2.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:613f6bb2b47d76b66c0bf2ca581c7c33e3dd9dcb29d65d8c34fef4135f933149", size = 3746488, upload-time = "2025-12-12T13:19:59.469Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/bc/04/1b791e1219652bbfc51df6498267d8dcec73ad508b99388b2890902ccd9d/pypdfium2-5.2.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c03fad3f2fa68d358f5dd4deb07e438482fa26fae439c49d127576d969769ca1", size = 4336534, upload-time = "2025-12-12T13:20:01.28Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/4f/e3/6f00f963bb702ffd2e3e2d9c7286bc3bb0bebcdfa96ca897d466f66976c6/pypdfium2-5.2.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:f10be1900ae21879d02d9f4d58c2d2db3a2e6da611736a8e9decc22d1fb02909", size = 4375079, upload-time = "2025-12-12T13:20:03.117Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/3a/2a/7ec2b191b5e1b7716a0dfc14e6860e89bb355fb3b94ed0c1d46db526858c/pypdfium2-5.2.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:97c1a126d30378726872f94866e38c055740cae80313638dafd1cd448d05e7c0", size = 3928648, upload-time = "2025-12-12T13:20:05.041Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/bf/c3/c6d972fa095ff3ace76f9d3a91ceaf8a9dbbe0d9a5a84ac1d6178a46630e/pypdfium2-5.2.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:c369f183a90781b788af9a357a877bc8caddc24801e8346d0bf23f3295f89f3a", size = 4997772, upload-time = "2025-12-12T13:20:06.453Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/22/45/2c64584b7a3ca5c4652280a884f4b85b8ed24e27662adeebdc06d991c917/pypdfium2-5.2.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b391f1cceb454934b612a05b54e90f98aafeffe5e73830d71700b17f0812226b", size = 4180046, upload-time = "2025-12-12T13:20:08.715Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/d6/99/8d1ff87b626649400e62a2840e6e10fe258443ba518798e071fee4cd86f9/pypdfium2-5.2.0-py3-none-win32.whl", hash = "sha256:c68067938f617c37e4d17b18de7cac231fc7ce0eb7b6653b7283ebe8764d4999", size = 2990175, upload-time = "2025-12-12T13:20:10.241Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/93/fc/114fff8895b620aac4984808e93d01b6d7b93e342a1635fcfe2a5f39cf39/pypdfium2-5.2.0-py3-none-win_amd64.whl", hash = "sha256:eb0591b720e8aaeab9475c66d653655ec1be0464b946f3f48a53922e843f0f3b", size = 3098615, upload-time = "2025-12-12T13:20:11.795Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/08/97/eb738bff5998760d6e0cbcb7dd04cbf1a95a97b997fac6d4e57562a58992/pypdfium2-5.2.0-py3-none-win_arm64.whl", hash = "sha256:5dd1ef579f19fa3719aee4959b28bda44b1072405756708b5e83df8806a19521", size = 2939479, upload-time = "2025-12-12T13:20:13.815Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyperclip"
|
name = "pyperclip"
|
||||||
version = "1.11.0"
|
version = "1.11.0"
|
||||||
@@ -5063,6 +5119,7 @@ dependencies = [
|
|||||||
{ name = "openai" },
|
{ name = "openai" },
|
||||||
{ name = "orjson" },
|
{ name = "orjson" },
|
||||||
{ name = "pandas" },
|
{ name = "pandas" },
|
||||||
|
{ name = "pdfplumber" },
|
||||||
{ name = "pillow" },
|
{ name = "pillow" },
|
||||||
{ name = "playwright", version = "1.46.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
|
{ name = "playwright", version = "1.46.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
|
||||||
{ name = "playwright", version = "1.56.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
|
{ name = "playwright", version = "1.56.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
|
||||||
@@ -5159,6 +5216,7 @@ requires-dist = [
|
|||||||
{ name = "openai", specifier = ">=1.68.2" },
|
{ name = "openai", specifier = ">=1.68.2" },
|
||||||
{ name = "orjson", specifier = ">=3.9.10,<4" },
|
{ name = "orjson", specifier = ">=3.9.10,<4" },
|
||||||
{ name = "pandas", specifier = ">=2.3.1,<3" },
|
{ name = "pandas", specifier = ">=2.3.1,<3" },
|
||||||
|
{ name = "pdfplumber", specifier = ">=0.11.0,<0.12" },
|
||||||
{ name = "pillow", specifier = ">=10.1.0,<11" },
|
{ name = "pillow", specifier = ">=10.1.0,<11" },
|
||||||
{ name = "playwright", marker = "python_full_version == '3.11.*'", specifier = "==1.46.0" },
|
{ name = "playwright", marker = "python_full_version == '3.11.*'", specifier = "==1.46.0" },
|
||||||
{ name = "playwright", marker = "python_full_version >= '3.12' and python_full_version < '3.14'", specifier = ">1.46.0" },
|
{ name = "playwright", marker = "python_full_version >= '3.12' and python_full_version < '3.14'", specifier = ">1.46.0" },
|
||||||
|
|||||||
Reference in New Issue
Block a user