Cap number of tokens sent for PDF parsing requests (avoid total failure) (#4602)

This commit is contained in:
Marc Kelechava
2026-02-02 08:36:28 -08:00
committed by GitHub
parent 4c56ff36d0
commit 2409df7589
2 changed files with 31 additions and 1 deletions

View File

@@ -35,6 +35,7 @@ class ScrapeType(StrEnum):
SCRAPE_TYPE_ORDER = [ScrapeType.NORMAL, ScrapeType.NORMAL, ScrapeType.RELOAD] SCRAPE_TYPE_ORDER = [ScrapeType.NORMAL, ScrapeType.NORMAL, ScrapeType.RELOAD]
DEFAULT_MAX_TOKENS = 100000 DEFAULT_MAX_TOKENS = 100000
MAX_FILE_PARSE_INPUT_TOKENS = 256_000
MAX_IMAGE_MESSAGES = 10 MAX_IMAGE_MESSAGES = 10
SCROLL_AMOUNT_MULTIPLIER = 100 SCROLL_AMOUNT_MULTIPLIER = 100

View File

@@ -9,7 +9,9 @@ import pdfplumber
import structlog import structlog
from pypdf import PdfReader from pypdf import PdfReader
from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS
from skyvern.exceptions import PDFParsingError from skyvern.exceptions import PDFParsingError
from skyvern.utils.token_counter import count_tokens
LOG = structlog.get_logger(__name__) LOG = structlog.get_logger(__name__)
@@ -17,6 +19,7 @@ LOG = structlog.get_logger(__name__)
def extract_pdf_file( def extract_pdf_file(
file_path: str, file_path: str,
file_identifier: str | None = None, file_identifier: str | None = None,
max_tokens: int = MAX_FILE_PARSE_INPUT_TOKENS,
) -> str: ) -> str:
""" """
Extract text from a PDF file with fallback support. Extract text from a PDF file with fallback support.
@@ -47,10 +50,23 @@ def extract_pdf_file(
try: try:
reader = PdfReader(file_path) reader = PdfReader(file_path)
extracted_text = "" extracted_text = ""
current_tokens = 0
page_count = len(reader.pages) page_count = len(reader.pages)
for i in range(page_count): for i in range(page_count):
page_text = reader.pages[i].extract_text() or "" page_text = reader.pages[i].extract_text() or ""
if max_tokens:
page_tokens = count_tokens(page_text)
if current_tokens + page_tokens > max_tokens:
LOG.warning(
"PDF text exceeds token limit, truncating at page boundary",
file_identifier=identifier,
pages_included=i,
total_pages=page_count,
max_tokens=max_tokens,
)
break
current_tokens += page_tokens
extracted_text += page_text + "\n" extracted_text += page_text + "\n"
LOG.info( LOG.info(
@@ -73,11 +89,24 @@ def extract_pdf_file(
try: try:
with pdfplumber.open(file_path) as pdf: with pdfplumber.open(file_path) as pdf:
extracted_text = "" extracted_text = ""
current_tokens = 0
page_count = len(pdf.pages) page_count = len(pdf.pages)
for page in pdf.pages: for i, page in enumerate(pdf.pages):
page_text = page.extract_text() page_text = page.extract_text()
if page_text: if page_text:
if max_tokens:
page_tokens = count_tokens(page_text)
if current_tokens + page_tokens > max_tokens:
LOG.warning(
"PDF text exceeds token limit, truncating at page boundary",
file_identifier=identifier,
pages_included=i,
total_pages=page_count,
max_tokens=max_tokens,
)
break
current_tokens += page_tokens
extracted_text += page_text + "\n" extracted_text += page_text + "\n"
LOG.info( LOG.info(