From 2409df75894a7e78e51fa2ba77cd5f84268352ed Mon Sep 17 00:00:00 2001 From: Marc Kelechava Date: Mon, 2 Feb 2026 08:36:28 -0800 Subject: [PATCH] Cap number of tokens sent for PDF parsing requests (avoid total failure) (#4602) --- skyvern/constants.py | 1 + skyvern/forge/sdk/utils/pdf_parser.py | 31 ++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/skyvern/constants.py b/skyvern/constants.py index 3c64afa0..13454af5 100644 --- a/skyvern/constants.py +++ b/skyvern/constants.py @@ -35,6 +35,7 @@ class ScrapeType(StrEnum): SCRAPE_TYPE_ORDER = [ScrapeType.NORMAL, ScrapeType.NORMAL, ScrapeType.RELOAD] DEFAULT_MAX_TOKENS = 100000 +MAX_FILE_PARSE_INPUT_TOKENS = 256_000 MAX_IMAGE_MESSAGES = 10 SCROLL_AMOUNT_MULTIPLIER = 100 diff --git a/skyvern/forge/sdk/utils/pdf_parser.py b/skyvern/forge/sdk/utils/pdf_parser.py index 301f5e1d..5843e1e3 100644 --- a/skyvern/forge/sdk/utils/pdf_parser.py +++ b/skyvern/forge/sdk/utils/pdf_parser.py @@ -9,7 +9,9 @@ import pdfplumber import structlog from pypdf import PdfReader +from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS from skyvern.exceptions import PDFParsingError +from skyvern.utils.token_counter import count_tokens LOG = structlog.get_logger(__name__) @@ -17,6 +19,7 @@ LOG = structlog.get_logger(__name__) def extract_pdf_file( file_path: str, file_identifier: str | None = None, + max_tokens: int = MAX_FILE_PARSE_INPUT_TOKENS, ) -> str: """ Extract text from a PDF file with fallback support. @@ -47,10 +50,23 @@ def extract_pdf_file( try: reader = PdfReader(file_path) extracted_text = "" + current_tokens = 0 page_count = len(reader.pages) for i in range(page_count): page_text = reader.pages[i].extract_text() or "" + if max_tokens: + page_tokens = count_tokens(page_text) + if current_tokens + page_tokens > max_tokens: + LOG.warning( + "PDF text exceeds token limit, truncating at page boundary", + file_identifier=identifier, + pages_included=i, + total_pages=page_count, + max_tokens=max_tokens, + ) + break + current_tokens += page_tokens extracted_text += page_text + "\n" LOG.info( @@ -73,11 +89,24 @@ def extract_pdf_file( try: with pdfplumber.open(file_path) as pdf: extracted_text = "" + current_tokens = 0 page_count = len(pdf.pages) - for page in pdf.pages: + for i, page in enumerate(pdf.pages): page_text = page.extract_text() if page_text: + if max_tokens: + page_tokens = count_tokens(page_text) + if current_tokens + page_tokens > max_tokens: + LOG.warning( + "PDF text exceeds token limit, truncating at page boundary", + file_identifier=identifier, + pages_included=i, + total_pages=page_count, + max_tokens=max_tokens, + ) + break + current_tokens += page_tokens extracted_text += page_text + "\n" LOG.info(