Cap number of tokens sent for PDF parsing requests (avoid total failure) (#4602)
This commit is contained in:
@@ -35,6 +35,7 @@ class ScrapeType(StrEnum):
|
|||||||
|
|
||||||
SCRAPE_TYPE_ORDER = [ScrapeType.NORMAL, ScrapeType.NORMAL, ScrapeType.RELOAD]
|
SCRAPE_TYPE_ORDER = [ScrapeType.NORMAL, ScrapeType.NORMAL, ScrapeType.RELOAD]
|
||||||
DEFAULT_MAX_TOKENS = 100000
|
DEFAULT_MAX_TOKENS = 100000
|
||||||
|
MAX_FILE_PARSE_INPUT_TOKENS = 256_000
|
||||||
MAX_IMAGE_MESSAGES = 10
|
MAX_IMAGE_MESSAGES = 10
|
||||||
SCROLL_AMOUNT_MULTIPLIER = 100
|
SCROLL_AMOUNT_MULTIPLIER = 100
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,9 @@ import pdfplumber
|
|||||||
import structlog
|
import structlog
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
from skyvern.constants import MAX_FILE_PARSE_INPUT_TOKENS
|
||||||
from skyvern.exceptions import PDFParsingError
|
from skyvern.exceptions import PDFParsingError
|
||||||
|
from skyvern.utils.token_counter import count_tokens
|
||||||
|
|
||||||
LOG = structlog.get_logger(__name__)
|
LOG = structlog.get_logger(__name__)
|
||||||
|
|
||||||
@@ -17,6 +19,7 @@ LOG = structlog.get_logger(__name__)
|
|||||||
def extract_pdf_file(
|
def extract_pdf_file(
|
||||||
file_path: str,
|
file_path: str,
|
||||||
file_identifier: str | None = None,
|
file_identifier: str | None = None,
|
||||||
|
max_tokens: int = MAX_FILE_PARSE_INPUT_TOKENS,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Extract text from a PDF file with fallback support.
|
Extract text from a PDF file with fallback support.
|
||||||
@@ -47,10 +50,23 @@ def extract_pdf_file(
|
|||||||
try:
|
try:
|
||||||
reader = PdfReader(file_path)
|
reader = PdfReader(file_path)
|
||||||
extracted_text = ""
|
extracted_text = ""
|
||||||
|
current_tokens = 0
|
||||||
page_count = len(reader.pages)
|
page_count = len(reader.pages)
|
||||||
|
|
||||||
for i in range(page_count):
|
for i in range(page_count):
|
||||||
page_text = reader.pages[i].extract_text() or ""
|
page_text = reader.pages[i].extract_text() or ""
|
||||||
|
if max_tokens:
|
||||||
|
page_tokens = count_tokens(page_text)
|
||||||
|
if current_tokens + page_tokens > max_tokens:
|
||||||
|
LOG.warning(
|
||||||
|
"PDF text exceeds token limit, truncating at page boundary",
|
||||||
|
file_identifier=identifier,
|
||||||
|
pages_included=i,
|
||||||
|
total_pages=page_count,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
break
|
||||||
|
current_tokens += page_tokens
|
||||||
extracted_text += page_text + "\n"
|
extracted_text += page_text + "\n"
|
||||||
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
@@ -73,11 +89,24 @@ def extract_pdf_file(
|
|||||||
try:
|
try:
|
||||||
with pdfplumber.open(file_path) as pdf:
|
with pdfplumber.open(file_path) as pdf:
|
||||||
extracted_text = ""
|
extracted_text = ""
|
||||||
|
current_tokens = 0
|
||||||
page_count = len(pdf.pages)
|
page_count = len(pdf.pages)
|
||||||
|
|
||||||
for page in pdf.pages:
|
for i, page in enumerate(pdf.pages):
|
||||||
page_text = page.extract_text()
|
page_text = page.extract_text()
|
||||||
if page_text:
|
if page_text:
|
||||||
|
if max_tokens:
|
||||||
|
page_tokens = count_tokens(page_text)
|
||||||
|
if current_tokens + page_tokens > max_tokens:
|
||||||
|
LOG.warning(
|
||||||
|
"PDF text exceeds token limit, truncating at page boundary",
|
||||||
|
file_identifier=identifier,
|
||||||
|
pages_included=i,
|
||||||
|
total_pages=page_count,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
break
|
||||||
|
current_tokens += page_tokens
|
||||||
extracted_text += page_text + "\n"
|
extracted_text += page_text + "\n"
|
||||||
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
|
|||||||
Reference in New Issue
Block a user