From ce717146f3b26d003c3f417e9d2067cee103c81e Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Mon, 15 Dec 2025 14:30:32 +0800 Subject: [PATCH] reenbale the download action (#4299) --- skyvern-frontend/src/api/types.ts | 2 + .../routes/tasks/detail/ActionTypePill.tsx | 8 +- .../tasks/detail/ActionTypePillMinimal.tsx | 2 + skyvern/exceptions.py | 24 ++++++ skyvern/forge/agent.py | 81 ++++++++++++++++-- skyvern/webeye/actions/actions.py | 8 +- skyvern/webeye/actions/handler.py | 84 +++++++++++++------ skyvern/webeye/scraper/domUtils.js | 10 +++ skyvern/webeye/scraper/scraped_page.py | 26 ++++++ 9 files changed, 208 insertions(+), 37 deletions(-) diff --git a/skyvern-frontend/src/api/types.ts b/skyvern-frontend/src/api/types.ts index b7f08654..fbf5b7f9 100644 --- a/skyvern-frontend/src/api/types.ts +++ b/skyvern-frontend/src/api/types.ts @@ -267,6 +267,7 @@ export const ActionTypes = { Hover: "hover", SelectOption: "select_option", UploadFile: "upload_file", + DownloadFile: "download_file", complete: "complete", wait: "wait", terminate: "terminate", @@ -294,6 +295,7 @@ export const ReadableActionTypes: { hover: "Hover", select_option: "Select Option", upload_file: "Upload File", + download_file: "Download File", complete: "Complete", wait: "Wait", terminate: "Terminate", diff --git a/skyvern-frontend/src/routes/tasks/detail/ActionTypePill.tsx b/skyvern-frontend/src/routes/tasks/detail/ActionTypePill.tsx index 4174d099..a317eab8 100644 --- a/skyvern-frontend/src/routes/tasks/detail/ActionTypePill.tsx +++ b/skyvern-frontend/src/routes/tasks/detail/ActionTypePill.tsx @@ -1,5 +1,10 @@ import { ActionType, ReadableActionTypes } from "@/api/types"; -import { CursorArrowIcon, HandIcon, InputIcon } from "@radix-ui/react-icons"; +import { + CursorArrowIcon, + HandIcon, + DownloadIcon, + InputIcon, +} from "@radix-ui/react-icons"; type Props = { actionType: ActionType; @@ -9,6 +14,7 @@ const icons: Partial> = { click: , hover: , input_text: , + download_file: , }; function ActionTypePill({ actionType }: Props) { diff --git a/skyvern-frontend/src/routes/tasks/detail/ActionTypePillMinimal.tsx b/skyvern-frontend/src/routes/tasks/detail/ActionTypePillMinimal.tsx index 359ddcc7..cc96fe1c 100644 --- a/skyvern-frontend/src/routes/tasks/detail/ActionTypePillMinimal.tsx +++ b/skyvern-frontend/src/routes/tasks/detail/ActionTypePillMinimal.tsx @@ -3,6 +3,7 @@ import { CheckCircledIcon, CursorArrowIcon, HandIcon, + DownloadIcon, InputIcon, QuestionMarkIcon, } from "@radix-ui/react-icons"; @@ -17,6 +18,7 @@ const icons: Partial> = { hover: , complete: , input_text: , + download_file: , }; function ActionTypePillMinimal({ actionType }: Props) { diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py index c02faf9d..0193866c 100644 --- a/skyvern/exceptions.py +++ b/skyvern/exceptions.py @@ -104,6 +104,14 @@ class ImaginaryFileUrl(SkyvernException): super().__init__(f"File url {file_url} is imaginary.") +class DownloadedFileNotFound(SkyvernException): + def __init__(self, downloaded_path: str, download_url: str | None = None) -> None: + message = f"Downloaded file does not exist at path: {downloaded_path}. This may indicate the download failed silently or the file was removed." + if download_url: + message += f" Download URL: {download_url}" + super().__init__(message) + + class MissingBrowserState(SkyvernException): def __init__(self, task_id: str | None = None, workflow_run_id: str | None = None) -> None: task_str = f"task_id={task_id}" if task_id else "" @@ -878,3 +886,19 @@ class InvalidSchemaError(SkyvernException): self.message = message self.validation_errors = validation_errors or [] super().__init__(self.message) + + +class PDFEmbedBase64DecodeError(SkyvernException): + """Raised when failed to extract or decode base64 data from PDF embed src attribute.""" + + def __init__(self, pdf_embed_src: str | None = None, reason: str | None = None): + self.pdf_embed_src = pdf_embed_src + self.reason = reason + message = "Failed to extract or decode base64 data from PDF embed src" + if reason: + message += f". Reason: {reason}" + if pdf_embed_src: + # Truncate long base64 strings for logging + src_preview = pdf_embed_src[:100] + "..." if len(pdf_embed_src) > 100 else pdf_embed_src + message += f". PDF embed src: {src_preview}" + super().__init__(message) diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 19e400a5..7aef2181 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -6,6 +6,7 @@ import os import random import re import string +import uuid from asyncio.exceptions import CancelledError from dataclasses import dataclass from datetime import UTC, datetime @@ -51,6 +52,7 @@ from skyvern.exceptions import ( MissingBrowserStatePage, MissingExtractActionsResponse, NoTOTPVerificationCodeFound, + PDFEmbedBase64DecodeError, ScrapingFailed, SkyvernException, StepTerminationError, @@ -110,6 +112,7 @@ from skyvern.webeye.actions.actions import ( CompleteAction, CompleteVerifyResult, DecisiveAction, + DownloadFileAction, ExtractAction, GotoUrlAction, ReloadPageAction, @@ -1035,16 +1038,73 @@ class ForgeAgent: if json_response is None: raise MissingExtractActionsResponse() try: - otp_json_response, otp_actions = await self.handle_potential_OTP_actions( - task, step, scraped_page, browser_state, json_response - ) - if otp_actions: - detailed_agent_step_output.llm_response = otp_json_response - actions = otp_actions + if pdf_embed_src := scraped_page.check_pdf_viewer_embed(): + LOG.info("Generate DownloadFileAction for PDF viewer page", step_id=step.step_id) + pdf_bytes: bytes | None = None + download_url: str | None = None + + # Check if the embed src is a data URI with base64 encoded PDF + # Format: data:application/pdf[;charset=...];base64, + if pdf_embed_src.startswith("data:application/pdf"): + # Use more precise regex to extract base64 data after the base64, prefix + # This pattern matches: data:application/pdf[;optional_params];base64, + m = re.search(r"data:application/pdf[^;]*;base64,(.+)", pdf_embed_src, re.S) + if not m: + raise PDFEmbedBase64DecodeError( + pdf_embed_src=pdf_embed_src, + reason="Failed to extract base64 data from PDF embed src. Expected format: data:application/pdf[;charset=...];base64,", + ) + + base64_data = m.group(1) + LOG.info( + "Found base64 data in PDF embed src", + step_id=step.step_id, + base64_data_length=len(base64_data), + ) + + # Decode base64 data with error handling + try: + pdf_bytes = base64.b64decode(base64_data, validate=True) + except Exception as e: + raise PDFEmbedBase64DecodeError( + pdf_embed_src=pdf_embed_src, + reason=f"Failed to decode base64 data: {str(e)}", + ) from e + else: + # If not a data URI, treat it as a URL + LOG.info( + "Found PDF embed src as URL (not base64 data)", + step_id=step.step_id, + download_url=pdf_embed_src, + ) + download_url = pdf_embed_src + + actions = [ + DownloadFileAction( + reasoning="Downloading the file from the PDF viewer.", + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=0, + file_name=f"{uuid.uuid4()}.pdf", + byte=pdf_bytes, + download_url=download_url, + download=True, + ) + ] else: - actions = parse_actions( - task, step.step_id, step.order, scraped_page, json_response["actions"] + otp_json_response, otp_actions = await self.handle_potential_OTP_actions( + task, step, scraped_page, browser_state, json_response ) + if otp_actions: + detailed_agent_step_output.llm_response = otp_json_response + actions = otp_actions + else: + actions = parse_actions( + task, step.step_id, step.order, scraped_page, json_response["actions"] + ) if context: context.pop_totp_code(task.task_id) @@ -1762,6 +1822,11 @@ class ForgeAgent: persist_artifacts=False, ) + if scraped_page.check_pdf_viewer_embed(): + next_step.is_speculative = False + LOG.info("Skipping speculative extract-actions for PDF viewer page", step_id=current_step.step_id) + return None + llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler( task.llm_key, default=app.LLM_API_HANDLER, diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index b9e066bc..5253ffe7 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -246,13 +246,15 @@ class UploadFileAction(WebAction): return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})" -# this is a deprecated action type -class DownloadFileAction(WebAction): +# This action is deprecated in 'extract-actions' prompt. Only used for the download action triggered by the code. +class DownloadFileAction(Action): action_type: ActionType = ActionType.DOWNLOAD_FILE file_name: str + byte: Annotated[bytes | None, Field(exclude=True)] = None # bytes data + download_url: str | None = None # URL to download file from def __repr__(self) -> str: - return f"DownloadFileAction(element_id={self.element_id}, file_name={self.file_name})" + return f"DownloadFileAction(file_name={self.file_name}, download_url={self.download_url}, has_byte={self.byte is not None})" class NullAction(Action): diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index d4172046..4833b498 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -2,6 +2,7 @@ import asyncio import copy import json import os +import shutil import time import urllib.parse import uuid @@ -21,11 +22,11 @@ from skyvern.constants import ( BROWSER_DOWNLOAD_MAX_WAIT_TIME, BROWSER_DOWNLOAD_TIMEOUT, DROPDOWN_MENU_MAX_DISTANCE, - REPO_ROOT_DIR, SKYVERN_ID_ATTR, ) from skyvern.errors.errors import TOTPExpiredError from skyvern.exceptions import ( + DownloadedFileNotFound, DownloadFileMaxWaitingTime, EmptySelect, ErrEmptyTweakValue, @@ -60,6 +61,7 @@ from skyvern.exceptions import ( from skyvern.experimentation.wait_utils import get_or_create_wait_config, get_wait_time from skyvern.forge import app from skyvern.forge.prompts import prompt_engine +from skyvern.forge.sdk.api.files import download_file as download_file_api from skyvern.forge.sdk.api.files import ( get_download_dir, list_downloading_files_in_directory, @@ -102,6 +104,7 @@ from skyvern.webeye.actions.actions import ( WebAction, ) from skyvern.webeye.actions.responses import ActionAbort, ActionFailure, ActionResult, ActionSuccess +from skyvern.webeye.browser_factory import initialize_download_dir from skyvern.webeye.scraper.scraped_page import ( CleanupElementTreeFunc, ElementTreeBuilder, @@ -1591,7 +1594,8 @@ async def handle_upload_file_action( ) -# This function is deprecated. Downloads are handled by the click action handler now. +# This function is deprecated in 'extract-actions' prompt. Downloads are handled by the click action handler now. +# Currently, it's only used for the download action triggered by the code. @TraceManager.traced_async(ignore_inputs=["scraped_page", "page"]) async def handle_download_file_action( action: actions.DownloadFileAction, @@ -1600,42 +1604,72 @@ async def handle_download_file_action( task: Task, step: Step, ) -> list[ActionResult]: - # Get wait config once for this handler - wait_config = await get_or_create_wait_config(task.task_id, task.workflow_run_id, task.organization_id) - - dom = DomUtil(scraped_page=scraped_page, page=page) - skyvern_element = await dom.get_skyvern_element_by_id(action.element_id) - file_name = f"{action.file_name or uuid.uuid4()}" - full_file_path = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}/{file_name}" + download_folder = initialize_download_dir() + full_file_path = f"{download_folder}/{file_name}" + try: - # Start waiting for the download - async with page.expect_download() as download_info: - await asyncio.sleep(get_wait_time(wait_config, "post_click_delay", default=0.3)) + # Priority 1: If byte data is provided, save it directly + if action.byte is not None: + with open(full_file_path, "wb") as f: + f.write(action.byte) - locator = skyvern_element.locator - await locator.click( - timeout=settings.BROWSER_ACTION_TIMEOUT_MS, - modifiers=["Alt"], + LOG.info( + "DownloadFileAction: Saved file from byte data", + action=action, + full_file_path=full_file_path, + file_size=len(action.byte), ) + return [ActionSuccess(download_triggered=True)] - download = await download_info.value + # Priority 2: If download_url is provided, download from URL + if action.download_url is not None: + downloaded_path = await download_file_api(action.download_url) + # Check if the downloaded file actually exists + if not os.path.exists(downloaded_path): + LOG.error( + "DownloadFileAction: Downloaded file path does not exist", + action=action, + downloaded_path=downloaded_path, + download_url=action.download_url, + full_file_path=full_file_path, + ) + return [ActionFailure(DownloadedFileNotFound(downloaded_path, action.download_url))] + + # Move the downloaded file to the target location + # If the downloaded file has a different name, use it; otherwise use the specified file_name + if os.path.basename(downloaded_path) != file_name: + # Copy to target location with specified file_name + shutil.copy2(downloaded_path, full_file_path) + # Optionally remove the temporary file + try: + os.remove(downloaded_path) + except Exception: + pass # Ignore errors when removing temp file + else: + # Move to target location + shutil.move(downloaded_path, full_file_path) + + LOG.info( + "DownloadFileAction: Downloaded file from URL", + action=action, + full_file_path=full_file_path, + download_url=action.download_url, + ) + return [ActionSuccess(download_triggered=True)] + + return [ActionSuccess(download_triggered=False)] - # Create download folders if they don't exist - download_folder = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}" - os.makedirs(download_folder, exist_ok=True) - # Wait for the download process to complete and save the downloaded file - await download.save_as(full_file_path) except Exception as e: LOG.exception( "DownloadFileAction: Failed to download file", action=action, full_file_path=full_file_path, + download_url=action.download_url, + has_byte=action.byte is not None, ) return [ActionFailure(e)] - return [ActionSuccess(data={"file_path": full_file_path})] - @TraceManager.traced_async(ignore_inputs=["scraped_page", "page"]) async def handle_null_action( @@ -2250,7 +2284,7 @@ ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captch ActionHandler.register_action_type(ActionType.CLICK, handle_click_action) ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action) ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action) -# ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action) +ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action) ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action) ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action) ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action) diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index c43780ac..b20f6367 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -1794,6 +1794,16 @@ async function buildElementTree( elementObj = await buildElementObject(frame, element, interactable); } else if (tagName === "div" && isDOMNodeRepresentDiv(element)) { elementObj = await buildElementObject(frame, element, interactable); + } else if ( + tagName === "embed" && + element.getAttribute("type")?.toLowerCase() === "application/pdf" + ) { + elementObj = await buildElementObject( + frame, + element, + interactable, + true, + ); } else if ( getElementText(element).length > 0 && getElementText(element).length <= 5000 diff --git a/skyvern/webeye/scraper/scraped_page.py b/skyvern/webeye/scraper/scraped_page.py index ff2dda38..0ab48feb 100644 --- a/skyvern/webeye/scraper/scraped_page.py +++ b/skyvern/webeye/scraper/scraped_page.py @@ -178,6 +178,32 @@ class ScrapedPage(BaseModel, ElementTreeBuilder): self._clean_up_func = clean_up_func self._scrape_exclude = scrape_exclude + def check_pdf_viewer_embed(self) -> str | None: + """ + Check if the page contains a PDF viewer embed. + If found, return the src attribute of the embed. + """ + if len(self.elements) != 1: + return None + + element = self.elements[0] + if element.get("tagName", "") != "embed": + return None + + attributes: dict = element.get("attributes", {}) + if not attributes: + return None + + type_attr: str | None = attributes.get("type") + if not type_attr: + return None + + if type_attr.lower() != "application/pdf": + return None + + LOG.info("Found a PDF viewer page", element=element) + return attributes.get("src", "") + def support_economy_elements_tree(self) -> bool: return True