reenbale the download action (#4299)

2025-12-15 14:30:32 +08:00
parent 6178a20824
commit ce717146f3
9 changed files with 208 additions and 37 deletions
--- a/skyvern-frontend/src/api/types.ts
+++ b/skyvern-frontend/src/api/types.ts
@@ -267,6 +267,7 @@ export const ActionTypes = {
  Hover: "hover",
  SelectOption: "select_option",
  UploadFile: "upload_file",
+  DownloadFile: "download_file",
  complete: "complete",
  wait: "wait",
  terminate: "terminate",
@@ -294,6 +295,7 @@ export const ReadableActionTypes: {
  hover: "Hover",
  select_option: "Select Option",
  upload_file: "Upload File",
+  download_file: "Download File",
  complete: "Complete",
  wait: "Wait",
  terminate: "Terminate",
--- a/skyvern-frontend/src/routes/tasks/detail/ActionTypePill.tsx
+++ b/skyvern-frontend/src/routes/tasks/detail/ActionTypePill.tsx
@@ -1,5 +1,10 @@
 import { ActionType, ReadableActionTypes } from "@/api/types";
-import { CursorArrowIcon, HandIcon, InputIcon } from "@radix-ui/react-icons";
+import {
+  CursorArrowIcon,
+  HandIcon,
+  DownloadIcon,
+  InputIcon,
+} from "@radix-ui/react-icons";

 type Props = {
  actionType: ActionType;
@@ -9,6 +14,7 @@ const icons: Partial<Record<ActionType, React.ReactNode>> = {
  click: <CursorArrowIcon className="h-4 w-4" />,
  hover: <HandIcon className="h-4 w-4" />,
  input_text: <InputIcon className="h-4 w-4" />,
+  download_file: <DownloadIcon className="h-4 w-4" />,
 };

 function ActionTypePill({ actionType }: Props) {
--- a/skyvern-frontend/src/routes/tasks/detail/ActionTypePillMinimal.tsx
+++ b/skyvern-frontend/src/routes/tasks/detail/ActionTypePillMinimal.tsx
@@ -3,6 +3,7 @@ import {
  CheckCircledIcon,
  CursorArrowIcon,
  HandIcon,
+  DownloadIcon,
  InputIcon,
  QuestionMarkIcon,
 } from "@radix-ui/react-icons";
@@ -17,6 +18,7 @@ const icons: Partial<Record<ActionType, React.ReactNode>> = {
  hover: <HandIcon className="h-4 w-4" />,
  complete: <CheckCircledIcon className="h-4 w-4" />,
  input_text: <InputIcon className="h-4 w-4" />,
+  download_file: <DownloadIcon className="h-4 w-4" />,
 };

 function ActionTypePillMinimal({ actionType }: Props) {
--- a/skyvern/exceptions.py
+++ b/skyvern/exceptions.py
@@ -104,6 +104,14 @@ class ImaginaryFileUrl(SkyvernException):
        super().__init__(f"File url {file_url} is imaginary.")


+class DownloadedFileNotFound(SkyvernException):
+    def __init__(self, downloaded_path: str, download_url: str | None = None) -> None:
+        message = f"Downloaded file does not exist at path: {downloaded_path}. This may indicate the download failed silently or the file was removed."
+        if download_url:
+            message += f" Download URL: {download_url}"
+        super().__init__(message)
+
+
 class MissingBrowserState(SkyvernException):
    def __init__(self, task_id: str | None = None, workflow_run_id: str | None = None) -> None:
        task_str = f"task_id={task_id}" if task_id else ""
@@ -878,3 +886,19 @@ class InvalidSchemaError(SkyvernException):
        self.message = message
        self.validation_errors = validation_errors or []
        super().__init__(self.message)
+
+
+class PDFEmbedBase64DecodeError(SkyvernException):
+    """Raised when failed to extract or decode base64 data from PDF embed src attribute."""
+
+    def __init__(self, pdf_embed_src: str | None = None, reason: str | None = None):
+        self.pdf_embed_src = pdf_embed_src
+        self.reason = reason
+        message = "Failed to extract or decode base64 data from PDF embed src"
+        if reason:
+            message += f". Reason: {reason}"
+        if pdf_embed_src:
+            # Truncate long base64 strings for logging
+            src_preview = pdf_embed_src[:100] + "..." if len(pdf_embed_src) > 100 else pdf_embed_src
+            message += f". PDF embed src: {src_preview}"
+        super().__init__(message)
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -6,6 +6,7 @@ import os
 import random
 import re
 import string
+import uuid
 from asyncio.exceptions import CancelledError
 from dataclasses import dataclass
 from datetime import UTC, datetime
@@ -51,6 +52,7 @@ from skyvern.exceptions import (
    MissingBrowserStatePage,
    MissingExtractActionsResponse,
    NoTOTPVerificationCodeFound,
+    PDFEmbedBase64DecodeError,
    ScrapingFailed,
    SkyvernException,
    StepTerminationError,
@@ -110,6 +112,7 @@ from skyvern.webeye.actions.actions import (
    CompleteAction,
    CompleteVerifyResult,
    DecisiveAction,
+    DownloadFileAction,
    ExtractAction,
    GotoUrlAction,
    ReloadPageAction,
@@ -1035,16 +1038,73 @@ class ForgeAgent:
                    if json_response is None:
                        raise MissingExtractActionsResponse()
                    try:
-                        otp_json_response, otp_actions = await self.handle_potential_OTP_actions(
-                            task, step, scraped_page, browser_state, json_response
-                        )
-                        if otp_actions:
-                            detailed_agent_step_output.llm_response = otp_json_response
-                            actions = otp_actions
+                        if pdf_embed_src := scraped_page.check_pdf_viewer_embed():
+                            LOG.info("Generate DownloadFileAction for PDF viewer page", step_id=step.step_id)
+                            pdf_bytes: bytes | None = None
+                            download_url: str | None = None
+
+                            # Check if the embed src is a data URI with base64 encoded PDF
+                            # Format: data:application/pdf[;charset=...];base64,<base64_data>
+                            if pdf_embed_src.startswith("data:application/pdf"):
+                                # Use more precise regex to extract base64 data after the base64, prefix
+                                # This pattern matches: data:application/pdf[;optional_params];base64,<data>
+                                m = re.search(r"data:application/pdf[^;]*;base64,(.+)", pdf_embed_src, re.S)
+                                if not m:
+                                    raise PDFEmbedBase64DecodeError(
+                                        pdf_embed_src=pdf_embed_src,
+                                        reason="Failed to extract base64 data from PDF embed src. Expected format: data:application/pdf[;charset=...];base64,<data>",
+                                    )
+
+                                base64_data = m.group(1)
+                                LOG.info(
+                                    "Found base64 data in PDF embed src",
+                                    step_id=step.step_id,
+                                    base64_data_length=len(base64_data),
+                                )
+
+                                # Decode base64 data with error handling
+                                try:
+                                    pdf_bytes = base64.b64decode(base64_data, validate=True)
+                                except Exception as e:
+                                    raise PDFEmbedBase64DecodeError(
+                                        pdf_embed_src=pdf_embed_src,
+                                        reason=f"Failed to decode base64 data: {str(e)}",
+                                    ) from e
+                            else:
+                                # If not a data URI, treat it as a URL
+                                LOG.info(
+                                    "Found PDF embed src as URL (not base64 data)",
+                                    step_id=step.step_id,
+                                    download_url=pdf_embed_src,
+                                )
+                                download_url = pdf_embed_src
+
+                            actions = [
+                                DownloadFileAction(
+                                    reasoning="Downloading the file from the PDF viewer.",
+                                    organization_id=task.organization_id,
+                                    workflow_run_id=task.workflow_run_id,
+                                    task_id=task.task_id,
+                                    step_id=step.step_id,
+                                    step_order=step.order,
+                                    action_order=0,
+                                    file_name=f"{uuid.uuid4()}.pdf",
+                                    byte=pdf_bytes,
+                                    download_url=download_url,
+                                    download=True,
+                                )
+                            ]
                        else:
-                            actions = parse_actions(
-                                task, step.step_id, step.order, scraped_page, json_response["actions"]
+                            otp_json_response, otp_actions = await self.handle_potential_OTP_actions(
+                                task, step, scraped_page, browser_state, json_response
                            )
+                            if otp_actions:
+                                detailed_agent_step_output.llm_response = otp_json_response
+                                actions = otp_actions
+                            else:
+                                actions = parse_actions(
+                                    task, step.step_id, step.order, scraped_page, json_response["actions"]
+                                )

                        if context:
                            context.pop_totp_code(task.task_id)
@@ -1762,6 +1822,11 @@ class ForgeAgent:
                persist_artifacts=False,
            )

+            if scraped_page.check_pdf_viewer_embed():
+                next_step.is_speculative = False
+                LOG.info("Skipping speculative extract-actions for PDF viewer page", step_id=current_step.step_id)
+                return None
+
            llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
                task.llm_key,
                default=app.LLM_API_HANDLER,
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -246,13 +246,15 @@ class UploadFileAction(WebAction):
        return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})"


-# this is a deprecated action type
-class DownloadFileAction(WebAction):
+# This action is deprecated in 'extract-actions' prompt. Only used for the download action triggered by the code.
+class DownloadFileAction(Action):
    action_type: ActionType = ActionType.DOWNLOAD_FILE
    file_name: str
+    byte: Annotated[bytes | None, Field(exclude=True)] = None  # bytes data
+    download_url: str | None = None  # URL to download file from

    def __repr__(self) -> str:
-        return f"DownloadFileAction(element_id={self.element_id}, file_name={self.file_name})"
+        return f"DownloadFileAction(file_name={self.file_name}, download_url={self.download_url}, has_byte={self.byte is not None})"


 class NullAction(Action):
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -2,6 +2,7 @@ import asyncio
 import copy
 import json
 import os
+import shutil
 import time
 import urllib.parse
 import uuid
@@ -21,11 +22,11 @@ from skyvern.constants import (
    BROWSER_DOWNLOAD_MAX_WAIT_TIME,
    BROWSER_DOWNLOAD_TIMEOUT,
    DROPDOWN_MENU_MAX_DISTANCE,
-    REPO_ROOT_DIR,
    SKYVERN_ID_ATTR,
 )
 from skyvern.errors.errors import TOTPExpiredError
 from skyvern.exceptions import (
+    DownloadedFileNotFound,
    DownloadFileMaxWaitingTime,
    EmptySelect,
    ErrEmptyTweakValue,
@@ -60,6 +61,7 @@ from skyvern.exceptions import (
 from skyvern.experimentation.wait_utils import get_or_create_wait_config, get_wait_time
 from skyvern.forge import app
 from skyvern.forge.prompts import prompt_engine
+from skyvern.forge.sdk.api.files import download_file as download_file_api
 from skyvern.forge.sdk.api.files import (
    get_download_dir,
    list_downloading_files_in_directory,
@@ -102,6 +104,7 @@ from skyvern.webeye.actions.actions import (
    WebAction,
 )
 from skyvern.webeye.actions.responses import ActionAbort, ActionFailure, ActionResult, ActionSuccess
+from skyvern.webeye.browser_factory import initialize_download_dir
 from skyvern.webeye.scraper.scraped_page import (
    CleanupElementTreeFunc,
    ElementTreeBuilder,
@@ -1591,7 +1594,8 @@ async def handle_upload_file_action(
        )


-# This function is deprecated. Downloads are handled by the click action handler now.
+# This function is deprecated in 'extract-actions' prompt. Downloads are handled by the click action handler now.
+# Currently, it's only used for the download action triggered by the code.
@TraceManager.traced_async(ignore_inputs=["scraped_page", "page"])
 async def handle_download_file_action(
    action: actions.DownloadFileAction,
@@ -1600,42 +1604,72 @@ async def handle_download_file_action(
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
-    # Get wait config once for this handler
-    wait_config = await get_or_create_wait_config(task.task_id, task.workflow_run_id, task.organization_id)
-
-    dom = DomUtil(scraped_page=scraped_page, page=page)
-    skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
-
    file_name = f"{action.file_name or uuid.uuid4()}"
-    full_file_path = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}/{file_name}"
+    download_folder = initialize_download_dir()
+    full_file_path = f"{download_folder}/{file_name}"
+
    try:
-        # Start waiting for the download
-        async with page.expect_download() as download_info:
-            await asyncio.sleep(get_wait_time(wait_config, "post_click_delay", default=0.3))
+        # Priority 1: If byte data is provided, save it directly
+        if action.byte is not None:
+            with open(full_file_path, "wb") as f:
+                f.write(action.byte)

-            locator = skyvern_element.locator
-            await locator.click(
-                timeout=settings.BROWSER_ACTION_TIMEOUT_MS,
-                modifiers=["Alt"],
+            LOG.info(
+                "DownloadFileAction: Saved file from byte data",
+                action=action,
+                full_file_path=full_file_path,
+                file_size=len(action.byte),
            )
+            return [ActionSuccess(download_triggered=True)]

-        download = await download_info.value
+        # Priority 2: If download_url is provided, download from URL
+        if action.download_url is not None:
+            downloaded_path = await download_file_api(action.download_url)
+            # Check if the downloaded file actually exists
+            if not os.path.exists(downloaded_path):
+                LOG.error(
+                    "DownloadFileAction: Downloaded file path does not exist",
+                    action=action,
+                    downloaded_path=downloaded_path,
+                    download_url=action.download_url,
+                    full_file_path=full_file_path,
+                )
+                return [ActionFailure(DownloadedFileNotFound(downloaded_path, action.download_url))]
+
+            # Move the downloaded file to the target location
+            # If the downloaded file has a different name, use it; otherwise use the specified file_name
+            if os.path.basename(downloaded_path) != file_name:
+                # Copy to target location with specified file_name
+                shutil.copy2(downloaded_path, full_file_path)
+                # Optionally remove the temporary file
+                try:
+                    os.remove(downloaded_path)
+                except Exception:
+                    pass  # Ignore errors when removing temp file
+            else:
+                # Move to target location
+                shutil.move(downloaded_path, full_file_path)
+
+            LOG.info(
+                "DownloadFileAction: Downloaded file from URL",
+                action=action,
+                full_file_path=full_file_path,
+                download_url=action.download_url,
+            )
+            return [ActionSuccess(download_triggered=True)]
+
+        return [ActionSuccess(download_triggered=False)]

-        # Create download folders if they don't exist
-        download_folder = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}"
-        os.makedirs(download_folder, exist_ok=True)
-        # Wait for the download process to complete and save the downloaded file
-        await download.save_as(full_file_path)
    except Exception as e:
        LOG.exception(
            "DownloadFileAction: Failed to download file",
            action=action,
            full_file_path=full_file_path,
+            download_url=action.download_url,
+            has_byte=action.byte is not None,
        )
        return [ActionFailure(e)]

-    return [ActionSuccess(data={"file_path": full_file_path})]
-

@TraceManager.traced_async(ignore_inputs=["scraped_page", "page"])
 async def handle_null_action(
@@ -2250,7 +2284,7 @@ ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captch
 ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
 ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
 ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
-# ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action)
+ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action)
 ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
 ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action)
 ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
--- a/skyvern/webeye/scraper/domUtils.js
+++ b/skyvern/webeye/scraper/domUtils.js
@@ -1794,6 +1794,16 @@ async function buildElementTree(
        elementObj = await buildElementObject(frame, element, interactable);
      } else if (tagName === "div" && isDOMNodeRepresentDiv(element)) {
        elementObj = await buildElementObject(frame, element, interactable);
+      } else if (
+        tagName === "embed" &&
+        element.getAttribute("type")?.toLowerCase() === "application/pdf"
+      ) {
+        elementObj = await buildElementObject(
+          frame,
+          element,
+          interactable,
+          true,
+        );
      } else if (
        getElementText(element).length > 0 &&
        getElementText(element).length <= 5000
--- a/skyvern/webeye/scraper/scraped_page.py
+++ b/skyvern/webeye/scraper/scraped_page.py
@@ -178,6 +178,32 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
        self._clean_up_func = clean_up_func
        self._scrape_exclude = scrape_exclude

+    def check_pdf_viewer_embed(self) -> str | None:
+        """
+        Check if the page contains a PDF viewer embed.
+        If found, return the src attribute of the embed.
+        """
+        if len(self.elements) != 1:
+            return None
+
+        element = self.elements[0]
+        if element.get("tagName", "") != "embed":
+            return None
+
+        attributes: dict = element.get("attributes", {})
+        if not attributes:
+            return None
+
+        type_attr: str | None = attributes.get("type")
+        if not type_attr:
+            return None
+
+        if type_attr.lower() != "application/pdf":
+            return None
+
+        LOG.info("Found a PDF viewer page", element=element)
+        return attributes.get("src", "")
+
    def support_economy_elements_tree(self) -> bool:
        return True