diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2 index 21589fcf..c5063b01 100644 --- a/skyvern/forge/prompts/skyvern/extract-action.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action.j2 @@ -16,6 +16,7 @@ Reply in JSON format with the following keys: "id": int, // The id of the element to take action on. The id has to be one from the elements list "text": str, // Text for INPUT_TEXT action only "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. + "download": bool, // Can only be true for CLICK actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download. "option": { // The option to select for SELECT_OPTION action only. null if not SELECT_OPTION action "label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE "index": int, // the id corresponding to the optionIndex under the the select element. diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index 1bd64f3b..2ecb2799 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -52,9 +52,7 @@ class DecisiveAction(Action, abc.ABC): class ClickAction(WebAction): action_type: ActionType = ActionType.CLICK file_url: str | None = None - - def __repr__(self) -> str: - return f"ClickAction(element_id={self.element_id}, file_url={self.file_url})" + download: bool = False class InputTextAction(WebAction): @@ -162,7 +160,14 @@ def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Actio actions.append(TerminateAction(reasoning=reasoning, errors=action["errors"] if "errors" in action else [])) elif action_type == ActionType.CLICK: file_url = action["file_url"] if "file_url" in action else None - actions.append(ClickAction(element_id=element_id, reasoning=reasoning, file_url=file_url)) + actions.append( + ClickAction( + element_id=element_id, + reasoning=reasoning, + file_url=file_url, + download=action.get("download", False), + ) + ) elif action_type == ActionType.INPUT_TEXT: actions.append(InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning)) elif action_type == ActionType.UPLOAD_FILE: diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 68ebbc07..668e0d17 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -142,11 +142,30 @@ async def handle_click_action( ) -> list[ActionResult]: xpath = await validate_actions_in_dom(action, page, scraped_page) await asyncio.sleep(0.3) + if action.download: + return await handle_click_to_download_file_action(action, page, scraped_page) return await chain_click( task, page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS ) +async def handle_click_to_download_file_action( + action: actions.ClickAction, + page: Page, + scraped_page: ScrapedPage, +) -> list[ActionResult]: + xpath = await validate_actions_in_dom(action, page, scraped_page) + try: + await page.click( + f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS, modifiers=["Alt"] + ) + except Exception as e: + LOG.exception("ClickAction with download failed", action=action, exc_info=True) + return [ActionFailure(e, download_triggered=False)] + + return [ActionSuccess()] + + async def handle_input_text_action( action: actions.InputTextAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step ) -> list[ActionResult]: diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index d40d3bf6..7f4dcd4c 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -26,6 +26,7 @@ RESERVED_ATTRIBUTES = { "aria-role", "aria-selected", # for option tag "checked", + "data-original-title", # for bootstrap tooltip "data-ui", "for", "href", # For a tags @@ -373,7 +374,7 @@ def _trimmed_attributes(tag_name: str, attributes: dict) -> dict: new_attributes[key] = attributes[key] if key == "role" and attributes[key] in ["listbox", "option"]: new_attributes[key] = attributes[key] - if key in RESERVED_ATTRIBUTES: + if key in RESERVED_ATTRIBUTES and attributes[key]: new_attributes[key] = attributes[key] return new_attributes