From f49b07f30d0401884eb3a60eabd9727386ed2ec2 Mon Sep 17 00:00:00 2001 From: Mohamed Khalil <69445107+mohamedmamdouh22@users.noreply.github.com> Date: Tue, 9 Dec 2025 17:27:26 +0200 Subject: [PATCH] feat: add hover action support (#3994) Co-authored-by: LawyZheng --- skyvern-frontend/package-lock.json | 2 +- skyvern-frontend/src/api/types.ts | 2 + .../routes/tasks/detail/ActionTypePill.tsx | 1 + .../tasks/detail/ActionTypePillMinimal.tsx | 1 + .../routes/tasks/detail/hooks/useActions.ts | 2 + .../script_generations/generate_script.py | 15 +++++ .../script_generations/script_skyvern_page.py | 1 + .../core/script_generations/skyvern_page.py | 21 ++++++ skyvern/exceptions.py | 5 ++ skyvern/forge/agent.py | 27 ++++++++ .../prompts/skyvern/extract-action-static.j2 | 5 +- .../forge/prompts/skyvern/extract-action.j2 | 5 +- skyvern/forge/sdk/core/skyvern_context.py | 5 ++ skyvern/forge/sdk/db/utils.py | 2 + skyvern/library/skyvern_locator.py | 1 + skyvern/webeye/actions/action_types.py | 3 + skyvern/webeye/actions/actions.py | 5 ++ skyvern/webeye/actions/handler.py | 65 +++++++++++++++++++ skyvern/webeye/actions/parse_actions.py | 4 ++ skyvern/webeye/scraper/domUtils.js | 50 +++++++++++++- skyvern/webeye/scraper/scraper.py | 28 ++++++-- skyvern/webeye/utils/dom.py | 44 +++++++++++++ 22 files changed, 281 insertions(+), 13 deletions(-) diff --git a/skyvern-frontend/package-lock.json b/skyvern-frontend/package-lock.json index c688a0cd..b2baff21 100644 --- a/skyvern-frontend/package-lock.json +++ b/skyvern-frontend/package-lock.json @@ -9458,4 +9458,4 @@ } } } -} +} \ No newline at end of file diff --git a/skyvern-frontend/src/api/types.ts b/skyvern-frontend/src/api/types.ts index c4c839a9..f91a139a 100644 --- a/skyvern-frontend/src/api/types.ts +++ b/skyvern-frontend/src/api/types.ts @@ -264,6 +264,7 @@ export interface CustomCredentialServiceConfigResponse { export const ActionTypes = { InputText: "input_text", Click: "click", + Hover: "hover", SelectOption: "select_option", UploadFile: "upload_file", complete: "complete", @@ -290,6 +291,7 @@ export const ReadableActionTypes: { } = { input_text: "Input Text", click: "Click", + hover: "Hover", select_option: "Select Option", upload_file: "Upload File", complete: "Complete", diff --git a/skyvern-frontend/src/routes/tasks/detail/ActionTypePill.tsx b/skyvern-frontend/src/routes/tasks/detail/ActionTypePill.tsx index b58550e2..92930b05 100644 --- a/skyvern-frontend/src/routes/tasks/detail/ActionTypePill.tsx +++ b/skyvern-frontend/src/routes/tasks/detail/ActionTypePill.tsx @@ -7,6 +7,7 @@ type Props = { const icons: Partial> = { click: , + hover: , input_text: , }; diff --git a/skyvern-frontend/src/routes/tasks/detail/ActionTypePillMinimal.tsx b/skyvern-frontend/src/routes/tasks/detail/ActionTypePillMinimal.tsx index ef873b25..319ec78a 100644 --- a/skyvern-frontend/src/routes/tasks/detail/ActionTypePillMinimal.tsx +++ b/skyvern-frontend/src/routes/tasks/detail/ActionTypePillMinimal.tsx @@ -13,6 +13,7 @@ type Props = { const icons: Partial> = { click: , + hover: , complete: , input_text: , }; diff --git a/skyvern-frontend/src/routes/tasks/detail/hooks/useActions.ts b/skyvern-frontend/src/routes/tasks/detail/hooks/useActions.ts index 40cbb0e7..2a60fbac 100644 --- a/skyvern-frontend/src/routes/tasks/detail/hooks/useActions.ts +++ b/skyvern-frontend/src/routes/tasks/detail/hooks/useActions.ts @@ -18,6 +18,8 @@ function getActionInput(action: ActionApiResponse) { input = action.text; } else if (action.action_type === ActionTypes.Click) { input = "Click"; + } else if (action.action_type === ActionTypes.Hover) { + input = "Hover"; } else if (action.action_type === ActionTypes.SelectOption && action.option) { input = action.option.label; } diff --git a/skyvern/core/script_generations/generate_script.py b/skyvern/core/script_generations/generate_script.py index 991b9513..8f1087f0 100644 --- a/skyvern/core/script_generations/generate_script.py +++ b/skyvern/core/script_generations/generate_script.py @@ -91,6 +91,7 @@ def sanitize_variable_name(name: str) -> str: ACTION_MAP = { "click": "click", + "hover": "hover", "input_text": "fill", "upload_file": "upload_file", "select_option": "select_option", @@ -108,6 +109,7 @@ ACTION_MAP = { } ACTIONS_WITH_XPATH = [ "click", + "hover", "input_text", "type", "fill", @@ -276,6 +278,19 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output: ), ) ) + elif method == "hover": + hold_seconds = act.get("hold_seconds") + if hold_seconds and hold_seconds > 0: + args.append( + cst.Arg( + keyword=cst.Name("hold_seconds"), + value=_value(hold_seconds), + whitespace_after_arg=cst.ParenthesizedWhitespace( + indent=True, + last_line=cst.SimpleWhitespace(INDENT), + ), + ) + ) elif method in ["type", "fill"]: # Use context.parameters if field_name is available, otherwise fallback to direct value if act.get("field_name"): diff --git a/skyvern/core/script_generations/script_skyvern_page.py b/skyvern/core/script_generations/script_skyvern_page.py index ae75f4e4..30981ad7 100644 --- a/skyvern/core/script_generations/script_skyvern_page.py +++ b/skyvern/core/script_generations/script_skyvern_page.py @@ -136,6 +136,7 @@ class ScriptSkyvernPage(SkyvernPage): ActionType.INPUT_TEXT: "⌨️", ActionType.UPLOAD_FILE: "📤", ActionType.DOWNLOAD_FILE: "📥", + ActionType.HOVER: "🖱️", ActionType.SELECT_OPTION: "🎯", ActionType.WAIT: "⏳", ActionType.SOLVE_CAPTCHA: "🔓", diff --git a/skyvern/core/script_generations/skyvern_page.py b/skyvern/core/script_generations/skyvern_page.py index 1c5f3c0c..2fd02214 100644 --- a/skyvern/core/script_generations/skyvern_page.py +++ b/skyvern/core/script_generations/skyvern_page.py @@ -207,6 +207,27 @@ class SkyvernPage(Page): return selector + @action_wrap(ActionType.HOVER) + async def hover( + self, + selector: str, + *, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + hold_seconds: float = 0.0, + intention: str | None = None, + **kwargs: Any, + ) -> str: + """Move the mouse over the element identified by `selector`.""" + if not selector: + raise ValueError("Hover requires a selector.") + + locator = self.page.locator(selector, **kwargs) + await locator.scroll_into_view_if_needed() + await locator.hover(timeout=timeout) + if hold_seconds and hold_seconds > 0: + await asyncio.sleep(hold_seconds) + return selector + @overload async def fill( self, diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py index 8abbe566..c02faf9d 100644 --- a/skyvern/exceptions.py +++ b/skyvern/exceptions.py @@ -501,6 +501,11 @@ class FailToClick(SkyvernException): super().__init__(f"Failed to click({anchor}). element_id={element_id}, error_msg={msg}") +class FailToHover(SkyvernException): + def __init__(self, element_id: str, msg: str): + super().__init__(f"Failed to hover. element_id={element_id}, error_msg={msg}") + + class FailToSelectByLabel(SkyvernException): def __init__(self, element_id: str): super().__init__(f"Failed to select by label. element_id={element_id}") diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index ae9382bd..99d91893 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -2284,6 +2284,33 @@ class ForgeAgent: # If we don't have pre-scraped data, scrape normally if scraped_page is None: + # Scroll back to last hovered element position BEFORE scraping + # This ensures the element is visible in the screenshot taken during scraping + if context and context.last_hovered_element_page_y is not None: + try: + working_page = await browser_state.must_get_working_page() + # Get viewport height to center the element + viewport_height = await working_page.evaluate("window.innerHeight") + # Calculate scroll position to center the element in viewport + target_scroll_y = context.last_hovered_element_page_y - (viewport_height / 2) + target_scroll_y = max(0, target_scroll_y) # Don't scroll to negative + + LOG.info( + "Scrolling back to last hovered element position before scraping", + element_id=context.last_hovered_element_id, + absolute_page_y=context.last_hovered_element_page_y, + target_scroll_y=target_scroll_y, + ) + await working_page.evaluate(f"window.scrollTo(0, {target_scroll_y})") + # Small delay to let the scroll settle + await asyncio.sleep(0.3) + + # Clear the saved position after scrolling back + context.last_hovered_element_page_y = None + context.last_hovered_element_id = None + except Exception: + LOG.warning("Failed to scroll back to hovered element position before scraping", exc_info=True) + # Check PostHog for speed optimizations BEFORE scraping # This decision will be used in both: # 1. SVG conversion skip (in agent_functions.py cleanup) diff --git a/skyvern/forge/prompts/skyvern/extract-action-static.j2 b/skyvern/forge/prompts/skyvern/extract-action-static.j2 index 73868a95..abb0116d 100644 --- a/skyvern/forge/prompts/skyvern/extract-action-static.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action-static.j2 @@ -3,9 +3,10 @@ Include only the elements that are relevant to the user goal, without altering o Accurately interpret and understand the functional significance of SVG elements based on their shapes and context within the webpage. Use the user details to fill in necessary values. Always satisfy required fields if the field isn't already filled in. Don't return any action for the same field, if this field is already filled in and the value is the same as the one you would have filled in. MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. -Each interactable element is tagged with an ID. Avoid taking action on a disabled element when there is an alternative action available. +Each interactable element is tagged with an ID. Avoid taking action on a disabled element when there is an alternative action available. For element-based actions (CLICK, HOVER, INPUT_TEXT, UPLOAD_FILE, SELECT_OPTION), the `id` you output MUST exactly match one of the IDs from the provided elements list—never invent IDs (e.g., `INSTER_CARD_WRAPPER`). If the user refers to an element that isn't listed, choose the closest matching real element ID or explain why no suitable element exists. Actions that do not operate on a specific element (e.g., WAIT, SOLVE_CAPTCHA, RELOAD_PAGE, OPEN_TAB) should not provide an `id`. If you see any information in red in the page screenshot, this means a condition wasn't satisfied. prioritize actions with the red information. If you see a popup in the page screenshot, prioritize actions on the popup. +If the UI hints that a control (like a CTA button) only appears after hovering a card, tile, or model name, always return a HOVER action on the element that needs hovering and explicitly wait for an on-hover UI update before attempting the next action. Use a WAIT action after the hover if the new element needs extra time to appear. Reply in JSON format with the following keys: { @@ -18,7 +19,7 @@ Reply in JSON format with the following keys: "user_detail_query": str, // Think of this value as a Jeopardy question and the intention behind the action. Ask the user for the details you need for executing this action. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what the intention is behind the click and what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?". If the action doesn't require any user details, describe the intention behind the action. "user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details. "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{ "complete criterion has been met" if complete_criterion else "user goal has been achieved" }} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} + "action_type": str, // It's a string enum: "CLICK", "HOVER", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "HOVER" is used to move the mouse over an element (for example to reveal hover-only menus) without clicking. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used only if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take such as hovering or clicking. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} "id": str, // The id of the element to take action on. The id has to be one from the elements list "text": str, // Text for INPUT_TEXT action only "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2 index 8973e2c2..9e06868c 100644 --- a/skyvern/forge/prompts/skyvern/extract-action.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action.j2 @@ -3,9 +3,10 @@ Include only the elements that are relevant to the user goal, without altering o Accurately interpret and understand the functional significance of SVG elements based on their shapes and context within the webpage. Use the user details to fill in necessary values. Always satisfy required fields if the field isn't already filled in. Don't return any action for the same field, if this field is already filled in and the value is the same as the one you would have filled in. MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. -Each interactable element is tagged with an ID. Avoid taking action on a disabled element when there is an alternative action available. +Each interactable element is tagged with an ID. Avoid taking action on a disabled element when there is an alternative action available. The `id` you output MUST exactly match one of the IDs from the provided elements list—never invent IDs. If the user refers to an element that isn't listed, choose the closest matching real element ID or explain why no suitable element exists. If you see any information in red in the page screenshot, this means a condition wasn't satisfied. prioritize actions with the red information. If you see a popup in the page screenshot, prioritize actions on the popup. +If the UI hints that a control (like a CTA button) only appears after hovering a card, tile, or model name, always return a HOVER action on the element that needs hovering and explicitly wait for an on-hover UI update before attempting the next action. Use a WAIT action after the hover if the new element needs extra time to appear. Reply in JSON format with the following keys: { @@ -18,7 +19,7 @@ Reply in JSON format with the following keys: "user_detail_query": str, // Think of this value as a Jeopardy question and the intention behind the action. Ask the user for the details you need for executing this action. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what the intention is behind the click and what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?". If the action doesn't require any user details, describe the intention behind the action. "user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details. "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} + "action_type": str, // It's a string enum: "CLICK", "HOVER", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "HOVER" is used to move the mouse over an element (for example, to reveal hover-only menus or buttons before clicking). "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used only if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take such as hovering or clicking. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} "id": str, // The id of the element to take action on. The id has to be one from the elements list "text": str, // Text for INPUT_TEXT action only "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. diff --git a/skyvern/forge/sdk/core/skyvern_context.py b/skyvern/forge/sdk/core/skyvern_context.py index 8f3edc47..0d59cb66 100644 --- a/skyvern/forge/sdk/core/skyvern_context.py +++ b/skyvern/forge/sdk/core/skyvern_context.py @@ -67,6 +67,11 @@ class SkyvernContext: next_step_pre_scraped_data: dict[str, Any] | None = None speculative_plans: dict[str, Any] = field(default_factory=dict) + # Store absolute page position of last hovered element to scroll back after re-scraping + # This is the Y position relative to the entire page (not viewport) + last_hovered_element_page_y: float | None = None + last_hovered_element_id: str | None = None + """ Example output value: {"loop_value": "str", "output_parameter": "the key of the parameter", "output_value": Any} diff --git a/skyvern/forge/sdk/db/utils.py b/skyvern/forge/sdk/db/utils.py index 96ff62e4..6a470efd 100644 --- a/skyvern/forge/sdk/db/utils.py +++ b/skyvern/forge/sdk/db/utils.py @@ -70,6 +70,7 @@ from skyvern.webeye.actions.actions import ( DragAction, ExtractAction, GotoUrlAction, + HoverAction, InputTextAction, KeypressAction, LeftMouseAction, @@ -138,6 +139,7 @@ ACTION_TYPE_TO_CLASS = { ActionType.SELECT_OPTION: SelectOptionAction, ActionType.CHECKBOX: CheckboxAction, ActionType.WAIT: WaitAction, + ActionType.HOVER: HoverAction, ActionType.SOLVE_CAPTCHA: SolveCaptchaAction, ActionType.RELOAD_PAGE: ReloadPageAction, ActionType.EXTRACT: ExtractAction, diff --git a/skyvern/library/skyvern_locator.py b/skyvern/library/skyvern_locator.py index bdaf17c0..8409ddd0 100644 --- a/skyvern/library/skyvern_locator.py +++ b/skyvern/library/skyvern_locator.py @@ -48,6 +48,7 @@ class SkyvernLocator: async def hover(self, **kwargs: Any) -> None: """Hover over the element.""" + await self._locator.scroll_into_view_if_needed() await self._locator.hover(**kwargs) async def focus(self, **kwargs: Any) -> None: diff --git a/skyvern/webeye/actions/action_types.py b/skyvern/webeye/actions/action_types.py index fc92b2ec..7c933d94 100644 --- a/skyvern/webeye/actions/action_types.py +++ b/skyvern/webeye/actions/action_types.py @@ -12,6 +12,7 @@ class ActionType(StrEnum): SELECT_OPTION = "select_option" CHECKBOX = "checkbox" WAIT = "wait" + HOVER = "hover" NULL_ACTION = "null_action" SOLVE_CAPTCHA = "solve_captcha" TERMINATE = "terminate" @@ -37,11 +38,13 @@ class ActionType(StrEnum): ActionType.DOWNLOAD_FILE, ActionType.SELECT_OPTION, ActionType.CHECKBOX, + ActionType.HOVER, ] POST_ACTION_EXECUTION_ACTION_TYPES = [ ActionType.CLICK, + ActionType.HOVER, ActionType.INPUT_TEXT, ActionType.UPLOAD_FILE, ActionType.DOWNLOAD_FILE, diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index 3e525f3a..743f0df6 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -282,6 +282,11 @@ class WaitAction(Action): seconds: int = 20 +class HoverAction(WebAction): + action_type: ActionType = ActionType.HOVER + hold_seconds: float = 0.0 + + class TerminateAction(DecisiveAction): action_type: ActionType = ActionType.TERMINATE diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index db448e2c..63da6d55 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -32,6 +32,7 @@ from skyvern.exceptions import ( ErrFoundSelectableElement, FailedToFetchSecret, FailToClick, + FailToHover, FailToSelectByIndex, FailToSelectByLabel, FailToSelectByValue, @@ -1998,6 +1999,68 @@ async def handle_wait_action( return [ActionFailure(exception=Exception("Wait action is treated as a failure"))] +@TraceManager.traced_async(ignore_inputs=["scraped_page", "page"]) +async def handle_hover_action( + action: actions.HoverAction, + page: Page, + scraped_page: ScrapedPage, + task: Task, + step: Step, +) -> list[ActionResult]: + dom = DomUtil(scraped_page=scraped_page, page=page) + try: + skyvern_element = await dom.get_skyvern_element_by_id(action.element_id) + except Exception as exc: + LOG.warning( + "Failed to resolve element for hover action", + action=action, + workflow_run_id=task.workflow_run_id, + exc_info=True, + ) + return [ActionFailure(exception=exc)] + + try: + await skyvern_element.hover_to_reveal() + await skyvern_element.get_locator().scroll_into_view_if_needed() + await skyvern_element.get_locator().hover(timeout=settings.BROWSER_ACTION_TIMEOUT_MS) + + # Save the absolute page position of the hovered element + # This allows us to scroll back to this position after re-scraping + try: + bounding_box = await skyvern_element.get_locator().bounding_box(timeout=settings.BROWSER_ACTION_TIMEOUT_MS) + if bounding_box: + # Get current scroll position + scroll_y = await page.evaluate("window.scrollY") + # Calculate absolute page Y = viewport Y + scroll offset + absolute_page_y = bounding_box["y"] + scroll_y + + context = skyvern_context.current() + if context: + context.last_hovered_element_page_y = absolute_page_y + context.last_hovered_element_id = action.element_id + LOG.info( + "Saved hovered element absolute position", + element_id=action.element_id, + viewport_y=bounding_box["y"], + scroll_y=scroll_y, + absolute_page_y=absolute_page_y, + ) + except Exception: + LOG.warning("Failed to save hovered element position", exc_info=True) + + if action.hold_seconds and action.hold_seconds > 0: + await asyncio.sleep(action.hold_seconds) + return [ActionSuccess()] + except Exception as exc: + LOG.warning( + "Hover action failed", + action=action, + workflow_run_id=task.workflow_run_id, + exc_info=True, + ) + return [ActionFailure(FailToHover(skyvern_element.get_id(), msg=str(exc)))] + + @TraceManager.traced_async(ignore_inputs=["scraped_page", "page"]) async def handle_terminate_action( action: actions.TerminateAction, @@ -2205,6 +2268,7 @@ ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_ac ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action) ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action) ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action) +ActionHandler.register_action_type(ActionType.HOVER, handle_hover_action) ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action) ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action) ActionHandler.register_action_type(ActionType.EXTRACT, handle_extract_action) @@ -2284,6 +2348,7 @@ async def chain_click( :param css: css of the element to click """ try: + await skyvern_element.hover_to_reveal() if not await skyvern_element.navigate_to_a_href(page=page): await locator.click(timeout=timeout) LOG.info("Chain click: main element click succeeded", action=action, locator=locator) diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index a97b4638..2c27ede1 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -27,6 +27,7 @@ from skyvern.webeye.actions.actions import ( DownloadFileAction, DragAction, GotoUrlAction, + HoverAction, InputOrSelectContext, InputTextAction, KeypressAction, @@ -166,6 +167,9 @@ def parse_action( if action_type == ActionType.WAIT: return WaitAction(**base_action_dict) + if action_type == ActionType.HOVER: + return HoverAction(**base_action_dict, hold_seconds=action.get("hold_seconds", 0) or 0) + if action_type == ActionType.COMPLETE: return CompleteAction( **base_action_dict, diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index b63ae5da..33fdc81c 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -411,6 +411,44 @@ function hasASPClientControl() { return typeof ASPxClientControl !== "undefined"; } +// Check if element is only visible on hover (e.g., hover-only buttons) +function isHoverOnlyElement(element) { + // Check for common hover-only patterns in class names + const className = element.className?.toString() ?? ""; + const parentClassName = element.parentElement?.className?.toString() ?? ""; + + // Common hover-only class patterns + if ( + className.includes("hover-") || + className.includes("-hover") || + parentClassName.includes("hover-") || + parentClassName.includes("-hover") + ) { + return true; + } + + // Check if parent has hover-related attributes or classes that might reveal this element + let parent = element.parentElement; + let depth = 0; + // Cap recursion to avoid walking the entire tree and bloating prompts + const maxDepth = 5; + while (parent && parent !== document.body && depth < maxDepth) { + const parentClass = parent.className?.toString() ?? ""; + if ( + parentClass.includes("hover") || + parentClass.includes("card") || + parentClass.includes("item") + ) { + // This element might be revealed on parent hover + return true; + } + parent = parent.parentElement; + depth += 1; + } + + return false; +} + // from playwright: https://github.com/microsoft/playwright/blob/1b65f26f0287c0352e76673bc5f85bc36c934b55/packages/playwright-core/src/server/injected/domUtils.ts#L100-L119 // NOTE: According this logic, some elements with aria-hidden won't be considered as invisible. And the result shows they are indeed interactable. function isElementVisible(element) { @@ -450,6 +488,10 @@ function isElementVisible(element) { if (!isElementStyleVisibilityVisible(element, style)) return false; const rect = element.getBoundingClientRect(); if (rect.width <= 0 || rect.height <= 0) { + // Check if this element might be visible on hover before marking as invisible + if (isHoverOnlyElement(element)) { + return true; + } return false; } @@ -824,7 +866,12 @@ function isInteractable(element, hoverStylesMap) { // https://developer.mozilla.org/en-US/docs/Web/CSS/pointer-events#none const elementPointerEvent = getElementComputedStyle(element)?.pointerEvents; if (elementPointerEvent === "none" && !element.disabled) { - return false; + // Some CTAs stay hidden until the parent is hovered + // When we can infer that the element is revealed on hover, keep it interactable so the agent + // has a chance to hover the parent before clicking. + if (!isHoverOnlyElement(element)) { + return false; + } } if (isInteractableInput(element, hoverStylesMap)) { @@ -1569,6 +1616,7 @@ async function buildElementObject( frame: frame, frame_index: window.GlobalSkyvernFrameIndex, interactable: interactable, + hoverOnly: isHoverOnlyElement(element), tagName: elementTagNameLower, attributes: attrs, beforePseudoText: getPseudoContent(element, "::before"), diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 7eb56f93..7417cf8b 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -2,6 +2,7 @@ import asyncio import copy import json from collections import defaultdict +from typing import Any import structlog from playwright._impl._errors import TimeoutError @@ -92,6 +93,14 @@ def load_js_script() -> str: JS_FUNCTION_DEFS = load_js_script() +# function to convert JSON element to HTML +def build_attribute(key: str, value: Any) -> str: + if isinstance(value, bool) or isinstance(value, int): + return f'{key}="{str(value).lower()}"' + + return f'{key}="{str(value)}"' if value else key + + def clean_element_before_hashing(element: dict) -> dict: def clean_nested(element: dict) -> dict: element_cleaned = {key: value for key, value in element.items() if key not in {"id", "rect", "frame_index"}} @@ -125,7 +134,7 @@ def build_element_dict( for element in elements: element_id: str = element.get("id", "") - # get_interactable_element_tree marks each interactable element with a unique_id attribute + # get_interactable_element_tree marks each interactable element with a SKYVERN_ID_ATTR attribute id_to_css_dict[element_id] = f"[{SKYVERN_ID_ATTR}='{element_id}']" id_to_element_dict[element_id] = element id_to_frame_dict[element_id] = element["frame"] @@ -409,16 +418,18 @@ async def add_frame_interactable_elements( # it will get stuck when we `frame.evaluate()` on an invisible iframe if not await frame_element.is_visible(): return elements, element_tree - unique_id = await frame_element.get_attribute("unique_id") - if not unique_id: + skyvern_id = await frame_element.get_attribute(SKYVERN_ID_ATTR) + if not skyvern_id: LOG.info( - "No unique_id found for frame, skipping", + "No Skyvern id found for frame, skipping", frame_index=frame_index, + attr=SKYVERN_ID_ATTR, ) return elements, element_tree except Exception: LOG.warning( - "Unable to get unique_id from frame_element", + "Unable to get Skyvern id from frame_element", + attr=SKYVERN_ID_ATTR, exc_info=True, ) return elements, element_tree @@ -427,11 +438,11 @@ async def add_frame_interactable_elements( await skyvern_frame.safe_wait_for_animation_end() frame_elements, frame_element_tree = await skyvern_frame.build_tree_from_body( - frame_name=unique_id, frame_index=frame_index + frame_name=skyvern_id, frame_index=frame_index ) for element in elements: - if element["id"] == unique_id: + if element["id"] == skyvern_id: element["children"] = frame_element_tree elements = elements + frame_elements @@ -638,6 +649,9 @@ def _should_keep_unique_id(element: dict) -> bool: # 1. no readonly attr and not disable attr and no interactable # 2. readonly=false and disable=false and interactable=false + if element.get("hoverOnly"): + return True + attributes = element.get("attributes", {}) if ( "disabled" not in attributes diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index 8324afd6..1254f28b 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -133,6 +133,7 @@ class SkyvernElement: self._id_cache = static_element.get("id", "") self._tag_name = static_element.get("tagName", "") self._selectable = static_element.get("isSelectable", False) + self._hover_only = static_element.get("hoverOnly", False) self._frame_id = static_element.get("frame", "") self._attributes = static_element.get("attributes", {}) self._rect: FloatRect | None = None @@ -401,6 +402,49 @@ class SkyvernElement: def get_attributes(self) -> dict: return self._attributes + def requires_hover(self) -> bool: + return bool(self._hover_only) + + async def hover_to_reveal( + self, + max_depth: int = 4, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + settle_delay_s: float = 0.15, + ) -> bool: + if not self.requires_hover(): + return False + + hover_target = self.get_locator() + for depth in range(max_depth): + try: + await hover_target.scroll_into_view_if_needed() + await hover_target.hover(timeout=timeout) + await asyncio.sleep(settle_delay_s) + if await self.get_locator().is_visible(timeout=timeout): + LOG.debug("Hover reveal succeeded", element_id=self.get_id(), depth=depth) + return True + except Exception: + LOG.debug( + "Hover attempt failed while trying to reveal element", + exc_info=True, + element_id=self.get_id(), + depth=depth, + ) + + parent_locator = hover_target.locator("..") + try: + if await parent_locator.count() != 1: + break + except Exception: + LOG.debug( + "Unable to evaluate parent locator during hover reveal", exc_info=True, element_id=self.get_id() + ) + break + hover_target = parent_locator + + LOG.debug("Hover reveal attempts exhausted", element_id=self.get_id()) + return False + def get_options(self) -> list[SkyvernOptionType]: options = self.__static_element.get("options", None) if options is None: