From 53a89c3e446eaa7ef43cfda5e03360af4fdf4a66 Mon Sep 17 00:00:00 2001 From: pedrohsdb Date: Fri, 13 Feb 2026 09:42:49 -0800 Subject: [PATCH] Add SCROLL action to extract-action prompt (#SKY-7924) (#4743) --- skyvern/constants.py | 1 + .../prompts/skyvern/extract-action-static.j2 | 3 +- .../forge/prompts/skyvern/extract-action.j2 | 3 +- skyvern/webeye/actions/action_types.py | 1 + skyvern/webeye/actions/handler.py | 35 ++++++++++++- skyvern/webeye/actions/parse_actions.py | 23 +++++++-- skyvern/webeye/scraper/domUtils.js | 50 +++++++++++++++++++ 7 files changed, 109 insertions(+), 7 deletions(-) diff --git a/skyvern/constants.py b/skyvern/constants.py index 13454af5..bade3349 100644 --- a/skyvern/constants.py +++ b/skyvern/constants.py @@ -38,6 +38,7 @@ DEFAULT_MAX_TOKENS = 100000 MAX_FILE_PARSE_INPUT_TOKENS = 256_000 MAX_IMAGE_MESSAGES = 10 SCROLL_AMOUNT_MULTIPLIER = 100 +EXTRACT_ACTION_SCROLL_AMOUNT = 500 # pixels per scroll action from extract-action prompt # Text input constants TEXT_INPUT_DELAY = 10 # 10ms between each character input diff --git a/skyvern/forge/prompts/skyvern/extract-action-static.j2 b/skyvern/forge/prompts/skyvern/extract-action-static.j2 index febaf001..11f432a9 100644 --- a/skyvern/forge/prompts/skyvern/extract-action-static.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action-static.j2 @@ -18,11 +18,12 @@ Reply in JSON format with the following keys: "user_detail_query": str, // Think of this value as a Jeopardy question and the intention behind the action. Ask the user for the details you need for executing this action. IMPORTANT: This question must be user information agnostic and must NOT contain specific user data (like names, IDs, specific values, etc.). Ask what information is needed generically. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what the intention is behind the click and what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?", "Which member should I take action on?". NEVER include specific user data in this question - keep it generic so it can be answered with different user contexts. "user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details. This answer CAN contain specific user data. "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "action_type": str, // It's a string enum: "CLICK", "HOVER", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE", "KEYPRESS"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "HOVER" is used to move the mouse over an element without clicking, particularly when revealing hover-only menus or buttons before clicking, or when the UI hints that a control (like a CTA button) only appears after hovering a card, tile, or model name. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned. "KEYPRESS" is used to press a keyboard key when no clickable button or element achieves the same result. Only use KEYPRESS when pressing a key is the sole way to proceed (e.g., pressing Enter to submit a search with no search button, or Escape to close a modal with no close button). KEYPRESS does not require an element id. Requires the "key" field.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} + "action_type": str, // It's a string enum: "CLICK", "HOVER", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE", "KEYPRESS", "SCROLL"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "HOVER" is used to move the mouse over an element without clicking, particularly when revealing hover-only menus or buttons before clicking, or when the UI hints that a control (like a CTA button) only appears after hovering a card, tile, or model name. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned. "KEYPRESS" is used to press a keyboard key when no clickable button or element achieves the same result. Only use KEYPRESS when pressing a key is the sole way to proceed (e.g., pressing Enter to submit a search with no search button, or Escape to close a modal with no close button). KEYPRESS does not require an element id. Requires the "key" field. "SCROLL" is used to scroll within a specific scrollable container on the page (not the page itself). Only use SCROLL when a required action is blocked because the target element is hidden, disabled, or unreachable until the container is scrolled (e.g., an "Agree" button that only becomes enabled after scrolling to the bottom of a terms and conditions box). Do not use SCROLL for general page navigation. Provide the element id of an interactable element within or near the scrollable container. Requires the "direction" field. SCROLL does not require a "text", "option", or "file_url" field.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} "id": str, // The id of the element to take action on. The id has to be one from the elements list "captcha_type": str, // The type of captcha for SOLVE_CAPTCHA action only. null if not SOLVE_CAPTCHA action. It's a string enum: "TEXT_CAPTCHA", "RECAPTCHA", "HCAPTCHA", "MTCAPTCHA", "FUNCAPTCHA", "CLOUDFLARE", "OTHER". "text": str, // Text for INPUT_TEXT action only "key": str, // The keyboard key to press for KEYPRESS action only. Allowed values: "Enter", "Tab", "Escape", "ArrowDown", "ArrowUp". null if not KEYPRESS action. + "direction": str, // The direction to scroll for SCROLL action only. Allowed values: "up", "down". null if not SCROLL action. "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. "download": bool, // Can only be true for CLICK or SELECT_OPTION actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download. "option": { // The option to select for SELECT_OPTION action only. null if not SELECT_OPTION action diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2 index 4ac92865..27aa8d30 100644 --- a/skyvern/forge/prompts/skyvern/extract-action.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action.j2 @@ -18,11 +18,12 @@ Reply in JSON format with the following keys: "user_detail_query": str, // Think of this value as a Jeopardy question and the intention behind the action. Ask the user for the details you need for executing this action. IMPORTANT: This question must be user information agnostic and must NOT contain specific user data (like names, IDs, specific values, etc.). Ask what information is needed generically. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what the intention is behind the click and what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?", "Which member should I take action on?". NEVER include specific user data in this question - keep it generic so it can be answered with different user contexts. "user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details. This answer CAN contain specific user data. "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "action_type": str, // It's a string enum: "CLICK", "HOVER", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE", "KEYPRESS"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "HOVER" is used to move the mouse over an element without clicking, particularly when revealing hover-only menus or buttons before clicking, or when the UI hints that a control (like a CTA button) only appears after hovering a card, tile, or model name. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned. "KEYPRESS" is used to press a keyboard key when no clickable button or element achieves the same result. Only use KEYPRESS when pressing a key is the sole way to proceed (e.g., pressing Enter to submit a search with no search button, or Escape to close a modal with no close button). KEYPRESS does not require an element id. Requires the "key" field.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} + "action_type": str, // It's a string enum: "CLICK", "HOVER", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE", "KEYPRESS", "SCROLL"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "HOVER" is used to move the mouse over an element without clicking, particularly when revealing hover-only menus or buttons before clicking, or when the UI hints that a control (like a CTA button) only appears after hovering a card, tile, or model name. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned. "KEYPRESS" is used to press a keyboard key when no clickable button or element achieves the same result. Only use KEYPRESS when pressing a key is the sole way to proceed (e.g., pressing Enter to submit a search with no search button, or Escape to close a modal with no close button). KEYPRESS does not require an element id. Requires the "key" field. "SCROLL" is used to scroll within a specific scrollable container on the page (not the page itself). Only use SCROLL when a required action is blocked because the target element is hidden, disabled, or unreachable until the container is scrolled (e.g., an "Agree" button that only becomes enabled after scrolling to the bottom of a terms and conditions box). Do not use SCROLL for general page navigation. Provide the element id of an interactable element within or near the scrollable container. Requires the "direction" field. SCROLL does not require a "text", "option", or "file_url" field.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} "id": str, // The id of the element to take action on. The id has to be one from the elements list "captcha_type": str, // The type of captcha for SOLVE_CAPTCHA action only. null if not SOLVE_CAPTCHA action. It's a string enum: "TEXT_CAPTCHA", "RECAPTCHA", "HCAPTCHA", "MTCAPTCHA", "FUNCAPTCHA", "CLOUDFLARE", "OTHER". "text": str, // Text for INPUT_TEXT action only "key": str, // The keyboard key to press for KEYPRESS action only. Allowed values: "Enter", "Tab", "Escape", "ArrowDown", "ArrowUp". null if not KEYPRESS action. + "direction": str, // The direction to scroll for SCROLL action only. Allowed values: "up", "down". null if not SCROLL action. "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. "download": bool, // Can only be true for CLICK or SELECT_OPTION actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download. "option": { // The option to select for SELECT_OPTION action only. null if not SELECT_OPTION action diff --git a/skyvern/webeye/actions/action_types.py b/skyvern/webeye/actions/action_types.py index 90441f6b..7296f7cd 100644 --- a/skyvern/webeye/actions/action_types.py +++ b/skyvern/webeye/actions/action_types.py @@ -53,4 +53,5 @@ POST_ACTION_EXECUTION_ACTION_TYPES = [ ActionType.SOLVE_CAPTCHA, ActionType.EXTRACT, ActionType.KEYPRESS, + ActionType.SCROLL, ] diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 14983fbc..79c58b6d 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -2185,9 +2185,40 @@ async def handle_scroll_action( task: Task, step: Step, ) -> list[ActionResult]: - if action.x and action.y: + if action.element_id: + # Element-based scrolling from extract-action prompt. Uses + # scrollNearestScrollableContainer() from domUtils.js which walks the DOM to find + # the nearest scrollable ancestor or sibling container relative to the element. + scroll_direction = "down" if action.scroll_y >= 0 else "up" + scrolled = False + dom = DomUtil(scraped_page=scraped_page, page=page) + skyvern_element = await dom.safe_get_skyvern_element_by_id(action.element_id) + if skyvern_element: + try: + scrolled = await skyvern_element.locator.evaluate( + "(el, direction) => scrollNearestScrollableContainer(el, direction)", + scroll_direction, + ) + except Exception: + LOG.warning( + "JavaScript scroll evaluation failed, falling back to mouse wheel", + element_id=action.element_id, + exc_info=True, + ) + else: + LOG.warning("Could not resolve element for scroll action", element_id=action.element_id) + if not scrolled: + LOG.warning( + "Could not find scrollable container near element, falling back to mouse wheel", + element_id=action.element_id, + ) + await page.mouse.wheel(action.scroll_x, action.scroll_y) + elif action.x and action.y: + # Coordinate-based scrolling from CUA/UI-TARS agents await page.mouse.move(action.x, action.y) - await page.mouse.wheel(action.scroll_x, action.scroll_y) + await page.mouse.wheel(action.scroll_x, action.scroll_y) + else: + await page.mouse.wheel(action.scroll_x, action.scroll_y) return [ActionSuccess()] diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index f62dc34c..40785d9b 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -6,7 +6,7 @@ import structlog from openai.types.responses.response import Response as OpenAIResponse from pydantic import ValidationError -from skyvern.constants import SCROLL_AMOUNT_MULTIPLIER +from skyvern.constants import EXTRACT_ACTION_SCROLL_AMOUNT, SCROLL_AMOUNT_MULTIPLIER from skyvern.exceptions import FailedToGetTOTPVerificationCode, NoTOTPVerificationCodeFound, UnsupportedActionType from skyvern.forge import app from skyvern.forge.prompts import prompt_engine @@ -93,10 +93,10 @@ def parse_action( action_type_str = "KEYPRESS" action_type = ActionType[action_type_str] - if not action_type.is_web_action(): + if not action_type.is_web_action() and action_type != ActionType.SCROLL: # LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc. # That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not - # set for non-web actions. + # set for non-web actions. SCROLL needs element_id to target a specific scrollable container. base_action_dict["element_id"] = None if action_type == ActionType.TERMINATE: @@ -208,6 +208,23 @@ def parse_action( keys = action.get("keys", ["Enter"]) return KeypressAction(**base_action_dict, keys=keys) + if action_type == ActionType.SCROLL: + # SCROLL from extract-action prompt provides a direction and optionally an element_id + # for the scrollable container. Convert direction to scroll_x/scroll_y pixel values. + base_action_dict["skyvern_element_hash"] = None + base_action_dict["skyvern_element_data"] = None + direction = action.get("direction", "down").lower() + if direction not in ("up", "down"): + LOG.warning("SCROLL action has unexpected direction, defaulting to down", direction=direction) + direction = "down" + if direction == "up": + scroll_x = 0 + scroll_y = -EXTRACT_ACTION_SCROLL_AMOUNT + else: + scroll_x = 0 + scroll_y = EXTRACT_ACTION_SCROLL_AMOUNT + return ScrollAction(**base_action_dict, scroll_x=scroll_x, scroll_y=scroll_y) + if action_type == ActionType.CLOSE_PAGE: return ClosePageAction(**base_action_dict) diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 3c3993c7..055fd908 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -2326,6 +2326,56 @@ function isWindowScrollable() { return true; } +/** + * Find the nearest scrollable container relative to the given element and scroll it. + * Two strategies: + * 1) Walk up from element to find a scrollable ancestor (element is inside container) + * 2) Walk up the DOM checking siblings at each level (element is beside container) + * Returns true if a scrollable container was found and scrolled, false otherwise. + */ +function scrollNearestScrollableContainer(element, direction) { + function isContainerScrollable(node) { + if (!node || node === document.documentElement || node === document.body) + return false; + const style = window.getComputedStyle(node); + const oy = style.overflowY; + return ( + (oy === "auto" || oy === "scroll") && + node.scrollHeight > node.clientHeight + ); + } + + // Strategy 1: walk up from element to find a scrollable ancestor + let target = element; + while (target && target !== document.documentElement) { + if (isContainerScrollable(target)) break; + target = target.parentElement; + } + + // Strategy 2: walk up the DOM checking siblings at each level + if (!target || target === document.documentElement) { + target = null; + let level = element.parentElement; + while (level && level !== document.documentElement && !target) { + for (const child of level.children) { + if (isContainerScrollable(child)) { + target = child; + break; + } + } + level = level.parentElement; + } + } + + if (!target) return false; + if (direction === "down") { + target.scrollTop = target.scrollHeight; + } else { + target.scrollTop = 0; + } + return true; +} + function scrollToElementBottom(element, page_by_page = false) { const top = page_by_page ? element.clientHeight + element.scrollTop