Add SCROLL action to extract-action prompt (#SKY-7924) (#4743)

This commit is contained in:
pedrohsdb
2026-02-13 09:42:49 -08:00
committed by GitHub
parent 0243ae1375
commit 53a89c3e44
7 changed files with 109 additions and 7 deletions

View File

@@ -53,4 +53,5 @@ POST_ACTION_EXECUTION_ACTION_TYPES = [
ActionType.SOLVE_CAPTCHA,
ActionType.EXTRACT,
ActionType.KEYPRESS,
ActionType.SCROLL,
]

View File

@@ -2185,9 +2185,40 @@ async def handle_scroll_action(
task: Task,
step: Step,
) -> list[ActionResult]:
if action.x and action.y:
if action.element_id:
# Element-based scrolling from extract-action prompt. Uses
# scrollNearestScrollableContainer() from domUtils.js which walks the DOM to find
# the nearest scrollable ancestor or sibling container relative to the element.
scroll_direction = "down" if action.scroll_y >= 0 else "up"
scrolled = False
dom = DomUtil(scraped_page=scraped_page, page=page)
skyvern_element = await dom.safe_get_skyvern_element_by_id(action.element_id)
if skyvern_element:
try:
scrolled = await skyvern_element.locator.evaluate(
"(el, direction) => scrollNearestScrollableContainer(el, direction)",
scroll_direction,
)
except Exception:
LOG.warning(
"JavaScript scroll evaluation failed, falling back to mouse wheel",
element_id=action.element_id,
exc_info=True,
)
else:
LOG.warning("Could not resolve element for scroll action", element_id=action.element_id)
if not scrolled:
LOG.warning(
"Could not find scrollable container near element, falling back to mouse wheel",
element_id=action.element_id,
)
await page.mouse.wheel(action.scroll_x, action.scroll_y)
elif action.x and action.y:
# Coordinate-based scrolling from CUA/UI-TARS agents
await page.mouse.move(action.x, action.y)
await page.mouse.wheel(action.scroll_x, action.scroll_y)
await page.mouse.wheel(action.scroll_x, action.scroll_y)
else:
await page.mouse.wheel(action.scroll_x, action.scroll_y)
return [ActionSuccess()]

View File

@@ -6,7 +6,7 @@ import structlog
from openai.types.responses.response import Response as OpenAIResponse
from pydantic import ValidationError
from skyvern.constants import SCROLL_AMOUNT_MULTIPLIER
from skyvern.constants import EXTRACT_ACTION_SCROLL_AMOUNT, SCROLL_AMOUNT_MULTIPLIER
from skyvern.exceptions import FailedToGetTOTPVerificationCode, NoTOTPVerificationCodeFound, UnsupportedActionType
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
@@ -93,10 +93,10 @@ def parse_action(
action_type_str = "KEYPRESS"
action_type = ActionType[action_type_str]
if not action_type.is_web_action():
if not action_type.is_web_action() and action_type != ActionType.SCROLL:
# LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc.
# That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not
# set for non-web actions.
# set for non-web actions. SCROLL needs element_id to target a specific scrollable container.
base_action_dict["element_id"] = None
if action_type == ActionType.TERMINATE:
@@ -208,6 +208,23 @@ def parse_action(
keys = action.get("keys", ["Enter"])
return KeypressAction(**base_action_dict, keys=keys)
if action_type == ActionType.SCROLL:
# SCROLL from extract-action prompt provides a direction and optionally an element_id
# for the scrollable container. Convert direction to scroll_x/scroll_y pixel values.
base_action_dict["skyvern_element_hash"] = None
base_action_dict["skyvern_element_data"] = None
direction = action.get("direction", "down").lower()
if direction not in ("up", "down"):
LOG.warning("SCROLL action has unexpected direction, defaulting to down", direction=direction)
direction = "down"
if direction == "up":
scroll_x = 0
scroll_y = -EXTRACT_ACTION_SCROLL_AMOUNT
else:
scroll_x = 0
scroll_y = EXTRACT_ACTION_SCROLL_AMOUNT
return ScrollAction(**base_action_dict, scroll_x=scroll_x, scroll_y=scroll_y)
if action_type == ActionType.CLOSE_PAGE:
return ClosePageAction(**base_action_dict)

View File

@@ -2326,6 +2326,56 @@ function isWindowScrollable() {
return true;
}
/**
* Find the nearest scrollable container relative to the given element and scroll it.
* Two strategies:
* 1) Walk up from element to find a scrollable ancestor (element is inside container)
* 2) Walk up the DOM checking siblings at each level (element is beside container)
* Returns true if a scrollable container was found and scrolled, false otherwise.
*/
function scrollNearestScrollableContainer(element, direction) {
function isContainerScrollable(node) {
if (!node || node === document.documentElement || node === document.body)
return false;
const style = window.getComputedStyle(node);
const oy = style.overflowY;
return (
(oy === "auto" || oy === "scroll") &&
node.scrollHeight > node.clientHeight
);
}
// Strategy 1: walk up from element to find a scrollable ancestor
let target = element;
while (target && target !== document.documentElement) {
if (isContainerScrollable(target)) break;
target = target.parentElement;
}
// Strategy 2: walk up the DOM checking siblings at each level
if (!target || target === document.documentElement) {
target = null;
let level = element.parentElement;
while (level && level !== document.documentElement && !target) {
for (const child of level.children) {
if (isContainerScrollable(child)) {
target = child;
break;
}
}
level = level.parentElement;
}
}
if (!target) return false;
if (direction === "down") {
target.scrollTop = target.scrollHeight;
} else {
target.scrollTop = 0;
}
return true;
}
function scrollToElementBottom(element, page_by_page = false) {
const top = page_by_page
? element.clientHeight + element.scrollTop