Add SCROLL action to extract-action prompt (#SKY-7924) (#4743)
This commit is contained in:
@@ -53,4 +53,5 @@ POST_ACTION_EXECUTION_ACTION_TYPES = [
|
||||
ActionType.SOLVE_CAPTCHA,
|
||||
ActionType.EXTRACT,
|
||||
ActionType.KEYPRESS,
|
||||
ActionType.SCROLL,
|
||||
]
|
||||
|
||||
@@ -2185,9 +2185,40 @@ async def handle_scroll_action(
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
if action.x and action.y:
|
||||
if action.element_id:
|
||||
# Element-based scrolling from extract-action prompt. Uses
|
||||
# scrollNearestScrollableContainer() from domUtils.js which walks the DOM to find
|
||||
# the nearest scrollable ancestor or sibling container relative to the element.
|
||||
scroll_direction = "down" if action.scroll_y >= 0 else "up"
|
||||
scrolled = False
|
||||
dom = DomUtil(scraped_page=scraped_page, page=page)
|
||||
skyvern_element = await dom.safe_get_skyvern_element_by_id(action.element_id)
|
||||
if skyvern_element:
|
||||
try:
|
||||
scrolled = await skyvern_element.locator.evaluate(
|
||||
"(el, direction) => scrollNearestScrollableContainer(el, direction)",
|
||||
scroll_direction,
|
||||
)
|
||||
except Exception:
|
||||
LOG.warning(
|
||||
"JavaScript scroll evaluation failed, falling back to mouse wheel",
|
||||
element_id=action.element_id,
|
||||
exc_info=True,
|
||||
)
|
||||
else:
|
||||
LOG.warning("Could not resolve element for scroll action", element_id=action.element_id)
|
||||
if not scrolled:
|
||||
LOG.warning(
|
||||
"Could not find scrollable container near element, falling back to mouse wheel",
|
||||
element_id=action.element_id,
|
||||
)
|
||||
await page.mouse.wheel(action.scroll_x, action.scroll_y)
|
||||
elif action.x and action.y:
|
||||
# Coordinate-based scrolling from CUA/UI-TARS agents
|
||||
await page.mouse.move(action.x, action.y)
|
||||
await page.mouse.wheel(action.scroll_x, action.scroll_y)
|
||||
await page.mouse.wheel(action.scroll_x, action.scroll_y)
|
||||
else:
|
||||
await page.mouse.wheel(action.scroll_x, action.scroll_y)
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import structlog
|
||||
from openai.types.responses.response import Response as OpenAIResponse
|
||||
from pydantic import ValidationError
|
||||
|
||||
from skyvern.constants import SCROLL_AMOUNT_MULTIPLIER
|
||||
from skyvern.constants import EXTRACT_ACTION_SCROLL_AMOUNT, SCROLL_AMOUNT_MULTIPLIER
|
||||
from skyvern.exceptions import FailedToGetTOTPVerificationCode, NoTOTPVerificationCodeFound, UnsupportedActionType
|
||||
from skyvern.forge import app
|
||||
from skyvern.forge.prompts import prompt_engine
|
||||
@@ -93,10 +93,10 @@ def parse_action(
|
||||
action_type_str = "KEYPRESS"
|
||||
action_type = ActionType[action_type_str]
|
||||
|
||||
if not action_type.is_web_action():
|
||||
if not action_type.is_web_action() and action_type != ActionType.SCROLL:
|
||||
# LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc.
|
||||
# That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not
|
||||
# set for non-web actions.
|
||||
# set for non-web actions. SCROLL needs element_id to target a specific scrollable container.
|
||||
base_action_dict["element_id"] = None
|
||||
|
||||
if action_type == ActionType.TERMINATE:
|
||||
@@ -208,6 +208,23 @@ def parse_action(
|
||||
keys = action.get("keys", ["Enter"])
|
||||
return KeypressAction(**base_action_dict, keys=keys)
|
||||
|
||||
if action_type == ActionType.SCROLL:
|
||||
# SCROLL from extract-action prompt provides a direction and optionally an element_id
|
||||
# for the scrollable container. Convert direction to scroll_x/scroll_y pixel values.
|
||||
base_action_dict["skyvern_element_hash"] = None
|
||||
base_action_dict["skyvern_element_data"] = None
|
||||
direction = action.get("direction", "down").lower()
|
||||
if direction not in ("up", "down"):
|
||||
LOG.warning("SCROLL action has unexpected direction, defaulting to down", direction=direction)
|
||||
direction = "down"
|
||||
if direction == "up":
|
||||
scroll_x = 0
|
||||
scroll_y = -EXTRACT_ACTION_SCROLL_AMOUNT
|
||||
else:
|
||||
scroll_x = 0
|
||||
scroll_y = EXTRACT_ACTION_SCROLL_AMOUNT
|
||||
return ScrollAction(**base_action_dict, scroll_x=scroll_x, scroll_y=scroll_y)
|
||||
|
||||
if action_type == ActionType.CLOSE_PAGE:
|
||||
return ClosePageAction(**base_action_dict)
|
||||
|
||||
|
||||
@@ -2326,6 +2326,56 @@ function isWindowScrollable() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the nearest scrollable container relative to the given element and scroll it.
|
||||
* Two strategies:
|
||||
* 1) Walk up from element to find a scrollable ancestor (element is inside container)
|
||||
* 2) Walk up the DOM checking siblings at each level (element is beside container)
|
||||
* Returns true if a scrollable container was found and scrolled, false otherwise.
|
||||
*/
|
||||
function scrollNearestScrollableContainer(element, direction) {
|
||||
function isContainerScrollable(node) {
|
||||
if (!node || node === document.documentElement || node === document.body)
|
||||
return false;
|
||||
const style = window.getComputedStyle(node);
|
||||
const oy = style.overflowY;
|
||||
return (
|
||||
(oy === "auto" || oy === "scroll") &&
|
||||
node.scrollHeight > node.clientHeight
|
||||
);
|
||||
}
|
||||
|
||||
// Strategy 1: walk up from element to find a scrollable ancestor
|
||||
let target = element;
|
||||
while (target && target !== document.documentElement) {
|
||||
if (isContainerScrollable(target)) break;
|
||||
target = target.parentElement;
|
||||
}
|
||||
|
||||
// Strategy 2: walk up the DOM checking siblings at each level
|
||||
if (!target || target === document.documentElement) {
|
||||
target = null;
|
||||
let level = element.parentElement;
|
||||
while (level && level !== document.documentElement && !target) {
|
||||
for (const child of level.children) {
|
||||
if (isContainerScrollable(child)) {
|
||||
target = child;
|
||||
break;
|
||||
}
|
||||
}
|
||||
level = level.parentElement;
|
||||
}
|
||||
}
|
||||
|
||||
if (!target) return false;
|
||||
if (direction === "down") {
|
||||
target.scrollTop = target.scrollHeight;
|
||||
} else {
|
||||
target.scrollTop = 0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function scrollToElementBottom(element, page_by_page = false) {
|
||||
const top = page_by_page
|
||||
? element.clientHeight + element.scrollTop
|
||||
|
||||
Reference in New Issue
Block a user