Add SCROLL action to extract-action prompt (#SKY-7924) (#4743)

This commit is contained in:
pedrohsdb
2026-02-13 09:42:49 -08:00
committed by GitHub
parent 0243ae1375
commit 53a89c3e44
7 changed files with 109 additions and 7 deletions

View File

@@ -6,7 +6,7 @@ import structlog
from openai.types.responses.response import Response as OpenAIResponse
from pydantic import ValidationError
from skyvern.constants import SCROLL_AMOUNT_MULTIPLIER
from skyvern.constants import EXTRACT_ACTION_SCROLL_AMOUNT, SCROLL_AMOUNT_MULTIPLIER
from skyvern.exceptions import FailedToGetTOTPVerificationCode, NoTOTPVerificationCodeFound, UnsupportedActionType
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
@@ -93,10 +93,10 @@ def parse_action(
action_type_str = "KEYPRESS"
action_type = ActionType[action_type_str]
if not action_type.is_web_action():
if not action_type.is_web_action() and action_type != ActionType.SCROLL:
# LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc.
# That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not
# set for non-web actions.
# set for non-web actions. SCROLL needs element_id to target a specific scrollable container.
base_action_dict["element_id"] = None
if action_type == ActionType.TERMINATE:
@@ -208,6 +208,23 @@ def parse_action(
keys = action.get("keys", ["Enter"])
return KeypressAction(**base_action_dict, keys=keys)
if action_type == ActionType.SCROLL:
# SCROLL from extract-action prompt provides a direction and optionally an element_id
# for the scrollable container. Convert direction to scroll_x/scroll_y pixel values.
base_action_dict["skyvern_element_hash"] = None
base_action_dict["skyvern_element_data"] = None
direction = action.get("direction", "down").lower()
if direction not in ("up", "down"):
LOG.warning("SCROLL action has unexpected direction, defaulting to down", direction=direction)
direction = "down"
if direction == "up":
scroll_x = 0
scroll_y = -EXTRACT_ACTION_SCROLL_AMOUNT
else:
scroll_x = 0
scroll_y = EXTRACT_ACTION_SCROLL_AMOUNT
return ScrollAction(**base_action_dict, scroll_x=scroll_x, scroll_y=scroll_y)
if action_type == ActionType.CLOSE_PAGE:
return ClosePageAction(**base_action_dict)