Add SCROLL action to extract-action prompt (#SKY-7924) (#4743)

2026-02-13 09:42:49 -08:00
parent 0243ae1375
commit 53a89c3e44
7 changed files with 109 additions and 7 deletions
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -6,7 +6,7 @@ import structlog
 from openai.types.responses.response import Response as OpenAIResponse
 from pydantic import ValidationError

-from skyvern.constants import SCROLL_AMOUNT_MULTIPLIER
+from skyvern.constants import EXTRACT_ACTION_SCROLL_AMOUNT, SCROLL_AMOUNT_MULTIPLIER
 from skyvern.exceptions import FailedToGetTOTPVerificationCode, NoTOTPVerificationCodeFound, UnsupportedActionType
 from skyvern.forge import app
 from skyvern.forge.prompts import prompt_engine
@@ -93,10 +93,10 @@ def parse_action(
        action_type_str = "KEYPRESS"
    action_type = ActionType[action_type_str]

-    if not action_type.is_web_action():
+    if not action_type.is_web_action() and action_type != ActionType.SCROLL:
        # LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc.
        # That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not
-        # set for non-web actions.
+        # set for non-web actions. SCROLL needs element_id to target a specific scrollable container.
        base_action_dict["element_id"] = None

    if action_type == ActionType.TERMINATE:
@@ -208,6 +208,23 @@ def parse_action(
            keys = action.get("keys", ["Enter"])
        return KeypressAction(**base_action_dict, keys=keys)

+    if action_type == ActionType.SCROLL:
+        # SCROLL from extract-action prompt provides a direction and optionally an element_id
+        # for the scrollable container. Convert direction to scroll_x/scroll_y pixel values.
+        base_action_dict["skyvern_element_hash"] = None
+        base_action_dict["skyvern_element_data"] = None
+        direction = action.get("direction", "down").lower()
+        if direction not in ("up", "down"):
+            LOG.warning("SCROLL action has unexpected direction, defaulting to down", direction=direction)
+            direction = "down"
+        if direction == "up":
+            scroll_x = 0
+            scroll_y = -EXTRACT_ACTION_SCROLL_AMOUNT
+        else:
+            scroll_x = 0
+            scroll_y = EXTRACT_ACTION_SCROLL_AMOUNT
+        return ScrollAction(**base_action_dict, scroll_x=scroll_x, scroll_y=scroll_y)
+
    if action_type == ActionType.CLOSE_PAGE:
        return ClosePageAction(**base_action_dict)