support more anthropic actions (#2257)

2025-04-30 18:42:44 +08:00
parent f932d7d704
commit 1a33810f09
4 changed files with 369 additions and 130 deletions
--- a/skyvern-frontend/src/api/types.ts
+++ b/skyvern-frontend/src/api/types.ts
@@ -182,6 +182,8 @@ export const ActionTypes = {
  Move: "move",
  NullAction: "null_action",
  VerificationCode: "verification_code",
  Drag: "drag",
  LeftMouse: "left_mouse",
 } as const;
 export type ActionType = (typeof ActionTypes)[keyof typeof ActionTypes];
@@ -204,6 +206,8 @@ export const ReadableActionTypes: {
  move: "Move",
  null_action: "Screenshot",
  verification_code: "Verification Code",
  drag: "Drag",
  left_mouse: "Left Mouse",
 };
 export type Option = {
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -1,6 +1,6 @@
 from datetime import datetime
 from enum import StrEnum
-from typing import Annotated, Any, Type, TypeVar
+from typing import Annotated, Any, Literal, Type, TypeVar
 import structlog
 from litellm import ConfigDict
@@ -28,12 +28,14 @@ class ActionType(StrEnum):
    RELOAD_PAGE = "reload_page"
    EXTRACT = "extract"
    VERIFICATION_CODE = "verification_code"
    SCROLL = "scroll"
    KEYPRESS = "keypress"
    TYPE = "type"
    MOVE = "move"
    DRAG = "drag"
-    VERIFICATION_CODE = "verification_code"
+    LEFT_MOUSE = "left_mouse"
    def is_web_action(self) -> bool:
        return self in [
@@ -271,8 +273,8 @@ class ExtractAction(Action):
 class ScrollAction(Action):
    action_type: ActionType = ActionType.SCROLL
-    x: int
+    x: int | None = None
-    y: int
+    y: int | None = None
    scroll_x: int
    scroll_y: int
@@ -280,6 +282,8 @@ class ScrollAction(Action):
 class KeypressAction(Action):
    action_type: ActionType = ActionType.KEYPRESS
    keys: list[str] = []
    hold: bool = False
    duration: int = 0
 class MoveAction(Action):
@@ -290,8 +294,8 @@ class MoveAction(Action):
 class DragAction(Action):
    action_type: ActionType = ActionType.DRAG
-    start_x: int
+    start_x: int | None = None
-    start_y: int
+    start_y: int | None = None
    path: list[tuple[int, int]] = []
@@ -300,6 +304,13 @@ class VerificationCodeAction(Action):
    verification_code: str
 class LeftMouseAction(Action):
    action_type: ActionType = ActionType.LEFT_MOUSE
    direction: Literal["down", "up"]
    x: int | None = None
    y: int | None = None
 class ScrapeResult(BaseModel):
    """
    Scraped response from a webpage, including:
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -1563,6 +1563,7 @@ async def handle_scroll_action(
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
    if action.x and action.y:
        await page.mouse.move(action.x, action.y)
    await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
    return [ActionSuccess()]
@@ -1614,6 +1615,11 @@ async def handle_keypress_action(
        else:
            updated_keys.append(key)
    keypress_str = "+".join(updated_keys)
    if action.hold:
        await page.keyboard.down(keypress_str)
        await asyncio.sleep(action.duration)
        await page.keyboard.up(keypress_str)
    else:
        await page.keyboard.press(keypress_str)
    return [ActionSuccess()]
@@ -1636,6 +1642,7 @@ async def handle_drag_action(
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
    if action.start_x and action.start_y:
        await page.mouse.move(action.start_x, action.start_y)
    await page.mouse.down()
    for point in action.path:
@@ -1663,9 +1670,22 @@ async def handle_verification_code_action(
    return [ActionSuccess()]
-ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
+async def handle_left_mouse_action(
-ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
+    action: actions.LeftMouseAction,
-ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
+    page: Page,
    scraped_page: ScrapedPage,
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
    if action.x and action.y:
        await page.mouse.move(action.x, action.y)
    if action.direction == "down":
        await page.mouse.down()
    elif action.direction == "up":
        await page.mouse.up()
    return [ActionSuccess()]
 ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
 # ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action)
 ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
@@ -1679,6 +1699,7 @@ ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action)
 ActionHandler.register_action_type(ActionType.MOVE, handle_move_action)
 ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action)
 ActionHandler.register_action_type(ActionType.VERIFICATION_CODE, handle_verification_code_action)
 ActionHandler.register_action_type(ActionType.LEFT_MOUSE, handle_left_mouse_action)
 async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -20,6 +20,7 @@ from skyvern.webeye.actions.actions import (
    DragAction,
    InputTextAction,
    KeypressAction,
    LeftMouseAction,
    MoveAction,
    NullAction,
    ScrollAction,
@@ -287,7 +288,8 @@ async def parse_cua_actions(
                        intention=reasoning,
                    )
                case "move":
-                    reasoning = reasoning or f"Move mouse to: ({cua_action.x}, {cua_action.y})"
+                    response = f"Move mouse to: ({cua_action.x}, {cua_action.y})"
                    reasoning = reasoning or response
                    action = MoveAction(
                        x=cua_action.x,
                        y=cua_action.y,
@@ -350,7 +352,7 @@ async def parse_cua_actions(
            step_id=step.step_id,
            organization_id=task.organization_id,
            workflow_run_id=task.workflow_run_id,
-            response=response.dict(),
+            response=response.model_dump(),
        )
        reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
        assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
@@ -366,11 +368,21 @@ async def parse_anthropic_actions(
    screenshot_resize_target_dimension: Resolution,
 ) -> list[Action]:
    tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
    reasonings = [block for block in assistant_content if block["type"] == "thinking"]
    LOG.info("Anthropic tool calls", tool_calls=tool_calls, reasonings=reasonings, assistant_content=assistant_content)
    if len(reasonings) > 1:
        LOG.warning(
            "Anthropic CUA: multiple reasonings in assistant content",
            task_id=task.task_id,
            step_id=step.step_id,
            assistant_content=assistant_content,
        )
    reasoning = reasonings[0]["thinking"] if reasonings else None
    idx = 0
    actions: list[Action] = []
    LOG.info("Anthropic tool calls", tool_calls=tool_calls, assistant_content=assistant_content)
    while idx < len(tool_calls):
        tool_call = tool_calls[idx]
        try:
            tool_call_id = tool_call["id"]
            tool_call_input = tool_call.get("input")
            if not tool_call_input:
@@ -378,15 +390,27 @@ async def parse_anthropic_actions(
                continue
            action = tool_call_input["action"]
            if action == "mouse_move":
-            original_x, original_y = tool_call_input["coordinate"]
+                coordinate = tool_call_input.get("coordinate")
-            # (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension.
+                if not coordinate:
-            x, y = scale_coordinates(
+                    LOG.warning(
-                (original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension
+                        "Anthropic CUA error: mouse move action has no coordinate",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
                # (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension.
                x, y = validate_and_get_coordinates(
                    coordinate, screenshot_resize_target_dimension, browser_window_dimension
                )
                response = f"Move mouse to: ({x}, {y})"
                reasoning = reasoning or response
                actions.append(
                    # TODO: add response by adding specifying the element to move to
                    MoveAction(
                        x=x,
                        y=y,
                        reasoning=reasoning,
                        intention=reasoning,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
@@ -406,21 +430,25 @@ async def parse_anthropic_actions(
                if not coordinate:
                    LOG.warning(
-                    "Left click action has no coordinate and it doesn't have mouse_move before it",
+                        "Anthropic CUA error: left click action has no coordinate and it doesn't have mouse_move before it",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
-            original_x, original_y = coordinate
+                x, y = validate_and_get_coordinates(
-            x, y = scale_coordinates(
+                    coordinate, screenshot_resize_target_dimension, browser_window_dimension
                (original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension
                )
                response = f"Click at: ({x}, {y})"
                reasoning = reasoning or response
                actions.append(
                    ClickAction(
                        element_id="",
                        x=x,
                        y=y,
                        button="left",
                        reasoning=reasoning,
                        intention=reasoning,
                        response=response,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
@@ -434,7 +462,7 @@ async def parse_anthropic_actions(
                text = tool_call_input.get("text")
                if not text:
                    LOG.warning(
-                    "Anthropic type action has no text",
+                        "Anthropic CUA error: type action has no text",
                        tool_call=tool_call,
                    )
                    idx += 1
@@ -443,6 +471,9 @@ async def parse_anthropic_actions(
                    InputTextAction(
                        element_id="",
                        text=text,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=text,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
@@ -452,19 +483,30 @@ async def parse_anthropic_actions(
                        tool_call_id=tool_call_id,
                    )
                )
-        elif action == "key":
+            elif action in ["key", "hold_key"]:
                text = tool_call_input.get("text")
                if not text:
                    LOG.warning(
-                    "Key action has no text",
+                        "Anthropic CUA error: key action has no text",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
                response = f"Press keys: {text}"
                hold = action == "hold_key"
                duration = tool_call_input.get("duration", 0)
                if hold:
                    response = f"Hold keys for {duration} seconds: {text}"
                reasoning = reasoning or response
                actions.append(
                    KeypressAction(
                        element_id="",
                        keys=[text],
                        hold=hold,
                        duration=duration,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=response,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
@@ -477,6 +519,139 @@ async def parse_anthropic_actions(
            elif action == "screenshot":
                actions.append(
                    NullAction(
                        reasoning=reasoning,
                        intention=reasoning,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action == "scroll":
                x, y = None, None
                coordinate = tool_call_input.get("coordinate")
                if coordinate:
                    x, y = validate_and_get_coordinates(
                        coordinate, browser_window_dimension, screenshot_resize_target_dimension
                    )
                scroll_direction = tool_call_input.get("scroll_direction")
                scroll_amount = tool_call_input.get("scroll_amount")
                if scroll_direction == "up":
                    scroll_x = 0
                    scroll_y = -scroll_amount
                elif scroll_direction == "down":
                    scroll_x = 0
                    scroll_y = scroll_amount
                elif scroll_direction == "left":
                    scroll_x = -scroll_amount
                    scroll_y = 0
                elif scroll_direction == "right":
                    scroll_x = scroll_amount
                    scroll_y = 0
                else:
                    LOG.warning(
                        "Anthropic CUA error: unsupported scroll direction",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
                response = f"Scroll by: ({scroll_x}, {scroll_y})"
                reasoning = reasoning or response
                actions.append(
                    ScrollAction(
                        element_id="",
                        x=x,
                        y=y,
                        scroll_x=scroll_x,
                        scroll_y=scroll_y,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=response,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action in ["left_mouse_down", "left_mouse_up"]:
                coordinate = tool_call_input.get("coordinate")
                x, y = None, None
                if coordinate:
                    x, y = validate_and_get_coordinates(
                        coordinate, browser_window_dimension, screenshot_resize_target_dimension
                    )
                direction = "down" if action == "left_mouse_down" else "up"
                response = f"Left mouse {direction} at: ({x}, {y})"
                reasoning = reasoning or response
                actions.append(
                    LeftMouseAction(
                        x=x,
                        y=y,
                        direction=direction,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=response,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action == "left_click_drag":
                coordinate = tool_call_input.get("coordinate")
                start_coordinate = tool_call_input.get("start_coordinate")
                LOG.info(
                    "Anthropic CUA left click drag action", coordinate=coordinate, start_coordinate=start_coordinate
                )
                if not coordinate or not start_coordinate:
                    LOG.warning(
                        "Anthropic CUA error: left click drag action has no coordinate or start coordinate",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
                x, y = validate_and_get_coordinates(
                    coordinate, browser_window_dimension, screenshot_resize_target_dimension
                )
                start_x, start_y = validate_and_get_coordinates(
                    start_coordinate, browser_window_dimension, screenshot_resize_target_dimension
                )
                response = f"Drag from ({start_x}, {start_y}) to ({x}, {y})"
                reasoning = reasoning or response
                actions.append(
                    DragAction(
                        start_x=start_x,
                        start_y=start_y,
                        path=[(x, y)],
                        reasoning=reasoning,
                        intention=reasoning,
                        response=response,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action == "wait":
                duration = tool_call_input.get("duration", 5)
                actions.append(
                    WaitAction(
                        seconds=duration,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=f"Wait for {duration} seconds",
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
@@ -488,13 +663,41 @@ async def parse_anthropic_actions(
                )
            else:
                LOG.error(
-                "Unsupported action",
+                    "Anthropic CUA error: unsupported action",
                    tool_call=tool_call,
                )
            idx += 1
        except Exception:
            LOG.exception(
                "Anthropic CUA error: failed to parse action",
                task_id=task.task_id,
                step_id=step.step_id,
                tool_call=tool_call,
            )
            break
    if not actions:
        reasoning = reasonings[0]["thinking"] if reasonings else None
        assistant_messages = [block for block in assistant_content if block["type"] == "text"]
        assistant_message = assistant_messages[0]["text"] if assistant_messages else None
        actions = await generate_cua_fallback_actions(task, step, assistant_message, reasoning)
    return actions
 # function from anthropic's quickstart guide
 # https://github.com/anthropics/anthropic-quickstarts/blob/81c4085944abb1734db411f05290b538fdc46dcd/computer-use-demo/computer_use_demo/tools/computer.py#L214C1-L221C1
 def validate_and_get_coordinates(
    coordinate: tuple[int, int] | list[int],
    current_dimension: Resolution,
    target_dimension: Resolution,
 ) -> tuple[int, int]:
    if len(coordinate) != 2:
        raise ValueError(f"{coordinate} must be a tuple of length 2")
    if not all(isinstance(i, int) and i >= 0 for i in coordinate):
        raise ValueError(f"{coordinate} must be a tuple of non-negative ints")
    return scale_coordinates((coordinate[0], coordinate[1]), current_dimension, target_dimension)
 async def generate_cua_fallback_actions(
    task: Task,
    step: Step,