From 1a33810f0900b6cfdf0b708e3bc4e1810feedf85 Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Wed, 30 Apr 2025 18:42:44 +0800 Subject: [PATCH] support more anthropic actions (#2257) --- skyvern-frontend/src/api/types.ts | 4 + skyvern/webeye/actions/actions.py | 23 +- skyvern/webeye/actions/handler.py | 33 +- skyvern/webeye/actions/parse_actions.py | 439 +++++++++++++++++------- 4 files changed, 369 insertions(+), 130 deletions(-) diff --git a/skyvern-frontend/src/api/types.ts b/skyvern-frontend/src/api/types.ts index 92f008e5..206f9166 100644 --- a/skyvern-frontend/src/api/types.ts +++ b/skyvern-frontend/src/api/types.ts @@ -182,6 +182,8 @@ export const ActionTypes = { Move: "move", NullAction: "null_action", VerificationCode: "verification_code", + Drag: "drag", + LeftMouse: "left_mouse", } as const; export type ActionType = (typeof ActionTypes)[keyof typeof ActionTypes]; @@ -204,6 +206,8 @@ export const ReadableActionTypes: { move: "Move", null_action: "Screenshot", verification_code: "Verification Code", + drag: "Drag", + left_mouse: "Left Mouse", }; export type Option = { diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index a3d3e8ae..dcd6b52a 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -1,6 +1,6 @@ from datetime import datetime from enum import StrEnum -from typing import Annotated, Any, Type, TypeVar +from typing import Annotated, Any, Literal, Type, TypeVar import structlog from litellm import ConfigDict @@ -28,12 +28,14 @@ class ActionType(StrEnum): RELOAD_PAGE = "reload_page" EXTRACT = "extract" + VERIFICATION_CODE = "verification_code" + SCROLL = "scroll" KEYPRESS = "keypress" TYPE = "type" MOVE = "move" DRAG = "drag" - VERIFICATION_CODE = "verification_code" + LEFT_MOUSE = "left_mouse" def is_web_action(self) -> bool: return self in [ @@ -271,8 +273,8 @@ class ExtractAction(Action): class ScrollAction(Action): action_type: ActionType = ActionType.SCROLL - x: int - y: int + x: int | None = None + y: int | None = None scroll_x: int scroll_y: int @@ -280,6 +282,8 @@ class ScrollAction(Action): class KeypressAction(Action): action_type: ActionType = ActionType.KEYPRESS keys: list[str] = [] + hold: bool = False + duration: int = 0 class MoveAction(Action): @@ -290,8 +294,8 @@ class MoveAction(Action): class DragAction(Action): action_type: ActionType = ActionType.DRAG - start_x: int - start_y: int + start_x: int | None = None + start_y: int | None = None path: list[tuple[int, int]] = [] @@ -300,6 +304,13 @@ class VerificationCodeAction(Action): verification_code: str +class LeftMouseAction(Action): + action_type: ActionType = ActionType.LEFT_MOUSE + direction: Literal["down", "up"] + x: int | None = None + y: int | None = None + + class ScrapeResult(BaseModel): """ Scraped response from a webpage, including: diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 874b6925..5e8b0c46 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -1563,7 +1563,8 @@ async def handle_scroll_action( task: Task, step: Step, ) -> list[ActionResult]: - await page.mouse.move(action.x, action.y) + if action.x and action.y: + await page.mouse.move(action.x, action.y) await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})") return [ActionSuccess()] @@ -1614,7 +1615,12 @@ async def handle_keypress_action( else: updated_keys.append(key) keypress_str = "+".join(updated_keys) - await page.keyboard.press(keypress_str) + if action.hold: + await page.keyboard.down(keypress_str) + await asyncio.sleep(action.duration) + await page.keyboard.up(keypress_str) + else: + await page.keyboard.press(keypress_str) return [ActionSuccess()] @@ -1636,7 +1642,8 @@ async def handle_drag_action( task: Task, step: Step, ) -> list[ActionResult]: - await page.mouse.move(action.start_x, action.start_y) + if action.start_x and action.start_y: + await page.mouse.move(action.start_x, action.start_y) await page.mouse.down() for point in action.path: x, y = point[0], point[1] @@ -1663,9 +1670,22 @@ async def handle_verification_code_action( return [ActionSuccess()] -ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action) -ActionHandler.register_action_type(ActionType.CLICK, handle_click_action) -ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action) +async def handle_left_mouse_action( + action: actions.LeftMouseAction, + page: Page, + scraped_page: ScrapedPage, + task: Task, + step: Step, +) -> list[ActionResult]: + if action.x and action.y: + await page.mouse.move(action.x, action.y) + if action.direction == "down": + await page.mouse.down() + elif action.direction == "up": + await page.mouse.up() + return [ActionSuccess()] + + ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action) # ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action) ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action) @@ -1679,6 +1699,7 @@ ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action) ActionHandler.register_action_type(ActionType.MOVE, handle_move_action) ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action) ActionHandler.register_action_type(ActionType.VERIFICATION_CODE, handle_verification_code_action) +ActionHandler.register_action_type(ActionType.LEFT_MOUSE, handle_left_mouse_action) async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any: diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index d7050c6f..284d42fb 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -20,6 +20,7 @@ from skyvern.webeye.actions.actions import ( DragAction, InputTextAction, KeypressAction, + LeftMouseAction, MoveAction, NullAction, ScrollAction, @@ -287,7 +288,8 @@ async def parse_cua_actions( intention=reasoning, ) case "move": - reasoning = reasoning or f"Move mouse to: ({cua_action.x}, {cua_action.y})" + response = f"Move mouse to: ({cua_action.x}, {cua_action.y})" + reasoning = reasoning or response action = MoveAction( x=cua_action.x, y=cua_action.y, @@ -350,7 +352,7 @@ async def parse_cua_actions( step_id=step.step_id, organization_id=task.organization_id, workflow_run_id=task.workflow_run_id, - response=response.dict(), + response=response.model_dump(), ) reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None assistant_message = assistant_messages[0].content[0].text if assistant_messages else None @@ -366,135 +368,336 @@ async def parse_anthropic_actions( screenshot_resize_target_dimension: Resolution, ) -> list[Action]: tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"] + reasonings = [block for block in assistant_content if block["type"] == "thinking"] + LOG.info("Anthropic tool calls", tool_calls=tool_calls, reasonings=reasonings, assistant_content=assistant_content) + if len(reasonings) > 1: + LOG.warning( + "Anthropic CUA: multiple reasonings in assistant content", + task_id=task.task_id, + step_id=step.step_id, + assistant_content=assistant_content, + ) + reasoning = reasonings[0]["thinking"] if reasonings else None idx = 0 actions: list[Action] = [] - LOG.info("Anthropic tool calls", tool_calls=tool_calls, assistant_content=assistant_content) while idx < len(tool_calls): tool_call = tool_calls[idx] - tool_call_id = tool_call["id"] - tool_call_input = tool_call.get("input") - if not tool_call_input: - idx += 1 - continue - action = tool_call_input["action"] - if action == "mouse_move": - original_x, original_y = tool_call_input["coordinate"] - # (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension. - x, y = scale_coordinates( - (original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension - ) - actions.append( - MoveAction( - x=x, - y=y, - organization_id=task.organization_id, - workflow_run_id=task.workflow_run_id, - task_id=task.task_id, - step_id=step.step_id, - step_order=step.order, - action_order=idx, - tool_call_id=tool_call_id, + try: + tool_call_id = tool_call["id"] + tool_call_input = tool_call.get("input") + if not tool_call_input: + idx += 1 + continue + action = tool_call_input["action"] + if action == "mouse_move": + coordinate = tool_call_input.get("coordinate") + if not coordinate: + LOG.warning( + "Anthropic CUA error: mouse move action has no coordinate", + tool_call=tool_call, + ) + idx += 1 + continue + # (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension. + x, y = validate_and_get_coordinates( + coordinate, screenshot_resize_target_dimension, browser_window_dimension ) - ) - elif action == "left_click": - coordinate = tool_call_input.get("coordinate") - if not coordinate and idx - 1 >= 0: - prev_tool_call = tool_calls[idx - 1] - prev_tool_call_input = prev_tool_call.get("input") - if prev_tool_call_input and prev_tool_call_input["action"] == "mouse_move": - coordinate = prev_tool_call_input.get("coordinate") + response = f"Move mouse to: ({x}, {y})" + reasoning = reasoning or response + actions.append( + # TODO: add response by adding specifying the element to move to + MoveAction( + x=x, + y=y, + reasoning=reasoning, + intention=reasoning, + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + tool_call_id=tool_call_id, + ) + ) + elif action == "left_click": + coordinate = tool_call_input.get("coordinate") + if not coordinate and idx - 1 >= 0: + prev_tool_call = tool_calls[idx - 1] + prev_tool_call_input = prev_tool_call.get("input") + if prev_tool_call_input and prev_tool_call_input["action"] == "mouse_move": + coordinate = prev_tool_call_input.get("coordinate") - if not coordinate: - LOG.warning( - "Left click action has no coordinate and it doesn't have mouse_move before it", + if not coordinate: + LOG.warning( + "Anthropic CUA error: left click action has no coordinate and it doesn't have mouse_move before it", + tool_call=tool_call, + ) + idx += 1 + continue + x, y = validate_and_get_coordinates( + coordinate, screenshot_resize_target_dimension, browser_window_dimension + ) + response = f"Click at: ({x}, {y})" + reasoning = reasoning or response + actions.append( + ClickAction( + element_id="", + x=x, + y=y, + button="left", + reasoning=reasoning, + intention=reasoning, + response=response, + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + tool_call_id=tool_call_id, + ) + ) + elif action == "type": + text = tool_call_input.get("text") + if not text: + LOG.warning( + "Anthropic CUA error: type action has no text", + tool_call=tool_call, + ) + idx += 1 + continue + actions.append( + InputTextAction( + element_id="", + text=text, + reasoning=reasoning, + intention=reasoning, + response=text, + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + tool_call_id=tool_call_id, + ) + ) + elif action in ["key", "hold_key"]: + text = tool_call_input.get("text") + if not text: + LOG.warning( + "Anthropic CUA error: key action has no text", + tool_call=tool_call, + ) + idx += 1 + continue + response = f"Press keys: {text}" + hold = action == "hold_key" + duration = tool_call_input.get("duration", 0) + if hold: + response = f"Hold keys for {duration} seconds: {text}" + reasoning = reasoning or response + actions.append( + KeypressAction( + element_id="", + keys=[text], + hold=hold, + duration=duration, + reasoning=reasoning, + intention=reasoning, + response=response, + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + tool_call_id=tool_call_id, + ) + ) + elif action == "screenshot": + actions.append( + NullAction( + reasoning=reasoning, + intention=reasoning, + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + tool_call_id=tool_call_id, + ) + ) + elif action == "scroll": + x, y = None, None + coordinate = tool_call_input.get("coordinate") + if coordinate: + x, y = validate_and_get_coordinates( + coordinate, browser_window_dimension, screenshot_resize_target_dimension + ) + scroll_direction = tool_call_input.get("scroll_direction") + scroll_amount = tool_call_input.get("scroll_amount") + if scroll_direction == "up": + scroll_x = 0 + scroll_y = -scroll_amount + elif scroll_direction == "down": + scroll_x = 0 + scroll_y = scroll_amount + elif scroll_direction == "left": + scroll_x = -scroll_amount + scroll_y = 0 + elif scroll_direction == "right": + scroll_x = scroll_amount + scroll_y = 0 + else: + LOG.warning( + "Anthropic CUA error: unsupported scroll direction", + tool_call=tool_call, + ) + idx += 1 + continue + response = f"Scroll by: ({scroll_x}, {scroll_y})" + reasoning = reasoning or response + actions.append( + ScrollAction( + element_id="", + x=x, + y=y, + scroll_x=scroll_x, + scroll_y=scroll_y, + reasoning=reasoning, + intention=reasoning, + response=response, + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + tool_call_id=tool_call_id, + ) + ) + elif action in ["left_mouse_down", "left_mouse_up"]: + coordinate = tool_call_input.get("coordinate") + x, y = None, None + if coordinate: + x, y = validate_and_get_coordinates( + coordinate, browser_window_dimension, screenshot_resize_target_dimension + ) + direction = "down" if action == "left_mouse_down" else "up" + response = f"Left mouse {direction} at: ({x}, {y})" + reasoning = reasoning or response + actions.append( + LeftMouseAction( + x=x, + y=y, + direction=direction, + reasoning=reasoning, + intention=reasoning, + response=response, + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + tool_call_id=tool_call_id, + ) + ) + elif action == "left_click_drag": + coordinate = tool_call_input.get("coordinate") + start_coordinate = tool_call_input.get("start_coordinate") + LOG.info( + "Anthropic CUA left click drag action", coordinate=coordinate, start_coordinate=start_coordinate + ) + if not coordinate or not start_coordinate: + LOG.warning( + "Anthropic CUA error: left click drag action has no coordinate or start coordinate", + tool_call=tool_call, + ) + idx += 1 + continue + x, y = validate_and_get_coordinates( + coordinate, browser_window_dimension, screenshot_resize_target_dimension + ) + start_x, start_y = validate_and_get_coordinates( + start_coordinate, browser_window_dimension, screenshot_resize_target_dimension + ) + response = f"Drag from ({start_x}, {start_y}) to ({x}, {y})" + reasoning = reasoning or response + actions.append( + DragAction( + start_x=start_x, + start_y=start_y, + path=[(x, y)], + reasoning=reasoning, + intention=reasoning, + response=response, + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + tool_call_id=tool_call_id, + ) + ) + elif action == "wait": + duration = tool_call_input.get("duration", 5) + actions.append( + WaitAction( + seconds=duration, + reasoning=reasoning, + intention=reasoning, + response=f"Wait for {duration} seconds", + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + tool_call_id=tool_call_id, + ) + ) + else: + LOG.error( + "Anthropic CUA error: unsupported action", tool_call=tool_call, ) - idx += 1 - continue - original_x, original_y = coordinate - x, y = scale_coordinates( - (original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension - ) - actions.append( - ClickAction( - element_id="", - x=x, - y=y, - button="left", - organization_id=task.organization_id, - workflow_run_id=task.workflow_run_id, - task_id=task.task_id, - step_id=step.step_id, - step_order=step.order, - action_order=idx, - tool_call_id=tool_call_id, - ) - ) - elif action == "type": - text = tool_call_input.get("text") - if not text: - LOG.warning( - "Anthropic type action has no text", - tool_call=tool_call, - ) - idx += 1 - continue - actions.append( - InputTextAction( - element_id="", - text=text, - organization_id=task.organization_id, - workflow_run_id=task.workflow_run_id, - task_id=task.task_id, - step_id=step.step_id, - step_order=step.order, - action_order=idx, - tool_call_id=tool_call_id, - ) - ) - elif action == "key": - text = tool_call_input.get("text") - if not text: - LOG.warning( - "Key action has no text", - tool_call=tool_call, - ) - idx += 1 - continue - actions.append( - KeypressAction( - element_id="", - keys=[text], - organization_id=task.organization_id, - workflow_run_id=task.workflow_run_id, - task_id=task.task_id, - step_id=step.step_id, - step_order=step.order, - action_order=idx, - tool_call_id=tool_call_id, - ) - ) - elif action == "screenshot": - actions.append( - NullAction( - organization_id=task.organization_id, - workflow_run_id=task.workflow_run_id, - task_id=task.task_id, - step_id=step.step_id, - step_order=step.order, - action_order=idx, - tool_call_id=tool_call_id, - ) - ) - else: - LOG.error( - "Unsupported action", + idx += 1 + except Exception: + LOG.exception( + "Anthropic CUA error: failed to parse action", + task_id=task.task_id, + step_id=step.step_id, tool_call=tool_call, ) - idx += 1 + break + if not actions: + reasoning = reasonings[0]["thinking"] if reasonings else None + assistant_messages = [block for block in assistant_content if block["type"] == "text"] + assistant_message = assistant_messages[0]["text"] if assistant_messages else None + actions = await generate_cua_fallback_actions(task, step, assistant_message, reasoning) return actions +# function from anthropic's quickstart guide +# https://github.com/anthropics/anthropic-quickstarts/blob/81c4085944abb1734db411f05290b538fdc46dcd/computer-use-demo/computer_use_demo/tools/computer.py#L214C1-L221C1 +def validate_and_get_coordinates( + coordinate: tuple[int, int] | list[int], + current_dimension: Resolution, + target_dimension: Resolution, +) -> tuple[int, int]: + if len(coordinate) != 2: + raise ValueError(f"{coordinate} must be a tuple of length 2") + if not all(isinstance(i, int) and i >= 0 for i in coordinate): + raise ValueError(f"{coordinate} must be a tuple of non-negative ints") + + return scale_coordinates((coordinate[0], coordinate[1]), current_dimension, target_dimension) + + async def generate_cua_fallback_actions( task: Task, step: Step,