support more anthropic actions (#2257)

This commit is contained in:
Shuchang Zheng
2025-04-30 18:42:44 +08:00
committed by GitHub
parent f932d7d704
commit 1a33810f09
4 changed files with 369 additions and 130 deletions

View File

@@ -182,6 +182,8 @@ export const ActionTypes = {
Move: "move", Move: "move",
NullAction: "null_action", NullAction: "null_action",
VerificationCode: "verification_code", VerificationCode: "verification_code",
Drag: "drag",
LeftMouse: "left_mouse",
} as const; } as const;
export type ActionType = (typeof ActionTypes)[keyof typeof ActionTypes]; export type ActionType = (typeof ActionTypes)[keyof typeof ActionTypes];
@@ -204,6 +206,8 @@ export const ReadableActionTypes: {
move: "Move", move: "Move",
null_action: "Screenshot", null_action: "Screenshot",
verification_code: "Verification Code", verification_code: "Verification Code",
drag: "Drag",
left_mouse: "Left Mouse",
}; };
export type Option = { export type Option = {

View File

@@ -1,6 +1,6 @@
from datetime import datetime from datetime import datetime
from enum import StrEnum from enum import StrEnum
from typing import Annotated, Any, Type, TypeVar from typing import Annotated, Any, Literal, Type, TypeVar
import structlog import structlog
from litellm import ConfigDict from litellm import ConfigDict
@@ -28,12 +28,14 @@ class ActionType(StrEnum):
RELOAD_PAGE = "reload_page" RELOAD_PAGE = "reload_page"
EXTRACT = "extract" EXTRACT = "extract"
VERIFICATION_CODE = "verification_code"
SCROLL = "scroll" SCROLL = "scroll"
KEYPRESS = "keypress" KEYPRESS = "keypress"
TYPE = "type" TYPE = "type"
MOVE = "move" MOVE = "move"
DRAG = "drag" DRAG = "drag"
VERIFICATION_CODE = "verification_code" LEFT_MOUSE = "left_mouse"
def is_web_action(self) -> bool: def is_web_action(self) -> bool:
return self in [ return self in [
@@ -271,8 +273,8 @@ class ExtractAction(Action):
class ScrollAction(Action): class ScrollAction(Action):
action_type: ActionType = ActionType.SCROLL action_type: ActionType = ActionType.SCROLL
x: int x: int | None = None
y: int y: int | None = None
scroll_x: int scroll_x: int
scroll_y: int scroll_y: int
@@ -280,6 +282,8 @@ class ScrollAction(Action):
class KeypressAction(Action): class KeypressAction(Action):
action_type: ActionType = ActionType.KEYPRESS action_type: ActionType = ActionType.KEYPRESS
keys: list[str] = [] keys: list[str] = []
hold: bool = False
duration: int = 0
class MoveAction(Action): class MoveAction(Action):
@@ -290,8 +294,8 @@ class MoveAction(Action):
class DragAction(Action): class DragAction(Action):
action_type: ActionType = ActionType.DRAG action_type: ActionType = ActionType.DRAG
start_x: int start_x: int | None = None
start_y: int start_y: int | None = None
path: list[tuple[int, int]] = [] path: list[tuple[int, int]] = []
@@ -300,6 +304,13 @@ class VerificationCodeAction(Action):
verification_code: str verification_code: str
class LeftMouseAction(Action):
action_type: ActionType = ActionType.LEFT_MOUSE
direction: Literal["down", "up"]
x: int | None = None
y: int | None = None
class ScrapeResult(BaseModel): class ScrapeResult(BaseModel):
""" """
Scraped response from a webpage, including: Scraped response from a webpage, including:

View File

@@ -1563,7 +1563,8 @@ async def handle_scroll_action(
task: Task, task: Task,
step: Step, step: Step,
) -> list[ActionResult]: ) -> list[ActionResult]:
await page.mouse.move(action.x, action.y) if action.x and action.y:
await page.mouse.move(action.x, action.y)
await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})") await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
return [ActionSuccess()] return [ActionSuccess()]
@@ -1614,7 +1615,12 @@ async def handle_keypress_action(
else: else:
updated_keys.append(key) updated_keys.append(key)
keypress_str = "+".join(updated_keys) keypress_str = "+".join(updated_keys)
await page.keyboard.press(keypress_str) if action.hold:
await page.keyboard.down(keypress_str)
await asyncio.sleep(action.duration)
await page.keyboard.up(keypress_str)
else:
await page.keyboard.press(keypress_str)
return [ActionSuccess()] return [ActionSuccess()]
@@ -1636,7 +1642,8 @@ async def handle_drag_action(
task: Task, task: Task,
step: Step, step: Step,
) -> list[ActionResult]: ) -> list[ActionResult]:
await page.mouse.move(action.start_x, action.start_y) if action.start_x and action.start_y:
await page.mouse.move(action.start_x, action.start_y)
await page.mouse.down() await page.mouse.down()
for point in action.path: for point in action.path:
x, y = point[0], point[1] x, y = point[0], point[1]
@@ -1663,9 +1670,22 @@ async def handle_verification_code_action(
return [ActionSuccess()] return [ActionSuccess()]
ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action) async def handle_left_mouse_action(
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action) action: actions.LeftMouseAction,
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action) page: Page,
scraped_page: ScrapedPage,
task: Task,
step: Step,
) -> list[ActionResult]:
if action.x and action.y:
await page.mouse.move(action.x, action.y)
if action.direction == "down":
await page.mouse.down()
elif action.direction == "up":
await page.mouse.up()
return [ActionSuccess()]
ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action) ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
# ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action) # ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action)
ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action) ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
@@ -1679,6 +1699,7 @@ ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action)
ActionHandler.register_action_type(ActionType.MOVE, handle_move_action) ActionHandler.register_action_type(ActionType.MOVE, handle_move_action)
ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action) ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action)
ActionHandler.register_action_type(ActionType.VERIFICATION_CODE, handle_verification_code_action) ActionHandler.register_action_type(ActionType.VERIFICATION_CODE, handle_verification_code_action)
ActionHandler.register_action_type(ActionType.LEFT_MOUSE, handle_left_mouse_action)
async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any: async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:

View File

@@ -20,6 +20,7 @@ from skyvern.webeye.actions.actions import (
DragAction, DragAction,
InputTextAction, InputTextAction,
KeypressAction, KeypressAction,
LeftMouseAction,
MoveAction, MoveAction,
NullAction, NullAction,
ScrollAction, ScrollAction,
@@ -287,7 +288,8 @@ async def parse_cua_actions(
intention=reasoning, intention=reasoning,
) )
case "move": case "move":
reasoning = reasoning or f"Move mouse to: ({cua_action.x}, {cua_action.y})" response = f"Move mouse to: ({cua_action.x}, {cua_action.y})"
reasoning = reasoning or response
action = MoveAction( action = MoveAction(
x=cua_action.x, x=cua_action.x,
y=cua_action.y, y=cua_action.y,
@@ -350,7 +352,7 @@ async def parse_cua_actions(
step_id=step.step_id, step_id=step.step_id,
organization_id=task.organization_id, organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id, workflow_run_id=task.workflow_run_id,
response=response.dict(), response=response.model_dump(),
) )
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
@@ -366,135 +368,336 @@ async def parse_anthropic_actions(
screenshot_resize_target_dimension: Resolution, screenshot_resize_target_dimension: Resolution,
) -> list[Action]: ) -> list[Action]:
tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"] tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
reasonings = [block for block in assistant_content if block["type"] == "thinking"]
LOG.info("Anthropic tool calls", tool_calls=tool_calls, reasonings=reasonings, assistant_content=assistant_content)
if len(reasonings) > 1:
LOG.warning(
"Anthropic CUA: multiple reasonings in assistant content",
task_id=task.task_id,
step_id=step.step_id,
assistant_content=assistant_content,
)
reasoning = reasonings[0]["thinking"] if reasonings else None
idx = 0 idx = 0
actions: list[Action] = [] actions: list[Action] = []
LOG.info("Anthropic tool calls", tool_calls=tool_calls, assistant_content=assistant_content)
while idx < len(tool_calls): while idx < len(tool_calls):
tool_call = tool_calls[idx] tool_call = tool_calls[idx]
tool_call_id = tool_call["id"] try:
tool_call_input = tool_call.get("input") tool_call_id = tool_call["id"]
if not tool_call_input: tool_call_input = tool_call.get("input")
idx += 1 if not tool_call_input:
continue idx += 1
action = tool_call_input["action"] continue
if action == "mouse_move": action = tool_call_input["action"]
original_x, original_y = tool_call_input["coordinate"] if action == "mouse_move":
# (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension. coordinate = tool_call_input.get("coordinate")
x, y = scale_coordinates( if not coordinate:
(original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension LOG.warning(
) "Anthropic CUA error: mouse move action has no coordinate",
actions.append( tool_call=tool_call,
MoveAction( )
x=x, idx += 1
y=y, continue
organization_id=task.organization_id, # (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension.
workflow_run_id=task.workflow_run_id, x, y = validate_and_get_coordinates(
task_id=task.task_id, coordinate, screenshot_resize_target_dimension, browser_window_dimension
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
) )
) response = f"Move mouse to: ({x}, {y})"
elif action == "left_click": reasoning = reasoning or response
coordinate = tool_call_input.get("coordinate") actions.append(
if not coordinate and idx - 1 >= 0: # TODO: add response by adding specifying the element to move to
prev_tool_call = tool_calls[idx - 1] MoveAction(
prev_tool_call_input = prev_tool_call.get("input") x=x,
if prev_tool_call_input and prev_tool_call_input["action"] == "mouse_move": y=y,
coordinate = prev_tool_call_input.get("coordinate") reasoning=reasoning,
intention=reasoning,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "left_click":
coordinate = tool_call_input.get("coordinate")
if not coordinate and idx - 1 >= 0:
prev_tool_call = tool_calls[idx - 1]
prev_tool_call_input = prev_tool_call.get("input")
if prev_tool_call_input and prev_tool_call_input["action"] == "mouse_move":
coordinate = prev_tool_call_input.get("coordinate")
if not coordinate: if not coordinate:
LOG.warning( LOG.warning(
"Left click action has no coordinate and it doesn't have mouse_move before it", "Anthropic CUA error: left click action has no coordinate and it doesn't have mouse_move before it",
tool_call=tool_call,
)
idx += 1
continue
x, y = validate_and_get_coordinates(
coordinate, screenshot_resize_target_dimension, browser_window_dimension
)
response = f"Click at: ({x}, {y})"
reasoning = reasoning or response
actions.append(
ClickAction(
element_id="",
x=x,
y=y,
button="left",
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "type":
text = tool_call_input.get("text")
if not text:
LOG.warning(
"Anthropic CUA error: type action has no text",
tool_call=tool_call,
)
idx += 1
continue
actions.append(
InputTextAction(
element_id="",
text=text,
reasoning=reasoning,
intention=reasoning,
response=text,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action in ["key", "hold_key"]:
text = tool_call_input.get("text")
if not text:
LOG.warning(
"Anthropic CUA error: key action has no text",
tool_call=tool_call,
)
idx += 1
continue
response = f"Press keys: {text}"
hold = action == "hold_key"
duration = tool_call_input.get("duration", 0)
if hold:
response = f"Hold keys for {duration} seconds: {text}"
reasoning = reasoning or response
actions.append(
KeypressAction(
element_id="",
keys=[text],
hold=hold,
duration=duration,
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "screenshot":
actions.append(
NullAction(
reasoning=reasoning,
intention=reasoning,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "scroll":
x, y = None, None
coordinate = tool_call_input.get("coordinate")
if coordinate:
x, y = validate_and_get_coordinates(
coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
scroll_direction = tool_call_input.get("scroll_direction")
scroll_amount = tool_call_input.get("scroll_amount")
if scroll_direction == "up":
scroll_x = 0
scroll_y = -scroll_amount
elif scroll_direction == "down":
scroll_x = 0
scroll_y = scroll_amount
elif scroll_direction == "left":
scroll_x = -scroll_amount
scroll_y = 0
elif scroll_direction == "right":
scroll_x = scroll_amount
scroll_y = 0
else:
LOG.warning(
"Anthropic CUA error: unsupported scroll direction",
tool_call=tool_call,
)
idx += 1
continue
response = f"Scroll by: ({scroll_x}, {scroll_y})"
reasoning = reasoning or response
actions.append(
ScrollAction(
element_id="",
x=x,
y=y,
scroll_x=scroll_x,
scroll_y=scroll_y,
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action in ["left_mouse_down", "left_mouse_up"]:
coordinate = tool_call_input.get("coordinate")
x, y = None, None
if coordinate:
x, y = validate_and_get_coordinates(
coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
direction = "down" if action == "left_mouse_down" else "up"
response = f"Left mouse {direction} at: ({x}, {y})"
reasoning = reasoning or response
actions.append(
LeftMouseAction(
x=x,
y=y,
direction=direction,
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "left_click_drag":
coordinate = tool_call_input.get("coordinate")
start_coordinate = tool_call_input.get("start_coordinate")
LOG.info(
"Anthropic CUA left click drag action", coordinate=coordinate, start_coordinate=start_coordinate
)
if not coordinate or not start_coordinate:
LOG.warning(
"Anthropic CUA error: left click drag action has no coordinate or start coordinate",
tool_call=tool_call,
)
idx += 1
continue
x, y = validate_and_get_coordinates(
coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
start_x, start_y = validate_and_get_coordinates(
start_coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
response = f"Drag from ({start_x}, {start_y}) to ({x}, {y})"
reasoning = reasoning or response
actions.append(
DragAction(
start_x=start_x,
start_y=start_y,
path=[(x, y)],
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "wait":
duration = tool_call_input.get("duration", 5)
actions.append(
WaitAction(
seconds=duration,
reasoning=reasoning,
intention=reasoning,
response=f"Wait for {duration} seconds",
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
else:
LOG.error(
"Anthropic CUA error: unsupported action",
tool_call=tool_call, tool_call=tool_call,
) )
idx += 1 idx += 1
continue except Exception:
original_x, original_y = coordinate LOG.exception(
x, y = scale_coordinates( "Anthropic CUA error: failed to parse action",
(original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension task_id=task.task_id,
) step_id=step.step_id,
actions.append(
ClickAction(
element_id="",
x=x,
y=y,
button="left",
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "type":
text = tool_call_input.get("text")
if not text:
LOG.warning(
"Anthropic type action has no text",
tool_call=tool_call,
)
idx += 1
continue
actions.append(
InputTextAction(
element_id="",
text=text,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "key":
text = tool_call_input.get("text")
if not text:
LOG.warning(
"Key action has no text",
tool_call=tool_call,
)
idx += 1
continue
actions.append(
KeypressAction(
element_id="",
keys=[text],
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "screenshot":
actions.append(
NullAction(
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
else:
LOG.error(
"Unsupported action",
tool_call=tool_call, tool_call=tool_call,
) )
idx += 1 break
if not actions:
reasoning = reasonings[0]["thinking"] if reasonings else None
assistant_messages = [block for block in assistant_content if block["type"] == "text"]
assistant_message = assistant_messages[0]["text"] if assistant_messages else None
actions = await generate_cua_fallback_actions(task, step, assistant_message, reasoning)
return actions return actions
# function from anthropic's quickstart guide
# https://github.com/anthropics/anthropic-quickstarts/blob/81c4085944abb1734db411f05290b538fdc46dcd/computer-use-demo/computer_use_demo/tools/computer.py#L214C1-L221C1
def validate_and_get_coordinates(
coordinate: tuple[int, int] | list[int],
current_dimension: Resolution,
target_dimension: Resolution,
) -> tuple[int, int]:
if len(coordinate) != 2:
raise ValueError(f"{coordinate} must be a tuple of length 2")
if not all(isinstance(i, int) and i >= 0 for i in coordinate):
raise ValueError(f"{coordinate} must be a tuple of non-negative ints")
return scale_coordinates((coordinate[0], coordinate[1]), current_dimension, target_dimension)
async def generate_cua_fallback_actions( async def generate_cua_fallback_actions(
task: Task, task: Task,
step: Step, step: Step,