support more anthropic actions (#2257)

This commit is contained in:
Shuchang Zheng
2025-04-30 18:42:44 +08:00
committed by GitHub
parent f932d7d704
commit 1a33810f09
4 changed files with 369 additions and 130 deletions

View File

@@ -182,6 +182,8 @@ export const ActionTypes = {
Move: "move", Move: "move",
NullAction: "null_action", NullAction: "null_action",
VerificationCode: "verification_code", VerificationCode: "verification_code",
Drag: "drag",
LeftMouse: "left_mouse",
} as const; } as const;
export type ActionType = (typeof ActionTypes)[keyof typeof ActionTypes]; export type ActionType = (typeof ActionTypes)[keyof typeof ActionTypes];
@@ -204,6 +206,8 @@ export const ReadableActionTypes: {
move: "Move", move: "Move",
null_action: "Screenshot", null_action: "Screenshot",
verification_code: "Verification Code", verification_code: "Verification Code",
drag: "Drag",
left_mouse: "Left Mouse",
}; };
export type Option = { export type Option = {

View File

@@ -1,6 +1,6 @@
from datetime import datetime from datetime import datetime
from enum import StrEnum from enum import StrEnum
from typing import Annotated, Any, Type, TypeVar from typing import Annotated, Any, Literal, Type, TypeVar
import structlog import structlog
from litellm import ConfigDict from litellm import ConfigDict
@@ -28,12 +28,14 @@ class ActionType(StrEnum):
RELOAD_PAGE = "reload_page" RELOAD_PAGE = "reload_page"
EXTRACT = "extract" EXTRACT = "extract"
VERIFICATION_CODE = "verification_code"
SCROLL = "scroll" SCROLL = "scroll"
KEYPRESS = "keypress" KEYPRESS = "keypress"
TYPE = "type" TYPE = "type"
MOVE = "move" MOVE = "move"
DRAG = "drag" DRAG = "drag"
VERIFICATION_CODE = "verification_code" LEFT_MOUSE = "left_mouse"
def is_web_action(self) -> bool: def is_web_action(self) -> bool:
return self in [ return self in [
@@ -271,8 +273,8 @@ class ExtractAction(Action):
class ScrollAction(Action): class ScrollAction(Action):
action_type: ActionType = ActionType.SCROLL action_type: ActionType = ActionType.SCROLL
x: int x: int | None = None
y: int y: int | None = None
scroll_x: int scroll_x: int
scroll_y: int scroll_y: int
@@ -280,6 +282,8 @@ class ScrollAction(Action):
class KeypressAction(Action): class KeypressAction(Action):
action_type: ActionType = ActionType.KEYPRESS action_type: ActionType = ActionType.KEYPRESS
keys: list[str] = [] keys: list[str] = []
hold: bool = False
duration: int = 0
class MoveAction(Action): class MoveAction(Action):
@@ -290,8 +294,8 @@ class MoveAction(Action):
class DragAction(Action): class DragAction(Action):
action_type: ActionType = ActionType.DRAG action_type: ActionType = ActionType.DRAG
start_x: int start_x: int | None = None
start_y: int start_y: int | None = None
path: list[tuple[int, int]] = [] path: list[tuple[int, int]] = []
@@ -300,6 +304,13 @@ class VerificationCodeAction(Action):
verification_code: str verification_code: str
class LeftMouseAction(Action):
action_type: ActionType = ActionType.LEFT_MOUSE
direction: Literal["down", "up"]
x: int | None = None
y: int | None = None
class ScrapeResult(BaseModel): class ScrapeResult(BaseModel):
""" """
Scraped response from a webpage, including: Scraped response from a webpage, including:

View File

@@ -1563,6 +1563,7 @@ async def handle_scroll_action(
task: Task, task: Task,
step: Step, step: Step,
) -> list[ActionResult]: ) -> list[ActionResult]:
if action.x and action.y:
await page.mouse.move(action.x, action.y) await page.mouse.move(action.x, action.y)
await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})") await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
return [ActionSuccess()] return [ActionSuccess()]
@@ -1614,6 +1615,11 @@ async def handle_keypress_action(
else: else:
updated_keys.append(key) updated_keys.append(key)
keypress_str = "+".join(updated_keys) keypress_str = "+".join(updated_keys)
if action.hold:
await page.keyboard.down(keypress_str)
await asyncio.sleep(action.duration)
await page.keyboard.up(keypress_str)
else:
await page.keyboard.press(keypress_str) await page.keyboard.press(keypress_str)
return [ActionSuccess()] return [ActionSuccess()]
@@ -1636,6 +1642,7 @@ async def handle_drag_action(
task: Task, task: Task,
step: Step, step: Step,
) -> list[ActionResult]: ) -> list[ActionResult]:
if action.start_x and action.start_y:
await page.mouse.move(action.start_x, action.start_y) await page.mouse.move(action.start_x, action.start_y)
await page.mouse.down() await page.mouse.down()
for point in action.path: for point in action.path:
@@ -1663,9 +1670,22 @@ async def handle_verification_code_action(
return [ActionSuccess()] return [ActionSuccess()]
ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action) async def handle_left_mouse_action(
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action) action: actions.LeftMouseAction,
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action) page: Page,
scraped_page: ScrapedPage,
task: Task,
step: Step,
) -> list[ActionResult]:
if action.x and action.y:
await page.mouse.move(action.x, action.y)
if action.direction == "down":
await page.mouse.down()
elif action.direction == "up":
await page.mouse.up()
return [ActionSuccess()]
ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action) ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
# ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action) # ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action)
ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action) ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
@@ -1679,6 +1699,7 @@ ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action)
ActionHandler.register_action_type(ActionType.MOVE, handle_move_action) ActionHandler.register_action_type(ActionType.MOVE, handle_move_action)
ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action) ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action)
ActionHandler.register_action_type(ActionType.VERIFICATION_CODE, handle_verification_code_action) ActionHandler.register_action_type(ActionType.VERIFICATION_CODE, handle_verification_code_action)
ActionHandler.register_action_type(ActionType.LEFT_MOUSE, handle_left_mouse_action)
async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any: async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:

View File

@@ -20,6 +20,7 @@ from skyvern.webeye.actions.actions import (
DragAction, DragAction,
InputTextAction, InputTextAction,
KeypressAction, KeypressAction,
LeftMouseAction,
MoveAction, MoveAction,
NullAction, NullAction,
ScrollAction, ScrollAction,
@@ -287,7 +288,8 @@ async def parse_cua_actions(
intention=reasoning, intention=reasoning,
) )
case "move": case "move":
reasoning = reasoning or f"Move mouse to: ({cua_action.x}, {cua_action.y})" response = f"Move mouse to: ({cua_action.x}, {cua_action.y})"
reasoning = reasoning or response
action = MoveAction( action = MoveAction(
x=cua_action.x, x=cua_action.x,
y=cua_action.y, y=cua_action.y,
@@ -350,7 +352,7 @@ async def parse_cua_actions(
step_id=step.step_id, step_id=step.step_id,
organization_id=task.organization_id, organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id, workflow_run_id=task.workflow_run_id,
response=response.dict(), response=response.model_dump(),
) )
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
@@ -366,11 +368,21 @@ async def parse_anthropic_actions(
screenshot_resize_target_dimension: Resolution, screenshot_resize_target_dimension: Resolution,
) -> list[Action]: ) -> list[Action]:
tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"] tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
reasonings = [block for block in assistant_content if block["type"] == "thinking"]
LOG.info("Anthropic tool calls", tool_calls=tool_calls, reasonings=reasonings, assistant_content=assistant_content)
if len(reasonings) > 1:
LOG.warning(
"Anthropic CUA: multiple reasonings in assistant content",
task_id=task.task_id,
step_id=step.step_id,
assistant_content=assistant_content,
)
reasoning = reasonings[0]["thinking"] if reasonings else None
idx = 0 idx = 0
actions: list[Action] = [] actions: list[Action] = []
LOG.info("Anthropic tool calls", tool_calls=tool_calls, assistant_content=assistant_content)
while idx < len(tool_calls): while idx < len(tool_calls):
tool_call = tool_calls[idx] tool_call = tool_calls[idx]
try:
tool_call_id = tool_call["id"] tool_call_id = tool_call["id"]
tool_call_input = tool_call.get("input") tool_call_input = tool_call.get("input")
if not tool_call_input: if not tool_call_input:
@@ -378,15 +390,27 @@ async def parse_anthropic_actions(
continue continue
action = tool_call_input["action"] action = tool_call_input["action"]
if action == "mouse_move": if action == "mouse_move":
original_x, original_y = tool_call_input["coordinate"] coordinate = tool_call_input.get("coordinate")
# (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension. if not coordinate:
x, y = scale_coordinates( LOG.warning(
(original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension "Anthropic CUA error: mouse move action has no coordinate",
tool_call=tool_call,
) )
idx += 1
continue
# (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension.
x, y = validate_and_get_coordinates(
coordinate, screenshot_resize_target_dimension, browser_window_dimension
)
response = f"Move mouse to: ({x}, {y})"
reasoning = reasoning or response
actions.append( actions.append(
# TODO: add response by adding specifying the element to move to
MoveAction( MoveAction(
x=x, x=x,
y=y, y=y,
reasoning=reasoning,
intention=reasoning,
organization_id=task.organization_id, organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id, workflow_run_id=task.workflow_run_id,
task_id=task.task_id, task_id=task.task_id,
@@ -406,21 +430,25 @@ async def parse_anthropic_actions(
if not coordinate: if not coordinate:
LOG.warning( LOG.warning(
"Left click action has no coordinate and it doesn't have mouse_move before it", "Anthropic CUA error: left click action has no coordinate and it doesn't have mouse_move before it",
tool_call=tool_call, tool_call=tool_call,
) )
idx += 1 idx += 1
continue continue
original_x, original_y = coordinate x, y = validate_and_get_coordinates(
x, y = scale_coordinates( coordinate, screenshot_resize_target_dimension, browser_window_dimension
(original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension
) )
response = f"Click at: ({x}, {y})"
reasoning = reasoning or response
actions.append( actions.append(
ClickAction( ClickAction(
element_id="", element_id="",
x=x, x=x,
y=y, y=y,
button="left", button="left",
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id, organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id, workflow_run_id=task.workflow_run_id,
task_id=task.task_id, task_id=task.task_id,
@@ -434,7 +462,7 @@ async def parse_anthropic_actions(
text = tool_call_input.get("text") text = tool_call_input.get("text")
if not text: if not text:
LOG.warning( LOG.warning(
"Anthropic type action has no text", "Anthropic CUA error: type action has no text",
tool_call=tool_call, tool_call=tool_call,
) )
idx += 1 idx += 1
@@ -443,6 +471,9 @@ async def parse_anthropic_actions(
InputTextAction( InputTextAction(
element_id="", element_id="",
text=text, text=text,
reasoning=reasoning,
intention=reasoning,
response=text,
organization_id=task.organization_id, organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id, workflow_run_id=task.workflow_run_id,
task_id=task.task_id, task_id=task.task_id,
@@ -452,19 +483,30 @@ async def parse_anthropic_actions(
tool_call_id=tool_call_id, tool_call_id=tool_call_id,
) )
) )
elif action == "key": elif action in ["key", "hold_key"]:
text = tool_call_input.get("text") text = tool_call_input.get("text")
if not text: if not text:
LOG.warning( LOG.warning(
"Key action has no text", "Anthropic CUA error: key action has no text",
tool_call=tool_call, tool_call=tool_call,
) )
idx += 1 idx += 1
continue continue
response = f"Press keys: {text}"
hold = action == "hold_key"
duration = tool_call_input.get("duration", 0)
if hold:
response = f"Hold keys for {duration} seconds: {text}"
reasoning = reasoning or response
actions.append( actions.append(
KeypressAction( KeypressAction(
element_id="", element_id="",
keys=[text], keys=[text],
hold=hold,
duration=duration,
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id, organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id, workflow_run_id=task.workflow_run_id,
task_id=task.task_id, task_id=task.task_id,
@@ -477,6 +519,139 @@ async def parse_anthropic_actions(
elif action == "screenshot": elif action == "screenshot":
actions.append( actions.append(
NullAction( NullAction(
reasoning=reasoning,
intention=reasoning,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "scroll":
x, y = None, None
coordinate = tool_call_input.get("coordinate")
if coordinate:
x, y = validate_and_get_coordinates(
coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
scroll_direction = tool_call_input.get("scroll_direction")
scroll_amount = tool_call_input.get("scroll_amount")
if scroll_direction == "up":
scroll_x = 0
scroll_y = -scroll_amount
elif scroll_direction == "down":
scroll_x = 0
scroll_y = scroll_amount
elif scroll_direction == "left":
scroll_x = -scroll_amount
scroll_y = 0
elif scroll_direction == "right":
scroll_x = scroll_amount
scroll_y = 0
else:
LOG.warning(
"Anthropic CUA error: unsupported scroll direction",
tool_call=tool_call,
)
idx += 1
continue
response = f"Scroll by: ({scroll_x}, {scroll_y})"
reasoning = reasoning or response
actions.append(
ScrollAction(
element_id="",
x=x,
y=y,
scroll_x=scroll_x,
scroll_y=scroll_y,
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action in ["left_mouse_down", "left_mouse_up"]:
coordinate = tool_call_input.get("coordinate")
x, y = None, None
if coordinate:
x, y = validate_and_get_coordinates(
coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
direction = "down" if action == "left_mouse_down" else "up"
response = f"Left mouse {direction} at: ({x}, {y})"
reasoning = reasoning or response
actions.append(
LeftMouseAction(
x=x,
y=y,
direction=direction,
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "left_click_drag":
coordinate = tool_call_input.get("coordinate")
start_coordinate = tool_call_input.get("start_coordinate")
LOG.info(
"Anthropic CUA left click drag action", coordinate=coordinate, start_coordinate=start_coordinate
)
if not coordinate or not start_coordinate:
LOG.warning(
"Anthropic CUA error: left click drag action has no coordinate or start coordinate",
tool_call=tool_call,
)
idx += 1
continue
x, y = validate_and_get_coordinates(
coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
start_x, start_y = validate_and_get_coordinates(
start_coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
response = f"Drag from ({start_x}, {start_y}) to ({x}, {y})"
reasoning = reasoning or response
actions.append(
DragAction(
start_x=start_x,
start_y=start_y,
path=[(x, y)],
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "wait":
duration = tool_call_input.get("duration", 5)
actions.append(
WaitAction(
seconds=duration,
reasoning=reasoning,
intention=reasoning,
response=f"Wait for {duration} seconds",
organization_id=task.organization_id, organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id, workflow_run_id=task.workflow_run_id,
task_id=task.task_id, task_id=task.task_id,
@@ -488,13 +663,41 @@ async def parse_anthropic_actions(
) )
else: else:
LOG.error( LOG.error(
"Unsupported action", "Anthropic CUA error: unsupported action",
tool_call=tool_call, tool_call=tool_call,
) )
idx += 1 idx += 1
except Exception:
LOG.exception(
"Anthropic CUA error: failed to parse action",
task_id=task.task_id,
step_id=step.step_id,
tool_call=tool_call,
)
break
if not actions:
reasoning = reasonings[0]["thinking"] if reasonings else None
assistant_messages = [block for block in assistant_content if block["type"] == "text"]
assistant_message = assistant_messages[0]["text"] if assistant_messages else None
actions = await generate_cua_fallback_actions(task, step, assistant_message, reasoning)
return actions return actions
# function from anthropic's quickstart guide
# https://github.com/anthropics/anthropic-quickstarts/blob/81c4085944abb1734db411f05290b538fdc46dcd/computer-use-demo/computer_use_demo/tools/computer.py#L214C1-L221C1
def validate_and_get_coordinates(
coordinate: tuple[int, int] | list[int],
current_dimension: Resolution,
target_dimension: Resolution,
) -> tuple[int, int]:
if len(coordinate) != 2:
raise ValueError(f"{coordinate} must be a tuple of length 2")
if not all(isinstance(i, int) and i >= 0 for i in coordinate):
raise ValueError(f"{coordinate} must be a tuple of non-negative ints")
return scale_coordinates((coordinate[0], coordinate[1]), current_dimension, target_dimension)
async def generate_cua_fallback_actions( async def generate_cua_fallback_actions(
task: Task, task: Task,
step: Step, step: Step,