support more anthropic actions (#2257)
This commit is contained in:
@@ -182,6 +182,8 @@ export const ActionTypes = {
|
|||||||
Move: "move",
|
Move: "move",
|
||||||
NullAction: "null_action",
|
NullAction: "null_action",
|
||||||
VerificationCode: "verification_code",
|
VerificationCode: "verification_code",
|
||||||
|
Drag: "drag",
|
||||||
|
LeftMouse: "left_mouse",
|
||||||
} as const;
|
} as const;
|
||||||
|
|
||||||
export type ActionType = (typeof ActionTypes)[keyof typeof ActionTypes];
|
export type ActionType = (typeof ActionTypes)[keyof typeof ActionTypes];
|
||||||
@@ -204,6 +206,8 @@ export const ReadableActionTypes: {
|
|||||||
move: "Move",
|
move: "Move",
|
||||||
null_action: "Screenshot",
|
null_action: "Screenshot",
|
||||||
verification_code: "Verification Code",
|
verification_code: "Verification Code",
|
||||||
|
drag: "Drag",
|
||||||
|
left_mouse: "Left Mouse",
|
||||||
};
|
};
|
||||||
|
|
||||||
export type Option = {
|
export type Option = {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import StrEnum
|
from enum import StrEnum
|
||||||
from typing import Annotated, Any, Type, TypeVar
|
from typing import Annotated, Any, Literal, Type, TypeVar
|
||||||
|
|
||||||
import structlog
|
import structlog
|
||||||
from litellm import ConfigDict
|
from litellm import ConfigDict
|
||||||
@@ -28,12 +28,14 @@ class ActionType(StrEnum):
|
|||||||
RELOAD_PAGE = "reload_page"
|
RELOAD_PAGE = "reload_page"
|
||||||
|
|
||||||
EXTRACT = "extract"
|
EXTRACT = "extract"
|
||||||
|
VERIFICATION_CODE = "verification_code"
|
||||||
|
|
||||||
SCROLL = "scroll"
|
SCROLL = "scroll"
|
||||||
KEYPRESS = "keypress"
|
KEYPRESS = "keypress"
|
||||||
TYPE = "type"
|
TYPE = "type"
|
||||||
MOVE = "move"
|
MOVE = "move"
|
||||||
DRAG = "drag"
|
DRAG = "drag"
|
||||||
VERIFICATION_CODE = "verification_code"
|
LEFT_MOUSE = "left_mouse"
|
||||||
|
|
||||||
def is_web_action(self) -> bool:
|
def is_web_action(self) -> bool:
|
||||||
return self in [
|
return self in [
|
||||||
@@ -271,8 +273,8 @@ class ExtractAction(Action):
|
|||||||
|
|
||||||
class ScrollAction(Action):
|
class ScrollAction(Action):
|
||||||
action_type: ActionType = ActionType.SCROLL
|
action_type: ActionType = ActionType.SCROLL
|
||||||
x: int
|
x: int | None = None
|
||||||
y: int
|
y: int | None = None
|
||||||
scroll_x: int
|
scroll_x: int
|
||||||
scroll_y: int
|
scroll_y: int
|
||||||
|
|
||||||
@@ -280,6 +282,8 @@ class ScrollAction(Action):
|
|||||||
class KeypressAction(Action):
|
class KeypressAction(Action):
|
||||||
action_type: ActionType = ActionType.KEYPRESS
|
action_type: ActionType = ActionType.KEYPRESS
|
||||||
keys: list[str] = []
|
keys: list[str] = []
|
||||||
|
hold: bool = False
|
||||||
|
duration: int = 0
|
||||||
|
|
||||||
|
|
||||||
class MoveAction(Action):
|
class MoveAction(Action):
|
||||||
@@ -290,8 +294,8 @@ class MoveAction(Action):
|
|||||||
|
|
||||||
class DragAction(Action):
|
class DragAction(Action):
|
||||||
action_type: ActionType = ActionType.DRAG
|
action_type: ActionType = ActionType.DRAG
|
||||||
start_x: int
|
start_x: int | None = None
|
||||||
start_y: int
|
start_y: int | None = None
|
||||||
path: list[tuple[int, int]] = []
|
path: list[tuple[int, int]] = []
|
||||||
|
|
||||||
|
|
||||||
@@ -300,6 +304,13 @@ class VerificationCodeAction(Action):
|
|||||||
verification_code: str
|
verification_code: str
|
||||||
|
|
||||||
|
|
||||||
|
class LeftMouseAction(Action):
|
||||||
|
action_type: ActionType = ActionType.LEFT_MOUSE
|
||||||
|
direction: Literal["down", "up"]
|
||||||
|
x: int | None = None
|
||||||
|
y: int | None = None
|
||||||
|
|
||||||
|
|
||||||
class ScrapeResult(BaseModel):
|
class ScrapeResult(BaseModel):
|
||||||
"""
|
"""
|
||||||
Scraped response from a webpage, including:
|
Scraped response from a webpage, including:
|
||||||
|
|||||||
@@ -1563,7 +1563,8 @@ async def handle_scroll_action(
|
|||||||
task: Task,
|
task: Task,
|
||||||
step: Step,
|
step: Step,
|
||||||
) -> list[ActionResult]:
|
) -> list[ActionResult]:
|
||||||
await page.mouse.move(action.x, action.y)
|
if action.x and action.y:
|
||||||
|
await page.mouse.move(action.x, action.y)
|
||||||
await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
|
await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
|
||||||
return [ActionSuccess()]
|
return [ActionSuccess()]
|
||||||
|
|
||||||
@@ -1614,7 +1615,12 @@ async def handle_keypress_action(
|
|||||||
else:
|
else:
|
||||||
updated_keys.append(key)
|
updated_keys.append(key)
|
||||||
keypress_str = "+".join(updated_keys)
|
keypress_str = "+".join(updated_keys)
|
||||||
await page.keyboard.press(keypress_str)
|
if action.hold:
|
||||||
|
await page.keyboard.down(keypress_str)
|
||||||
|
await asyncio.sleep(action.duration)
|
||||||
|
await page.keyboard.up(keypress_str)
|
||||||
|
else:
|
||||||
|
await page.keyboard.press(keypress_str)
|
||||||
return [ActionSuccess()]
|
return [ActionSuccess()]
|
||||||
|
|
||||||
|
|
||||||
@@ -1636,7 +1642,8 @@ async def handle_drag_action(
|
|||||||
task: Task,
|
task: Task,
|
||||||
step: Step,
|
step: Step,
|
||||||
) -> list[ActionResult]:
|
) -> list[ActionResult]:
|
||||||
await page.mouse.move(action.start_x, action.start_y)
|
if action.start_x and action.start_y:
|
||||||
|
await page.mouse.move(action.start_x, action.start_y)
|
||||||
await page.mouse.down()
|
await page.mouse.down()
|
||||||
for point in action.path:
|
for point in action.path:
|
||||||
x, y = point[0], point[1]
|
x, y = point[0], point[1]
|
||||||
@@ -1663,9 +1670,22 @@ async def handle_verification_code_action(
|
|||||||
return [ActionSuccess()]
|
return [ActionSuccess()]
|
||||||
|
|
||||||
|
|
||||||
ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
|
async def handle_left_mouse_action(
|
||||||
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
|
action: actions.LeftMouseAction,
|
||||||
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
|
page: Page,
|
||||||
|
scraped_page: ScrapedPage,
|
||||||
|
task: Task,
|
||||||
|
step: Step,
|
||||||
|
) -> list[ActionResult]:
|
||||||
|
if action.x and action.y:
|
||||||
|
await page.mouse.move(action.x, action.y)
|
||||||
|
if action.direction == "down":
|
||||||
|
await page.mouse.down()
|
||||||
|
elif action.direction == "up":
|
||||||
|
await page.mouse.up()
|
||||||
|
return [ActionSuccess()]
|
||||||
|
|
||||||
|
|
||||||
ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
|
ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
|
||||||
# ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action)
|
# ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action)
|
||||||
ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
|
ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
|
||||||
@@ -1679,6 +1699,7 @@ ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action)
|
|||||||
ActionHandler.register_action_type(ActionType.MOVE, handle_move_action)
|
ActionHandler.register_action_type(ActionType.MOVE, handle_move_action)
|
||||||
ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action)
|
ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action)
|
||||||
ActionHandler.register_action_type(ActionType.VERIFICATION_CODE, handle_verification_code_action)
|
ActionHandler.register_action_type(ActionType.VERIFICATION_CODE, handle_verification_code_action)
|
||||||
|
ActionHandler.register_action_type(ActionType.LEFT_MOUSE, handle_left_mouse_action)
|
||||||
|
|
||||||
|
|
||||||
async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
|
async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ from skyvern.webeye.actions.actions import (
|
|||||||
DragAction,
|
DragAction,
|
||||||
InputTextAction,
|
InputTextAction,
|
||||||
KeypressAction,
|
KeypressAction,
|
||||||
|
LeftMouseAction,
|
||||||
MoveAction,
|
MoveAction,
|
||||||
NullAction,
|
NullAction,
|
||||||
ScrollAction,
|
ScrollAction,
|
||||||
@@ -287,7 +288,8 @@ async def parse_cua_actions(
|
|||||||
intention=reasoning,
|
intention=reasoning,
|
||||||
)
|
)
|
||||||
case "move":
|
case "move":
|
||||||
reasoning = reasoning or f"Move mouse to: ({cua_action.x}, {cua_action.y})"
|
response = f"Move mouse to: ({cua_action.x}, {cua_action.y})"
|
||||||
|
reasoning = reasoning or response
|
||||||
action = MoveAction(
|
action = MoveAction(
|
||||||
x=cua_action.x,
|
x=cua_action.x,
|
||||||
y=cua_action.y,
|
y=cua_action.y,
|
||||||
@@ -350,7 +352,7 @@ async def parse_cua_actions(
|
|||||||
step_id=step.step_id,
|
step_id=step.step_id,
|
||||||
organization_id=task.organization_id,
|
organization_id=task.organization_id,
|
||||||
workflow_run_id=task.workflow_run_id,
|
workflow_run_id=task.workflow_run_id,
|
||||||
response=response.dict(),
|
response=response.model_dump(),
|
||||||
)
|
)
|
||||||
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
|
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
|
||||||
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
|
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
|
||||||
@@ -366,135 +368,336 @@ async def parse_anthropic_actions(
|
|||||||
screenshot_resize_target_dimension: Resolution,
|
screenshot_resize_target_dimension: Resolution,
|
||||||
) -> list[Action]:
|
) -> list[Action]:
|
||||||
tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
|
tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
|
||||||
|
reasonings = [block for block in assistant_content if block["type"] == "thinking"]
|
||||||
|
LOG.info("Anthropic tool calls", tool_calls=tool_calls, reasonings=reasonings, assistant_content=assistant_content)
|
||||||
|
if len(reasonings) > 1:
|
||||||
|
LOG.warning(
|
||||||
|
"Anthropic CUA: multiple reasonings in assistant content",
|
||||||
|
task_id=task.task_id,
|
||||||
|
step_id=step.step_id,
|
||||||
|
assistant_content=assistant_content,
|
||||||
|
)
|
||||||
|
reasoning = reasonings[0]["thinking"] if reasonings else None
|
||||||
idx = 0
|
idx = 0
|
||||||
actions: list[Action] = []
|
actions: list[Action] = []
|
||||||
LOG.info("Anthropic tool calls", tool_calls=tool_calls, assistant_content=assistant_content)
|
|
||||||
while idx < len(tool_calls):
|
while idx < len(tool_calls):
|
||||||
tool_call = tool_calls[idx]
|
tool_call = tool_calls[idx]
|
||||||
tool_call_id = tool_call["id"]
|
try:
|
||||||
tool_call_input = tool_call.get("input")
|
tool_call_id = tool_call["id"]
|
||||||
if not tool_call_input:
|
tool_call_input = tool_call.get("input")
|
||||||
idx += 1
|
if not tool_call_input:
|
||||||
continue
|
idx += 1
|
||||||
action = tool_call_input["action"]
|
continue
|
||||||
if action == "mouse_move":
|
action = tool_call_input["action"]
|
||||||
original_x, original_y = tool_call_input["coordinate"]
|
if action == "mouse_move":
|
||||||
# (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension.
|
coordinate = tool_call_input.get("coordinate")
|
||||||
x, y = scale_coordinates(
|
if not coordinate:
|
||||||
(original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension
|
LOG.warning(
|
||||||
)
|
"Anthropic CUA error: mouse move action has no coordinate",
|
||||||
actions.append(
|
tool_call=tool_call,
|
||||||
MoveAction(
|
)
|
||||||
x=x,
|
idx += 1
|
||||||
y=y,
|
continue
|
||||||
organization_id=task.organization_id,
|
# (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension.
|
||||||
workflow_run_id=task.workflow_run_id,
|
x, y = validate_and_get_coordinates(
|
||||||
task_id=task.task_id,
|
coordinate, screenshot_resize_target_dimension, browser_window_dimension
|
||||||
step_id=step.step_id,
|
|
||||||
step_order=step.order,
|
|
||||||
action_order=idx,
|
|
||||||
tool_call_id=tool_call_id,
|
|
||||||
)
|
)
|
||||||
)
|
response = f"Move mouse to: ({x}, {y})"
|
||||||
elif action == "left_click":
|
reasoning = reasoning or response
|
||||||
coordinate = tool_call_input.get("coordinate")
|
actions.append(
|
||||||
if not coordinate and idx - 1 >= 0:
|
# TODO: add response by adding specifying the element to move to
|
||||||
prev_tool_call = tool_calls[idx - 1]
|
MoveAction(
|
||||||
prev_tool_call_input = prev_tool_call.get("input")
|
x=x,
|
||||||
if prev_tool_call_input and prev_tool_call_input["action"] == "mouse_move":
|
y=y,
|
||||||
coordinate = prev_tool_call_input.get("coordinate")
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
organization_id=task.organization_id,
|
||||||
|
workflow_run_id=task.workflow_run_id,
|
||||||
|
task_id=task.task_id,
|
||||||
|
step_id=step.step_id,
|
||||||
|
step_order=step.order,
|
||||||
|
action_order=idx,
|
||||||
|
tool_call_id=tool_call_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif action == "left_click":
|
||||||
|
coordinate = tool_call_input.get("coordinate")
|
||||||
|
if not coordinate and idx - 1 >= 0:
|
||||||
|
prev_tool_call = tool_calls[idx - 1]
|
||||||
|
prev_tool_call_input = prev_tool_call.get("input")
|
||||||
|
if prev_tool_call_input and prev_tool_call_input["action"] == "mouse_move":
|
||||||
|
coordinate = prev_tool_call_input.get("coordinate")
|
||||||
|
|
||||||
if not coordinate:
|
if not coordinate:
|
||||||
LOG.warning(
|
LOG.warning(
|
||||||
"Left click action has no coordinate and it doesn't have mouse_move before it",
|
"Anthropic CUA error: left click action has no coordinate and it doesn't have mouse_move before it",
|
||||||
|
tool_call=tool_call,
|
||||||
|
)
|
||||||
|
idx += 1
|
||||||
|
continue
|
||||||
|
x, y = validate_and_get_coordinates(
|
||||||
|
coordinate, screenshot_resize_target_dimension, browser_window_dimension
|
||||||
|
)
|
||||||
|
response = f"Click at: ({x}, {y})"
|
||||||
|
reasoning = reasoning or response
|
||||||
|
actions.append(
|
||||||
|
ClickAction(
|
||||||
|
element_id="",
|
||||||
|
x=x,
|
||||||
|
y=y,
|
||||||
|
button="left",
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
response=response,
|
||||||
|
organization_id=task.organization_id,
|
||||||
|
workflow_run_id=task.workflow_run_id,
|
||||||
|
task_id=task.task_id,
|
||||||
|
step_id=step.step_id,
|
||||||
|
step_order=step.order,
|
||||||
|
action_order=idx,
|
||||||
|
tool_call_id=tool_call_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif action == "type":
|
||||||
|
text = tool_call_input.get("text")
|
||||||
|
if not text:
|
||||||
|
LOG.warning(
|
||||||
|
"Anthropic CUA error: type action has no text",
|
||||||
|
tool_call=tool_call,
|
||||||
|
)
|
||||||
|
idx += 1
|
||||||
|
continue
|
||||||
|
actions.append(
|
||||||
|
InputTextAction(
|
||||||
|
element_id="",
|
||||||
|
text=text,
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
response=text,
|
||||||
|
organization_id=task.organization_id,
|
||||||
|
workflow_run_id=task.workflow_run_id,
|
||||||
|
task_id=task.task_id,
|
||||||
|
step_id=step.step_id,
|
||||||
|
step_order=step.order,
|
||||||
|
action_order=idx,
|
||||||
|
tool_call_id=tool_call_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif action in ["key", "hold_key"]:
|
||||||
|
text = tool_call_input.get("text")
|
||||||
|
if not text:
|
||||||
|
LOG.warning(
|
||||||
|
"Anthropic CUA error: key action has no text",
|
||||||
|
tool_call=tool_call,
|
||||||
|
)
|
||||||
|
idx += 1
|
||||||
|
continue
|
||||||
|
response = f"Press keys: {text}"
|
||||||
|
hold = action == "hold_key"
|
||||||
|
duration = tool_call_input.get("duration", 0)
|
||||||
|
if hold:
|
||||||
|
response = f"Hold keys for {duration} seconds: {text}"
|
||||||
|
reasoning = reasoning or response
|
||||||
|
actions.append(
|
||||||
|
KeypressAction(
|
||||||
|
element_id="",
|
||||||
|
keys=[text],
|
||||||
|
hold=hold,
|
||||||
|
duration=duration,
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
response=response,
|
||||||
|
organization_id=task.organization_id,
|
||||||
|
workflow_run_id=task.workflow_run_id,
|
||||||
|
task_id=task.task_id,
|
||||||
|
step_id=step.step_id,
|
||||||
|
step_order=step.order,
|
||||||
|
action_order=idx,
|
||||||
|
tool_call_id=tool_call_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif action == "screenshot":
|
||||||
|
actions.append(
|
||||||
|
NullAction(
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
organization_id=task.organization_id,
|
||||||
|
workflow_run_id=task.workflow_run_id,
|
||||||
|
task_id=task.task_id,
|
||||||
|
step_id=step.step_id,
|
||||||
|
step_order=step.order,
|
||||||
|
action_order=idx,
|
||||||
|
tool_call_id=tool_call_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif action == "scroll":
|
||||||
|
x, y = None, None
|
||||||
|
coordinate = tool_call_input.get("coordinate")
|
||||||
|
if coordinate:
|
||||||
|
x, y = validate_and_get_coordinates(
|
||||||
|
coordinate, browser_window_dimension, screenshot_resize_target_dimension
|
||||||
|
)
|
||||||
|
scroll_direction = tool_call_input.get("scroll_direction")
|
||||||
|
scroll_amount = tool_call_input.get("scroll_amount")
|
||||||
|
if scroll_direction == "up":
|
||||||
|
scroll_x = 0
|
||||||
|
scroll_y = -scroll_amount
|
||||||
|
elif scroll_direction == "down":
|
||||||
|
scroll_x = 0
|
||||||
|
scroll_y = scroll_amount
|
||||||
|
elif scroll_direction == "left":
|
||||||
|
scroll_x = -scroll_amount
|
||||||
|
scroll_y = 0
|
||||||
|
elif scroll_direction == "right":
|
||||||
|
scroll_x = scroll_amount
|
||||||
|
scroll_y = 0
|
||||||
|
else:
|
||||||
|
LOG.warning(
|
||||||
|
"Anthropic CUA error: unsupported scroll direction",
|
||||||
|
tool_call=tool_call,
|
||||||
|
)
|
||||||
|
idx += 1
|
||||||
|
continue
|
||||||
|
response = f"Scroll by: ({scroll_x}, {scroll_y})"
|
||||||
|
reasoning = reasoning or response
|
||||||
|
actions.append(
|
||||||
|
ScrollAction(
|
||||||
|
element_id="",
|
||||||
|
x=x,
|
||||||
|
y=y,
|
||||||
|
scroll_x=scroll_x,
|
||||||
|
scroll_y=scroll_y,
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
response=response,
|
||||||
|
organization_id=task.organization_id,
|
||||||
|
workflow_run_id=task.workflow_run_id,
|
||||||
|
task_id=task.task_id,
|
||||||
|
step_id=step.step_id,
|
||||||
|
step_order=step.order,
|
||||||
|
action_order=idx,
|
||||||
|
tool_call_id=tool_call_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif action in ["left_mouse_down", "left_mouse_up"]:
|
||||||
|
coordinate = tool_call_input.get("coordinate")
|
||||||
|
x, y = None, None
|
||||||
|
if coordinate:
|
||||||
|
x, y = validate_and_get_coordinates(
|
||||||
|
coordinate, browser_window_dimension, screenshot_resize_target_dimension
|
||||||
|
)
|
||||||
|
direction = "down" if action == "left_mouse_down" else "up"
|
||||||
|
response = f"Left mouse {direction} at: ({x}, {y})"
|
||||||
|
reasoning = reasoning or response
|
||||||
|
actions.append(
|
||||||
|
LeftMouseAction(
|
||||||
|
x=x,
|
||||||
|
y=y,
|
||||||
|
direction=direction,
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
response=response,
|
||||||
|
organization_id=task.organization_id,
|
||||||
|
workflow_run_id=task.workflow_run_id,
|
||||||
|
task_id=task.task_id,
|
||||||
|
step_id=step.step_id,
|
||||||
|
step_order=step.order,
|
||||||
|
action_order=idx,
|
||||||
|
tool_call_id=tool_call_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif action == "left_click_drag":
|
||||||
|
coordinate = tool_call_input.get("coordinate")
|
||||||
|
start_coordinate = tool_call_input.get("start_coordinate")
|
||||||
|
LOG.info(
|
||||||
|
"Anthropic CUA left click drag action", coordinate=coordinate, start_coordinate=start_coordinate
|
||||||
|
)
|
||||||
|
if not coordinate or not start_coordinate:
|
||||||
|
LOG.warning(
|
||||||
|
"Anthropic CUA error: left click drag action has no coordinate or start coordinate",
|
||||||
|
tool_call=tool_call,
|
||||||
|
)
|
||||||
|
idx += 1
|
||||||
|
continue
|
||||||
|
x, y = validate_and_get_coordinates(
|
||||||
|
coordinate, browser_window_dimension, screenshot_resize_target_dimension
|
||||||
|
)
|
||||||
|
start_x, start_y = validate_and_get_coordinates(
|
||||||
|
start_coordinate, browser_window_dimension, screenshot_resize_target_dimension
|
||||||
|
)
|
||||||
|
response = f"Drag from ({start_x}, {start_y}) to ({x}, {y})"
|
||||||
|
reasoning = reasoning or response
|
||||||
|
actions.append(
|
||||||
|
DragAction(
|
||||||
|
start_x=start_x,
|
||||||
|
start_y=start_y,
|
||||||
|
path=[(x, y)],
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
response=response,
|
||||||
|
organization_id=task.organization_id,
|
||||||
|
workflow_run_id=task.workflow_run_id,
|
||||||
|
task_id=task.task_id,
|
||||||
|
step_id=step.step_id,
|
||||||
|
step_order=step.order,
|
||||||
|
action_order=idx,
|
||||||
|
tool_call_id=tool_call_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif action == "wait":
|
||||||
|
duration = tool_call_input.get("duration", 5)
|
||||||
|
actions.append(
|
||||||
|
WaitAction(
|
||||||
|
seconds=duration,
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
response=f"Wait for {duration} seconds",
|
||||||
|
organization_id=task.organization_id,
|
||||||
|
workflow_run_id=task.workflow_run_id,
|
||||||
|
task_id=task.task_id,
|
||||||
|
step_id=step.step_id,
|
||||||
|
step_order=step.order,
|
||||||
|
action_order=idx,
|
||||||
|
tool_call_id=tool_call_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
LOG.error(
|
||||||
|
"Anthropic CUA error: unsupported action",
|
||||||
tool_call=tool_call,
|
tool_call=tool_call,
|
||||||
)
|
)
|
||||||
idx += 1
|
idx += 1
|
||||||
continue
|
except Exception:
|
||||||
original_x, original_y = coordinate
|
LOG.exception(
|
||||||
x, y = scale_coordinates(
|
"Anthropic CUA error: failed to parse action",
|
||||||
(original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension
|
task_id=task.task_id,
|
||||||
)
|
step_id=step.step_id,
|
||||||
actions.append(
|
|
||||||
ClickAction(
|
|
||||||
element_id="",
|
|
||||||
x=x,
|
|
||||||
y=y,
|
|
||||||
button="left",
|
|
||||||
organization_id=task.organization_id,
|
|
||||||
workflow_run_id=task.workflow_run_id,
|
|
||||||
task_id=task.task_id,
|
|
||||||
step_id=step.step_id,
|
|
||||||
step_order=step.order,
|
|
||||||
action_order=idx,
|
|
||||||
tool_call_id=tool_call_id,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
elif action == "type":
|
|
||||||
text = tool_call_input.get("text")
|
|
||||||
if not text:
|
|
||||||
LOG.warning(
|
|
||||||
"Anthropic type action has no text",
|
|
||||||
tool_call=tool_call,
|
|
||||||
)
|
|
||||||
idx += 1
|
|
||||||
continue
|
|
||||||
actions.append(
|
|
||||||
InputTextAction(
|
|
||||||
element_id="",
|
|
||||||
text=text,
|
|
||||||
organization_id=task.organization_id,
|
|
||||||
workflow_run_id=task.workflow_run_id,
|
|
||||||
task_id=task.task_id,
|
|
||||||
step_id=step.step_id,
|
|
||||||
step_order=step.order,
|
|
||||||
action_order=idx,
|
|
||||||
tool_call_id=tool_call_id,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
elif action == "key":
|
|
||||||
text = tool_call_input.get("text")
|
|
||||||
if not text:
|
|
||||||
LOG.warning(
|
|
||||||
"Key action has no text",
|
|
||||||
tool_call=tool_call,
|
|
||||||
)
|
|
||||||
idx += 1
|
|
||||||
continue
|
|
||||||
actions.append(
|
|
||||||
KeypressAction(
|
|
||||||
element_id="",
|
|
||||||
keys=[text],
|
|
||||||
organization_id=task.organization_id,
|
|
||||||
workflow_run_id=task.workflow_run_id,
|
|
||||||
task_id=task.task_id,
|
|
||||||
step_id=step.step_id,
|
|
||||||
step_order=step.order,
|
|
||||||
action_order=idx,
|
|
||||||
tool_call_id=tool_call_id,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
elif action == "screenshot":
|
|
||||||
actions.append(
|
|
||||||
NullAction(
|
|
||||||
organization_id=task.organization_id,
|
|
||||||
workflow_run_id=task.workflow_run_id,
|
|
||||||
task_id=task.task_id,
|
|
||||||
step_id=step.step_id,
|
|
||||||
step_order=step.order,
|
|
||||||
action_order=idx,
|
|
||||||
tool_call_id=tool_call_id,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
LOG.error(
|
|
||||||
"Unsupported action",
|
|
||||||
tool_call=tool_call,
|
tool_call=tool_call,
|
||||||
)
|
)
|
||||||
idx += 1
|
break
|
||||||
|
if not actions:
|
||||||
|
reasoning = reasonings[0]["thinking"] if reasonings else None
|
||||||
|
assistant_messages = [block for block in assistant_content if block["type"] == "text"]
|
||||||
|
assistant_message = assistant_messages[0]["text"] if assistant_messages else None
|
||||||
|
actions = await generate_cua_fallback_actions(task, step, assistant_message, reasoning)
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
|
|
||||||
|
# function from anthropic's quickstart guide
|
||||||
|
# https://github.com/anthropics/anthropic-quickstarts/blob/81c4085944abb1734db411f05290b538fdc46dcd/computer-use-demo/computer_use_demo/tools/computer.py#L214C1-L221C1
|
||||||
|
def validate_and_get_coordinates(
|
||||||
|
coordinate: tuple[int, int] | list[int],
|
||||||
|
current_dimension: Resolution,
|
||||||
|
target_dimension: Resolution,
|
||||||
|
) -> tuple[int, int]:
|
||||||
|
if len(coordinate) != 2:
|
||||||
|
raise ValueError(f"{coordinate} must be a tuple of length 2")
|
||||||
|
if not all(isinstance(i, int) and i >= 0 for i in coordinate):
|
||||||
|
raise ValueError(f"{coordinate} must be a tuple of non-negative ints")
|
||||||
|
|
||||||
|
return scale_coordinates((coordinate[0], coordinate[1]), current_dimension, target_dimension)
|
||||||
|
|
||||||
|
|
||||||
async def generate_cua_fallback_actions(
|
async def generate_cua_fallback_actions(
|
||||||
task: Task,
|
task: Task,
|
||||||
step: Step,
|
step: Step,
|
||||||
|
|||||||
Reference in New Issue
Block a user