anthropic CUA (#2231)

Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
2025-04-28 09:49:44 +08:00
parent 5582998490
commit 0a0228b341
18 changed files with 378 additions and 45 deletions
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -113,6 +113,7 @@ class Action(BaseModel):
    element_id: Annotated[str, Field(coerce_numbers_to_str=True)] | None = None
    skyvern_element_hash: str | None = None
    skyvern_element_data: dict[str, Any] | None = None
+    tool_call_id: str | None = None

    # DecisiveAction (CompleteAction, TerminateAction) fields
    errors: list[UserDefinedError] | None = None
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -59,6 +59,7 @@ from skyvern.forge.sdk.api.files import (
    list_files_in_directory,
    wait_for_download_finished,
 )
+from skyvern.forge.sdk.api.llm.api_handler_factory import LLMCallerManager
 from skyvern.forge.sdk.api.llm.exceptions import LLMProviderError
 from skyvern.forge.sdk.core import skyvern_context
 from skyvern.forge.sdk.core.aiohttp_helper import aiohttp_post
@@ -363,9 +364,26 @@ class ActionHandler:
                handler = ActionHandler._handled_action_types[action.action_type]
                results = await handler(action, page, scraped_page, task, step)
                actions_result.extend(results)
+                llm_caller = LLMCallerManager.get_llm_caller(task.task_id)
                if not results or not isinstance(actions_result[-1], ActionSuccess):
+                    if llm_caller and action.tool_call_id:
+                        # add failure message to the llm caller
+                        tool_call_result = {
+                            "type": "tool_result",
+                            "tool_use_id": action.tool_call_id,
+                            "content": {"result": "Tool execution failed"},
+                        }
+                        llm_caller.add_tool_result(tool_call_result)
                    return actions_result

+                if llm_caller and action.tool_call_id:
+                    tool_call_result = {
+                        "type": "tool_result",
+                        "tool_use_id": action.tool_call_id,
+                        "content": {"result": "Tool executed successfully"},
+                    }
+                    llm_caller.add_tool_result(tool_call_result)
+
                # do the teardown
                teardown = ActionHandler._teardown_action_types.get(action.action_type)
                if teardown:
@@ -1532,7 +1550,7 @@ async def handle_keypress_action(
 ) -> list[ActionResult]:
    updated_keys = []
    for key in action.keys:
-        if key.lower() == "enter":
+        if key.lower() in ("enter", "return"):
            updated_keys.append("Enter")
        elif key.lower() == "space":
            updated_keys.append(" ")
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -1,3 +1,4 @@
+import json
 from typing import Any, Dict

 import structlog
@@ -448,3 +449,133 @@ async def parse_cua_actions(
        action.action_order = 0
        return [action]
    return actions
+
+
+async def parse_anthropic_actions(
+    task: Task,
+    step: Step,
+    assistant_content: list[dict[str, Any]],
+) -> list[Action]:
+    tool_calls = [block for block in assistant_content if block["type"] == "tool_use"]
+    idx = 0
+    actions: list[Action] = []
+    while idx < len(tool_calls):
+        tool_call = tool_calls[idx]
+        tool_call_id = tool_call["id"]
+        parsed_args = _parse_anthropic_computer_args(tool_call)
+        if not parsed_args:
+            idx += 1
+            continue
+        action = parsed_args["action"]
+        if action == "mouse_move":
+            x, y = parsed_args["coordinate"]
+            actions.append(
+                MoveAction(
+                    x=x,
+                    y=y,
+                    organization_id=task.organization_id,
+                    workflow_run_id=task.workflow_run_id,
+                    task_id=task.task_id,
+                    step_id=step.step_id,
+                    step_order=step.order,
+                    action_order=idx,
+                    tool_call_id=tool_call_id,
+                )
+            )
+            idx += 1
+        elif action == "left_click":
+            if idx - 1 >= 0:
+                prev_tool_call = tool_calls[idx - 1]
+                prev_parsed_args = _parse_anthropic_computer_args(prev_tool_call)
+                if prev_parsed_args and prev_parsed_args["action"] == "mouse_move":
+                    coordinate = prev_parsed_args["coordinate"]
+                else:
+                    coordinate = parsed_args.get("coordinate")
+            else:
+                coordinate = parsed_args.get("coordinate")
+
+            idx += 1
+            if not coordinate:
+                LOG.warning(
+                    "Left click action has no coordinate and it doesn't have mouse_move before it",
+                    tool_call=tool_call,
+                )
+                continue
+            x, y = coordinate
+            actions.append(
+                ClickAction(
+                    element_id="",
+                    x=x,
+                    y=y,
+                    button="left",
+                    organization_id=task.organization_id,
+                    workflow_run_id=task.workflow_run_id,
+                    task_id=task.task_id,
+                    step_id=step.step_id,
+                    step_order=step.order,
+                    action_order=idx - 1,
+                    tool_call_id=tool_call_id,
+                )
+            )
+        elif action == "type":
+            text = parsed_args.get("text")
+            idx += 1
+            if not text:
+                LOG.warning(
+                    "Type action has no text",
+                    tool_call=tool_call,
+                )
+                continue
+            actions.append(
+                InputTextAction(
+                    element_id="",
+                    text=text,
+                    organization_id=task.organization_id,
+                    workflow_run_id=task.workflow_run_id,
+                    task_id=task.task_id,
+                    step_id=step.step_id,
+                    step_order=step.order,
+                    action_order=idx,
+                    tool_call_id=tool_call_id,
+                )
+            )
+        elif action == "key":
+            text = parsed_args.get("text")
+            idx += 1
+            if not text:
+                LOG.warning(
+                    "Key action has no text",
+                    tool_call=tool_call,
+                )
+                continue
+            actions.append(
+                KeypressAction(
+                    element_id="",
+                    keys=[text],
+                    organization_id=task.organization_id,
+                    workflow_run_id=task.workflow_run_id,
+                    task_id=task.task_id,
+                    step_id=step.step_id,
+                    step_order=step.order,
+                    action_order=idx,
+                    tool_call_id=tool_call_id,
+                )
+            )
+        else:
+            LOG.error(
+                "Unsupported action",
+                tool_call=tool_call,
+            )
+            idx += 1
+    return actions
+
+
+def _parse_anthropic_computer_args(tool_call: dict[str, Any]) -> dict[str, Any] | None:
+    tool_call_type = tool_call["type"]
+    if tool_call_type != "function":
+        return None
+    tool_call_name = tool_call["function"]["name"]
+    if tool_call_name != "computer":
+        return None
+    tool_call_arguments = tool_call["function"]["arguments"]
+    return json.loads(tool_call_arguments)
--- a/skyvern/webeye/actions/responses.py
+++ b/skyvern/webeye/actions/responses.py
@@ -18,6 +18,7 @@ class ActionResult(BaseModel):
    interacted_with_sibling: bool | None = None
    interacted_with_parent: bool | None = None
    skip_remaining_actions: bool | None = None
+    tool_call_result: dict[str, Any] | None = None

    def __str__(self) -> str:
        results = [f"ActionResult(success={self.success}"]