Files
Dorod-Sky/skyvern/webeye/actions/parse_actions.py
2025-06-13 13:23:39 +08:00

1224 lines
47 KiB
Python

from typing import Any, Dict
import structlog
from openai.types.responses.response import Response as OpenAIResponse
from pydantic import ValidationError
from skyvern.constants import SCROLL_AMOUNT_MULTIPLIER
from skyvern.exceptions import NoTOTPVerificationCodeFound, UnsupportedActionType
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.tasks import Task
from skyvern.utils.image_resizer import Resolution, scale_coordinates
from skyvern.webeye.actions.action_types import ActionType
from skyvern.webeye.actions.actions import (
Action,
CheckboxAction,
ClickAction,
CompleteAction,
DownloadFileAction,
DragAction,
InputTextAction,
KeypressAction,
LeftMouseAction,
MoveAction,
NullAction,
ScrollAction,
SelectOption,
SelectOptionAction,
SolveCaptchaAction,
TerminateAction,
UploadFileAction,
VerificationCodeAction,
WaitAction,
)
from skyvern.webeye.actions.handler import poll_verification_code
from skyvern.webeye.scraper.scraper import ScrapedPage
LOG = structlog.get_logger()
def parse_action(action: Dict[str, Any], scraped_page: ScrapedPage, data_extraction_goal: str | None = None) -> Action:
if "id" in action:
element_id = action["id"]
elif "element_id" in action:
element_id = action["element_id"]
else:
element_id = None
skyvern_element_hash = scraped_page.id_to_element_hash.get(element_id) if element_id else None
skyvern_element_data = scraped_page.id_to_element_dict.get(element_id) if element_id else None
reasoning = action["reasoning"] if "reasoning" in action else None
confidence_float = action["confidence_float"] if "confidence_float" in action else None
# TODO: currently action intention and response are only used for Q&A actions, like input_text
# When we start supporting click action, intention will be the reasoning for the click action (why take the action)
intention = action["user_detail_query"] if "user_detail_query" in action else None
response = action["user_detail_answer"] if "user_detail_answer" in action else None
base_action_dict = {
"element_id": element_id,
"skyvern_element_hash": skyvern_element_hash,
"skyvern_element_data": skyvern_element_data,
"reasoning": reasoning,
"confidence_float": confidence_float,
"intention": intention,
"response": response,
}
if "action_type" not in action or action["action_type"] is None:
return NullAction(**base_action_dict)
# `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
action_type = ActionType[action["action_type"].upper()]
if not action_type.is_web_action():
# LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc.
# That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not
# set for non-web actions.
base_action_dict["element_id"] = None
if action_type == ActionType.TERMINATE:
return TerminateAction(**base_action_dict, errors=action["errors"] if "errors" in action else [])
if action_type == ActionType.CLICK:
file_url = action["file_url"] if "file_url" in action else None
return ClickAction(**base_action_dict, file_url=file_url, download=action.get("download", False))
if action_type == ActionType.INPUT_TEXT:
return InputTextAction(**base_action_dict, text=action["text"])
if action_type == ActionType.UPLOAD_FILE:
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
return UploadFileAction(
**base_action_dict,
file_url=action["file_url"],
)
# This action is not used in the current implementation. Click actions are used instead.
if action_type == ActionType.DOWNLOAD_FILE:
return DownloadFileAction(**base_action_dict, file_name=action["file_name"])
if action_type == ActionType.SELECT_OPTION:
option = action["option"]
if option is None:
raise ValueError("SelectOptionAction requires an 'option' field")
label = option.get("label")
value = option.get("value")
index = option.get("index")
if label is None and value is None and index is None:
raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption")
return SelectOptionAction(
**base_action_dict,
option=SelectOption(
label=label,
value=value,
index=index,
),
)
if action_type == ActionType.CHECKBOX:
return CheckboxAction(
**base_action_dict,
is_checked=action["is_checked"],
)
if action_type == ActionType.WAIT:
return WaitAction(**base_action_dict)
if action_type == ActionType.COMPLETE:
return CompleteAction(
**base_action_dict,
data_extraction_goal=data_extraction_goal,
errors=action["errors"] if "errors" in action else [],
)
if action_type == "null":
return NullAction(**base_action_dict)
if action_type == ActionType.SOLVE_CAPTCHA:
return SolveCaptchaAction(**base_action_dict)
raise UnsupportedActionType(action_type=action_type)
def parse_actions(
task: Task, step_id: str, step_order: int, scraped_page: ScrapedPage, json_response: list[Dict[str, Any]]
) -> list[Action]:
actions: list[Action] = []
for idx, action in enumerate(json_response):
try:
action_instance = parse_action(
action=action, scraped_page=scraped_page, data_extraction_goal=task.data_extraction_goal
)
action_instance.organization_id = task.organization_id
action_instance.workflow_run_id = task.workflow_run_id
action_instance.task_id = task.task_id
action_instance.step_id = step_id
action_instance.step_order = step_order
action_instance.action_order = idx
if isinstance(action_instance, TerminateAction):
LOG.warning(
"Agent decided to terminate",
task_id=task.task_id,
llm_response=json_response,
reasoning=action_instance.reasoning,
actions=actions,
)
actions.append(action_instance)
except UnsupportedActionType:
LOG.error(
"Unsupported action type when parsing actions",
task_id=task.task_id,
raw_action=action,
exc_info=True,
)
except (ValidationError, ValueError):
LOG.warning(
"Invalid action",
task_id=task.task_id,
raw_action=action,
exc_info=True,
)
except Exception:
LOG.error(
"Failed to marshal action",
task_id=task.task_id,
raw_action=action,
exc_info=True,
)
############################ This part of code might not be needed ############################
# Reason #1. validation can be done in action handler but not in parser
# Reason #2. no need to validate whether the element_id has a hash.
# If there's no hash, we can fall back to normal operation
all_element_ids = [action.element_id for action in actions if action.element_id]
missing_element_ids = [
element_id for element_id in all_element_ids if element_id not in scraped_page.id_to_element_hash
]
if missing_element_ids:
LOG.warning(
"Missing elements in scraped page",
task_id=task.task_id,
missing_element_ids=missing_element_ids,
all_element_ids=all_element_ids,
)
############################ This part of code might not be needed ############################
return actions
async def parse_cua_actions(
task: Task,
step: Step,
response: OpenAIResponse,
) -> list[Action]:
computer_calls = [item for item in response.output if item.type == "computer_call"]
reasonings = [item for item in response.output if item.type == "reasoning"]
assistant_messages = [item for item in response.output if item.type == "message" and item.role == "assistant"]
actions: list[Action] = []
for idx, computer_call in enumerate(computer_calls):
cua_action = computer_call.action
action_type = cua_action.type
try:
reasoning = None
if idx < len(reasonings):
try:
reasoning = reasonings[idx].summary[0].text
except Exception:
LOG.exception(
"Failed to parse reasoning",
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
)
match action_type:
case "click":
button = cua_action.button
if button != "left" and button != "right":
button = "left"
reasoning = reasoning or f"Click at: ({cua_action.x}, {cua_action.y})"
action = ClickAction(
element_id="",
x=cua_action.x,
y=cua_action.y,
button=button,
reasoning=reasoning,
intention=reasoning,
response=f"Click at: ({cua_action.x}, {cua_action.y})",
)
case "scroll":
reasoning = reasoning or f"Scroll by: ({cua_action.x}, {cua_action.y})"
action = ScrollAction(
element_id="",
x=cua_action.x,
y=cua_action.y,
scroll_x=cua_action.scroll_x,
scroll_y=cua_action.scroll_y,
reasoning=reasoning,
intention=reasoning,
response=f"Scroll by: ({cua_action.x}, {cua_action.y})",
)
case "keypress":
reasoning_str = f"Press keys: {cua_action.keys}"
if len(cua_action.keys) == 1:
reasoning_str = f"Press the '{cua_action.keys[0]}' key"
reasoning = reasoning or reasoning_str
action = KeypressAction(
element_id="",
keys=cua_action.keys,
reasoning=reasoning,
intention=reasoning,
response=str(cua_action.keys),
)
case "type":
action = InputTextAction(
element_id="",
text=cua_action.text,
reasoning=reasoning,
intention=reasoning,
response=cua_action.text,
)
case "wait":
action = WaitAction(
seconds=5,
reasoning=reasoning,
intention=reasoning,
)
case "move":
response = f"Move mouse to: ({cua_action.x}, {cua_action.y})"
reasoning = reasoning or response
action = MoveAction(
x=cua_action.x,
y=cua_action.y,
reasoning=reasoning,
intention=reasoning,
)
case "drag":
whole_path = cua_action.path
if not whole_path or len(whole_path) < 2:
LOG.warning(
"Invalid drag action",
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
whole_path=whole_path,
)
action = WaitAction(
seconds=5,
reasoning=reasoning,
intention=reasoning,
)
else:
start_x, start_y = whole_path[0][0], whole_path[0][1]
reasoning = reasoning or f"Drag action path: {whole_path}"
action = DragAction(
start_x=start_x,
start_y=start_y,
path=whole_path[1:],
reasoning=reasoning,
intention=reasoning,
)
case "screenshot":
action = NullAction(
reasoning=reasoning,
intention=reasoning,
)
case _:
raise ValueError(f"Unsupported action type: {action_type}")
action.organization_id = task.organization_id
action.workflow_run_id = task.workflow_run_id
action.task_id = task.task_id
action.step_id = step.step_id
action.step_order = step.order
action.action_order = idx
actions.append(action)
except Exception:
LOG.exception(
"Failed to parse action",
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
)
break
if not actions:
LOG.info(
"Empty action returned by CUA",
task_id=task.task_id,
step_id=step.step_id,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
response=response.model_dump(),
)
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
actions = await generate_cua_fallback_actions(task, step, assistant_message, reasoning)
return actions
async def parse_anthropic_actions(
task: Task,
step: Step,
assistant_content: list[dict[str, Any]],
browser_window_dimension: Resolution,
screenshot_resize_target_dimension: Resolution,
) -> list[Action]:
tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
reasonings = [block for block in assistant_content if block["type"] == "thinking"]
LOG.info("Anthropic tool calls", tool_calls=tool_calls, reasonings=reasonings, assistant_content=assistant_content)
if len(reasonings) > 1:
LOG.warning(
"Anthropic CUA: multiple reasonings in assistant content",
task_id=task.task_id,
step_id=step.step_id,
assistant_content=assistant_content,
)
reasoning = reasonings[0]["thinking"] if reasonings else None
idx = 0
actions: list[Action] = []
while idx < len(tool_calls):
tool_call = tool_calls[idx]
try:
tool_call_id = tool_call["id"]
tool_call_input = tool_call.get("input")
if not tool_call_input:
idx += 1
continue
action = tool_call_input["action"]
if action == "mouse_move":
coordinate = tool_call_input.get("coordinate")
if not coordinate:
LOG.warning(
"Anthropic CUA error: mouse move action has no coordinate",
tool_call=tool_call,
)
idx += 1
continue
# (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension.
x, y = validate_and_get_coordinates(
coordinate, screenshot_resize_target_dimension, browser_window_dimension
)
response = f"Move mouse to: ({x}, {y})"
reasoning = reasoning or response
actions.append(
# TODO: add response by adding specifying the element to move to
MoveAction(
x=x,
y=y,
reasoning=reasoning,
intention=reasoning,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action in ["left_click", "double_click", "triple_click", "right_click"]:
coordinate = tool_call_input.get("coordinate")
if not coordinate and idx - 1 >= 0:
prev_tool_call = tool_calls[idx - 1]
prev_tool_call_input = prev_tool_call.get("input")
if prev_tool_call_input and prev_tool_call_input["action"] == "mouse_move":
coordinate = prev_tool_call_input.get("coordinate")
if not coordinate:
LOG.warning(
"Anthropic CUA error: left click action has no coordinate and it doesn't have mouse_move before it",
tool_call=tool_call,
)
idx += 1
continue
x, y = validate_and_get_coordinates(
coordinate, screenshot_resize_target_dimension, browser_window_dimension
)
repeat = 1
if action == "double_click":
repeat = 2
elif action == "triple_click":
repeat = 3
response = f"Click at: ({x}, {y})"
reasoning = reasoning or response
button = "left"
if action == "right_click":
button = "right"
actions.append(
ClickAction(
element_id="",
x=x,
y=y,
button=button,
repeat=repeat,
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "type":
text = tool_call_input.get("text")
if not text:
LOG.warning(
"Anthropic CUA error: type action has no text",
tool_call=tool_call,
)
idx += 1
continue
actions.append(
InputTextAction(
element_id="",
text=text,
reasoning=reasoning,
intention=reasoning,
response=text,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action in ["key", "hold_key"]:
text = tool_call_input.get("text", "")
if not text:
LOG.warning(
"Anthropic CUA error: key action has no text",
tool_call=tool_call,
)
idx += 1
continue
response = f"Press keys: {text}"
keys = text.split("+")
hold = action == "hold_key"
duration = tool_call_input.get("duration", 0)
if hold:
response = f"Hold keys for {duration} seconds: {text}"
reasoning = reasoning or response
actions.append(
KeypressAction(
element_id="",
keys=keys,
hold=hold,
duration=duration,
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "screenshot":
actions.append(
NullAction(
reasoning=reasoning,
intention=reasoning,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "scroll":
x, y = None, None
coordinate = tool_call_input.get("coordinate")
if coordinate:
x, y = validate_and_get_coordinates(
coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
scroll_direction = tool_call_input.get("scroll_direction")
scroll_amount = tool_call_input.get("scroll_amount")
if scroll_direction == "up":
scroll_x = 0
scroll_y = -scroll_amount * SCROLL_AMOUNT_MULTIPLIER
elif scroll_direction == "down":
scroll_x = 0
scroll_y = scroll_amount * SCROLL_AMOUNT_MULTIPLIER
elif scroll_direction == "left":
scroll_x = -scroll_amount * SCROLL_AMOUNT_MULTIPLIER
scroll_y = 0
elif scroll_direction == "right":
scroll_x = scroll_amount * SCROLL_AMOUNT_MULTIPLIER
scroll_y = 0
else:
LOG.warning(
"Anthropic CUA error: unsupported scroll direction",
tool_call=tool_call,
)
idx += 1
continue
response = f"Scroll by: ({scroll_x}, {scroll_y})"
reasoning = reasoning or response
actions.append(
ScrollAction(
element_id="",
x=x,
y=y,
scroll_x=scroll_x,
scroll_y=scroll_y,
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action in ["left_mouse_down", "left_mouse_up"]:
coordinate = tool_call_input.get("coordinate")
x, y = None, None
if coordinate:
x, y = validate_and_get_coordinates(
coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
direction = "down" if action == "left_mouse_down" else "up"
response = f"Left mouse {direction} at: ({x}, {y})"
reasoning = reasoning or response
actions.append(
LeftMouseAction(
x=x,
y=y,
direction=direction,
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "left_click_drag":
coordinate = tool_call_input.get("coordinate")
start_coordinate = tool_call_input.get("start_coordinate")
LOG.info(
"Anthropic CUA left click drag action", coordinate=coordinate, start_coordinate=start_coordinate
)
if not coordinate or not start_coordinate:
LOG.warning(
"Anthropic CUA error: left click drag action has no coordinate or start coordinate",
tool_call=tool_call,
)
idx += 1
continue
x, y = validate_and_get_coordinates(
coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
start_x, start_y = validate_and_get_coordinates(
start_coordinate, browser_window_dimension, screenshot_resize_target_dimension
)
response = f"Drag from ({start_x}, {start_y}) to ({x}, {y})"
reasoning = reasoning or response
actions.append(
DragAction(
start_x=start_x,
start_y=start_y,
path=[(x, y)],
reasoning=reasoning,
intention=reasoning,
response=response,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
elif action == "wait":
duration = tool_call_input.get("duration", 5)
actions.append(
WaitAction(
seconds=duration,
reasoning=reasoning,
intention=reasoning,
response=f"Wait for {duration} seconds",
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
tool_call_id=tool_call_id,
)
)
else:
LOG.error(
"Anthropic CUA error: unsupported action",
tool_call=tool_call,
)
idx += 1
except Exception:
LOG.exception(
"Anthropic CUA error: failed to parse action",
task_id=task.task_id,
step_id=step.step_id,
tool_call=tool_call,
)
break
if not actions:
reasoning = reasonings[0]["thinking"] if reasonings else None
assistant_messages = [block for block in assistant_content if block["type"] == "text"]
assistant_message = assistant_messages[0]["text"] if assistant_messages else None
actions = await generate_cua_fallback_actions(task, step, assistant_message, reasoning)
return actions
# function from anthropic's quickstart guide
# https://github.com/anthropics/anthropic-quickstarts/blob/81c4085944abb1734db411f05290b538fdc46dcd/computer-use-demo/computer_use_demo/tools/computer.py#L214C1-L221C1
def validate_and_get_coordinates(
coordinate: tuple[int, int] | list[int],
current_dimension: Resolution,
target_dimension: Resolution,
) -> tuple[int, int]:
if len(coordinate) != 2:
raise ValueError(f"{coordinate} must be a tuple of length 2")
if not all(isinstance(i, int) and i >= 0 for i in coordinate):
raise ValueError(f"{coordinate} must be a tuple of non-negative ints")
return scale_coordinates((coordinate[0], coordinate[1]), current_dimension, target_dimension)
async def generate_cua_fallback_actions(
task: Task,
step: Step,
assistant_message: str | None,
reasoning: str | None,
) -> list[Action]:
fallback_action_prompt = prompt_engine.load_prompt(
"cua-fallback-action",
navigation_goal=task.navigation_goal,
assistant_message=assistant_message,
assistant_reasoning=reasoning,
)
action_response = await app.LLM_API_HANDLER(
prompt=fallback_action_prompt,
prompt_name="cua-fallback-action",
)
LOG.info("Fallback action response", action_response=action_response)
skyvern_action_type = action_response.get("action")
useful_information = action_response.get("useful_information")
action = WaitAction(
seconds=5,
reasoning=reasoning,
intention=reasoning,
)
if skyvern_action_type == "complete":
LOG.info(
"Updating task with useful information",
task_id=task.task_id,
organization_id=task.organization_id,
useful_information=useful_information,
assistant_message=assistant_message,
reasoning=reasoning,
)
await app.DATABASE.update_task(
task.task_id,
organization_id=task.organization_id,
extracted_information=assistant_message,
)
action = CompleteAction(
reasoning=reasoning,
intention=reasoning,
verified=True,
data_extraction_goal=task.data_extraction_goal,
)
elif skyvern_action_type == "terminate":
action = TerminateAction(
reasoning=reasoning,
intention=reasoning,
)
elif skyvern_action_type == "solve_captcha":
action = SolveCaptchaAction(
reasoning=reasoning,
intention=reasoning,
)
elif skyvern_action_type == "get_verification_code":
if (task.totp_verification_url or task.totp_identifier) and task.organization_id:
LOG.info(
"Getting verification code for CUA",
task_id=task.task_id,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
totp_verification_url=task.totp_verification_url,
totp_identifier=task.totp_identifier,
)
try:
verification_code = await poll_verification_code(
task.task_id,
task.organization_id,
workflow_run_id=task.workflow_run_id,
totp_verification_url=task.totp_verification_url,
totp_identifier=task.totp_identifier,
)
reasoning = reasoning or f"Received verification code: {verification_code}"
action = VerificationCodeAction(
verification_code=verification_code,
reasoning=reasoning,
intention=reasoning,
)
except NoTOTPVerificationCodeFound:
reasoning_suffix = "No verification code found"
reasoning = f"{reasoning}. {reasoning_suffix}" if reasoning else reasoning_suffix
action = TerminateAction(
reasoning=reasoning,
intention=reasoning,
)
else:
action = TerminateAction(
reasoning=reasoning,
intention=reasoning,
)
action.organization_id = task.organization_id
action.workflow_run_id = task.workflow_run_id
action.task_id = task.task_id
action.step_id = step.step_id
action.step_order = step.order
action.action_order = 0
return [action]
async def parse_ui_tars_actions(
task: Task,
step: Step,
response_content: str,
browser_window_dimension: Resolution,
) -> list[Action]:
"""Parse UI-TARS response and convert to Skyvern actions."""
try:
# Parse the UI-TARS response text
parsed_actions = _parse_ui_tars_response(response_content, browser_window_dimension)
actions: list[Action] = []
for idx, parsed_action in enumerate(parsed_actions):
try:
action = _create_ui_tars_action(parsed_action, task, step, browser_window_dimension, idx)
if action:
actions.append(action)
except Exception:
LOG.exception(
"Failed to create UI-TARS action",
task_id=task.task_id,
step_id=step.step_id,
parsed_action=parsed_action,
)
continue
if not actions:
LOG.warning(
"No valid actions generated from UI-TARS response",
task_id=task.task_id,
step_id=step.step_id,
response_preview=response_content[:200],
)
return actions
except Exception:
LOG.exception(
"Failed to parse UI-TARS actions",
task_id=task.task_id,
step_id=step.step_id,
response_content=response_content[:200],
)
return []
def _parse_ui_tars_response(response_content: str, browser_window_dimension: Resolution) -> list[dict[str, Any]]:
"""Parse UI-TARS response text into structured action data.
Extracts essential parsing logic from action_parser.py without the complex coordinate transformations.
"""
import re
text = response_content.strip()
# Convert point format to coordinates if needed
if "<point>" in text:
text = _convert_point_to_coordinates(text)
# Normalize parameter names
text = text.replace("start_point=", "start_box=")
text = text.replace("end_point=", "end_box=")
text = text.replace("point=", "start_box=")
# Extract thought/reasoning
thought = None
thought_patterns = [
r"Thought: (.+?)(?=\s*Action: |$)",
r"Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action: |$)",
r"Action_Summary: (.+?)(?=\s*Action: |$)",
]
for pattern in thought_patterns:
thought_match = re.search(pattern, text, re.DOTALL)
if thought_match:
if len(thought_match.groups()) == 1:
thought = thought_match.group(1).strip()
elif len(thought_match.groups()) == 2:
thought = thought_match.group(2).strip() # Use Action_Summary
break
if "Action:" not in text:
raise ValueError("No Action section found in UI-TARS response")
# Extract action string
action_str = text.split("Action: ")[-1]
# Split multiple actions
action_parts = action_str.split(")\n\n")
all_actions = []
for action_part in action_parts:
action_part = action_part.strip()
if not action_part:
continue
# Handle type action with content specially
if "type(content" in action_part:
if not action_part.endswith(")"):
action_part += ")"
# Extract content from type action
pattern = r"type\(content='(.*?)'\)"
match = re.search(pattern, action_part)
if match:
content = match.group(1)
# Escape single quotes in content
content = content.replace("'", "\\'")
action_part = f"type(content='{content}')"
if not action_part.endswith(")"):
action_part += ")"
all_actions.append(action_part)
# Parse each action
parsed_actions = []
for action_str in all_actions:
try:
parsed_action = _parse_single_action(action_str)
if parsed_action:
parsed_action["thought"] = thought
parsed_action["browser_window_dimension"] = browser_window_dimension
parsed_actions.append(parsed_action)
except Exception:
LOG.warning(
"Failed to parse individual UI-TARS action",
action_str=action_str,
exc_info=True,
)
continue
return parsed_actions
def _parse_single_action(action_str: str) -> dict[str, Any] | None:
"""Parse a single action string into structured data."""
import ast
try:
# Clean up the action string
action_str = action_str.replace("\n", "\\n").strip()
# Parse as Python expression
node = ast.parse(action_str, mode="eval")
if not isinstance(node, ast.Expression) or not isinstance(node.body, ast.Call):
return None
call = node.body
# Get function name
if isinstance(call.func, ast.Name):
func_name = call.func.id
elif isinstance(call.func, ast.Attribute):
func_name = call.func.attr
else:
return None
# Get arguments
action_inputs = {}
for kw in call.keywords:
if kw.arg and isinstance(kw.value, (ast.Constant, ast.Str)):
if isinstance(kw.value, ast.Constant):
value = kw.value.value
else: # ast.Str for older Python versions
value = kw.value.s
action_inputs[kw.arg] = value
return {
"action_type": func_name,
"action_inputs": action_inputs,
}
except Exception:
LOG.debug(f"Failed to parse action string: {action_str}", exc_info=True)
return None
def _convert_point_to_coordinates(text: str) -> str:
"""Convert <point>x y</point> format to (x,y) format."""
import re
from typing import Match
pattern = r"<point>(\d+)\s+(\d+)</point>"
def replace_match(match: Match[str]) -> str:
x, y = map(int, match.groups())
return f"({x},{y})"
return re.sub(pattern, replace_match, text)
def _create_ui_tars_action(
parsed_action: dict[str, Any],
task: Task,
step: Step,
browser_window_dimension: Resolution,
action_order: int,
) -> Action | None:
"""Create a Skyvern action from parsed UI-TARS data."""
action_type = parsed_action.get("action_type", "")
action_inputs = parsed_action.get("action_inputs", {})
thought = parsed_action.get("thought", "")
base_params = {
"reasoning": thought,
"intention": thought,
"organization_id": task.organization_id,
"workflow_run_id": task.workflow_run_id,
"task_id": task.task_id,
"step_id": step.step_id,
"step_order": step.order,
"action_order": action_order,
}
if action_type == "click":
x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
if x is None or y is None:
return None
return ClickAction(
element_id="",
x=x,
y=y,
response=f"Click at ({x}, {y})",
**base_params,
)
elif action_type == "left_double":
x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
if x is None or y is None:
return None
return ClickAction(
element_id="",
x=x,
y=y,
button="left",
repeat=2,
response=f"Double click at ({x}, {y})",
**base_params,
)
elif action_type == "right_single":
x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
if x is None or y is None:
return None
return ClickAction(
element_id="",
x=x,
y=y,
button="right",
response=f"Right click at ({x}, {y})",
**base_params,
)
elif action_type == "type":
content = action_inputs.get("content", "")
if not content:
return None
return InputTextAction(
element_id="",
text=content,
response=f"Type: {content[:50]}{'...' if len(content) > 50 else ''}",
**base_params,
)
elif action_type in ["drag", "select"]:
start_x, start_y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
end_x, end_y = _extract_ui_tars_coordinates(action_inputs.get("end_box", ""), browser_window_dimension)
if None in (start_x, start_y, end_x, end_y):
return None
return DragAction(
start_x=start_x,
start_y=start_y,
path=[(end_x, end_y)],
response=f"Drag from ({start_x}, {start_y}) to ({end_x}, {end_y})",
**base_params,
)
elif action_type == "hotkey":
key_combo = action_inputs.get("key", action_inputs.get("hotkey", ""))
if not key_combo:
return None
keys = key_combo.split()
return KeypressAction(
keys=keys,
response=f"Hotkey: {key_combo}",
**base_params,
)
elif action_type == "scroll":
direction = action_inputs.get("direction", "down").lower()
x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
if x is None or y is None:
# Use center of screen as fallback
x = browser_window_dimension["width"] // 2
y = browser_window_dimension["height"] // 2
scroll_amount = 300
if direction == "down":
scroll_x, scroll_y = 0, scroll_amount
elif direction == "up":
scroll_x, scroll_y = 0, -scroll_amount
elif direction == "right":
scroll_x, scroll_y = scroll_amount, 0
elif direction == "left":
scroll_x, scroll_y = -scroll_amount, 0
else:
scroll_x, scroll_y = 0, scroll_amount
return ScrollAction(
element_id="",
x=x,
y=y,
scroll_x=scroll_x,
scroll_y=scroll_y,
response=f"Scroll {direction} at ({x}, {y})",
**base_params,
)
elif action_type == "wait":
return WaitAction(
seconds=5,
**base_params,
)
elif action_type == "finished":
return CompleteAction(
data_extraction_goal=task.data_extraction_goal,
verified=True, # UI-TARS has already determined completion, skip Skyvern validation
**base_params,
)
else:
LOG.warning(f"Unsupported UI-TARS action type: {action_type}")
return None
def _extract_ui_tars_coordinates(box_str: str, browser_window_dimension: Resolution) -> tuple[int | None, int | None]:
"""Extract coordinates from UI-TARS box format with proper coordinate conversion.
UI-TARS coordinates need to be divided by 1000 to convert from the model's output
format to relative coordinates (0-1 range), then multiplied by screen dimensions
to get absolute pixel coordinates.
"""
import ast
if not box_str:
return None, None
try:
# Parse coordinates from string format like "(450,320)" or "[0.5, 0.3, 0.5, 0.3]"
coords = ast.literal_eval(box_str)
if not isinstance(coords, (list, tuple)):
return None, None
if len(coords) == 2:
# Direct coordinates like (450, 320) or (0.5, 0.3)
x, y = coords
# UI-TARS specific coordinate conversion
# UI-TARS outputs coordinates that need to be divided by 1000 first
if x > 1 or y > 1: # Likely UI-TARS format needing factor conversion
original_x, original_y = x, y
x = x / 1000.0
y = y / 1000.0
LOG.debug(f"Applied UI-TARS factor conversion: ({original_x}, {original_y}) -> ({x}, {y})")
# Convert relative coordinates (0-1) to absolute screen coordinates
if 0 <= x <= 1 and 0 <= y <= 1:
abs_x = int(x * browser_window_dimension["width"])
abs_y = int(y * browser_window_dimension["height"])
LOG.debug(
f"Converted to absolute coordinates: ({abs_x}, {abs_y}) for screen {browser_window_dimension['width']}x{browser_window_dimension['height']}"
)
return abs_x, abs_y
return int(x), int(y)
elif len(coords) == 4:
# Bounding box format [x1, y1, x2, y2] - take center point
x1, y1, x2, y2 = coords
x = (x1 + x2) / 2
y = (y1 + y2) / 2
# UI-TARS specific coordinate conversion for bounding boxes
if x > 1 or y > 1: # Likely UI-TARS format needing factor conversion
original_x, original_y = x, y
x = x / 1000.0
y = y / 1000.0
LOG.debug(
f"Applied UI-TARS factor conversion to bbox center: ({original_x}, {original_y}) -> ({x}, {y})"
)
# Convert relative coordinates (0-1) to absolute screen coordinates
if 0 <= x <= 1 and 0 <= y <= 1:
abs_x = int(x * browser_window_dimension["width"])
abs_y = int(y * browser_window_dimension["height"])
LOG.debug(
f"Converted bbox center to absolute coordinates: ({abs_x}, {abs_y}) for screen {browser_window_dimension['width']}x{browser_window_dimension['height']}"
)
return abs_x, abs_y
return int(x), int(y)
else:
return None, None
except Exception:
LOG.debug(f"Failed to parse UI-TARS coordinates: {box_str}", exc_info=True)
return None, None