ui-tars integration (#2656)

This commit is contained in:
Wyatt Marshall
2025-06-13 01:23:39 -04:00
committed by GitHub
parent 47cf755d9c
commit 15d46aab82
18 changed files with 986 additions and 13 deletions

View File

@@ -1745,6 +1745,9 @@ async def handle_keypress_action(
updated_keys.append("Escape")
elif key_lower_case == "alt":
updated_keys.append("Alt")
elif key_lower_case.startswith("f") and key_lower_case[1:].isdigit():
# Handle function keys: f1 -> F1, f5 -> F5, etc.
updated_keys.append(key_lower_case.upper())
else:
updated_keys.append(key)
keypress_str = "+".join(updated_keys)

View File

@@ -809,3 +809,415 @@ async def generate_cua_fallback_actions(
action.step_order = step.order
action.action_order = 0
return [action]
async def parse_ui_tars_actions(
task: Task,
step: Step,
response_content: str,
browser_window_dimension: Resolution,
) -> list[Action]:
"""Parse UI-TARS response and convert to Skyvern actions."""
try:
# Parse the UI-TARS response text
parsed_actions = _parse_ui_tars_response(response_content, browser_window_dimension)
actions: list[Action] = []
for idx, parsed_action in enumerate(parsed_actions):
try:
action = _create_ui_tars_action(parsed_action, task, step, browser_window_dimension, idx)
if action:
actions.append(action)
except Exception:
LOG.exception(
"Failed to create UI-TARS action",
task_id=task.task_id,
step_id=step.step_id,
parsed_action=parsed_action,
)
continue
if not actions:
LOG.warning(
"No valid actions generated from UI-TARS response",
task_id=task.task_id,
step_id=step.step_id,
response_preview=response_content[:200],
)
return actions
except Exception:
LOG.exception(
"Failed to parse UI-TARS actions",
task_id=task.task_id,
step_id=step.step_id,
response_content=response_content[:200],
)
return []
def _parse_ui_tars_response(response_content: str, browser_window_dimension: Resolution) -> list[dict[str, Any]]:
"""Parse UI-TARS response text into structured action data.
Extracts essential parsing logic from action_parser.py without the complex coordinate transformations.
"""
import re
text = response_content.strip()
# Convert point format to coordinates if needed
if "<point>" in text:
text = _convert_point_to_coordinates(text)
# Normalize parameter names
text = text.replace("start_point=", "start_box=")
text = text.replace("end_point=", "end_box=")
text = text.replace("point=", "start_box=")
# Extract thought/reasoning
thought = None
thought_patterns = [
r"Thought: (.+?)(?=\s*Action: |$)",
r"Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action: |$)",
r"Action_Summary: (.+?)(?=\s*Action: |$)",
]
for pattern in thought_patterns:
thought_match = re.search(pattern, text, re.DOTALL)
if thought_match:
if len(thought_match.groups()) == 1:
thought = thought_match.group(1).strip()
elif len(thought_match.groups()) == 2:
thought = thought_match.group(2).strip() # Use Action_Summary
break
if "Action:" not in text:
raise ValueError("No Action section found in UI-TARS response")
# Extract action string
action_str = text.split("Action: ")[-1]
# Split multiple actions
action_parts = action_str.split(")\n\n")
all_actions = []
for action_part in action_parts:
action_part = action_part.strip()
if not action_part:
continue
# Handle type action with content specially
if "type(content" in action_part:
if not action_part.endswith(")"):
action_part += ")"
# Extract content from type action
pattern = r"type\(content='(.*?)'\)"
match = re.search(pattern, action_part)
if match:
content = match.group(1)
# Escape single quotes in content
content = content.replace("'", "\\'")
action_part = f"type(content='{content}')"
if not action_part.endswith(")"):
action_part += ")"
all_actions.append(action_part)
# Parse each action
parsed_actions = []
for action_str in all_actions:
try:
parsed_action = _parse_single_action(action_str)
if parsed_action:
parsed_action["thought"] = thought
parsed_action["browser_window_dimension"] = browser_window_dimension
parsed_actions.append(parsed_action)
except Exception:
LOG.warning(
"Failed to parse individual UI-TARS action",
action_str=action_str,
exc_info=True,
)
continue
return parsed_actions
def _parse_single_action(action_str: str) -> dict[str, Any] | None:
"""Parse a single action string into structured data."""
import ast
try:
# Clean up the action string
action_str = action_str.replace("\n", "\\n").strip()
# Parse as Python expression
node = ast.parse(action_str, mode="eval")
if not isinstance(node, ast.Expression) or not isinstance(node.body, ast.Call):
return None
call = node.body
# Get function name
if isinstance(call.func, ast.Name):
func_name = call.func.id
elif isinstance(call.func, ast.Attribute):
func_name = call.func.attr
else:
return None
# Get arguments
action_inputs = {}
for kw in call.keywords:
if kw.arg and isinstance(kw.value, (ast.Constant, ast.Str)):
if isinstance(kw.value, ast.Constant):
value = kw.value.value
else: # ast.Str for older Python versions
value = kw.value.s
action_inputs[kw.arg] = value
return {
"action_type": func_name,
"action_inputs": action_inputs,
}
except Exception:
LOG.debug(f"Failed to parse action string: {action_str}", exc_info=True)
return None
def _convert_point_to_coordinates(text: str) -> str:
"""Convert <point>x y</point> format to (x,y) format."""
import re
from typing import Match
pattern = r"<point>(\d+)\s+(\d+)</point>"
def replace_match(match: Match[str]) -> str:
x, y = map(int, match.groups())
return f"({x},{y})"
return re.sub(pattern, replace_match, text)
def _create_ui_tars_action(
parsed_action: dict[str, Any],
task: Task,
step: Step,
browser_window_dimension: Resolution,
action_order: int,
) -> Action | None:
"""Create a Skyvern action from parsed UI-TARS data."""
action_type = parsed_action.get("action_type", "")
action_inputs = parsed_action.get("action_inputs", {})
thought = parsed_action.get("thought", "")
base_params = {
"reasoning": thought,
"intention": thought,
"organization_id": task.organization_id,
"workflow_run_id": task.workflow_run_id,
"task_id": task.task_id,
"step_id": step.step_id,
"step_order": step.order,
"action_order": action_order,
}
if action_type == "click":
x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
if x is None or y is None:
return None
return ClickAction(
element_id="",
x=x,
y=y,
response=f"Click at ({x}, {y})",
**base_params,
)
elif action_type == "left_double":
x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
if x is None or y is None:
return None
return ClickAction(
element_id="",
x=x,
y=y,
button="left",
repeat=2,
response=f"Double click at ({x}, {y})",
**base_params,
)
elif action_type == "right_single":
x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
if x is None or y is None:
return None
return ClickAction(
element_id="",
x=x,
y=y,
button="right",
response=f"Right click at ({x}, {y})",
**base_params,
)
elif action_type == "type":
content = action_inputs.get("content", "")
if not content:
return None
return InputTextAction(
element_id="",
text=content,
response=f"Type: {content[:50]}{'...' if len(content) > 50 else ''}",
**base_params,
)
elif action_type in ["drag", "select"]:
start_x, start_y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
end_x, end_y = _extract_ui_tars_coordinates(action_inputs.get("end_box", ""), browser_window_dimension)
if None in (start_x, start_y, end_x, end_y):
return None
return DragAction(
start_x=start_x,
start_y=start_y,
path=[(end_x, end_y)],
response=f"Drag from ({start_x}, {start_y}) to ({end_x}, {end_y})",
**base_params,
)
elif action_type == "hotkey":
key_combo = action_inputs.get("key", action_inputs.get("hotkey", ""))
if not key_combo:
return None
keys = key_combo.split()
return KeypressAction(
keys=keys,
response=f"Hotkey: {key_combo}",
**base_params,
)
elif action_type == "scroll":
direction = action_inputs.get("direction", "down").lower()
x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
if x is None or y is None:
# Use center of screen as fallback
x = browser_window_dimension["width"] // 2
y = browser_window_dimension["height"] // 2
scroll_amount = 300
if direction == "down":
scroll_x, scroll_y = 0, scroll_amount
elif direction == "up":
scroll_x, scroll_y = 0, -scroll_amount
elif direction == "right":
scroll_x, scroll_y = scroll_amount, 0
elif direction == "left":
scroll_x, scroll_y = -scroll_amount, 0
else:
scroll_x, scroll_y = 0, scroll_amount
return ScrollAction(
element_id="",
x=x,
y=y,
scroll_x=scroll_x,
scroll_y=scroll_y,
response=f"Scroll {direction} at ({x}, {y})",
**base_params,
)
elif action_type == "wait":
return WaitAction(
seconds=5,
**base_params,
)
elif action_type == "finished":
return CompleteAction(
data_extraction_goal=task.data_extraction_goal,
verified=True, # UI-TARS has already determined completion, skip Skyvern validation
**base_params,
)
else:
LOG.warning(f"Unsupported UI-TARS action type: {action_type}")
return None
def _extract_ui_tars_coordinates(box_str: str, browser_window_dimension: Resolution) -> tuple[int | None, int | None]:
"""Extract coordinates from UI-TARS box format with proper coordinate conversion.
UI-TARS coordinates need to be divided by 1000 to convert from the model's output
format to relative coordinates (0-1 range), then multiplied by screen dimensions
to get absolute pixel coordinates.
"""
import ast
if not box_str:
return None, None
try:
# Parse coordinates from string format like "(450,320)" or "[0.5, 0.3, 0.5, 0.3]"
coords = ast.literal_eval(box_str)
if not isinstance(coords, (list, tuple)):
return None, None
if len(coords) == 2:
# Direct coordinates like (450, 320) or (0.5, 0.3)
x, y = coords
# UI-TARS specific coordinate conversion
# UI-TARS outputs coordinates that need to be divided by 1000 first
if x > 1 or y > 1: # Likely UI-TARS format needing factor conversion
original_x, original_y = x, y
x = x / 1000.0
y = y / 1000.0
LOG.debug(f"Applied UI-TARS factor conversion: ({original_x}, {original_y}) -> ({x}, {y})")
# Convert relative coordinates (0-1) to absolute screen coordinates
if 0 <= x <= 1 and 0 <= y <= 1:
abs_x = int(x * browser_window_dimension["width"])
abs_y = int(y * browser_window_dimension["height"])
LOG.debug(
f"Converted to absolute coordinates: ({abs_x}, {abs_y}) for screen {browser_window_dimension['width']}x{browser_window_dimension['height']}"
)
return abs_x, abs_y
return int(x), int(y)
elif len(coords) == 4:
# Bounding box format [x1, y1, x2, y2] - take center point
x1, y1, x2, y2 = coords
x = (x1 + x2) / 2
y = (y1 + y2) / 2
# UI-TARS specific coordinate conversion for bounding boxes
if x > 1 or y > 1: # Likely UI-TARS format needing factor conversion
original_x, original_y = x, y
x = x / 1000.0
y = y / 1000.0
LOG.debug(
f"Applied UI-TARS factor conversion to bbox center: ({original_x}, {original_y}) -> ({x}, {y})"
)
# Convert relative coordinates (0-1) to absolute screen coordinates
if 0 <= x <= 1 and 0 <= y <= 1:
abs_x = int(x * browser_window_dimension["width"])
abs_y = int(y * browser_window_dimension["height"])
LOG.debug(
f"Converted bbox center to absolute coordinates: ({abs_x}, {abs_y}) for screen {browser_window_dimension['width']}x{browser_window_dimension['height']}"
)
return abs_x, abs_y
return int(x), int(y)
else:
return None, None
except Exception:
LOG.debug(f"Failed to parse UI-TARS coordinates: {box_str}", exc_info=True)
return None, None