anthropic CUA - support double and triple clicks (#2264)

This commit is contained in:
Shuchang Zheng
2025-05-01 07:30:16 +08:00
committed by GitHub
parent e1feb6cb45
commit 7dae328c0c
3 changed files with 19 additions and 2 deletions

View File

@@ -189,6 +189,8 @@ class ClickAction(WebAction):
x: int | None = None
y: int | None = None
button: str = "left"
# normal click: 1, double click: 2, triple click: 3
repeat: int = 1
def __repr__(self) -> str:
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url}, download={self.download}, x={self.x}, y={self.y}, button={self.button}, tool_call_id={self.tool_call_id})"

View File

@@ -505,7 +505,15 @@ async def handle_click_action(
)
LOG.info("Clicked element at location", x=action.x, y=action.y, element_id=element_id, button=action.button)
await page.mouse.click(x=action.x, y=action.y, button=action.button)
if action.repeat == 1:
await page.mouse.click(x=action.x, y=action.y, button=action.button)
elif action.repeat == 2:
await page.mouse.dblclick(x=action.x, y=action.y, button=action.button)
elif action.repeat == 3:
await page.mouse.click(x=action.x, y=action.y, button=action.button, click_count=3)
else:
raise ValueError(f"Invalid repeat value: {action.repeat}")
return [ActionSuccess()]
dom = DomUtil(scraped_page=scraped_page, page=page)

View File

@@ -420,7 +420,7 @@ async def parse_anthropic_actions(
tool_call_id=tool_call_id,
)
)
elif action == "left_click":
elif action in ["left_click", "double_click", "triple_click"]:
coordinate = tool_call_input.get("coordinate")
if not coordinate and idx - 1 >= 0:
prev_tool_call = tool_calls[idx - 1]
@@ -438,6 +438,12 @@ async def parse_anthropic_actions(
x, y = validate_and_get_coordinates(
coordinate, screenshot_resize_target_dimension, browser_window_dimension
)
repeat = 1
if action == "double_click":
repeat = 2
elif action == "triple_click":
repeat = 3
response = f"Click at: ({x}, {y})"
reasoning = reasoning or response
actions.append(
@@ -446,6 +452,7 @@ async def parse_anthropic_actions(
x=x,
y=y,
button="left",
repeat=repeat,
reasoning=reasoning,
intention=reasoning,
response=response,