diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index dcd6b52a..38c6ad01 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -189,6 +189,8 @@ class ClickAction(WebAction): x: int | None = None y: int | None = None button: str = "left" + # normal click: 1, double click: 2, triple click: 3 + repeat: int = 1 def __repr__(self) -> str: return f"ClickAction(element_id={self.element_id}, file_url={self.file_url}, download={self.download}, x={self.x}, y={self.y}, button={self.button}, tool_call_id={self.tool_call_id})" diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index d85768ef..33a18bef 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -505,7 +505,15 @@ async def handle_click_action( ) LOG.info("Clicked element at location", x=action.x, y=action.y, element_id=element_id, button=action.button) - await page.mouse.click(x=action.x, y=action.y, button=action.button) + if action.repeat == 1: + await page.mouse.click(x=action.x, y=action.y, button=action.button) + elif action.repeat == 2: + await page.mouse.dblclick(x=action.x, y=action.y, button=action.button) + elif action.repeat == 3: + await page.mouse.click(x=action.x, y=action.y, button=action.button, click_count=3) + else: + raise ValueError(f"Invalid repeat value: {action.repeat}") + return [ActionSuccess()] dom = DomUtil(scraped_page=scraped_page, page=page) diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index 2cdc662c..cdfa6ffd 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -420,7 +420,7 @@ async def parse_anthropic_actions( tool_call_id=tool_call_id, ) ) - elif action == "left_click": + elif action in ["left_click", "double_click", "triple_click"]: coordinate = tool_call_input.get("coordinate") if not coordinate and idx - 1 >= 0: prev_tool_call = tool_calls[idx - 1] @@ -438,6 +438,12 @@ async def parse_anthropic_actions( x, y = validate_and_get_coordinates( coordinate, screenshot_resize_target_dimension, browser_window_dimension ) + repeat = 1 + if action == "double_click": + repeat = 2 + elif action == "triple_click": + repeat = 3 + response = f"Click at: ({x}, {y})" reasoning = reasoning or response actions.append( @@ -446,6 +452,7 @@ async def parse_anthropic_actions( x=x, y=y, button="left", + repeat=repeat, reasoning=reasoning, intention=reasoning, response=response,