From f6d755da6b6cae8d89e9ba3dcda0af7305d72a0d Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Tue, 15 Apr 2025 19:07:37 -0700 Subject: [PATCH] CUA feature - support drag action (#2160) --- skyvern/webeye/actions/actions.py | 8 ++++++++ skyvern/webeye/actions/handler.py | 17 ++++++++++++++++ skyvern/webeye/actions/parse_actions.py | 27 +++++++++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index e91d6cc3..6cc4a064 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -32,6 +32,7 @@ class ActionType(StrEnum): KEYPRESS = "keypress" TYPE = "type" MOVE = "move" + DRAG = "drag" def is_web_action(self) -> bool: return self in [ @@ -285,6 +286,13 @@ class MoveAction(Action): y: int +class DragAction(Action): + action_type: ActionType = ActionType.DRAG + start_x: int + start_y: int + path: list[tuple[int, int]] = [] + + class ScrapeResult(BaseModel): """ Scraped response from a webpage, including: diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 83e0fec0..e7b2ba63 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -1543,6 +1543,22 @@ async def handle_move_action( return [ActionSuccess()] +async def handle_drag_action( + action: actions.DragAction, + page: Page, + scraped_page: ScrapedPage, + task: Task, + step: Step, +) -> list[ActionResult]: + await page.mouse.move(action.start_x, action.start_y) + await page.mouse.down() + for point in action.path: + x, y = point[0], point[1] + await page.mouse.move(x, y) + await page.mouse.up() + return [ActionSuccess()] + + ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action) ActionHandler.register_action_type(ActionType.CLICK, handle_click_action) ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action) @@ -1557,6 +1573,7 @@ ActionHandler.register_action_type(ActionType.EXTRACT, handle_extract_action) ActionHandler.register_action_type(ActionType.SCROLL, handle_scroll_action) ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action) ActionHandler.register_action_type(ActionType.MOVE, handle_move_action) +ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action) async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any: diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index bd865fd8..ff75f181 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -16,6 +16,7 @@ from skyvern.webeye.actions.actions import ( ClickAction, CompleteAction, DownloadFileAction, + DragAction, InputTextAction, KeypressAction, MoveAction, @@ -290,6 +291,32 @@ async def parse_cua_actions( reasoning=reasoning, intention=reasoning, ) + case "drag": + whole_path = cua_action.path + if not whole_path or len(whole_path) < 2: + LOG.warning( + "Invalid drag action", + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + whole_path=whole_path, + ) + action = WaitAction( + seconds=5, + reasoning=reasoning, + intention=reasoning, + ) + else: + start_x, start_y = whole_path[0][0], whole_path[0][1] + reasoning = reasoning or f"Drag action path: {whole_path}" + action = DragAction( + start_x=start_x, + start_y=start_y, + path=whole_path[1:], + reasoning=reasoning, + intention=reasoning, + ) case _: raise ValueError(f"Unsupported action type: {action_type}") action.organization_id = task.organization_id