From 7630aa41ea099337f6cbc30009b9b2e8a34bc1f3 Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Tue, 22 Jul 2025 14:38:47 +0800 Subject: [PATCH] use seperated userr goal check for sequential click agent (#3011) --- .../forge/prompts/skyvern/check-user-goal.j2 | 8 ++- skyvern/webeye/actions/handler.py | 69 +++++++++++++++---- 2 files changed, 61 insertions(+), 16 deletions(-) diff --git a/skyvern/forge/prompts/skyvern/check-user-goal.j2 b/skyvern/forge/prompts/skyvern/check-user-goal.j2 index 7aa5d9b0..e47327e4 100644 --- a/skyvern/forge/prompts/skyvern/check-user-goal.j2 +++ b/skyvern/forge/prompts/skyvern/check-user-goal.j2 @@ -1,4 +1,4 @@ -Your are here to help the user determine if the user has completed their goal on the web{{ " according to the complete criterion" if complete_criterion else "" }}. Use the content of the elements parsed from the page, the screenshots of the page, the user goal and user details to determine whether the {{ "complete criterion has been met" if complete_criterion else "user goal has been completed" }} or not. +Your are here to help the user determine if the user has completed their goal on the web{{ " according to the complete criterion" if complete_criterion else "" }}. Use the content of the elements parsed from the page,{{ "" if without_screenshots else " the screenshots of the page," }} the user goal and user details to determine whether the {{ "complete criterion has been met" if complete_criterion else "user goal has been completed" }} or not. Make sure to ONLY return the JSON object in this format with no additional text before or after it: ```json @@ -28,8 +28,12 @@ Action History: ``` {{ action_history }} ``` +{% endif %}{% if new_elements_ids %} +IDs for emerging HTML elements +``` +{{ new_elements_ids }} +``` {% endif %} - Elements on the page: ``` {{ elements }} diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 35936c95..66975071 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -81,6 +81,7 @@ from skyvern.webeye.actions.actions import ( ActionStatus, CheckboxAction, ClickAction, + CompleteVerifyResult, InputOrSelectContext, InputTextAction, ScrapeResult, @@ -626,6 +627,7 @@ async def handle_click_action( try: if sequential_click_result := await handle_sequential_click_for_dropdown( action=action, + action_history=results, anchor_element=skyvern_element, dom=dom, page=page, @@ -657,6 +659,7 @@ async def handle_click_action( @TraceManager.traced_async(ignore_inputs=["anchor_element", "scraped_page", "page", "incremental_scraped", "dom"]) async def handle_sequential_click_for_dropdown( action: actions.ClickAction, + action_history: list[ActionResult], anchor_element: SkyvernElement, dom: DomUtil, page: Page, @@ -678,6 +681,51 @@ async def handle_sequential_click_for_dropdown( return None LOG.info("Detected new element after clicking", action=action) + scraped_page_after_open = await scraped_page.generate_scraped_page_without_screenshots() + new_element_ids = set(scraped_page_after_open.id_to_css_dict.keys()) - set(scraped_page.id_to_css_dict.keys()) + + dom_after_open = DomUtil(scraped_page=scraped_page_after_open, page=page) + new_interactable_element_ids = [ + element_id + for element_id in new_element_ids + if (await dom_after_open.get_skyvern_element_by_id(element_id)).is_interactable() + ] + + action_history_str = "" + if action_history and len(action_history) > 0: + result = action_history[-1] + action_result = { + "action_type": action.action_type, + "reasoning": action.reasoning, + "result": result.success, + } + action_history_str = json.dumps(action_result) + + prompt = load_prompt_with_elements( + element_tree_builder=scraped_page_after_open, + prompt_engine=prompt_engine, + template_name="check-user-goal", + navigation_goal=task.navigation_goal, + navigation_payload=task.navigation_payload, + new_elements_ids=new_element_ids, + without_screenshots=True, + action_history=action_history_str, + local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(), + ) + response = await app.SECONDARY_LLM_API_HANDLER( + prompt=prompt, + step=step, + prompt_name="check-user-goal", + ) + verify_result = CompleteVerifyResult.model_validate(response) + if verify_result.user_goal_achieved: + LOG.info( + "User goal achieved, exiting the sequential click logic", + step_id=step.step_id, + task_id=task.task_id, + ) + return None + dropdown_menu_element = await locate_dropdown_menu( current_anchor_element=anchor_element, incremental_scraped=incremental_scraped, @@ -724,7 +772,8 @@ async def handle_sequential_click_for_dropdown( scraped_page=scraped_page, step=step, task=task, - support_complete_action=True, + scraped_page_after_open=scraped_page_after_open, + new_interactable_element_ids=new_interactable_element_ids, ) @@ -2703,7 +2752,8 @@ async def select_from_emerging_elements( scraped_page: ScrapedPage, step: Step, task: Task, - support_complete_action: bool = False, + scraped_page_after_open: ScrapedPage | None = None, + new_interactable_element_ids: list[str] | None = None, ) -> ActionResult: """ This is the function to select an element from the new showing elements. @@ -2711,11 +2761,11 @@ async def select_from_emerging_elements( """ # TODO: support to handle the case when options are loaded by scroll - scraped_page_after_open = await scraped_page.generate_scraped_page_without_screenshots() + scraped_page_after_open = scraped_page_after_open or await scraped_page.generate_scraped_page_without_screenshots() new_element_ids = set(scraped_page_after_open.id_to_css_dict.keys()) - set(scraped_page.id_to_css_dict.keys()) dom_after_open = DomUtil(scraped_page=scraped_page_after_open, page=page) - new_interactable_element_ids = [ + new_interactable_element_ids = new_interactable_element_ids or [ element_id for element_id in new_element_ids if (await dom_after_open.get_skyvern_element_by_id(element_id)).is_interactable() @@ -2734,7 +2784,6 @@ async def select_from_emerging_elements( target_value=options.target_value, navigation_goal=task.navigation_goal, new_elements_ids=new_interactable_element_ids, - support_complete_action=support_complete_action, navigation_payload_str=json.dumps(task.navigation_payload), local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(), ) @@ -2758,17 +2807,9 @@ async def select_from_emerging_elements( action_type_str: str = json_response.get("action_type", "") or "" action_type = ActionType(action_type_str.lower()) element_id: str | None = json_response.get("id", None) - if not element_id or action_type not in [ActionType.CLICK, ActionType.INPUT_TEXT, ActionType.COMPLETE]: + if not element_id or action_type not in [ActionType.CLICK, ActionType.INPUT_TEXT]: raise NoAvailableOptionFoundForCustomSelection(reason=json_response.get("reasoning")) - if action_type == ActionType.COMPLETE: - LOG.info( - "The user has completed the user goal in the current opened dropdown, although the dropdown might not be closed", - step_id=step.step_id, - task_id=task.task_id, - ) - return ActionSuccess() - if value is not None and action_type == ActionType.INPUT_TEXT: LOG.info( "No clickable option found, but found input element to search",