improve custom selection (#2236)

2025-04-28 16:16:36 +08:00
parent a3d9917386
commit 2f0e6e5eb3
3 changed files with 276 additions and 133 deletions
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -696,6 +696,7 @@ async def handle_input_text_action(
        await page.keyboard.type(action.text)
        return [ActionSuccess()]

+    input_or_select_context: InputOrSelectContext | None = None
    dom = DomUtil(scraped_page, page)
    skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
    skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
@@ -789,8 +790,16 @@ async def handle_input_text_action(
            try_to_quit_dropdown = True
            try:
                # TODO: we don't select by value for the auto completion detect case
+                if input_or_select_context is None:
+                    input_or_select_context = await _get_input_or_select_context(
+                        action=action,
+                        scraped_page=scraped_page,
+                        step=step,
+                    )
+
                select_result = await sequentially_select_from_dropdown(
                    action=select_action,
+                    input_or_select_context=input_or_select_context,
                    page=page,
                    dom=dom,
                    skyvern_element=skyvern_element,
@@ -926,25 +935,12 @@ async def handle_input_text_action(
            return [ActionSuccess()]

        if not await skyvern_element.is_raw_input():
-            prompt = load_prompt_with_elements(
-                scraped_page=scraped_page,
-                prompt_engine=prompt_engine,
-                template_name="parse-input-or-select-context",
-                element_id=action.element_id,
-                action_reasoning=action.reasoning,
-            )
-
-            json_response = await app.SECONDARY_LLM_API_HANDLER(
-                prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
-            )
-            json_response["intention"] = action.intention
-            input_or_select_context = InputOrSelectContext.model_validate(json_response)
-            LOG.info(
-                "Parsed input/select context",
-                context=input_or_select_context,
-                task_id=task.task_id,
-                step_id=step.step_id,
-            )
+            if input_or_select_context is None:
+                input_or_select_context = await _get_input_or_select_context(
+                    action=action,
+                    scraped_page=scraped_page,
+                    step=step,
+                )

            if await skyvern_element.is_auto_completion_input() or input_or_select_context.is_location_input:
                if result := await input_or_auto_complete_input(
@@ -1327,13 +1323,28 @@ async def handle_select_option_action(
                ),
            )

+        input_or_select_context = await _get_input_or_select_context(
+            action=action, scraped_page=scraped_page, step=step
+        )
+
        if len(incremental_element) == 0:
-            raise NoIncrementalElementFoundForCustomSelection(element_id=skyvern_element.get_id())
+            results.append(
+                await select_from_emerging_elements(
+                    action=action,
+                    input_or_select_context=input_or_select_context,
+                    page=page,
+                    scraped_page=scraped_page,
+                    task=task,
+                    step=step,
+                )
+            )
+            return results

        is_open = True
        # TODO: support sequetially select from dropdown by value, just support single select now
        result = await sequentially_select_from_dropdown(
            action=action,
+            input_or_select_context=input_or_select_context,
            page=page,
            dom=dom,
            skyvern_element=skyvern_element,
@@ -2222,6 +2233,7 @@ async def input_or_auto_complete_input(

 async def sequentially_select_from_dropdown(
    action: SelectOptionAction,
+    input_or_select_context: InputOrSelectContext,
    page: Page,
    dom: DomUtil,
    skyvern_element: SkyvernElement,
@@ -2237,26 +2249,6 @@ async def sequentially_select_from_dropdown(
    TODO: support to return all values retrieved from the sequentially select
    Only return the last value today
    """
-
-    prompt = load_prompt_with_elements(
-        scraped_page=dom.scraped_page,
-        prompt_engine=prompt_engine,
-        template_name="parse-input-or-select-context",
-        action_reasoning=action.reasoning,
-        element_id=action.element_id,
-    )
-    json_response = await app.SECONDARY_LLM_API_HANDLER(
-        prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
-    )
-    json_response["intention"] = action.intention
-    input_or_select_context = InputOrSelectContext.model_validate(json_response)
-    LOG.info(
-        "Parsed input/select context",
-        context=input_or_select_context,
-        task_id=task.task_id,
-        step_id=step.step_id,
-    )
-
    if not force_select and input_or_select_context.is_search_bar:
        LOG.info(
            "Exit custom selection mode since it's a non-force search bar",
@@ -2414,6 +2406,93 @@ def build_sequential_select_history(history_list: list[CustomSingleSelectResult]
    return result


+async def select_from_emerging_elements(
+    action: SelectOptionAction,
+    input_or_select_context: InputOrSelectContext,
+    page: Page,
+    scraped_page: ScrapedPage,
+    step: Step,
+    task: Task,
+) -> ActionResult:
+    # TODO: support to handle the case when options are loaded by scroll
+    LOG.info(
+        "No incremental elements detected by MutationObserver, using re-scraping the page to find the match element"
+    )
+    scraped_page_after_open = await scraped_page.generate_scraped_page_without_screenshots()
+    new_element_ids = set(scraped_page_after_open.id_to_css_dict.keys()) - set(scraped_page.id_to_css_dict.keys())
+
+    dom_after_open = DomUtil(scraped_page=scraped_page_after_open, page=page)
+    new_interactable_element_ids = [
+        element_id
+        for element_id in new_element_ids
+        if (await dom_after_open.get_skyvern_element_by_id(element_id)).is_interactable()
+    ]
+
+    if len(new_interactable_element_ids) == 0:
+        raise NoIncrementalElementFoundForCustomSelection(element_id=action.element_id)
+
+    prompt = load_prompt_with_elements(
+        scraped_page=scraped_page_after_open,
+        prompt_engine=prompt_engine,
+        template_name="custom-select",
+        is_date_related=input_or_select_context.is_date_related,
+        field_information=input_or_select_context.field
+        if not input_or_select_context.intention
+        else input_or_select_context.intention,
+        required_field=input_or_select_context.is_required,
+        target_value=action.option.label,
+        navigation_goal=task.navigation_goal,
+        new_elements_ids=new_interactable_element_ids,
+        navigation_payload_str=json.dumps(task.navigation_payload),
+        local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(),
+    )
+    LOG.info(
+        "Calling LLM to find the match element",
+        step_id=step.step_id,
+        task_id=task.task_id,
+    )
+
+    json_response = await app.LLM_API_HANDLER(prompt=prompt, step=step, prompt_name="custom-select")
+    value: str | None = json_response.get("value", None)
+    LOG.info(
+        "LLM response for the matched element",
+        matched_value=value,
+        response=json_response,
+        step_id=step.step_id,
+        task_id=task.task_id,
+    )
+
+    action_type_str: str = json_response.get("action_type", "")
+    action_type = ActionType(action_type_str.lower())
+    element_id: str | None = json_response.get("id", None)
+    if not element_id or action_type not in [ActionType.CLICK, ActionType.INPUT_TEXT]:
+        raise NoAvailableOptionFoundForCustomSelection(reason=json_response.get("reasoning"))
+
+    if value is not None and action_type == ActionType.INPUT_TEXT:
+        LOG.info(
+            "No clickable option found, but found input element to search",
+            element_id=element_id,
+        )
+        input_element = await dom_after_open.get_skyvern_element_by_id(element_id)
+        await input_element.scroll_into_view()
+        current_text = await get_input_value(input_element.get_tag_name(), input_element.get_locator())
+        if current_text == value:
+            return ActionSuccess()
+
+        await input_element.input_clear()
+        await input_element.input_sequentially(value)
+        return ActionSuccess()
+
+    else:
+        selected_element = await dom_after_open.get_skyvern_element_by_id(element_id)
+        if await selected_element.get_attr("role") == "listbox":
+            return ActionFailure(exception=InteractWithDropdownContainer(element_id=element_id))
+
+    await selected_element.scroll_into_view()
+    await selected_element.click(page=page)
+    return ActionSuccess()
+
+
 async def select_from_dropdown(
    context: InputOrSelectContext,
    page: Page,
@@ -3264,3 +3343,25 @@ async def _get_verification_code_from_db(
            continue
        return totp_code.code
    return None
+
+
+async def _get_input_or_select_context(
+    action: InputTextAction | SelectOptionAction, scraped_page: ScrapedPage, step: Step
+) -> InputOrSelectContext:
+    prompt = load_prompt_with_elements(
+        scraped_page=scraped_page,
+        prompt_engine=prompt_engine,
+        template_name="parse-input-or-select-context",
+        action_reasoning=action.reasoning,
+        element_id=action.element_id,
+    )
+    json_response = await app.SECONDARY_LLM_API_HANDLER(
+        prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
+    )
+    json_response["intention"] = action.intention
+    input_or_select_context = InputOrSelectContext.model_validate(json_response)
+    LOG.info(
+        "Parsed input/select context",
+        context=input_or_select_context,
+    )
+    return input_or_select_context