improve custom selection (#2236)

2025-04-28 16:16:36 +08:00
parent a3d9917386
commit 2f0e6e5eb3
3 changed files with 276 additions and 133 deletions
--- a/skyvern/forge/prompts/skyvern/custom-select.j2
+++ b/skyvern/forge/prompts/skyvern/custom-select.j2
@@ -1,13 +1,14 @@
-You are performing a {{ "multi-level selection" if select_history else "selection" }} action on an HTML page. Assist the user in selecting the most appropriate option(or typing some values to search if neccesary) to advance toward their goal, considering the context, user details, and the HTML elements provided in the list.
+You are performing a {{ "multi-level selection" if select_history else "selection" }} action on an HTML page. Assist the user in selecting the most appropriate option(or typing some values to search if neccesary) to advance toward their goal, considering the context, user details,{{" list of emerging HTML element ids" if new_elements_ids else "" }} and the HTML elements provided in the list.

 You can identify the matching element based on the following guidelines:
-  1. Select the most suitable element based on the user goal, user details, and the context.
-  2. If none of the options perfectly match, and there is no search box for input, but there is a fallback option such as "Other", "Others" or "None of the above" in the HTML elements, you can consider it a match.
-  3. If a field is required, do not leave it blank.
-  4. If a field is required, do not select a placeholder value, such as "Please select", "-", or "Select...".
-  5. Exclude loading indicators like "loading more results" as valid options.{% if select_history %}
-  6. The selection history displays the previously selected values for the multi-level selection. Continue to complete the entire selection process.{% if is_date_related %}
-  7. Date picker might be triggered, you goal is to set the correct start date and end date.{% endif %}{% endif %}
+  - Select the most suitable element based on the user goal, user details, and the context.
+  - If none of the options perfectly match, and there is no search box for input, but there is a fallback option such as "Other", "Others" or "None of the above" in the HTML elements, you can consider it a match.
+  - If a field is required, do not leave it blank.
+  - If a field is required, do not select a placeholder value, such as "Please select", "-", or "Select...".
+  - Exclude loading indicators like "loading more results" as valid options.{% if new_elements_ids %}
+  - The matching element can only be in the emerging elements.{% endif %}{% if select_history %}
+  - The selection history displays the previously selected values for the multi-level selection. Continue to complete the entire selection process.{% if is_date_related %}
+  - Date picker might be triggered, you goal is to set the correct start date and end date.{% endif %}{% endif %}

 MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
 Each interactable element is tagged with an ID.
@@ -17,7 +18,7 @@ Reply in JSON format with the following keys:
    "page_info": str, // Think step by step. Describe the page information you parsed from the HTML elements. Your action should be based on the current page information.
    "reasoning": str, // The reasoning behind the current single action. Be specific, referencing the value and the element id in your reasoning. Mention why you chose the element id. Keep the reasoning short and to the point.
    "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
-    "id": str, // The id of the element to take action on. The id has to be one from the elements list
+    "id": str, // The id of the element to take action on. The id has to be one from {{ "the emerging HTML elements list" if new_elements_ids else "the HTML elements list" }}.
    "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT". "CLICK" is an option you'd like to click to choose. "INPUT_TEXT" is an element you'd like to input text into for searching, but it only should be used when there's no valid option to click.
    "value": str, // The value to select.{% if target_value %}
    "relevant": bool, // True if the value you select is relevant to the target value, otherwise False. If the value is a fallback option according to the guidelines, it's still relevant.{% endif %}
@@ -42,7 +43,12 @@ User details:
 ```
 {{ navigation_payload_str }}
 ```
-
+{% if new_elements_ids %}
+IDs for emerging HTML elements
+```
+{{ new_elements_ids }}
+```
+{% endif %}
 HTML elements:
 ```
 {{ elements }}
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -696,6 +696,7 @@ async def handle_input_text_action(
        await page.keyboard.type(action.text)
        return [ActionSuccess()]

+    input_or_select_context: InputOrSelectContext | None = None
    dom = DomUtil(scraped_page, page)
    skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
    skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
@@ -789,8 +790,16 @@ async def handle_input_text_action(
            try_to_quit_dropdown = True
            try:
                # TODO: we don't select by value for the auto completion detect case
+                if input_or_select_context is None:
+                    input_or_select_context = await _get_input_or_select_context(
+                        action=action,
+                        scraped_page=scraped_page,
+                        step=step,
+                    )
+
                select_result = await sequentially_select_from_dropdown(
                    action=select_action,
+                    input_or_select_context=input_or_select_context,
                    page=page,
                    dom=dom,
                    skyvern_element=skyvern_element,
@@ -926,25 +935,12 @@ async def handle_input_text_action(
            return [ActionSuccess()]

        if not await skyvern_element.is_raw_input():
-            prompt = load_prompt_with_elements(
-                scraped_page=scraped_page,
-                prompt_engine=prompt_engine,
-                template_name="parse-input-or-select-context",
-                element_id=action.element_id,
-                action_reasoning=action.reasoning,
-            )
-
-            json_response = await app.SECONDARY_LLM_API_HANDLER(
-                prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
-            )
-            json_response["intention"] = action.intention
-            input_or_select_context = InputOrSelectContext.model_validate(json_response)
-            LOG.info(
-                "Parsed input/select context",
-                context=input_or_select_context,
-                task_id=task.task_id,
-                step_id=step.step_id,
-            )
+            if input_or_select_context is None:
+                input_or_select_context = await _get_input_or_select_context(
+                    action=action,
+                    scraped_page=scraped_page,
+                    step=step,
+                )

            if await skyvern_element.is_auto_completion_input() or input_or_select_context.is_location_input:
                if result := await input_or_auto_complete_input(
@@ -1327,13 +1323,28 @@ async def handle_select_option_action(
                ),
            )

+        input_or_select_context = await _get_input_or_select_context(
+            action=action, scraped_page=scraped_page, step=step
+        )
+
        if len(incremental_element) == 0:
-            raise NoIncrementalElementFoundForCustomSelection(element_id=skyvern_element.get_id())
+            results.append(
+                await select_from_emerging_elements(
+                    action=action,
+                    input_or_select_context=input_or_select_context,
+                    page=page,
+                    scraped_page=scraped_page,
+                    task=task,
+                    step=step,
+                )
+            )
+            return results

        is_open = True
        # TODO: support sequetially select from dropdown by value, just support single select now
        result = await sequentially_select_from_dropdown(
            action=action,
+            input_or_select_context=input_or_select_context,
            page=page,
            dom=dom,
            skyvern_element=skyvern_element,
@@ -2222,6 +2233,7 @@ async def input_or_auto_complete_input(

 async def sequentially_select_from_dropdown(
    action: SelectOptionAction,
+    input_or_select_context: InputOrSelectContext,
    page: Page,
    dom: DomUtil,
    skyvern_element: SkyvernElement,
@@ -2237,26 +2249,6 @@ async def sequentially_select_from_dropdown(
    TODO: support to return all values retrieved from the sequentially select
    Only return the last value today
    """
-
-    prompt = load_prompt_with_elements(
-        scraped_page=dom.scraped_page,
-        prompt_engine=prompt_engine,
-        template_name="parse-input-or-select-context",
-        action_reasoning=action.reasoning,
-        element_id=action.element_id,
-    )
-    json_response = await app.SECONDARY_LLM_API_HANDLER(
-        prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
-    )
-    json_response["intention"] = action.intention
-    input_or_select_context = InputOrSelectContext.model_validate(json_response)
-    LOG.info(
-        "Parsed input/select context",
-        context=input_or_select_context,
-        task_id=task.task_id,
-        step_id=step.step_id,
-    )
-
    if not force_select and input_or_select_context.is_search_bar:
        LOG.info(
            "Exit custom selection mode since it's a non-force search bar",
@@ -2414,6 +2406,93 @@ def build_sequential_select_history(history_list: list[CustomSingleSelectResult]
    return result


+async def select_from_emerging_elements(
+    action: SelectOptionAction,
+    input_or_select_context: InputOrSelectContext,
+    page: Page,
+    scraped_page: ScrapedPage,
+    step: Step,
+    task: Task,
+) -> ActionResult:
+    # TODO: support to handle the case when options are loaded by scroll
+    LOG.info(
+        "No incremental elements detected by MutationObserver, using re-scraping the page to find the match element"
+    )
+    scraped_page_after_open = await scraped_page.generate_scraped_page_without_screenshots()
+    new_element_ids = set(scraped_page_after_open.id_to_css_dict.keys()) - set(scraped_page.id_to_css_dict.keys())
+
+    dom_after_open = DomUtil(scraped_page=scraped_page_after_open, page=page)
+    new_interactable_element_ids = [
+        element_id
+        for element_id in new_element_ids
+        if (await dom_after_open.get_skyvern_element_by_id(element_id)).is_interactable()
+    ]
+
+    if len(new_interactable_element_ids) == 0:
+        raise NoIncrementalElementFoundForCustomSelection(element_id=action.element_id)
+
+    prompt = load_prompt_with_elements(
+        scraped_page=scraped_page_after_open,
+        prompt_engine=prompt_engine,
+        template_name="custom-select",
+        is_date_related=input_or_select_context.is_date_related,
+        field_information=input_or_select_context.field
+        if not input_or_select_context.intention
+        else input_or_select_context.intention,
+        required_field=input_or_select_context.is_required,
+        target_value=action.option.label,
+        navigation_goal=task.navigation_goal,
+        new_elements_ids=new_interactable_element_ids,
+        navigation_payload_str=json.dumps(task.navigation_payload),
+        local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(),
+    )
+    LOG.info(
+        "Calling LLM to find the match element",
+        step_id=step.step_id,
+        task_id=task.task_id,
+    )
+
+    json_response = await app.LLM_API_HANDLER(prompt=prompt, step=step, prompt_name="custom-select")
+    value: str | None = json_response.get("value", None)
+    LOG.info(
+        "LLM response for the matched element",
+        matched_value=value,
+        response=json_response,
+        step_id=step.step_id,
+        task_id=task.task_id,
+    )
+
+    action_type_str: str = json_response.get("action_type", "")
+    action_type = ActionType(action_type_str.lower())
+    element_id: str | None = json_response.get("id", None)
+    if not element_id or action_type not in [ActionType.CLICK, ActionType.INPUT_TEXT]:
+        raise NoAvailableOptionFoundForCustomSelection(reason=json_response.get("reasoning"))
+
+    if value is not None and action_type == ActionType.INPUT_TEXT:
+        LOG.info(
+            "No clickable option found, but found input element to search",
+            element_id=element_id,
+        )
+        input_element = await dom_after_open.get_skyvern_element_by_id(element_id)
+        await input_element.scroll_into_view()
+        current_text = await get_input_value(input_element.get_tag_name(), input_element.get_locator())
+        if current_text == value:
+            return ActionSuccess()
+
+        await input_element.input_clear()
+        await input_element.input_sequentially(value)
+        return ActionSuccess()
+
+    else:
+        selected_element = await dom_after_open.get_skyvern_element_by_id(element_id)
+        if await selected_element.get_attr("role") == "listbox":
+            return ActionFailure(exception=InteractWithDropdownContainer(element_id=element_id))
+
+    await selected_element.scroll_into_view()
+    await selected_element.click(page=page)
+    return ActionSuccess()
+
+
 async def select_from_dropdown(
    context: InputOrSelectContext,
    page: Page,
@@ -3264,3 +3343,25 @@ async def _get_verification_code_from_db(
            continue
        return totp_code.code
    return None
+
+
+async def _get_input_or_select_context(
+    action: InputTextAction | SelectOptionAction, scraped_page: ScrapedPage, step: Step
+) -> InputOrSelectContext:
+    prompt = load_prompt_with_elements(
+        scraped_page=scraped_page,
+        prompt_engine=prompt_engine,
+        template_name="parse-input-or-select-context",
+        action_reasoning=action.reasoning,
+        element_id=action.element_id,
+    )
+    json_response = await app.SECONDARY_LLM_API_HANDLER(
+        prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
+    )
+    json_response["intention"] = action.intention
+    input_or_select_context = InputOrSelectContext.model_validate(json_response)
+    LOG.info(
+        "Parsed input/select context",
+        context=input_or_select_context,
+    )
+    return input_or_select_context
--- a/skyvern/webeye/scraper/domUtils.js
+++ b/skyvern/webeye/scraper/domUtils.js
@@ -513,6 +513,20 @@ function isReadonlyElement(element) {
  return false;
 }

+function isDropdownRelatedElement(element) {
+  const tagName = element.tagName?.toLowerCase();
+  if (tagName === "select") {
+    return true;
+  }
+
+  const role = element.getAttribute("role")?.toLowerCase();
+  if (role === "option" || role === "listbox") {
+    return true;
+  }
+
+  return false;
+}
+
 function hasAngularClickBinding(element) {
  return (
    element.hasAttribute("ng-click") || element.hasAttribute("data-ng-click")
@@ -2272,94 +2286,116 @@ if (window.globalObserverForDOMIncrement === undefined) {
  ) {
    // TODO: how to detect duplicated recreate element?
    for (const mutation of mutationsList) {
-      if (mutation.type === "attributes") {
-        if (mutation.attributeName === "hidden") {
-          const node = mutation.target;
-          if (!node.hidden) {
-            window.globalOneTimeIncrementElements.push({
-              targetNode: node,
-              newNodes: [node],
-            });
-            await addIncrementalNodeToMap(node, [node]);
-          }
-        }
-        if (mutation.attributeName === "style") {
-          // TODO: need to confirm that elemnent is hidden previously
-          const node = mutation.target;
-          if (node.nodeType === Node.TEXT_NODE) continue;
-          if (node.tagName.toLowerCase() === "body") continue;
-          const newStyle = getElementComputedStyle(node);
-          const newDisplay = newStyle?.display;
-          if (newDisplay !== "none") {
-            window.globalOneTimeIncrementElements.push({
-              targetNode: node,
-              newNodes: [node],
-            });
-            await addIncrementalNodeToMap(node, [node]);
-          }
-        }
-        if (mutation.attributeName === "class") {
-          const node = mutation.target;
-          if (node.nodeType === Node.TEXT_NODE) continue;
-          if (node.tagName.toLowerCase() === "body") continue;
-          if (!mutation.oldValue) continue;
-          const currentClassName = node.className
-            ? node.className.toString()
-            : "";
-          if (
-            !isClassNameIncludesHidden(mutation.oldValue) &&
-            !isClassNameIncludesActivatedStatus(currentClassName) &&
-            !node.hasAttribute("data-menu-uid") && // google framework use this to trace dropdown menu
-            !mutation.oldValue.includes("select__items") &&
-            !(
-              node.hasAttribute("data-testid") &&
-              node.getAttribute("data-testid").includes("select-dropdown")
-            )
-          )
-            continue;
-          const newStyle = getElementComputedStyle(node);
-          const newDisplay = newStyle?.display;
-          if (newDisplay !== "none") {
-            window.globalOneTimeIncrementElements.push({
-              targetNode: node,
-              newNodes: [node],
-            });
-            await addIncrementalNodeToMap(node, [node]);
-          }
-        }
+      const node = mutation.target;
+      if (node.nodeType === Node.TEXT_NODE) continue;
+      const tagName = node.tagName?.toLowerCase();
+
+      // if the changing element is dropdown related elements, we should consider
+      // they're the new element as long as the element is still visible on the page
+      if (
+        isDropdownRelatedElement(node) &&
+        getElementComputedStyle(node)?.display !== "none"
+      ) {
+        window.globalOneTimeIncrementElements.push({
+          targetNode: node,
+          newNodes: [node],
+        });
+        await addIncrementalNodeToMap(node, [node]);
+        continue;
      }

-      if (mutation.type === "childList") {
-        if (mutation.target.nodeType === Node.TEXT_NODE) continue;
-        const node = mutation.target;
-        let changedNode = {
-          targetNode: node, // TODO: for future usage, when we want to parse new elements into a tree
-        };
-        let newNodes = [];
-        if (mutation.addedNodes && mutation.addedNodes.length > 0) {
-          for (const node of mutation.addedNodes) {
-            // skip the text nodes, they won't be interactable
-            if (node.nodeType === Node.TEXT_NODE) continue;
-            newNodes.push(node);
+      // if they're not the dropdown related elements
+      // we detect the element based on the following rules
+      switch (mutation.type) {
+        case "attributes": {
+          switch (mutation.attributeName) {
+            case "hidden": {
+              if (!node.hidden) {
+                window.globalOneTimeIncrementElements.push({
+                  targetNode: node,
+                  newNodes: [node],
+                });
+                await addIncrementalNodeToMap(node, [node]);
+              }
+              break;
+            }
+            case "style": {
+              // TODO: need to confirm that elemnent is hidden previously
+              if (tagName === "body") continue;
+              if (
+                (getElementComputedStyle(node)?.display !== "none") !==
+                "none"
+              ) {
+                window.globalOneTimeIncrementElements.push({
+                  targetNode: node,
+                  newNodes: [node],
+                });
+                await addIncrementalNodeToMap(node, [node]);
+              }
+              break;
+            }
+            case "class": {
+              if (tagName === "body") continue;
+              if (!mutation.oldValue) continue;
+              const currentClassName = node.className
+                ? node.className.toString()
+                : "";
+              if (
+                !isClassNameIncludesHidden(mutation.oldValue) &&
+                !isClassNameIncludesActivatedStatus(currentClassName) &&
+                !node.hasAttribute("data-menu-uid") && // google framework use this to trace dropdown menu
+                !mutation.oldValue.includes("select__items") &&
+                !(
+                  node.hasAttribute("data-testid") &&
+                  node.getAttribute("data-testid").includes("select-dropdown")
+                )
+              )
+                continue;
+              if (
+                (getElementComputedStyle(node)?.display !== "none") !==
+                "none"
+              ) {
+                window.globalOneTimeIncrementElements.push({
+                  targetNode: node,
+                  newNodes: [node],
+                });
+                await addIncrementalNodeToMap(node, [node]);
+              }
+              break;
+            }
          }
        }
-        if (
-          newNodes.length == 0 &&
-          (node.tagName.toLowerCase() === "ul" ||
-            (node.tagName.toLowerCase() === "div" &&
-              node.hasAttribute("role") &&
-              node.getAttribute("role").toLowerCase() === "listbox"))
-        ) {
-          newNodes.push(node);
-        }
+        case "childList": {
+          let changedNode = {
+            targetNode: node, // TODO: for future usage, when we want to parse new elements into a tree
+          };
+          let newNodes = [];
+          if (mutation.addedNodes && mutation.addedNodes.length > 0) {
+            for (const node of mutation.addedNodes) {
+              // skip the text nodes, they won't be interactable
+              if (node.nodeType === Node.TEXT_NODE) continue;
+              newNodes.push(node);
+            }
+          }
+          if (
+            newNodes.length == 0 &&
+            (tagName === "ul" ||
+              (tagName === "div" &&
+                node.hasAttribute("role") &&
+                node.getAttribute("role").toLowerCase() === "listbox"))
+          ) {
+            newNodes.push(node);
+          }

-        if (newNodes.length > 0) {
-          changedNode.newNodes = newNodes;
-          window.globalOneTimeIncrementElements.push(changedNode);
-          await addIncrementalNodeToMap(
-            changedNode.targetNode,
-            changedNode.newNodes,
-          );
+          if (newNodes.length > 0) {
+            changedNode.newNodes = newNodes;
+            window.globalOneTimeIncrementElements.push(changedNode);
+            await addIncrementalNodeToMap(
+              changedNode.targetNode,
+              changedNode.newNodes,
+            );
+          }
+          break;
        }
      }
    }