extend auto completion coverage (#1165)

2024-11-11 18:57:59 +08:00
parent 9130640fc2
commit dd3869b3b7
9 changed files with 128 additions and 33 deletions
--- a/skyvern/forge/prompts/skyvern/auto-completion-potential-answers.j2
+++ b/skyvern/forge/prompts/skyvern/auto-completion-potential-answers.j2
@@ -1,5 +1,5 @@
 You're doing an auto completion input action on HTML page. The current filled value doesn't match any option.
-Based on the context and current value, give ten most potential values with the same meaning as the current value.
+Based on the context, current value, user goal and user details, give ten most potential values with the same meaning as the current value.
 You can provide values like:
    - Subset or superset meaning from the current value
    - Summarized from the current value
@@ -26,4 +26,14 @@ Choose an auto-completion suggestion for "{{ field_information }}"
 Current Value:
 ```
 {{ current_value }}
+```
+
+User goal:
+```
+{{ navigation_goal }}
+```
+
+User details:
+```
+{{ navigation_payload_str }}
 ```
--- a/skyvern/forge/prompts/skyvern/auto-completion-tweak-value.j2
+++ b/skyvern/forge/prompts/skyvern/auto-completion-tweak-value.j2
@@ -1,5 +1,5 @@
 You're doing an auto completion input action on HTML page. User has tried several values, but none of them could find a match.
-Based on the context, current value, tried values, option elements popped up while typing, tweak the value into a reasonable one based on the information.
+Based on the context, current value, tried values, user goal, user details and option elements popped up while typing, tweak the value into a reasonable one based on the information.
 You can try to change the value under the following rules:
    1. the value must be reasonably changed from the current value, like superset, subset of the current value
    2. If there're popped up elements, find the common concept among all elements, and then tweak the current value into a reasonable value based on the same concept.
@@ -32,6 +32,16 @@ Tried Values:
 {{ tried_values }}
 ```

+User goal:
+```
+{{ navigation_goal }}
+```
+
+User details:
+```
+{{ navigation_payload_str }}
+```
+
 Popped up elements:
 ```
 {{ popped_up_elements }}
--- a/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2
+++ b/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2
@@ -8,6 +8,7 @@ Reply in the following JSON format:
    "field": str, // Which field is this action intended to fill out?
    "is_required": bool, // True if this is a required field, otherwise false.
    "is_search_bar": bool, // True if the element to take the action is a search bar, otherwise false.
+    "is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information.
 }

 Existing reasoning context:
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -69,9 +69,10 @@ class InputOrSelectContext(BaseModel):
    field: str | None = None
    is_required: bool | None = None
    is_search_bar: bool | None = None  # don't trigger custom-selection logic when it's a search bar
+    is_location_input: bool | None = None  # address input usually requires auto completion

    def __repr__(self) -> str:
-        return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar})"
+        return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input})"


 class Action(BaseModel):
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -18,7 +18,6 @@ from skyvern.exceptions import (
    ErrFoundSelectableElement,
    FailedToFetchSecret,
    FailToClick,
-    FailToFindAutocompleteOption,
    FailToSelectByIndex,
    FailToSelectByLabel,
    FailToSelectByValue,
@@ -27,6 +26,8 @@ from skyvern.exceptions import (
    InteractWithDisabledElement,
    InvalidElementForTextInput,
    MissingElement,
+    MissingElementDict,
+    MissingElementInCSSMap,
    MissingFileUrl,
    MultipleElementsFound,
    NoAutoCompleteOptionMeetCondition,
@@ -72,6 +73,7 @@ from skyvern.webeye.scraper.scraper import (
    ElementTreeFormat,
    IncrementalScrapePage,
    ScrapedPage,
+    hash_element,
    json_to_html,
    trim_element_tree,
 )
@@ -169,6 +171,7 @@ def clean_and_remove_element_tree_factory(
        )
        for check_exist in check_exist_funcs:
            element_tree = remove_exist_elements(element_tree=element_tree, check_exist=check_exist)
+
        return element_tree

    return helper_func
@@ -441,6 +444,7 @@ async def handle_input_text_action(
        return [ActionFailure(InteractWithDisabledElement(skyvern_element.get_id()))]

    incremental_element: list[dict] = []
+    auto_complete_hacky_flag: bool = False
    # check if it's selectable
    if skyvern_element.get_tag_name() == InteractiveElement.INPUT and not await skyvern_element.is_raw_input():
        select_action = SelectOptionAction(
@@ -489,6 +493,7 @@ async def handle_input_text_action(
            )
            await incremental_scraped.stop_listen_dom_increment()
        else:
+            auto_complete_hacky_flag = True
            try:
                # TODO: we don't select by value for the auto completion detect case
                result, _ = await sequentially_select_from_dropdown(
@@ -545,9 +550,26 @@ async def handle_input_text_action(
        if len(text) == 0:
            return [ActionSuccess()]

-        if await skyvern_element.is_auto_completion_input():
+        # parse the input context to help executing input action
+        prompt = prompt_engine.load_prompt(
+            "parse-input-or-select-context",
+            element_id=action.element_id,
+            action_reasoning=action.reasoning,
+            elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
+        )
+
+        json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
+        input_or_select_context = InputOrSelectContext.model_validate(json_response)
+        LOG.info(
+            "Parsed input/select context",
+            context=input_or_select_context,
+            task_id=task.task_id,
+            step_id=step.step_id,
+        )
+
+        if await skyvern_element.is_auto_completion_input() or input_or_select_context.is_location_input:
            if result := await input_or_auto_complete_input(
-                action=action,
+                input_or_select_context=input_or_select_context,
                page=page,
                dom=dom,
                text=text,
@@ -557,11 +579,22 @@ async def handle_input_text_action(
            ):
                return [result]

-        await skyvern_element.input_sequentially(text=text)
+        await incremental_scraped.start_listen_dom_increment()
+
+        try:
+            await skyvern_element.input_sequentially(text=text)
+        finally:
+            incremental_element = await incremental_scraped.get_incremental_element_tree(
+                clean_and_remove_element_tree_factory(task=task, step=step, check_exist_funcs=[dom.check_id_in_dom]),
+            )
+            if len(incremental_element) > 0:
+                auto_complete_hacky_flag = True
+            await incremental_scraped.stop_listen_dom_increment()
+
        return [ActionSuccess()]
    finally:
        # HACK: force to finish missing auto completion input
-        if len(incremental_element) > 0:
+        if auto_complete_hacky_flag:
            LOG.debug(
                "Trigger input-selection hack, pressing Tab to choose one",
                action=action,
@@ -1240,7 +1273,8 @@ async def choose_auto_completion_dropdown(
        if len(incremental_element) == 0:
            raise NoIncrementalElementFoundForAutoCompletion(element_id=skyvern_element.get_id(), text=text)

-        html = incremental_scraped.build_html_tree(incremental_element)
+        cleaned_incremental_element = remove_duplicated_HTML_element(incremental_element)
+        html = incremental_scraped.build_html_tree(cleaned_incremental_element)
        auto_completion_confirm_prompt = prompt_engine.load_prompt(
            "auto-completion-choose-option",
            field_information=context.field,
@@ -1305,8 +1339,20 @@ async def choose_auto_completion_dropdown(
            await skyvern_element.input_clear()


+def remove_duplicated_HTML_element(elements: list[dict]) -> list[dict]:
+    cache_map = set()
+    new_elements: list[dict] = []
+    for element in elements:
+        key = hash_element(element=element)
+        if key in cache_map:
+            continue
+        cache_map.add(key)
+        new_elements.append(element)
+    return new_elements
+
+
 async def input_or_auto_complete_input(
-    action: actions.InputTextAction,
+    input_or_select_context: InputOrSelectContext,
    page: Page,
    dom: DomUtil,
    text: str,
@@ -1321,22 +1367,6 @@ async def input_or_auto_complete_input(
        element_id=skyvern_element.get_id(),
    )

-    prompt = prompt_engine.load_prompt(
-        "parse-input-or-select-context",
-        element_id=action.element_id,
-        action_reasoning=action.reasoning,
-        elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
-    )
-
-    json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
-    input_or_select_context = InputOrSelectContext.model_validate(json_response)
-    LOG.info(
-        "Parsed input/select context",
-        context=input_or_select_context,
-        task_id=task.task_id,
-        step_id=step.step_id,
-    )
-
    # 1. press the orignal text to see if there's a match
    # 2. call LLM to find 5 potential values based on the orginal text
    # 3. try each potential values from #2
@@ -1388,6 +1418,8 @@ async def input_or_auto_complete_input(
            "auto-completion-potential-answers",
            field_information=input_or_select_context.field,
            current_value=current_value,
+            navigation_goal=task.navigation_goal,
+            navigation_payload_str=json.dumps(task.navigation_payload),
        )

        LOG.info(
@@ -1439,12 +1471,15 @@ async def input_or_auto_complete_input(
                current_value=current_value,
                current_attemp=current_attemp,
            )
+            cleaned_new_elements = remove_duplicated_HTML_element(whole_new_elements)
            prompt = prompt_engine.load_prompt(
                "auto-completion-tweak-value",
                field_information=input_or_select_context.field,
                current_value=current_value,
+                navigation_goal=task.navigation_goal,
+                navigation_payload_str=json.dumps(task.navigation_payload),
                tried_values=json.dumps(tried_values),
-                popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]),
+                popped_up_elements="".join([json_to_html(element) for element in cleaned_new_elements]),
            )
            json_respone = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
            context_reasoning = json_respone.get("reasoning")
@@ -1462,7 +1497,13 @@ async def input_or_auto_complete_input(
            current_value = new_current_value

    else:
-        return ActionFailure(FailToFindAutocompleteOption(current_value=text))
+        LOG.warning(
+            "Auto completion didn't finish, this might leave the input value to be empty.",
+            context=input_or_select_context,
+            step_id=step.step_id,
+            task_id=task.task_id,
+        )
+        return None


 async def sequentially_select_from_dropdown(
@@ -1723,7 +1764,7 @@ async def select_from_dropdown(
        await selected_element.get_locator().click(timeout=timeout)
        single_select_result.action_result = ActionSuccess()
        return single_select_result
-    except MissingElement:
+    except (MissingElement, MissingElementDict, MissingElementInCSSMap, MultipleElementsFound):
        if not value:
            raise

--- a/skyvern/webeye/scraper/domUtils.js
+++ b/skyvern/webeye/scraper/domUtils.js
@@ -216,6 +216,10 @@ function isElementStyleVisibilityVisible(element, style) {
  return true;
 }

+function hasASPClientControl() {
+  return typeof ASPxClientControl !== "undefined";
+}
+
 // from playwright
 function isElementVisible(element) {
  // TODO: This is a hack to not check visibility for option elements
@@ -496,8 +500,16 @@ function isInteractable(element) {
    if (element.className.toString().includes("hover:cursor-pointer")) {
      return true;
    }
+
+    // auto for <a> is equal to pointer for <a>
+    if (tagName == "a" && computedStyle.cursor === "auto") {
+      return true;
+    }
  }

+  if (hasASPClientControl() && tagName === "tr") {
+    return true;
+  }
  return false;
 }

--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -600,6 +600,7 @@ class IncrementalScrapePage:
            return None, False

        if not interactable:
+            LOG.debug("Find the target element by text, but the element is not interactable", text=text)
            return None, True

        return parent_locator, True
--- a/skyvern/webeye/utils/dom.py
+++ b/skyvern/webeye/utils/dom.py
@@ -143,10 +143,6 @@ class SkyvernElement:
        if autocomplete and autocomplete == "list":
            return True

-        element_id = await self.get_attr("id")
-        if element_id == "location-input":
-            return True
-
        return False

    async def is_custom_option(self) -> bool:
@@ -527,6 +523,25 @@ class SkyvernElement:
            await self.focus(timeout=timeout)
        await asyncio.sleep(2)  # wait for scrolling into the target

+    async def calculate_vertical_distance_to(
+        self,
+        target_locator: Locator,
+        mode: typing.Literal["inner", "outer"],
+        timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
+    ) -> float:
+        self_rect = await self.get_locator().bounding_box(timeout=timeout)
+        if self_rect is None:
+            raise Exception("Can't Skyvern element rect")
+
+        target_rect = await target_locator.bounding_box(timeout=timeout)
+        if self_rect is None or target_rect is None:
+            raise Exception("Can't get the target element rect")
+
+        if mode == "inner":
+            return abs(self_rect["y"] + self_rect["height"] - target_rect["y"])
+        else:
+            return abs(self_rect["y"] - (target_rect["y"] + target_rect["height"]))
+

 class DomUtil:
    """
--- a/skyvern/webeye/utils/page.py
+++ b/skyvern/webeye/utils/page.py
@@ -223,3 +223,7 @@ class SkyvernFrame:
    async def is_window_scrollable(self) -> bool:
        js_script = "() => isWindowScrollable()"
        return await self.evaluate(frame=self.frame, expression=js_script)
+
+    async def has_ASP_client_control(self) -> bool:
+        js_script = "() => hasASPClientControl()"
+        return await self.evaluate(frame=self.frame, expression=js_script)