extend auto completion coverage (#1165)

2024-11-11 18:57:59 +08:00
parent 9130640fc2
commit dd3869b3b7
9 changed files with 128 additions and 33 deletions
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -69,9 +69,10 @@ class InputOrSelectContext(BaseModel):
    field: str | None = None
    is_required: bool | None = None
    is_search_bar: bool | None = None  # don't trigger custom-selection logic when it's a search bar
+    is_location_input: bool | None = None  # address input usually requires auto completion

    def __repr__(self) -> str:
-        return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar})"
+        return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input})"


 class Action(BaseModel):
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -18,7 +18,6 @@ from skyvern.exceptions import (
    ErrFoundSelectableElement,
    FailedToFetchSecret,
    FailToClick,
-    FailToFindAutocompleteOption,
    FailToSelectByIndex,
    FailToSelectByLabel,
    FailToSelectByValue,
@@ -27,6 +26,8 @@ from skyvern.exceptions import (
    InteractWithDisabledElement,
    InvalidElementForTextInput,
    MissingElement,
+    MissingElementDict,
+    MissingElementInCSSMap,
    MissingFileUrl,
    MultipleElementsFound,
    NoAutoCompleteOptionMeetCondition,
@@ -72,6 +73,7 @@ from skyvern.webeye.scraper.scraper import (
    ElementTreeFormat,
    IncrementalScrapePage,
    ScrapedPage,
+    hash_element,
    json_to_html,
    trim_element_tree,
 )
@@ -169,6 +171,7 @@ def clean_and_remove_element_tree_factory(
        )
        for check_exist in check_exist_funcs:
            element_tree = remove_exist_elements(element_tree=element_tree, check_exist=check_exist)
+
        return element_tree

    return helper_func
@@ -441,6 +444,7 @@ async def handle_input_text_action(
        return [ActionFailure(InteractWithDisabledElement(skyvern_element.get_id()))]

    incremental_element: list[dict] = []
+    auto_complete_hacky_flag: bool = False
    # check if it's selectable
    if skyvern_element.get_tag_name() == InteractiveElement.INPUT and not await skyvern_element.is_raw_input():
        select_action = SelectOptionAction(
@@ -489,6 +493,7 @@ async def handle_input_text_action(
            )
            await incremental_scraped.stop_listen_dom_increment()
        else:
+            auto_complete_hacky_flag = True
            try:
                # TODO: we don't select by value for the auto completion detect case
                result, _ = await sequentially_select_from_dropdown(
@@ -545,9 +550,26 @@ async def handle_input_text_action(
        if len(text) == 0:
            return [ActionSuccess()]

-        if await skyvern_element.is_auto_completion_input():
+        # parse the input context to help executing input action
+        prompt = prompt_engine.load_prompt(
+            "parse-input-or-select-context",
+            element_id=action.element_id,
+            action_reasoning=action.reasoning,
+            elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
+        )
+
+        json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
+        input_or_select_context = InputOrSelectContext.model_validate(json_response)
+        LOG.info(
+            "Parsed input/select context",
+            context=input_or_select_context,
+            task_id=task.task_id,
+            step_id=step.step_id,
+        )
+
+        if await skyvern_element.is_auto_completion_input() or input_or_select_context.is_location_input:
            if result := await input_or_auto_complete_input(
-                action=action,
+                input_or_select_context=input_or_select_context,
                page=page,
                dom=dom,
                text=text,
@@ -557,11 +579,22 @@ async def handle_input_text_action(
            ):
                return [result]

-        await skyvern_element.input_sequentially(text=text)
+        await incremental_scraped.start_listen_dom_increment()
+
+        try:
+            await skyvern_element.input_sequentially(text=text)
+        finally:
+            incremental_element = await incremental_scraped.get_incremental_element_tree(
+                clean_and_remove_element_tree_factory(task=task, step=step, check_exist_funcs=[dom.check_id_in_dom]),
+            )
+            if len(incremental_element) > 0:
+                auto_complete_hacky_flag = True
+            await incremental_scraped.stop_listen_dom_increment()
+
        return [ActionSuccess()]
    finally:
        # HACK: force to finish missing auto completion input
-        if len(incremental_element) > 0:
+        if auto_complete_hacky_flag:
            LOG.debug(
                "Trigger input-selection hack, pressing Tab to choose one",
                action=action,
@@ -1240,7 +1273,8 @@ async def choose_auto_completion_dropdown(
        if len(incremental_element) == 0:
            raise NoIncrementalElementFoundForAutoCompletion(element_id=skyvern_element.get_id(), text=text)

-        html = incremental_scraped.build_html_tree(incremental_element)
+        cleaned_incremental_element = remove_duplicated_HTML_element(incremental_element)
+        html = incremental_scraped.build_html_tree(cleaned_incremental_element)
        auto_completion_confirm_prompt = prompt_engine.load_prompt(
            "auto-completion-choose-option",
            field_information=context.field,
@@ -1305,8 +1339,20 @@ async def choose_auto_completion_dropdown(
            await skyvern_element.input_clear()


+def remove_duplicated_HTML_element(elements: list[dict]) -> list[dict]:
+    cache_map = set()
+    new_elements: list[dict] = []
+    for element in elements:
+        key = hash_element(element=element)
+        if key in cache_map:
+            continue
+        cache_map.add(key)
+        new_elements.append(element)
+    return new_elements
+
+
 async def input_or_auto_complete_input(
-    action: actions.InputTextAction,
+    input_or_select_context: InputOrSelectContext,
    page: Page,
    dom: DomUtil,
    text: str,
@@ -1321,22 +1367,6 @@ async def input_or_auto_complete_input(
        element_id=skyvern_element.get_id(),
    )

-    prompt = prompt_engine.load_prompt(
-        "parse-input-or-select-context",
-        element_id=action.element_id,
-        action_reasoning=action.reasoning,
-        elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
-    )
-
-    json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
-    input_or_select_context = InputOrSelectContext.model_validate(json_response)
-    LOG.info(
-        "Parsed input/select context",
-        context=input_or_select_context,
-        task_id=task.task_id,
-        step_id=step.step_id,
-    )
-
    # 1. press the orignal text to see if there's a match
    # 2. call LLM to find 5 potential values based on the orginal text
    # 3. try each potential values from #2
@@ -1388,6 +1418,8 @@ async def input_or_auto_complete_input(
            "auto-completion-potential-answers",
            field_information=input_or_select_context.field,
            current_value=current_value,
+            navigation_goal=task.navigation_goal,
+            navigation_payload_str=json.dumps(task.navigation_payload),
        )

        LOG.info(
@@ -1439,12 +1471,15 @@ async def input_or_auto_complete_input(
                current_value=current_value,
                current_attemp=current_attemp,
            )
+            cleaned_new_elements = remove_duplicated_HTML_element(whole_new_elements)
            prompt = prompt_engine.load_prompt(
                "auto-completion-tweak-value",
                field_information=input_or_select_context.field,
                current_value=current_value,
+                navigation_goal=task.navigation_goal,
+                navigation_payload_str=json.dumps(task.navigation_payload),
                tried_values=json.dumps(tried_values),
-                popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]),
+                popped_up_elements="".join([json_to_html(element) for element in cleaned_new_elements]),
            )
            json_respone = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
            context_reasoning = json_respone.get("reasoning")
@@ -1462,7 +1497,13 @@ async def input_or_auto_complete_input(
            current_value = new_current_value

    else:
-        return ActionFailure(FailToFindAutocompleteOption(current_value=text))
+        LOG.warning(
+            "Auto completion didn't finish, this might leave the input value to be empty.",
+            context=input_or_select_context,
+            step_id=step.step_id,
+            task_id=task.task_id,
+        )
+        return None


 async def sequentially_select_from_dropdown(
@@ -1723,7 +1764,7 @@ async def select_from_dropdown(
        await selected_element.get_locator().click(timeout=timeout)
        single_select_result.action_result = ActionSuccess()
        return single_select_result
-    except MissingElement:
+    except (MissingElement, MissingElementDict, MissingElementInCSSMap, MultipleElementsFound):
        if not value:
            raise

--- a/skyvern/webeye/scraper/domUtils.js
+++ b/skyvern/webeye/scraper/domUtils.js
@@ -216,6 +216,10 @@ function isElementStyleVisibilityVisible(element, style) {
  return true;
 }

+function hasASPClientControl() {
+  return typeof ASPxClientControl !== "undefined";
+}
+
 // from playwright
 function isElementVisible(element) {
  // TODO: This is a hack to not check visibility for option elements
@@ -496,8 +500,16 @@ function isInteractable(element) {
    if (element.className.toString().includes("hover:cursor-pointer")) {
      return true;
    }
+
+    // auto for <a> is equal to pointer for <a>
+    if (tagName == "a" && computedStyle.cursor === "auto") {
+      return true;
+    }
  }

+  if (hasASPClientControl() && tagName === "tr") {
+    return true;
+  }
  return false;
 }

--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -600,6 +600,7 @@ class IncrementalScrapePage:
            return None, False

        if not interactable:
+            LOG.debug("Find the target element by text, but the element is not interactable", text=text)
            return None, True

        return parent_locator, True
--- a/skyvern/webeye/utils/dom.py
+++ b/skyvern/webeye/utils/dom.py
@@ -143,10 +143,6 @@ class SkyvernElement:
        if autocomplete and autocomplete == "list":
            return True

-        element_id = await self.get_attr("id")
-        if element_id == "location-input":
-            return True
-
        return False

    async def is_custom_option(self) -> bool:
@@ -527,6 +523,25 @@ class SkyvernElement:
            await self.focus(timeout=timeout)
        await asyncio.sleep(2)  # wait for scrolling into the target

+    async def calculate_vertical_distance_to(
+        self,
+        target_locator: Locator,
+        mode: typing.Literal["inner", "outer"],
+        timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
+    ) -> float:
+        self_rect = await self.get_locator().bounding_box(timeout=timeout)
+        if self_rect is None:
+            raise Exception("Can't Skyvern element rect")
+
+        target_rect = await target_locator.bounding_box(timeout=timeout)
+        if self_rect is None or target_rect is None:
+            raise Exception("Can't get the target element rect")
+
+        if mode == "inner":
+            return abs(self_rect["y"] + self_rect["height"] - target_rect["y"])
+        else:
+            return abs(self_rect["y"] - (target_rect["y"] + target_rect["height"]))
+

 class DomUtil:
    """
--- a/skyvern/webeye/utils/page.py
+++ b/skyvern/webeye/utils/page.py
@@ -223,3 +223,7 @@ class SkyvernFrame:
    async def is_window_scrollable(self) -> bool:
        js_script = "() => isWindowScrollable()"
        return await self.evaluate(frame=self.frame, expression=js_script)
+
+    async def has_ASP_client_control(self) -> bool:
+        js_script = "() => hasASPClientControl()"
+        return await self.evaluate(frame=self.frame, expression=js_script)