From dd3869b3b766badf6fdb3e7d61460e5575c20202 Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Mon, 11 Nov 2024 18:57:59 +0800 Subject: [PATCH] extend auto completion coverage (#1165) --- .../auto-completion-potential-answers.j2 | 12 ++- .../skyvern/auto-completion-tweak-value.j2 | 12 ++- .../skyvern/parse-input-or-select-context.j2 | 1 + skyvern/webeye/actions/actions.py | 3 +- skyvern/webeye/actions/handler.py | 93 +++++++++++++------ skyvern/webeye/scraper/domUtils.js | 12 +++ skyvern/webeye/scraper/scraper.py | 1 + skyvern/webeye/utils/dom.py | 23 ++++- skyvern/webeye/utils/page.py | 4 + 9 files changed, 128 insertions(+), 33 deletions(-) diff --git a/skyvern/forge/prompts/skyvern/auto-completion-potential-answers.j2 b/skyvern/forge/prompts/skyvern/auto-completion-potential-answers.j2 index 5fc4b662..5e6b7a18 100644 --- a/skyvern/forge/prompts/skyvern/auto-completion-potential-answers.j2 +++ b/skyvern/forge/prompts/skyvern/auto-completion-potential-answers.j2 @@ -1,5 +1,5 @@ You're doing an auto completion input action on HTML page. The current filled value doesn't match any option. -Based on the context and current value, give ten most potential values with the same meaning as the current value. +Based on the context, current value, user goal and user details, give ten most potential values with the same meaning as the current value. You can provide values like: - Subset or superset meaning from the current value - Summarized from the current value @@ -26,4 +26,14 @@ Choose an auto-completion suggestion for "{{ field_information }}" Current Value: ``` {{ current_value }} +``` + +User goal: +``` +{{ navigation_goal }} +``` + +User details: +``` +{{ navigation_payload_str }} ``` \ No newline at end of file diff --git a/skyvern/forge/prompts/skyvern/auto-completion-tweak-value.j2 b/skyvern/forge/prompts/skyvern/auto-completion-tweak-value.j2 index 0825f06e..3f965b33 100644 --- a/skyvern/forge/prompts/skyvern/auto-completion-tweak-value.j2 +++ b/skyvern/forge/prompts/skyvern/auto-completion-tweak-value.j2 @@ -1,5 +1,5 @@ You're doing an auto completion input action on HTML page. User has tried several values, but none of them could find a match. -Based on the context, current value, tried values, option elements popped up while typing, tweak the value into a reasonable one based on the information. +Based on the context, current value, tried values, user goal, user details and option elements popped up while typing, tweak the value into a reasonable one based on the information. You can try to change the value under the following rules: 1. the value must be reasonably changed from the current value, like superset, subset of the current value 2. If there're popped up elements, find the common concept among all elements, and then tweak the current value into a reasonable value based on the same concept. @@ -32,6 +32,16 @@ Tried Values: {{ tried_values }} ``` +User goal: +``` +{{ navigation_goal }} +``` + +User details: +``` +{{ navigation_payload_str }} +``` + Popped up elements: ``` {{ popped_up_elements }} diff --git a/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2 b/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2 index c81cdbe0..8096f1ca 100644 --- a/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2 +++ b/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2 @@ -8,6 +8,7 @@ Reply in the following JSON format: "field": str, // Which field is this action intended to fill out? "is_required": bool, // True if this is a required field, otherwise false. "is_search_bar": bool, // True if the element to take the action is a search bar, otherwise false. + "is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. } Existing reasoning context: diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index ffb2eda9..0e23c0cc 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -69,9 +69,10 @@ class InputOrSelectContext(BaseModel): field: str | None = None is_required: bool | None = None is_search_bar: bool | None = None # don't trigger custom-selection logic when it's a search bar + is_location_input: bool | None = None # address input usually requires auto completion def __repr__(self) -> str: - return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar})" + return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input})" class Action(BaseModel): diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index ab1e7e5e..07ca6ecb 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -18,7 +18,6 @@ from skyvern.exceptions import ( ErrFoundSelectableElement, FailedToFetchSecret, FailToClick, - FailToFindAutocompleteOption, FailToSelectByIndex, FailToSelectByLabel, FailToSelectByValue, @@ -27,6 +26,8 @@ from skyvern.exceptions import ( InteractWithDisabledElement, InvalidElementForTextInput, MissingElement, + MissingElementDict, + MissingElementInCSSMap, MissingFileUrl, MultipleElementsFound, NoAutoCompleteOptionMeetCondition, @@ -72,6 +73,7 @@ from skyvern.webeye.scraper.scraper import ( ElementTreeFormat, IncrementalScrapePage, ScrapedPage, + hash_element, json_to_html, trim_element_tree, ) @@ -169,6 +171,7 @@ def clean_and_remove_element_tree_factory( ) for check_exist in check_exist_funcs: element_tree = remove_exist_elements(element_tree=element_tree, check_exist=check_exist) + return element_tree return helper_func @@ -441,6 +444,7 @@ async def handle_input_text_action( return [ActionFailure(InteractWithDisabledElement(skyvern_element.get_id()))] incremental_element: list[dict] = [] + auto_complete_hacky_flag: bool = False # check if it's selectable if skyvern_element.get_tag_name() == InteractiveElement.INPUT and not await skyvern_element.is_raw_input(): select_action = SelectOptionAction( @@ -489,6 +493,7 @@ async def handle_input_text_action( ) await incremental_scraped.stop_listen_dom_increment() else: + auto_complete_hacky_flag = True try: # TODO: we don't select by value for the auto completion detect case result, _ = await sequentially_select_from_dropdown( @@ -545,9 +550,26 @@ async def handle_input_text_action( if len(text) == 0: return [ActionSuccess()] - if await skyvern_element.is_auto_completion_input(): + # parse the input context to help executing input action + prompt = prompt_engine.load_prompt( + "parse-input-or-select-context", + element_id=action.element_id, + action_reasoning=action.reasoning, + elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML), + ) + + json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step) + input_or_select_context = InputOrSelectContext.model_validate(json_response) + LOG.info( + "Parsed input/select context", + context=input_or_select_context, + task_id=task.task_id, + step_id=step.step_id, + ) + + if await skyvern_element.is_auto_completion_input() or input_or_select_context.is_location_input: if result := await input_or_auto_complete_input( - action=action, + input_or_select_context=input_or_select_context, page=page, dom=dom, text=text, @@ -557,11 +579,22 @@ async def handle_input_text_action( ): return [result] - await skyvern_element.input_sequentially(text=text) + await incremental_scraped.start_listen_dom_increment() + + try: + await skyvern_element.input_sequentially(text=text) + finally: + incremental_element = await incremental_scraped.get_incremental_element_tree( + clean_and_remove_element_tree_factory(task=task, step=step, check_exist_funcs=[dom.check_id_in_dom]), + ) + if len(incremental_element) > 0: + auto_complete_hacky_flag = True + await incremental_scraped.stop_listen_dom_increment() + return [ActionSuccess()] finally: # HACK: force to finish missing auto completion input - if len(incremental_element) > 0: + if auto_complete_hacky_flag: LOG.debug( "Trigger input-selection hack, pressing Tab to choose one", action=action, @@ -1240,7 +1273,8 @@ async def choose_auto_completion_dropdown( if len(incremental_element) == 0: raise NoIncrementalElementFoundForAutoCompletion(element_id=skyvern_element.get_id(), text=text) - html = incremental_scraped.build_html_tree(incremental_element) + cleaned_incremental_element = remove_duplicated_HTML_element(incremental_element) + html = incremental_scraped.build_html_tree(cleaned_incremental_element) auto_completion_confirm_prompt = prompt_engine.load_prompt( "auto-completion-choose-option", field_information=context.field, @@ -1305,8 +1339,20 @@ async def choose_auto_completion_dropdown( await skyvern_element.input_clear() +def remove_duplicated_HTML_element(elements: list[dict]) -> list[dict]: + cache_map = set() + new_elements: list[dict] = [] + for element in elements: + key = hash_element(element=element) + if key in cache_map: + continue + cache_map.add(key) + new_elements.append(element) + return new_elements + + async def input_or_auto_complete_input( - action: actions.InputTextAction, + input_or_select_context: InputOrSelectContext, page: Page, dom: DomUtil, text: str, @@ -1321,22 +1367,6 @@ async def input_or_auto_complete_input( element_id=skyvern_element.get_id(), ) - prompt = prompt_engine.load_prompt( - "parse-input-or-select-context", - element_id=action.element_id, - action_reasoning=action.reasoning, - elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML), - ) - - json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step) - input_or_select_context = InputOrSelectContext.model_validate(json_response) - LOG.info( - "Parsed input/select context", - context=input_or_select_context, - task_id=task.task_id, - step_id=step.step_id, - ) - # 1. press the orignal text to see if there's a match # 2. call LLM to find 5 potential values based on the orginal text # 3. try each potential values from #2 @@ -1388,6 +1418,8 @@ async def input_or_auto_complete_input( "auto-completion-potential-answers", field_information=input_or_select_context.field, current_value=current_value, + navigation_goal=task.navigation_goal, + navigation_payload_str=json.dumps(task.navigation_payload), ) LOG.info( @@ -1439,12 +1471,15 @@ async def input_or_auto_complete_input( current_value=current_value, current_attemp=current_attemp, ) + cleaned_new_elements = remove_duplicated_HTML_element(whole_new_elements) prompt = prompt_engine.load_prompt( "auto-completion-tweak-value", field_information=input_or_select_context.field, current_value=current_value, + navigation_goal=task.navigation_goal, + navigation_payload_str=json.dumps(task.navigation_payload), tried_values=json.dumps(tried_values), - popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]), + popped_up_elements="".join([json_to_html(element) for element in cleaned_new_elements]), ) json_respone = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step) context_reasoning = json_respone.get("reasoning") @@ -1462,7 +1497,13 @@ async def input_or_auto_complete_input( current_value = new_current_value else: - return ActionFailure(FailToFindAutocompleteOption(current_value=text)) + LOG.warning( + "Auto completion didn't finish, this might leave the input value to be empty.", + context=input_or_select_context, + step_id=step.step_id, + task_id=task.task_id, + ) + return None async def sequentially_select_from_dropdown( @@ -1723,7 +1764,7 @@ async def select_from_dropdown( await selected_element.get_locator().click(timeout=timeout) single_select_result.action_result = ActionSuccess() return single_select_result - except MissingElement: + except (MissingElement, MissingElementDict, MissingElementInCSSMap, MultipleElementsFound): if not value: raise diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 0cb720a4..8dd09687 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -216,6 +216,10 @@ function isElementStyleVisibilityVisible(element, style) { return true; } +function hasASPClientControl() { + return typeof ASPxClientControl !== "undefined"; +} + // from playwright function isElementVisible(element) { // TODO: This is a hack to not check visibility for option elements @@ -496,8 +500,16 @@ function isInteractable(element) { if (element.className.toString().includes("hover:cursor-pointer")) { return true; } + + // auto for is equal to pointer for + if (tagName == "a" && computedStyle.cursor === "auto") { + return true; + } } + if (hasASPClientControl() && tagName === "tr") { + return true; + } return false; } diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index b464bb68..57a17726 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -600,6 +600,7 @@ class IncrementalScrapePage: return None, False if not interactable: + LOG.debug("Find the target element by text, but the element is not interactable", text=text) return None, True return parent_locator, True diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index 6673b48b..569888cd 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -143,10 +143,6 @@ class SkyvernElement: if autocomplete and autocomplete == "list": return True - element_id = await self.get_attr("id") - if element_id == "location-input": - return True - return False async def is_custom_option(self) -> bool: @@ -527,6 +523,25 @@ class SkyvernElement: await self.focus(timeout=timeout) await asyncio.sleep(2) # wait for scrolling into the target + async def calculate_vertical_distance_to( + self, + target_locator: Locator, + mode: typing.Literal["inner", "outer"], + timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS, + ) -> float: + self_rect = await self.get_locator().bounding_box(timeout=timeout) + if self_rect is None: + raise Exception("Can't Skyvern element rect") + + target_rect = await target_locator.bounding_box(timeout=timeout) + if self_rect is None or target_rect is None: + raise Exception("Can't get the target element rect") + + if mode == "inner": + return abs(self_rect["y"] + self_rect["height"] - target_rect["y"]) + else: + return abs(self_rect["y"] - (target_rect["y"] + target_rect["height"])) + class DomUtil: """ diff --git a/skyvern/webeye/utils/page.py b/skyvern/webeye/utils/page.py index 596f4aba..44924f52 100644 --- a/skyvern/webeye/utils/page.py +++ b/skyvern/webeye/utils/page.py @@ -223,3 +223,7 @@ class SkyvernFrame: async def is_window_scrollable(self) -> bool: js_script = "() => isWindowScrollable()" return await self.evaluate(frame=self.frame, expression=js_script) + + async def has_ASP_client_control(self) -> bool: + js_script = "() => hasASPClientControl()" + return await self.evaluate(frame=self.frame, expression=js_script)