extend auto completion coverage (#1165)

This commit is contained in:
LawyZheng
2024-11-11 18:57:59 +08:00
committed by GitHub
parent 9130640fc2
commit dd3869b3b7
9 changed files with 128 additions and 33 deletions

View File

@@ -1,5 +1,5 @@
You're doing an auto completion input action on HTML page. The current filled value doesn't match any option.
Based on the context and current value, give ten most potential values with the same meaning as the current value.
Based on the context, current value, user goal and user details, give ten most potential values with the same meaning as the current value.
You can provide values like:
- Subset or superset meaning from the current value
- Summarized from the current value
@@ -26,4 +26,14 @@ Choose an auto-completion suggestion for "{{ field_information }}"
Current Value:
```
{{ current_value }}
```
User goal:
```
{{ navigation_goal }}
```
User details:
```
{{ navigation_payload_str }}
```

View File

@@ -1,5 +1,5 @@
You're doing an auto completion input action on HTML page. User has tried several values, but none of them could find a match.
Based on the context, current value, tried values, option elements popped up while typing, tweak the value into a reasonable one based on the information.
Based on the context, current value, tried values, user goal, user details and option elements popped up while typing, tweak the value into a reasonable one based on the information.
You can try to change the value under the following rules:
1. the value must be reasonably changed from the current value, like superset, subset of the current value
2. If there're popped up elements, find the common concept among all elements, and then tweak the current value into a reasonable value based on the same concept.
@@ -32,6 +32,16 @@ Tried Values:
{{ tried_values }}
```
User goal:
```
{{ navigation_goal }}
```
User details:
```
{{ navigation_payload_str }}
```
Popped up elements:
```
{{ popped_up_elements }}

View File

@@ -8,6 +8,7 @@ Reply in the following JSON format:
"field": str, // Which field is this action intended to fill out?
"is_required": bool, // True if this is a required field, otherwise false.
"is_search_bar": bool, // True if the element to take the action is a search bar, otherwise false.
"is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information.
}
Existing reasoning context:

View File

@@ -69,9 +69,10 @@ class InputOrSelectContext(BaseModel):
field: str | None = None
is_required: bool | None = None
is_search_bar: bool | None = None # don't trigger custom-selection logic when it's a search bar
is_location_input: bool | None = None # address input usually requires auto completion
def __repr__(self) -> str:
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar})"
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input})"
class Action(BaseModel):

View File

@@ -18,7 +18,6 @@ from skyvern.exceptions import (
ErrFoundSelectableElement,
FailedToFetchSecret,
FailToClick,
FailToFindAutocompleteOption,
FailToSelectByIndex,
FailToSelectByLabel,
FailToSelectByValue,
@@ -27,6 +26,8 @@ from skyvern.exceptions import (
InteractWithDisabledElement,
InvalidElementForTextInput,
MissingElement,
MissingElementDict,
MissingElementInCSSMap,
MissingFileUrl,
MultipleElementsFound,
NoAutoCompleteOptionMeetCondition,
@@ -72,6 +73,7 @@ from skyvern.webeye.scraper.scraper import (
ElementTreeFormat,
IncrementalScrapePage,
ScrapedPage,
hash_element,
json_to_html,
trim_element_tree,
)
@@ -169,6 +171,7 @@ def clean_and_remove_element_tree_factory(
)
for check_exist in check_exist_funcs:
element_tree = remove_exist_elements(element_tree=element_tree, check_exist=check_exist)
return element_tree
return helper_func
@@ -441,6 +444,7 @@ async def handle_input_text_action(
return [ActionFailure(InteractWithDisabledElement(skyvern_element.get_id()))]
incremental_element: list[dict] = []
auto_complete_hacky_flag: bool = False
# check if it's selectable
if skyvern_element.get_tag_name() == InteractiveElement.INPUT and not await skyvern_element.is_raw_input():
select_action = SelectOptionAction(
@@ -489,6 +493,7 @@ async def handle_input_text_action(
)
await incremental_scraped.stop_listen_dom_increment()
else:
auto_complete_hacky_flag = True
try:
# TODO: we don't select by value for the auto completion detect case
result, _ = await sequentially_select_from_dropdown(
@@ -545,9 +550,26 @@ async def handle_input_text_action(
if len(text) == 0:
return [ActionSuccess()]
if await skyvern_element.is_auto_completion_input():
# parse the input context to help executing input action
prompt = prompt_engine.load_prompt(
"parse-input-or-select-context",
element_id=action.element_id,
action_reasoning=action.reasoning,
elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
)
json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
context=input_or_select_context,
task_id=task.task_id,
step_id=step.step_id,
)
if await skyvern_element.is_auto_completion_input() or input_or_select_context.is_location_input:
if result := await input_or_auto_complete_input(
action=action,
input_or_select_context=input_or_select_context,
page=page,
dom=dom,
text=text,
@@ -557,11 +579,22 @@ async def handle_input_text_action(
):
return [result]
await skyvern_element.input_sequentially(text=text)
await incremental_scraped.start_listen_dom_increment()
try:
await skyvern_element.input_sequentially(text=text)
finally:
incremental_element = await incremental_scraped.get_incremental_element_tree(
clean_and_remove_element_tree_factory(task=task, step=step, check_exist_funcs=[dom.check_id_in_dom]),
)
if len(incremental_element) > 0:
auto_complete_hacky_flag = True
await incremental_scraped.stop_listen_dom_increment()
return [ActionSuccess()]
finally:
# HACK: force to finish missing auto completion input
if len(incremental_element) > 0:
if auto_complete_hacky_flag:
LOG.debug(
"Trigger input-selection hack, pressing Tab to choose one",
action=action,
@@ -1240,7 +1273,8 @@ async def choose_auto_completion_dropdown(
if len(incremental_element) == 0:
raise NoIncrementalElementFoundForAutoCompletion(element_id=skyvern_element.get_id(), text=text)
html = incremental_scraped.build_html_tree(incremental_element)
cleaned_incremental_element = remove_duplicated_HTML_element(incremental_element)
html = incremental_scraped.build_html_tree(cleaned_incremental_element)
auto_completion_confirm_prompt = prompt_engine.load_prompt(
"auto-completion-choose-option",
field_information=context.field,
@@ -1305,8 +1339,20 @@ async def choose_auto_completion_dropdown(
await skyvern_element.input_clear()
def remove_duplicated_HTML_element(elements: list[dict]) -> list[dict]:
cache_map = set()
new_elements: list[dict] = []
for element in elements:
key = hash_element(element=element)
if key in cache_map:
continue
cache_map.add(key)
new_elements.append(element)
return new_elements
async def input_or_auto_complete_input(
action: actions.InputTextAction,
input_or_select_context: InputOrSelectContext,
page: Page,
dom: DomUtil,
text: str,
@@ -1321,22 +1367,6 @@ async def input_or_auto_complete_input(
element_id=skyvern_element.get_id(),
)
prompt = prompt_engine.load_prompt(
"parse-input-or-select-context",
element_id=action.element_id,
action_reasoning=action.reasoning,
elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
)
json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
context=input_or_select_context,
task_id=task.task_id,
step_id=step.step_id,
)
# 1. press the orignal text to see if there's a match
# 2. call LLM to find 5 potential values based on the orginal text
# 3. try each potential values from #2
@@ -1388,6 +1418,8 @@ async def input_or_auto_complete_input(
"auto-completion-potential-answers",
field_information=input_or_select_context.field,
current_value=current_value,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
)
LOG.info(
@@ -1439,12 +1471,15 @@ async def input_or_auto_complete_input(
current_value=current_value,
current_attemp=current_attemp,
)
cleaned_new_elements = remove_duplicated_HTML_element(whole_new_elements)
prompt = prompt_engine.load_prompt(
"auto-completion-tweak-value",
field_information=input_or_select_context.field,
current_value=current_value,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
tried_values=json.dumps(tried_values),
popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]),
popped_up_elements="".join([json_to_html(element) for element in cleaned_new_elements]),
)
json_respone = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
context_reasoning = json_respone.get("reasoning")
@@ -1462,7 +1497,13 @@ async def input_or_auto_complete_input(
current_value = new_current_value
else:
return ActionFailure(FailToFindAutocompleteOption(current_value=text))
LOG.warning(
"Auto completion didn't finish, this might leave the input value to be empty.",
context=input_or_select_context,
step_id=step.step_id,
task_id=task.task_id,
)
return None
async def sequentially_select_from_dropdown(
@@ -1723,7 +1764,7 @@ async def select_from_dropdown(
await selected_element.get_locator().click(timeout=timeout)
single_select_result.action_result = ActionSuccess()
return single_select_result
except MissingElement:
except (MissingElement, MissingElementDict, MissingElementInCSSMap, MultipleElementsFound):
if not value:
raise

View File

@@ -216,6 +216,10 @@ function isElementStyleVisibilityVisible(element, style) {
return true;
}
function hasASPClientControl() {
return typeof ASPxClientControl !== "undefined";
}
// from playwright
function isElementVisible(element) {
// TODO: This is a hack to not check visibility for option elements
@@ -496,8 +500,16 @@ function isInteractable(element) {
if (element.className.toString().includes("hover:cursor-pointer")) {
return true;
}
// auto for <a> is equal to pointer for <a>
if (tagName == "a" && computedStyle.cursor === "auto") {
return true;
}
}
if (hasASPClientControl() && tagName === "tr") {
return true;
}
return false;
}

View File

@@ -600,6 +600,7 @@ class IncrementalScrapePage:
return None, False
if not interactable:
LOG.debug("Find the target element by text, but the element is not interactable", text=text)
return None, True
return parent_locator, True

View File

@@ -143,10 +143,6 @@ class SkyvernElement:
if autocomplete and autocomplete == "list":
return True
element_id = await self.get_attr("id")
if element_id == "location-input":
return True
return False
async def is_custom_option(self) -> bool:
@@ -527,6 +523,25 @@ class SkyvernElement:
await self.focus(timeout=timeout)
await asyncio.sleep(2) # wait for scrolling into the target
async def calculate_vertical_distance_to(
self,
target_locator: Locator,
mode: typing.Literal["inner", "outer"],
timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
) -> float:
self_rect = await self.get_locator().bounding_box(timeout=timeout)
if self_rect is None:
raise Exception("Can't Skyvern element rect")
target_rect = await target_locator.bounding_box(timeout=timeout)
if self_rect is None or target_rect is None:
raise Exception("Can't get the target element rect")
if mode == "inner":
return abs(self_rect["y"] + self_rect["height"] - target_rect["y"])
else:
return abs(self_rect["y"] - (target_rect["y"] + target_rect["height"]))
class DomUtil:
"""

View File

@@ -223,3 +223,7 @@ class SkyvernFrame:
async def is_window_scrollable(self) -> bool:
js_script = "() => isWindowScrollable()"
return await self.evaluate(frame=self.frame, expression=js_script)
async def has_ASP_client_control(self) -> bool:
js_script = "() => hasASPClientControl()"
return await self.evaluate(frame=self.frame, expression=js_script)