extend auto completion coverage (#1165)

This commit is contained in:
LawyZheng
2024-11-11 18:57:59 +08:00
committed by GitHub
parent 9130640fc2
commit dd3869b3b7
9 changed files with 128 additions and 33 deletions

View File

@@ -69,9 +69,10 @@ class InputOrSelectContext(BaseModel):
field: str | None = None
is_required: bool | None = None
is_search_bar: bool | None = None # don't trigger custom-selection logic when it's a search bar
is_location_input: bool | None = None # address input usually requires auto completion
def __repr__(self) -> str:
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar})"
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input})"
class Action(BaseModel):

View File

@@ -18,7 +18,6 @@ from skyvern.exceptions import (
ErrFoundSelectableElement,
FailedToFetchSecret,
FailToClick,
FailToFindAutocompleteOption,
FailToSelectByIndex,
FailToSelectByLabel,
FailToSelectByValue,
@@ -27,6 +26,8 @@ from skyvern.exceptions import (
InteractWithDisabledElement,
InvalidElementForTextInput,
MissingElement,
MissingElementDict,
MissingElementInCSSMap,
MissingFileUrl,
MultipleElementsFound,
NoAutoCompleteOptionMeetCondition,
@@ -72,6 +73,7 @@ from skyvern.webeye.scraper.scraper import (
ElementTreeFormat,
IncrementalScrapePage,
ScrapedPage,
hash_element,
json_to_html,
trim_element_tree,
)
@@ -169,6 +171,7 @@ def clean_and_remove_element_tree_factory(
)
for check_exist in check_exist_funcs:
element_tree = remove_exist_elements(element_tree=element_tree, check_exist=check_exist)
return element_tree
return helper_func
@@ -441,6 +444,7 @@ async def handle_input_text_action(
return [ActionFailure(InteractWithDisabledElement(skyvern_element.get_id()))]
incremental_element: list[dict] = []
auto_complete_hacky_flag: bool = False
# check if it's selectable
if skyvern_element.get_tag_name() == InteractiveElement.INPUT and not await skyvern_element.is_raw_input():
select_action = SelectOptionAction(
@@ -489,6 +493,7 @@ async def handle_input_text_action(
)
await incremental_scraped.stop_listen_dom_increment()
else:
auto_complete_hacky_flag = True
try:
# TODO: we don't select by value for the auto completion detect case
result, _ = await sequentially_select_from_dropdown(
@@ -545,9 +550,26 @@ async def handle_input_text_action(
if len(text) == 0:
return [ActionSuccess()]
if await skyvern_element.is_auto_completion_input():
# parse the input context to help executing input action
prompt = prompt_engine.load_prompt(
"parse-input-or-select-context",
element_id=action.element_id,
action_reasoning=action.reasoning,
elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
)
json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
context=input_or_select_context,
task_id=task.task_id,
step_id=step.step_id,
)
if await skyvern_element.is_auto_completion_input() or input_or_select_context.is_location_input:
if result := await input_or_auto_complete_input(
action=action,
input_or_select_context=input_or_select_context,
page=page,
dom=dom,
text=text,
@@ -557,11 +579,22 @@ async def handle_input_text_action(
):
return [result]
await skyvern_element.input_sequentially(text=text)
await incremental_scraped.start_listen_dom_increment()
try:
await skyvern_element.input_sequentially(text=text)
finally:
incremental_element = await incremental_scraped.get_incremental_element_tree(
clean_and_remove_element_tree_factory(task=task, step=step, check_exist_funcs=[dom.check_id_in_dom]),
)
if len(incremental_element) > 0:
auto_complete_hacky_flag = True
await incremental_scraped.stop_listen_dom_increment()
return [ActionSuccess()]
finally:
# HACK: force to finish missing auto completion input
if len(incremental_element) > 0:
if auto_complete_hacky_flag:
LOG.debug(
"Trigger input-selection hack, pressing Tab to choose one",
action=action,
@@ -1240,7 +1273,8 @@ async def choose_auto_completion_dropdown(
if len(incremental_element) == 0:
raise NoIncrementalElementFoundForAutoCompletion(element_id=skyvern_element.get_id(), text=text)
html = incremental_scraped.build_html_tree(incremental_element)
cleaned_incremental_element = remove_duplicated_HTML_element(incremental_element)
html = incremental_scraped.build_html_tree(cleaned_incremental_element)
auto_completion_confirm_prompt = prompt_engine.load_prompt(
"auto-completion-choose-option",
field_information=context.field,
@@ -1305,8 +1339,20 @@ async def choose_auto_completion_dropdown(
await skyvern_element.input_clear()
def remove_duplicated_HTML_element(elements: list[dict]) -> list[dict]:
cache_map = set()
new_elements: list[dict] = []
for element in elements:
key = hash_element(element=element)
if key in cache_map:
continue
cache_map.add(key)
new_elements.append(element)
return new_elements
async def input_or_auto_complete_input(
action: actions.InputTextAction,
input_or_select_context: InputOrSelectContext,
page: Page,
dom: DomUtil,
text: str,
@@ -1321,22 +1367,6 @@ async def input_or_auto_complete_input(
element_id=skyvern_element.get_id(),
)
prompt = prompt_engine.load_prompt(
"parse-input-or-select-context",
element_id=action.element_id,
action_reasoning=action.reasoning,
elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
)
json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
context=input_or_select_context,
task_id=task.task_id,
step_id=step.step_id,
)
# 1. press the orignal text to see if there's a match
# 2. call LLM to find 5 potential values based on the orginal text
# 3. try each potential values from #2
@@ -1388,6 +1418,8 @@ async def input_or_auto_complete_input(
"auto-completion-potential-answers",
field_information=input_or_select_context.field,
current_value=current_value,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
)
LOG.info(
@@ -1439,12 +1471,15 @@ async def input_or_auto_complete_input(
current_value=current_value,
current_attemp=current_attemp,
)
cleaned_new_elements = remove_duplicated_HTML_element(whole_new_elements)
prompt = prompt_engine.load_prompt(
"auto-completion-tweak-value",
field_information=input_or_select_context.field,
current_value=current_value,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
tried_values=json.dumps(tried_values),
popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]),
popped_up_elements="".join([json_to_html(element) for element in cleaned_new_elements]),
)
json_respone = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
context_reasoning = json_respone.get("reasoning")
@@ -1462,7 +1497,13 @@ async def input_or_auto_complete_input(
current_value = new_current_value
else:
return ActionFailure(FailToFindAutocompleteOption(current_value=text))
LOG.warning(
"Auto completion didn't finish, this might leave the input value to be empty.",
context=input_or_select_context,
step_id=step.step_id,
task_id=task.task_id,
)
return None
async def sequentially_select_from_dropdown(
@@ -1723,7 +1764,7 @@ async def select_from_dropdown(
await selected_element.get_locator().click(timeout=timeout)
single_select_result.action_result = ActionSuccess()
return single_select_result
except MissingElement:
except (MissingElement, MissingElementDict, MissingElementInCSSMap, MultipleElementsFound):
if not value:
raise

View File

@@ -216,6 +216,10 @@ function isElementStyleVisibilityVisible(element, style) {
return true;
}
function hasASPClientControl() {
return typeof ASPxClientControl !== "undefined";
}
// from playwright
function isElementVisible(element) {
// TODO: This is a hack to not check visibility for option elements
@@ -496,8 +500,16 @@ function isInteractable(element) {
if (element.className.toString().includes("hover:cursor-pointer")) {
return true;
}
// auto for <a> is equal to pointer for <a>
if (tagName == "a" && computedStyle.cursor === "auto") {
return true;
}
}
if (hasASPClientControl() && tagName === "tr") {
return true;
}
return false;
}

View File

@@ -600,6 +600,7 @@ class IncrementalScrapePage:
return None, False
if not interactable:
LOG.debug("Find the target element by text, but the element is not interactable", text=text)
return None, True
return parent_locator, True

View File

@@ -143,10 +143,6 @@ class SkyvernElement:
if autocomplete and autocomplete == "list":
return True
element_id = await self.get_attr("id")
if element_id == "location-input":
return True
return False
async def is_custom_option(self) -> bool:
@@ -527,6 +523,25 @@ class SkyvernElement:
await self.focus(timeout=timeout)
await asyncio.sleep(2) # wait for scrolling into the target
async def calculate_vertical_distance_to(
self,
target_locator: Locator,
mode: typing.Literal["inner", "outer"],
timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
) -> float:
self_rect = await self.get_locator().bounding_box(timeout=timeout)
if self_rect is None:
raise Exception("Can't Skyvern element rect")
target_rect = await target_locator.bounding_box(timeout=timeout)
if self_rect is None or target_rect is None:
raise Exception("Can't get the target element rect")
if mode == "inner":
return abs(self_rect["y"] + self_rect["height"] - target_rect["y"])
else:
return abs(self_rect["y"] - (target_rect["y"] + target_rect["height"]))
class DomUtil:
"""

View File

@@ -223,3 +223,7 @@ class SkyvernFrame:
async def is_window_scrollable(self) -> bool:
js_script = "() => isWindowScrollable()"
return await self.evaluate(frame=self.frame, expression=js_script)
async def has_ASP_client_control(self) -> bool:
js_script = "() => hasASPClientControl()"
return await self.evaluate(frame=self.frame, expression=js_script)