optimize multiple select (#1703)

This commit is contained in:
Shuchang Zheng
2025-02-03 19:19:39 +08:00
committed by GitHub
parent 36de8bde99
commit b4f2ec945c
4 changed files with 46 additions and 17 deletions

View File

@@ -451,7 +451,7 @@ class FailedToFetchSecret(SkyvernException):
class NoIncrementalElementFoundForCustomSelection(SkyvernException):
def __init__(self, element_id: str) -> None:
super().__init__(
f"No incremental element found, maybe try an input action or taking the select action on other elements. element_id={element_id}"
f"No incremental element found, try it again later or try another element. element_id={element_id}"
)

View File

@@ -1,13 +1,22 @@
Confirm if the user has finished the multi-level selection based on the screenshot, user details, the HTML elements and select history provided in the list.
Confirm if the user has finished the mini goal in the current opened dropdown selection based on the screenshot, user details, the HTML elements and select history provided in the list.
NOTE:
- Only consider the mini goal is achieved when there is(are) one(several) valid options selected in the dropdown.
- Sometimes it's a multi-level selection dropdown, you need to select multiple times to pick a valid option(sub-option).
Reply in JSON format with the following keys:
{
"page_info": str, // Think step by step. Describe the page information you parsed from the HTML elements. Your action should be based on the current page information.
"think": str, // Think step by step. Describe how you think the user has finished the multi-level selection.
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
"is_finished": bool, // True if the user has finished the multi-level selection, False otherwise.
"page_info": str, // Think step by step. Describe the page information you parsed from the HTML elements and the screenshot. Your decision should be based on the current page information.
"think": str, // Think step by step. Describe how you think the user has finished the mini goal in the current opened dropdown selection.
"is_multiple_selection": bool, // True if it's a multi-level selection, otheriwse False.
"is_mini_goal_finished": bool, // True if the user has finished the mini goal in the current opened dropdown selection, False otherwise.
}
Mini Goal:
```
Select an option for "{{ mini_goal }}"
```
User goal:
```
{{ navigation_goal }}

View File

@@ -75,6 +75,7 @@ class CompleteVerifyResult(BaseModel):
class InputOrSelectContext(BaseModel):
intention: str | None = None
field: str | None = None
is_required: bool | None = None
is_search_bar: bool | None = None # don't trigger custom-selection logic when it's a search bar
@@ -82,7 +83,7 @@ class InputOrSelectContext(BaseModel):
is_date_related: bool | None = None # date picker mini agent requires some special logic
def __repr__(self) -> str:
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input})"
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})"
class Action(BaseModel):

View File

@@ -582,6 +582,7 @@ async def handle_input_text_action(
reasoning=action.reasoning,
element_id=skyvern_element.get_id(),
option=SelectOption(label=text),
intention=action.intention,
)
if skyvern_element.get_selectable():
LOG.info(
@@ -761,6 +762,7 @@ async def handle_input_text_action(
json_response = await app.SECONDARY_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
)
json_response["intention"] = action.intention
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
@@ -999,6 +1001,7 @@ async def handle_select_option_action(
reasoning=action.reasoning,
element_id=selectable_child.get_id(),
option=action.option,
intention=action.intention,
)
action = select_action
skyvern_element = selectable_child
@@ -1045,6 +1048,7 @@ async def handle_select_option_action(
reasoning=action.reasoning,
element_id=blocking_element.get_id(),
option=action.option,
intention=action.intention,
)
action = select_action
skyvern_element = blocking_element
@@ -1666,7 +1670,7 @@ async def choose_auto_completion_dropdown(
auto_completion_confirm_prompt = prompt_engine.load_prompt(
"auto-completion-choose-option",
is_search=context.is_search_bar,
field_information=context.field,
field_information=context.field if not context.intention else context.intention,
filled_value=text,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
@@ -1815,10 +1819,16 @@ async def input_or_auto_complete_input(
tried_values.append(current_value)
whole_new_elements.extend(result.incremental_elements)
field_information = (
input_or_select_context.field
if not input_or_select_context.intention
else input_or_select_context.intention
)
prompt = prompt_engine.load_prompt(
"auto-completion-potential-answers",
potential_value_count=AUTO_COMPLETION_POTENTIAL_VALUES_COUNT,
field_information=input_or_select_context.field,
field_information=field_information,
current_value=current_value,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
@@ -1879,7 +1889,7 @@ async def input_or_auto_complete_input(
cleaned_new_elements = remove_duplicated_HTML_element(whole_new_elements)
prompt = prompt_engine.load_prompt(
"auto-completion-tweak-value",
field_information=input_or_select_context.field,
field_information=field_information,
current_value=current_value,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
@@ -1940,6 +1950,7 @@ async def sequentially_select_from_dropdown(
json_response = await app.SECONDARY_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
)
json_response["intention"] = action.intention
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
@@ -2038,21 +2049,26 @@ async def sequentially_select_from_dropdown(
# it's for typing. it's been verified in `single_select_result.is_done()`
assert single_select_result.dropdown_menu is not None
screenshot = await single_select_result.dropdown_menu.get_locator().screenshot(
timeout=settings.BROWSER_SCREENSHOT_TIMEOUT_MS
screenshot = await page.screenshot(timeout=settings.BROWSER_SCREENSHOT_TIMEOUT_MS)
mini_goal = (
input_or_select_context.field
if not input_or_select_context.intention
else input_or_select_context.intention
)
prompt = prompt_engine.load_prompt(
"confirm-multi-selection-finish",
mini_goal=mini_goal,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
elements="".join(json_to_html(element) for element in secondary_increment_element),
select_history=json.dumps(build_sequential_select_history(select_history)),
local_datetime=datetime.now(ensure_context().tz_info).isoformat(),
)
json_response = await app.SECONDARY_LLM_API_HANDLER(
json_response = await app.LLM_API_HANDLER(
prompt=prompt, screenshots=[screenshot], step=step, prompt_name="confirm-multi-selection-finish"
)
if json_response.get("is_finished", False):
if json_response.get("is_mini_goal_finished", False):
LOG.info("The user has finished the selection for the current opened dropdown", step_id=step.step_id)
return single_select_result.action_result, values[-1] if len(values) > 0 else None
return select_history[-1].action_result if len(select_history) > 0 else None, values[-1] if len(
@@ -2138,7 +2154,7 @@ async def select_from_dropdown(
prompt = prompt_engine.load_prompt(
"custom-select",
is_date_related=context.is_date_related,
field_information=context.field,
field_information=context.field if not context.intention else context.intention,
required_field=context.is_required,
target_value="" if force_select else target_value,
navigation_goal=task.navigation_goal,
@@ -2588,6 +2604,7 @@ async def normal_select(
json_response = await app.SECONDARY_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
)
json_response["intention"] = action.intention
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
@@ -2597,10 +2614,12 @@ async def normal_select(
)
options_html = skyvern_element.build_HTML()
field_information = (
input_or_select_context.field if not input_or_select_context.intention else input_or_select_context.intention
)
prompt = prompt_engine.load_prompt(
"normal-select",
field_information=input_or_select_context.field,
field_information=field_information,
required_field=input_or_select_context.is_required,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),