improve custom selection (#2236)

This commit is contained in:
Shuchang Zheng
2025-04-28 16:16:36 +08:00
committed by GitHub
parent a3d9917386
commit 2f0e6e5eb3
3 changed files with 276 additions and 133 deletions

View File

@@ -696,6 +696,7 @@ async def handle_input_text_action(
await page.keyboard.type(action.text)
return [ActionSuccess()]
input_or_select_context: InputOrSelectContext | None = None
dom = DomUtil(scraped_page, page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
@@ -789,8 +790,16 @@ async def handle_input_text_action(
try_to_quit_dropdown = True
try:
# TODO: we don't select by value for the auto completion detect case
if input_or_select_context is None:
input_or_select_context = await _get_input_or_select_context(
action=action,
scraped_page=scraped_page,
step=step,
)
select_result = await sequentially_select_from_dropdown(
action=select_action,
input_or_select_context=input_or_select_context,
page=page,
dom=dom,
skyvern_element=skyvern_element,
@@ -926,25 +935,12 @@ async def handle_input_text_action(
return [ActionSuccess()]
if not await skyvern_element.is_raw_input():
prompt = load_prompt_with_elements(
scraped_page=scraped_page,
prompt_engine=prompt_engine,
template_name="parse-input-or-select-context",
element_id=action.element_id,
action_reasoning=action.reasoning,
)
json_response = await app.SECONDARY_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
)
json_response["intention"] = action.intention
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
context=input_or_select_context,
task_id=task.task_id,
step_id=step.step_id,
)
if input_or_select_context is None:
input_or_select_context = await _get_input_or_select_context(
action=action,
scraped_page=scraped_page,
step=step,
)
if await skyvern_element.is_auto_completion_input() or input_or_select_context.is_location_input:
if result := await input_or_auto_complete_input(
@@ -1327,13 +1323,28 @@ async def handle_select_option_action(
),
)
input_or_select_context = await _get_input_or_select_context(
action=action, scraped_page=scraped_page, step=step
)
if len(incremental_element) == 0:
raise NoIncrementalElementFoundForCustomSelection(element_id=skyvern_element.get_id())
results.append(
await select_from_emerging_elements(
action=action,
input_or_select_context=input_or_select_context,
page=page,
scraped_page=scraped_page,
task=task,
step=step,
)
)
return results
is_open = True
# TODO: support sequetially select from dropdown by value, just support single select now
result = await sequentially_select_from_dropdown(
action=action,
input_or_select_context=input_or_select_context,
page=page,
dom=dom,
skyvern_element=skyvern_element,
@@ -2222,6 +2233,7 @@ async def input_or_auto_complete_input(
async def sequentially_select_from_dropdown(
action: SelectOptionAction,
input_or_select_context: InputOrSelectContext,
page: Page,
dom: DomUtil,
skyvern_element: SkyvernElement,
@@ -2237,26 +2249,6 @@ async def sequentially_select_from_dropdown(
TODO: support to return all values retrieved from the sequentially select
Only return the last value today
"""
prompt = load_prompt_with_elements(
scraped_page=dom.scraped_page,
prompt_engine=prompt_engine,
template_name="parse-input-or-select-context",
action_reasoning=action.reasoning,
element_id=action.element_id,
)
json_response = await app.SECONDARY_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
)
json_response["intention"] = action.intention
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
context=input_or_select_context,
task_id=task.task_id,
step_id=step.step_id,
)
if not force_select and input_or_select_context.is_search_bar:
LOG.info(
"Exit custom selection mode since it's a non-force search bar",
@@ -2414,6 +2406,93 @@ def build_sequential_select_history(history_list: list[CustomSingleSelectResult]
return result
async def select_from_emerging_elements(
action: SelectOptionAction,
input_or_select_context: InputOrSelectContext,
page: Page,
scraped_page: ScrapedPage,
step: Step,
task: Task,
) -> ActionResult:
# TODO: support to handle the case when options are loaded by scroll
LOG.info(
"No incremental elements detected by MutationObserver, using re-scraping the page to find the match element"
)
scraped_page_after_open = await scraped_page.generate_scraped_page_without_screenshots()
new_element_ids = set(scraped_page_after_open.id_to_css_dict.keys()) - set(scraped_page.id_to_css_dict.keys())
dom_after_open = DomUtil(scraped_page=scraped_page_after_open, page=page)
new_interactable_element_ids = [
element_id
for element_id in new_element_ids
if (await dom_after_open.get_skyvern_element_by_id(element_id)).is_interactable()
]
if len(new_interactable_element_ids) == 0:
raise NoIncrementalElementFoundForCustomSelection(element_id=action.element_id)
prompt = load_prompt_with_elements(
scraped_page=scraped_page_after_open,
prompt_engine=prompt_engine,
template_name="custom-select",
is_date_related=input_or_select_context.is_date_related,
field_information=input_or_select_context.field
if not input_or_select_context.intention
else input_or_select_context.intention,
required_field=input_or_select_context.is_required,
target_value=action.option.label,
navigation_goal=task.navigation_goal,
new_elements_ids=new_interactable_element_ids,
navigation_payload_str=json.dumps(task.navigation_payload),
local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(),
)
LOG.info(
"Calling LLM to find the match element",
step_id=step.step_id,
task_id=task.task_id,
)
json_response = await app.LLM_API_HANDLER(prompt=prompt, step=step, prompt_name="custom-select")
value: str | None = json_response.get("value", None)
LOG.info(
"LLM response for the matched element",
matched_value=value,
response=json_response,
step_id=step.step_id,
task_id=task.task_id,
)
action_type_str: str = json_response.get("action_type", "")
action_type = ActionType(action_type_str.lower())
element_id: str | None = json_response.get("id", None)
if not element_id or action_type not in [ActionType.CLICK, ActionType.INPUT_TEXT]:
raise NoAvailableOptionFoundForCustomSelection(reason=json_response.get("reasoning"))
if value is not None and action_type == ActionType.INPUT_TEXT:
LOG.info(
"No clickable option found, but found input element to search",
element_id=element_id,
)
input_element = await dom_after_open.get_skyvern_element_by_id(element_id)
await input_element.scroll_into_view()
current_text = await get_input_value(input_element.get_tag_name(), input_element.get_locator())
if current_text == value:
return ActionSuccess()
await input_element.input_clear()
await input_element.input_sequentially(value)
return ActionSuccess()
else:
selected_element = await dom_after_open.get_skyvern_element_by_id(element_id)
if await selected_element.get_attr("role") == "listbox":
return ActionFailure(exception=InteractWithDropdownContainer(element_id=element_id))
await selected_element.scroll_into_view()
await selected_element.click(page=page)
return ActionSuccess()
async def select_from_dropdown(
context: InputOrSelectContext,
page: Page,
@@ -3264,3 +3343,25 @@ async def _get_verification_code_from_db(
continue
return totp_code.code
return None
async def _get_input_or_select_context(
action: InputTextAction | SelectOptionAction, scraped_page: ScrapedPage, step: Step
) -> InputOrSelectContext:
prompt = load_prompt_with_elements(
scraped_page=scraped_page,
prompt_engine=prompt_engine,
template_name="parse-input-or-select-context",
action_reasoning=action.reasoning,
element_id=action.element_id,
)
json_response = await app.SECONDARY_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
)
json_response["intention"] = action.intention
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
context=input_or_select_context,
)
return input_or_select_context