better context for select and auto-complete (#816)

This commit is contained in:
LawyZheng
2024-09-12 15:00:25 +08:00
committed by GitHub
parent 3e82a6a5f9
commit f16b6f3c8d
7 changed files with 40 additions and 11 deletions

View File

@@ -45,6 +45,8 @@ class SelectOption(BaseModel):
class Action(BaseModel):
action_type: ActionType
field_information: str | None = None
required_field: bool | None = None
confidence_float: float | None = None
description: str | None = None
reasoning: str | None = None
@@ -160,6 +162,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
reasoning = action["reasoning"] if "reasoning" in action else None
confidence_float = action["confidence_float"] if "confidence_float" in action else None
field_information = action["field_information"] if "field_information" in action else None
required_field = action["required_field"] if "required_field" in action else None
if "action_type" not in action or action["action_type"] is None:
return NullAction(reasoning=reasoning, confidence_float=confidence_float)
@@ -177,6 +181,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
if action_type == ActionType.CLICK:
file_url = action["file_url"] if "file_url" in action else None
return ClickAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
reasoning=reasoning,
confidence_float=confidence_float,
@@ -186,6 +192,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
if action_type == ActionType.INPUT_TEXT:
return InputTextAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
text=action["text"],
reasoning=reasoning,
@@ -195,6 +203,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
if action_type == ActionType.UPLOAD_FILE:
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
return UploadFileAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
confidence_float=confidence_float,
file_url=action["file_url"],
@@ -204,6 +214,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
# This action is not used in the current implementation. Click actions are used instead.
if action_type == ActionType.DOWNLOAD_FILE:
return DownloadFileAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
file_name=action["file_name"],
reasoning=reasoning,
@@ -220,6 +232,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
if label is None and value is None and index is None:
raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption")
return SelectOptionAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
option=SelectOption(
label=label,
@@ -232,6 +246,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
if action_type == ActionType.CHECKBOX:
return CheckboxAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
is_checked=action["is_checked"],
reasoning=reasoning,

View File

@@ -418,7 +418,11 @@ async def handle_input_text_action(
if skyvern_element.get_tag_name() == InteractiveElement.INPUT and not await skyvern_element.is_spinbtn_input():
await skyvern_element.scroll_into_view()
select_action = SelectOptionAction(
reasoning=action.reasoning, element_id=skyvern_element.get_id(), option=SelectOption(label=text)
field_information=action.field_information,
required_field=action.required_field,
reasoning=action.reasoning,
element_id=skyvern_element.get_id(),
option=SelectOption(label=text),
)
if skyvern_element.get_selectable():
LOG.info(
@@ -685,7 +689,13 @@ async def handle_select_option_action(
tag_name=selectable_child.get_tag_name(),
element_id=selectable_child.get_id(),
)
select_action = SelectOptionAction(element_id=selectable_child.get_id(), option=action.option)
select_action = SelectOptionAction(
reasoning=action.reasoning,
field_information=action.field_information,
required_field=action.required_field,
element_id=selectable_child.get_id(),
option=action.option,
)
return await handle_select_option_action(select_action, page, scraped_page, task, step)
if tag_name == InteractiveElement.SELECT:
@@ -1122,7 +1132,7 @@ async def choose_auto_completion_dropdown(
html = incremental_scraped.build_html_tree(incremental_element)
auto_completion_confirm_prompt = prompt_engine.load_prompt(
"auto-completion-choose-option",
context_reasoning=action.reasoning,
field_information=action.field_information,
filled_value=text,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
@@ -1241,7 +1251,7 @@ async def input_or_auto_complete_input(
prompt = prompt_engine.load_prompt(
"auto-completion-potential-answers",
context_reasoning=context_reasoning,
field_information=action.field_information,
current_value=current_value,
)
@@ -1296,7 +1306,7 @@ async def input_or_auto_complete_input(
)
prompt = prompt_engine.load_prompt(
"auto-completion-tweak-value",
context_reasoning=context_reasoning,
field_information=action.field_information,
current_value=current_value,
tried_values=json.dumps(tried_values),
popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]),
@@ -1310,7 +1320,7 @@ async def input_or_auto_complete_input(
"Ask LLM tweaked the current value with a new value",
step_id=step.step_id,
task_id=task.task_id,
reasoning=context_reasoning,
field_information=action.field_information,
current_value=current_value,
new_value=new_current_value,
)
@@ -1485,7 +1495,8 @@ async def select_from_dropdown(
prompt = prompt_engine.load_prompt(
"custom-select",
context_reasoning=action.reasoning,
field_information=action.field_information,
required_field=action.required_field,
target_value=target_value if not force_select and should_relevant else "",
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),