refactor custom-select/auto-complete context (#830)

This commit is contained in:
LawyZheng
2024-09-14 17:28:08 +08:00
committed by GitHub
parent 6571604fa5
commit 8c2a733ba2
6 changed files with 86 additions and 53 deletions

View File

@@ -43,10 +43,16 @@ class SelectOption(BaseModel):
return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
class InputOrSelectContext(BaseModel):
field: str | None = None
is_required: bool | None = None
def __repr__(self) -> str:
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required})"
class Action(BaseModel):
action_type: ActionType
field_information: str | None = None
required_field: bool | None = None
confidence_float: float | None = None
description: str | None = None
reasoning: str | None = None
@@ -162,8 +168,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
reasoning = action["reasoning"] if "reasoning" in action else None
confidence_float = action["confidence_float"] if "confidence_float" in action else None
field_information = action["field_information"] if "field_information" in action else None
required_field = action["required_field"] if "required_field" in action else None
if "action_type" not in action or action["action_type"] is None:
return NullAction(reasoning=reasoning, confidence_float=confidence_float)
@@ -181,8 +185,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
if action_type == ActionType.CLICK:
file_url = action["file_url"] if "file_url" in action else None
return ClickAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
reasoning=reasoning,
confidence_float=confidence_float,
@@ -192,8 +194,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
if action_type == ActionType.INPUT_TEXT:
return InputTextAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
text=action["text"],
reasoning=reasoning,
@@ -203,8 +203,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
if action_type == ActionType.UPLOAD_FILE:
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
return UploadFileAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
confidence_float=confidence_float,
file_url=action["file_url"],
@@ -214,8 +212,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
# This action is not used in the current implementation. Click actions are used instead.
if action_type == ActionType.DOWNLOAD_FILE:
return DownloadFileAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
file_name=action["file_name"],
reasoning=reasoning,
@@ -232,8 +228,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
if label is None and value is None and index is None:
raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption")
return SelectOptionAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
option=SelectOption(
label=label,
@@ -246,8 +240,6 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
if action_type == ActionType.CHECKBOX:
return CheckboxAction(
field_information=field_information,
required_field=required_field,
element_id=element_id,
is_checked=action["is_checked"],
reasoning=reasoning,

View File

@@ -29,10 +29,10 @@ from skyvern.exceptions import (
MissingFileUrl,
MultipleElementsFound,
NoAutoCompleteOptionMeetCondition,
NoAvailableOptionFoundForCustomSelection,
NoElementMatchedForTargetOption,
NoIncrementalElementFoundForAutoCompletion,
NoIncrementalElementFoundForCustomSelection,
NoLabelOrValueForCustomSelection,
NoSuitableAutoCompleteOption,
OptionIndexOutOfBound,
WrongElementToUploadFile,
@@ -58,6 +58,7 @@ from skyvern.webeye.actions.actions import (
ActionType,
CheckboxAction,
ClickAction,
InputOrSelectContext,
ScrapeResult,
SelectOption,
SelectOptionAction,
@@ -418,8 +419,6 @@ async def handle_input_text_action(
if skyvern_element.get_tag_name() == InteractiveElement.INPUT and not await skyvern_element.is_spinbtn_input():
await skyvern_element.scroll_into_view()
select_action = SelectOptionAction(
field_information=action.field_information,
required_field=action.required_field,
reasoning=action.reasoning,
element_id=skyvern_element.get_id(),
option=SelectOption(label=text),
@@ -464,6 +463,7 @@ async def handle_input_text_action(
llm_handler=app.SECONDARY_LLM_API_HANDLER,
step=step,
task=task,
target_value=text,
)
if result is not None:
return [result]
@@ -692,8 +692,6 @@ async def handle_select_option_action(
)
select_action = SelectOptionAction(
reasoning=action.reasoning,
field_information=action.field_information,
required_field=action.required_field,
element_id=selectable_child.get_id(),
option=action.option,
)
@@ -1069,7 +1067,7 @@ async def chain_click(
async def choose_auto_completion_dropdown(
action: actions.InputTextAction,
context: InputOrSelectContext,
page: Page,
dom: DomUtil,
text: str,
@@ -1133,7 +1131,7 @@ async def choose_auto_completion_dropdown(
html = incremental_scraped.build_html_tree(incremental_element)
auto_completion_confirm_prompt = prompt_engine.load_prompt(
"auto-completion-choose-option",
field_information=action.field_information,
field_information=context.field,
filled_value=text,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
@@ -1211,6 +1209,22 @@ async def input_or_auto_complete_input(
element_id=skyvern_element.get_id(),
)
prompt = prompt_engine.load_prompt(
"parse-input-or-select-context",
element_id=action.element_id,
action_reasoning=action.reasoning,
elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
)
json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
context=input_or_select_context,
task_id=task.task_id,
step_id=step.step_id,
)
# 1. press the orignal text to see if there's a match
# 2. call LLM to find 5 potential values based on the orginal text
# 3. try each potential values from #2
@@ -1219,7 +1233,6 @@ async def input_or_auto_complete_input(
# FIXME: try the whole loop for twice now, to prevent too many LLM calls
MAX_AUTO_COMPLETE_ATTEMP = 2
current_attemp = 0
context_reasoning = action.reasoning
current_value = text
result = AutoCompletionResult()
@@ -1235,7 +1248,7 @@ async def input_or_auto_complete_input(
input_value=current_value,
)
result = await choose_auto_completion_dropdown(
action=action,
context=input_or_select_context,
page=page,
dom=dom,
text=current_value,
@@ -1252,7 +1265,7 @@ async def input_or_auto_complete_input(
prompt = prompt_engine.load_prompt(
"auto-completion-potential-answers",
field_information=action.field_information,
field_information=input_or_select_context.field,
current_value=current_value,
)
@@ -1282,7 +1295,7 @@ async def input_or_auto_complete_input(
input_value=value,
)
result = await choose_auto_completion_dropdown(
action=action,
context=input_or_select_context,
page=page,
dom=dom,
text=value,
@@ -1307,7 +1320,7 @@ async def input_or_auto_complete_input(
)
prompt = prompt_engine.load_prompt(
"auto-completion-tweak-value",
field_information=action.field_information,
field_information=input_or_select_context.field,
current_value=current_value,
tried_values=json.dumps(tried_values),
popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]),
@@ -1321,7 +1334,7 @@ async def input_or_auto_complete_input(
"Ask LLM tweaked the current value with a new value",
step_id=step.step_id,
task_id=task.task_id,
field_information=action.field_information,
field_information=input_or_select_context.field,
current_value=current_value,
new_value=new_current_value,
)
@@ -1341,13 +1354,28 @@ async def sequentially_select_from_dropdown(
step: Step,
task: Task,
force_select: bool = False,
should_relevant: bool = True,
target_value: str = "",
) -> tuple[ActionResult | None, str | None]:
"""
TODO: support to return all values retrieved from the sequentially select
Only return the last value today
"""
prompt = prompt_engine.load_prompt(
"parse-input-or-select-context",
action_reasoning=action.reasoning,
element_id=action.element_id,
elements=dom.scraped_page.build_element_tree(ElementTreeFormat.HTML),
)
json_response = await llm_handler(prompt=prompt, step=step)
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
context=input_or_select_context,
task_id=task.task_id,
step_id=step.step_id,
)
# TODO: only suport the third-level dropdown selection now
MAX_SELECT_DEPTH = 3
values: list[str | None] = []
@@ -1356,7 +1384,7 @@ async def sequentially_select_from_dropdown(
check_exist_funcs: list[CheckExistIDFunc] = [dom.check_id_in_dom]
for i in range(MAX_SELECT_DEPTH):
single_select_result = await select_from_dropdown(
action=action,
context=input_or_select_context,
page=page,
skyvern_frame=skyvern_frame,
incremental_scraped=incremental_scraped,
@@ -1366,7 +1394,7 @@ async def sequentially_select_from_dropdown(
task=task,
select_history=select_history,
force_select=force_select,
should_relevant=should_relevant,
target_value=target_value,
)
select_history.append(single_select_result)
values.append(single_select_result.value)
@@ -1431,7 +1459,7 @@ def build_sequential_select_history(history_list: list[CustomSingleSelectResult]
async def select_from_dropdown(
action: SelectOptionAction,
context: InputOrSelectContext,
page: Page,
skyvern_frame: SkyvernFrame,
incremental_scraped: IncrementalScrapePage,
@@ -1441,11 +1469,11 @@ async def select_from_dropdown(
task: Task,
select_history: list[CustomSingleSelectResult] | None = None,
force_select: bool = False,
should_relevant: bool = True,
target_value: str = "",
) -> CustomSingleSelectResult:
"""
force_select: is used to choose an element to click even there's no dropdown menu;
should_relevant: only valid when force_select is "False". When "True", the chosen value must be relevant to the target value;
targe_value: only valid when force_select is "False". When target_value is not empty, the matched option must be relevent to target value;
None will be only returned when:
1. force_select is false and no dropdown menu popped
2. force_select is false and match value is not relevant to the target value
@@ -1490,15 +1518,11 @@ async def select_from_dropdown(
html = incremental_scraped.build_html_tree(element_tree=trimmed_element_tree)
target_value = action.option.label or action.option.value
if target_value is None:
raise NoLabelOrValueForCustomSelection(element_id=action.element_id)
prompt = prompt_engine.load_prompt(
"custom-select",
field_information=action.field_information,
required_field=action.required_field,
target_value=target_value if not force_select and should_relevant else "",
field_information=context.field,
required_field=context.is_required,
target_value="" if force_select else target_value,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
elements=html,
@@ -1526,11 +1550,11 @@ async def select_from_dropdown(
element_id: str | None = json_response.get("id", None)
if not element_id:
raise NoElementMatchedForTargetOption(target=target_value, reason=json_response.get("reasoning"))
raise NoAvailableOptionFoundForCustomSelection(reason=json_response.get("reasoning"))
if not force_select and should_relevant:
if not force_select and target_value:
if not json_response.get("relevant", False):
LOG.debug(
LOG.info(
"The selected option is not relevant to the target value",
element_id=element_id,
task_id=task.task_id,