better context for select and auto-complete (#816)
This commit is contained in:
@@ -25,7 +25,7 @@ Reply in JSON format with the following keys:
|
|||||||
|
|
||||||
Context:
|
Context:
|
||||||
```
|
```
|
||||||
{{ context_reasoning }}
|
Choose an auto-completion suggestion for "{{ field_information }}"
|
||||||
```
|
```
|
||||||
|
|
||||||
Input value:
|
Input value:
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ Reply in JSON format with the following keys:
|
|||||||
|
|
||||||
Context:
|
Context:
|
||||||
```
|
```
|
||||||
{{ context_reasoning }}
|
Choose an auto-completion suggestion for "{{ field_information }}"
|
||||||
```
|
```
|
||||||
|
|
||||||
Current Value:
|
Current Value:
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ Reply in JSON format with the following keys:
|
|||||||
|
|
||||||
Context:
|
Context:
|
||||||
```
|
```
|
||||||
{{ context_reasoning }}
|
Choose an auto-completion suggestion for "{{ field_information }}"
|
||||||
```
|
```
|
||||||
|
|
||||||
Current Value:
|
Current Value:
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ Reply in JSON format with the following keys:
|
|||||||
|
|
||||||
Context:
|
Context:
|
||||||
```
|
```
|
||||||
{{ context_reasoning }}
|
Select an option for "{{ field_information }}". It's {{ "a required" if required_field else "an optional" }} field.
|
||||||
```
|
```
|
||||||
{% if target_value %}
|
{% if target_value %}
|
||||||
Target value:
|
Target value:
|
||||||
|
|||||||
@@ -17,6 +17,8 @@ Reply in JSON format with the following keys:
|
|||||||
"reasoning": str, // The reasoning behind the action. Be specific, referencing any user information and their fields and element ids in your reasoning. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point.
|
"reasoning": str, // The reasoning behind the action. Be specific, referencing any user information and their fields and element ids in your reasoning. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point.
|
||||||
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
|
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
|
||||||
"action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
|
"action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
|
||||||
|
"field_information": str, // The target field for the action. Only for INPUT_TEXT and SELECT_OPTION actions. Otherwise it should be null.
|
||||||
|
"required_field": bool, // True if it's a required field, otherwise false.
|
||||||
"id": str, // The id of the element to take action on. The id has to be one from the elements list
|
"id": str, // The id of the element to take action on. The id has to be one from the elements list
|
||||||
"text": str, // Text for INPUT_TEXT action only
|
"text": str, // Text for INPUT_TEXT action only
|
||||||
"file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise.
|
"file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise.
|
||||||
|
|||||||
@@ -45,6 +45,8 @@ class SelectOption(BaseModel):
|
|||||||
|
|
||||||
class Action(BaseModel):
|
class Action(BaseModel):
|
||||||
action_type: ActionType
|
action_type: ActionType
|
||||||
|
field_information: str | None = None
|
||||||
|
required_field: bool | None = None
|
||||||
confidence_float: float | None = None
|
confidence_float: float | None = None
|
||||||
description: str | None = None
|
description: str | None = None
|
||||||
reasoning: str | None = None
|
reasoning: str | None = None
|
||||||
@@ -160,6 +162,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
|
|||||||
|
|
||||||
reasoning = action["reasoning"] if "reasoning" in action else None
|
reasoning = action["reasoning"] if "reasoning" in action else None
|
||||||
confidence_float = action["confidence_float"] if "confidence_float" in action else None
|
confidence_float = action["confidence_float"] if "confidence_float" in action else None
|
||||||
|
field_information = action["field_information"] if "field_information" in action else None
|
||||||
|
required_field = action["required_field"] if "required_field" in action else None
|
||||||
|
|
||||||
if "action_type" not in action or action["action_type"] is None:
|
if "action_type" not in action or action["action_type"] is None:
|
||||||
return NullAction(reasoning=reasoning, confidence_float=confidence_float)
|
return NullAction(reasoning=reasoning, confidence_float=confidence_float)
|
||||||
@@ -177,6 +181,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
|
|||||||
if action_type == ActionType.CLICK:
|
if action_type == ActionType.CLICK:
|
||||||
file_url = action["file_url"] if "file_url" in action else None
|
file_url = action["file_url"] if "file_url" in action else None
|
||||||
return ClickAction(
|
return ClickAction(
|
||||||
|
field_information=field_information,
|
||||||
|
required_field=required_field,
|
||||||
element_id=element_id,
|
element_id=element_id,
|
||||||
reasoning=reasoning,
|
reasoning=reasoning,
|
||||||
confidence_float=confidence_float,
|
confidence_float=confidence_float,
|
||||||
@@ -186,6 +192,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
|
|||||||
|
|
||||||
if action_type == ActionType.INPUT_TEXT:
|
if action_type == ActionType.INPUT_TEXT:
|
||||||
return InputTextAction(
|
return InputTextAction(
|
||||||
|
field_information=field_information,
|
||||||
|
required_field=required_field,
|
||||||
element_id=element_id,
|
element_id=element_id,
|
||||||
text=action["text"],
|
text=action["text"],
|
||||||
reasoning=reasoning,
|
reasoning=reasoning,
|
||||||
@@ -195,6 +203,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
|
|||||||
if action_type == ActionType.UPLOAD_FILE:
|
if action_type == ActionType.UPLOAD_FILE:
|
||||||
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
|
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
|
||||||
return UploadFileAction(
|
return UploadFileAction(
|
||||||
|
field_information=field_information,
|
||||||
|
required_field=required_field,
|
||||||
element_id=element_id,
|
element_id=element_id,
|
||||||
confidence_float=confidence_float,
|
confidence_float=confidence_float,
|
||||||
file_url=action["file_url"],
|
file_url=action["file_url"],
|
||||||
@@ -204,6 +214,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
|
|||||||
# This action is not used in the current implementation. Click actions are used instead.
|
# This action is not used in the current implementation. Click actions are used instead.
|
||||||
if action_type == ActionType.DOWNLOAD_FILE:
|
if action_type == ActionType.DOWNLOAD_FILE:
|
||||||
return DownloadFileAction(
|
return DownloadFileAction(
|
||||||
|
field_information=field_information,
|
||||||
|
required_field=required_field,
|
||||||
element_id=element_id,
|
element_id=element_id,
|
||||||
file_name=action["file_name"],
|
file_name=action["file_name"],
|
||||||
reasoning=reasoning,
|
reasoning=reasoning,
|
||||||
@@ -220,6 +232,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
|
|||||||
if label is None and value is None and index is None:
|
if label is None and value is None and index is None:
|
||||||
raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption")
|
raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption")
|
||||||
return SelectOptionAction(
|
return SelectOptionAction(
|
||||||
|
field_information=field_information,
|
||||||
|
required_field=required_field,
|
||||||
element_id=element_id,
|
element_id=element_id,
|
||||||
option=SelectOption(
|
option=SelectOption(
|
||||||
label=label,
|
label=label,
|
||||||
@@ -232,6 +246,8 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
|
|||||||
|
|
||||||
if action_type == ActionType.CHECKBOX:
|
if action_type == ActionType.CHECKBOX:
|
||||||
return CheckboxAction(
|
return CheckboxAction(
|
||||||
|
field_information=field_information,
|
||||||
|
required_field=required_field,
|
||||||
element_id=element_id,
|
element_id=element_id,
|
||||||
is_checked=action["is_checked"],
|
is_checked=action["is_checked"],
|
||||||
reasoning=reasoning,
|
reasoning=reasoning,
|
||||||
|
|||||||
@@ -418,7 +418,11 @@ async def handle_input_text_action(
|
|||||||
if skyvern_element.get_tag_name() == InteractiveElement.INPUT and not await skyvern_element.is_spinbtn_input():
|
if skyvern_element.get_tag_name() == InteractiveElement.INPUT and not await skyvern_element.is_spinbtn_input():
|
||||||
await skyvern_element.scroll_into_view()
|
await skyvern_element.scroll_into_view()
|
||||||
select_action = SelectOptionAction(
|
select_action = SelectOptionAction(
|
||||||
reasoning=action.reasoning, element_id=skyvern_element.get_id(), option=SelectOption(label=text)
|
field_information=action.field_information,
|
||||||
|
required_field=action.required_field,
|
||||||
|
reasoning=action.reasoning,
|
||||||
|
element_id=skyvern_element.get_id(),
|
||||||
|
option=SelectOption(label=text),
|
||||||
)
|
)
|
||||||
if skyvern_element.get_selectable():
|
if skyvern_element.get_selectable():
|
||||||
LOG.info(
|
LOG.info(
|
||||||
@@ -685,7 +689,13 @@ async def handle_select_option_action(
|
|||||||
tag_name=selectable_child.get_tag_name(),
|
tag_name=selectable_child.get_tag_name(),
|
||||||
element_id=selectable_child.get_id(),
|
element_id=selectable_child.get_id(),
|
||||||
)
|
)
|
||||||
select_action = SelectOptionAction(element_id=selectable_child.get_id(), option=action.option)
|
select_action = SelectOptionAction(
|
||||||
|
reasoning=action.reasoning,
|
||||||
|
field_information=action.field_information,
|
||||||
|
required_field=action.required_field,
|
||||||
|
element_id=selectable_child.get_id(),
|
||||||
|
option=action.option,
|
||||||
|
)
|
||||||
return await handle_select_option_action(select_action, page, scraped_page, task, step)
|
return await handle_select_option_action(select_action, page, scraped_page, task, step)
|
||||||
|
|
||||||
if tag_name == InteractiveElement.SELECT:
|
if tag_name == InteractiveElement.SELECT:
|
||||||
@@ -1122,7 +1132,7 @@ async def choose_auto_completion_dropdown(
|
|||||||
html = incremental_scraped.build_html_tree(incremental_element)
|
html = incremental_scraped.build_html_tree(incremental_element)
|
||||||
auto_completion_confirm_prompt = prompt_engine.load_prompt(
|
auto_completion_confirm_prompt = prompt_engine.load_prompt(
|
||||||
"auto-completion-choose-option",
|
"auto-completion-choose-option",
|
||||||
context_reasoning=action.reasoning,
|
field_information=action.field_information,
|
||||||
filled_value=text,
|
filled_value=text,
|
||||||
navigation_goal=task.navigation_goal,
|
navigation_goal=task.navigation_goal,
|
||||||
navigation_payload_str=json.dumps(task.navigation_payload),
|
navigation_payload_str=json.dumps(task.navigation_payload),
|
||||||
@@ -1241,7 +1251,7 @@ async def input_or_auto_complete_input(
|
|||||||
|
|
||||||
prompt = prompt_engine.load_prompt(
|
prompt = prompt_engine.load_prompt(
|
||||||
"auto-completion-potential-answers",
|
"auto-completion-potential-answers",
|
||||||
context_reasoning=context_reasoning,
|
field_information=action.field_information,
|
||||||
current_value=current_value,
|
current_value=current_value,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1296,7 +1306,7 @@ async def input_or_auto_complete_input(
|
|||||||
)
|
)
|
||||||
prompt = prompt_engine.load_prompt(
|
prompt = prompt_engine.load_prompt(
|
||||||
"auto-completion-tweak-value",
|
"auto-completion-tweak-value",
|
||||||
context_reasoning=context_reasoning,
|
field_information=action.field_information,
|
||||||
current_value=current_value,
|
current_value=current_value,
|
||||||
tried_values=json.dumps(tried_values),
|
tried_values=json.dumps(tried_values),
|
||||||
popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]),
|
popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]),
|
||||||
@@ -1310,7 +1320,7 @@ async def input_or_auto_complete_input(
|
|||||||
"Ask LLM tweaked the current value with a new value",
|
"Ask LLM tweaked the current value with a new value",
|
||||||
step_id=step.step_id,
|
step_id=step.step_id,
|
||||||
task_id=task.task_id,
|
task_id=task.task_id,
|
||||||
reasoning=context_reasoning,
|
field_information=action.field_information,
|
||||||
current_value=current_value,
|
current_value=current_value,
|
||||||
new_value=new_current_value,
|
new_value=new_current_value,
|
||||||
)
|
)
|
||||||
@@ -1485,7 +1495,8 @@ async def select_from_dropdown(
|
|||||||
|
|
||||||
prompt = prompt_engine.load_prompt(
|
prompt = prompt_engine.load_prompt(
|
||||||
"custom-select",
|
"custom-select",
|
||||||
context_reasoning=action.reasoning,
|
field_information=action.field_information,
|
||||||
|
required_field=action.required_field,
|
||||||
target_value=target_value if not force_select and should_relevant else "",
|
target_value=target_value if not force_select and should_relevant else "",
|
||||||
navigation_goal=task.navigation_goal,
|
navigation_goal=task.navigation_goal,
|
||||||
navigation_payload_str=json.dumps(task.navigation_payload),
|
navigation_payload_str=json.dumps(task.navigation_payload),
|
||||||
|
|||||||
Reference in New Issue
Block a user