Smarter select_option & input_text actions (#3440)
This commit is contained in:
@@ -353,15 +353,69 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
|
||||
)
|
||||
)
|
||||
elif method == "select_option":
|
||||
option = act.get("option", {})
|
||||
value = option.get("value")
|
||||
if value:
|
||||
if act.get("field_name"):
|
||||
option_value = cst.Subscript(
|
||||
value=cst.Attribute(
|
||||
value=cst.Name("context"),
|
||||
attr=cst.Name("parameters"),
|
||||
),
|
||||
slice=[cst.SubscriptElement(slice=cst.Index(value=_value(act["field_name"])))],
|
||||
)
|
||||
else:
|
||||
option_value = _value(value)
|
||||
args.append(
|
||||
cst.Arg(
|
||||
keyword=cst.Name("value"),
|
||||
value=option_value,
|
||||
whitespace_after_arg=cst.ParenthesizedWhitespace(
|
||||
indent=True,
|
||||
last_line=cst.SimpleWhitespace(INDENT),
|
||||
),
|
||||
),
|
||||
)
|
||||
args.append(
|
||||
cst.Arg(
|
||||
keyword=cst.Name("ai_infer"),
|
||||
value=cst.Name("True"),
|
||||
whitespace_after_arg=cst.ParenthesizedWhitespace(
|
||||
indent=True,
|
||||
last_line=cst.SimpleWhitespace(INDENT),
|
||||
),
|
||||
)
|
||||
)
|
||||
elif method == "upload_file":
|
||||
if act.get("field_name"):
|
||||
file_url_value = cst.Subscript(
|
||||
value=cst.Attribute(
|
||||
value=cst.Name("context"),
|
||||
attr=cst.Name("parameters"),
|
||||
),
|
||||
slice=[cst.SubscriptElement(slice=cst.Index(value=_value(act["field_name"])))],
|
||||
)
|
||||
else:
|
||||
file_url_value = _value(act["file_url"])
|
||||
args.append(
|
||||
cst.Arg(
|
||||
keyword=cst.Name("option"),
|
||||
value=_value(act["option"]["value"]),
|
||||
keyword=cst.Name("files"),
|
||||
value=file_url_value,
|
||||
whitespace_after_arg=cst.ParenthesizedWhitespace(
|
||||
indent=True,
|
||||
last_line=cst.SimpleWhitespace(INDENT),
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
args.append(
|
||||
cst.Arg(
|
||||
keyword=cst.Name("ai_infer"),
|
||||
value=cst.Name("True"),
|
||||
whitespace_after_arg=cst.ParenthesizedWhitespace(
|
||||
indent=True,
|
||||
last_line=cst.SimpleWhitespace(INDENT),
|
||||
),
|
||||
)
|
||||
)
|
||||
elif method == "wait":
|
||||
args.append(
|
||||
|
||||
@@ -15,6 +15,7 @@ LOG = structlog.get_logger(__name__)
|
||||
|
||||
# Initialize prompt engine
|
||||
prompt_engine = PromptEngine("skyvern")
|
||||
CUSTOM_FIELD_ACTIONS = [ActionType.INPUT_TEXT, ActionType.UPLOAD_FILE, ActionType.SELECT_OPTION]
|
||||
|
||||
|
||||
class GeneratedFieldMapping(BaseModel):
|
||||
@@ -39,34 +40,45 @@ async def generate_workflow_parameters_schema(
|
||||
- field_mappings: Dictionary mapping action indices to field names for hydration
|
||||
"""
|
||||
# Extract all input_text actions
|
||||
input_actions = []
|
||||
custom_field_actions = []
|
||||
action_index_map = {}
|
||||
action_counter = 1
|
||||
|
||||
for task_id, actions in actions_by_task.items():
|
||||
for action in actions:
|
||||
if action.get("action_type") == ActionType.INPUT_TEXT:
|
||||
input_actions.append(
|
||||
{
|
||||
"text": action.get("text", ""),
|
||||
"intention": action.get("intention", ""),
|
||||
"task_id": task_id,
|
||||
"action_id": action.get("action_id", ""),
|
||||
}
|
||||
)
|
||||
action_index_map[f"action_index_{action_counter}"] = {
|
||||
action_type = action.get("action_type", "")
|
||||
if action_type not in CUSTOM_FIELD_ACTIONS:
|
||||
continue
|
||||
|
||||
value = ""
|
||||
if action_type == ActionType.INPUT_TEXT:
|
||||
value = action.get("text", "")
|
||||
elif action_type == ActionType.UPLOAD_FILE:
|
||||
value = action.get("file_url", "")
|
||||
elif action_type == ActionType.SELECT_OPTION:
|
||||
value = action.get("option", "")
|
||||
custom_field_actions.append(
|
||||
{
|
||||
"action_type": action_type,
|
||||
"value": value,
|
||||
"intention": action.get("intention", ""),
|
||||
"task_id": task_id,
|
||||
"action_id": action.get("action_id", ""),
|
||||
}
|
||||
action_counter += 1
|
||||
)
|
||||
action_index_map[f"action_index_{action_counter}"] = {
|
||||
"task_id": task_id,
|
||||
"action_id": action.get("action_id", ""),
|
||||
}
|
||||
action_counter += 1
|
||||
|
||||
if not input_actions:
|
||||
LOG.warning("No input_text actions found in workflow run")
|
||||
if not custom_field_actions:
|
||||
LOG.warning("No field_name_actions found in workflow run")
|
||||
return _generate_empty_schema(), {}
|
||||
|
||||
# Generate field names using LLM
|
||||
try:
|
||||
field_mapping = await _generate_field_names_with_llm(input_actions)
|
||||
field_mapping = await _generate_field_names_with_llm(custom_field_actions)
|
||||
|
||||
# Generate the Pydantic schema code
|
||||
schema_code = _generate_pydantic_schema(field_mapping.schema_fields)
|
||||
@@ -86,7 +98,7 @@ async def generate_workflow_parameters_schema(
|
||||
return _generate_empty_schema(), {}
|
||||
|
||||
|
||||
async def _generate_field_names_with_llm(input_actions: List[Dict[str, Any]]) -> GeneratedFieldMapping:
|
||||
async def _generate_field_names_with_llm(custom_field_actions: List[Dict[str, Any]]) -> GeneratedFieldMapping:
|
||||
"""
|
||||
Use LLM to generate field names from input actions.
|
||||
|
||||
@@ -96,7 +108,9 @@ async def _generate_field_names_with_llm(input_actions: List[Dict[str, Any]]) ->
|
||||
Returns:
|
||||
GeneratedFieldMapping with field mappings and schema definitions
|
||||
"""
|
||||
prompt = prompt_engine.load_prompt(template="generate-workflow-parameters", input_actions=input_actions)
|
||||
prompt = prompt_engine.load_prompt(
|
||||
template="generate-workflow-parameters", custom_field_actions=custom_field_actions
|
||||
)
|
||||
|
||||
response = await app.LLM_API_HANDLER(prompt=prompt, prompt_name="generate-workflow-parameters")
|
||||
|
||||
@@ -166,22 +180,22 @@ def hydrate_input_text_actions_with_field_names(
|
||||
for action in actions:
|
||||
action_copy = action.copy()
|
||||
|
||||
if action.get("action_type") == ActionType.INPUT_TEXT:
|
||||
if action.get("action_type") in CUSTOM_FIELD_ACTIONS:
|
||||
action_id = action.get("action_id", "")
|
||||
mapping_key = f"{task_id}:{action_id}"
|
||||
|
||||
if mapping_key in field_mappings:
|
||||
action_copy["field_name"] = field_mappings[mapping_key]
|
||||
else:
|
||||
# Fallback field name if mapping not found
|
||||
intention = action.get("intention", "")
|
||||
if intention:
|
||||
# Simple field name generation from intention
|
||||
field_name = intention.lower().replace(" ", "_").replace("?", "").replace("'", "")
|
||||
field_name = "".join(c for c in field_name if c.isalnum() or c == "_")
|
||||
action_copy["field_name"] = field_name or "unknown_field"
|
||||
else:
|
||||
action_copy["field_name"] = "unknown_field"
|
||||
# else:
|
||||
# # Fallback field name if mapping not found
|
||||
# intention = action.get("intention", "")
|
||||
# if intention:
|
||||
# # Simple field name generation from intention
|
||||
# field_name = intention.lower().replace(" ", "_").replace("?", "").replace("'", "")
|
||||
# field_name = "".join(c for c in field_name if c.isalnum() or c == "_")
|
||||
# action_copy["field_name"] = field_name or "unknown_field"
|
||||
# else:
|
||||
# action_copy["field_name"] = "unknown_field"
|
||||
|
||||
updated_actions.append(action_copy)
|
||||
|
||||
|
||||
@@ -23,11 +23,15 @@ from skyvern.forge.sdk.core import skyvern_context
|
||||
from skyvern.utils.prompt_engine import load_prompt_with_elements
|
||||
from skyvern.webeye.actions import handler_utils
|
||||
from skyvern.webeye.actions.action_types import ActionType
|
||||
from skyvern.webeye.actions.actions import Action, ActionStatus, ExtractAction, SelectOption
|
||||
from skyvern.webeye.actions.actions import Action, ActionStatus, ExtractAction, InputTextAction, SelectOption
|
||||
from skyvern.webeye.actions.handler import handle_input_text_action, handle_select_option_action
|
||||
from skyvern.webeye.actions.parse_actions import parse_actions
|
||||
from skyvern.webeye.browser_factory import BrowserState
|
||||
from skyvern.webeye.scraper.scraper import ScrapedPage, scrape_website
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
SELECT_OPTION_GOAL = """- The intention to select an option: {intention}.
|
||||
- The overall goal that the user wants to achieve: {prompt}."""
|
||||
|
||||
|
||||
class Driver(StrEnum):
|
||||
@@ -52,6 +56,12 @@ class ActionCall:
|
||||
error: Exception | None = None # populated if failed
|
||||
|
||||
|
||||
async def _get_element_id_by_xpath(xpath: str, page: Page) -> str | None:
|
||||
locator = page.locator(f"xpath={xpath}")
|
||||
element_id = await locator.get_attribute("unique_id")
|
||||
return element_id
|
||||
|
||||
|
||||
class SkyvernPage:
|
||||
"""
|
||||
A minimal adapter around the chosen driver that:
|
||||
@@ -208,17 +218,20 @@ class SkyvernPage:
|
||||
# Create action record. TODO: store more action fields
|
||||
kwargs = kwargs or {}
|
||||
# we're using "value" instead of "text" for input text actions interface
|
||||
text = kwargs.get("value", "")
|
||||
option_value = kwargs.get("option")
|
||||
select_option = SelectOption(value=option_value) if option_value else None
|
||||
text = None
|
||||
select_option = None
|
||||
response: str | None = kwargs.get("response")
|
||||
file_url = kwargs.get("file_url")
|
||||
if not response:
|
||||
if action_type == ActionType.INPUT_TEXT:
|
||||
text = str(call_result)
|
||||
response = text
|
||||
elif action_type == ActionType.SELECT_OPTION:
|
||||
if select_option:
|
||||
response = select_option.value
|
||||
option_value = str(call_result) or ""
|
||||
select_option = SelectOption(value=option_value)
|
||||
response = option_value
|
||||
elif action_type == ActionType.UPLOAD_FILE:
|
||||
file_url = str(call_result)
|
||||
|
||||
action = Action(
|
||||
element_id="",
|
||||
@@ -234,6 +247,7 @@ class SkyvernPage:
|
||||
reasoning=f"Auto-generated action for {action_type.value}",
|
||||
text=text,
|
||||
option=select_option,
|
||||
file_url=file_url,
|
||||
response=response,
|
||||
created_by="script",
|
||||
)
|
||||
@@ -283,7 +297,8 @@ class SkyvernPage:
|
||||
if screenshot:
|
||||
# Create a minimal Step object for artifact creation
|
||||
step = await app.DATABASE.get_step(
|
||||
context.task_id, context.step_id, organization_id=context.organization_id
|
||||
context.step_id,
|
||||
organization_id=context.organization_id,
|
||||
)
|
||||
if not step:
|
||||
return
|
||||
@@ -415,17 +430,24 @@ class SkyvernPage:
|
||||
context = skyvern_context.current()
|
||||
value = value or ""
|
||||
transformed_value = value
|
||||
element_id: str | None = None
|
||||
organization_id = context.organization_id if context else None
|
||||
task_id = context.task_id if context else None
|
||||
step_id = context.step_id if context else None
|
||||
workflow_run_id = context.workflow_run_id if context else None
|
||||
task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None
|
||||
step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None
|
||||
if ai_infer and intention:
|
||||
try:
|
||||
prompt = context.prompt if context else None
|
||||
# Build the element tree of the current page for the prompt
|
||||
# clean up empty data values
|
||||
data = {k: v for k, v in data.items() if v} if isinstance(data, dict) else (data or "")
|
||||
if (totp_identifier or totp_url) and context and context.organization_id and context.task_id:
|
||||
if (totp_identifier or totp_url) and context and organization_id and task_id:
|
||||
verification_code = await poll_verification_code(
|
||||
organization_id=context.organization_id,
|
||||
task_id=context.task_id,
|
||||
workflow_run_id=context.workflow_run_id,
|
||||
organization_id=organization_id,
|
||||
task_id=task_id,
|
||||
workflow_run_id=workflow_run_id,
|
||||
totp_identifier=totp_identifier,
|
||||
totp_verification_url=totp_url,
|
||||
)
|
||||
@@ -439,6 +461,10 @@ class SkyvernPage:
|
||||
else:
|
||||
data = {SPECIAL_FIELD_VERIFICATION_CODE: verification_code}
|
||||
|
||||
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
|
||||
self.scraped_page = refreshed_page
|
||||
# get the element_id by the xpath
|
||||
element_id = await _get_element_id_by_xpath(xpath, self.page)
|
||||
payload_str = json.dumps(data) if isinstance(data, (dict, list)) else (data or "")
|
||||
script_generation_input_text_prompt = prompt_engine.load_prompt(
|
||||
template="script-generation-input-text-generatiion",
|
||||
@@ -449,7 +475,7 @@ class SkyvernPage:
|
||||
json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER(
|
||||
prompt=script_generation_input_text_prompt,
|
||||
prompt_name="script-generation-input-text-generatiion",
|
||||
organization_id=context.organization_id if context else None,
|
||||
organization_id=organization_id,
|
||||
)
|
||||
value = json_response.get("answer", value)
|
||||
except Exception:
|
||||
@@ -458,39 +484,119 @@ class SkyvernPage:
|
||||
if context and context.workflow_run_id:
|
||||
transformed_value = await _get_actual_value_of_parameter_if_secret(context.workflow_run_id, value)
|
||||
|
||||
locator = self.page.locator(f"xpath={xpath}")
|
||||
await handler_utils.input_sequentially(locator, transformed_value, timeout=timeout)
|
||||
if element_id and organization_id and task and step:
|
||||
action = InputTextAction(
|
||||
element_id=element_id,
|
||||
text=value,
|
||||
status=ActionStatus.pending,
|
||||
organization_id=organization_id,
|
||||
workflow_run_id=workflow_run_id,
|
||||
task_id=task_id,
|
||||
step_id=context.step_id if context else None,
|
||||
reasoning=intention,
|
||||
intention=intention,
|
||||
response=value,
|
||||
)
|
||||
await handle_input_text_action(action, self.page, self.scraped_page, task, step)
|
||||
else:
|
||||
locator = self.page.locator(f"xpath={xpath}")
|
||||
await handler_utils.input_sequentially(locator, transformed_value, timeout=timeout)
|
||||
return value
|
||||
|
||||
@action_wrap(ActionType.UPLOAD_FILE)
|
||||
async def upload_file(
|
||||
self, xpath: str, file_path: str, intention: str | None = None, data: str | dict[str, Any] | None = None
|
||||
) -> None:
|
||||
# if self.generate_response:
|
||||
# # TODO: regenerate file_path and xpath
|
||||
# pass
|
||||
file = await download_file(file_path)
|
||||
await self.page.set_input_files(xpath, file)
|
||||
self,
|
||||
xpath: str,
|
||||
files: str,
|
||||
ai_infer: bool = False,
|
||||
intention: str | None = None,
|
||||
data: str | dict[str, Any] | None = None,
|
||||
) -> str:
|
||||
if ai_infer and intention:
|
||||
try:
|
||||
context = skyvern_context.current()
|
||||
prompt = context.prompt if context else None
|
||||
data = {k: v for k, v in data.items() if v} if isinstance(data, dict) else (data or "")
|
||||
payload_str = json.dumps(data) if isinstance(data, (dict, list)) else (data or "")
|
||||
script_generation_file_url_prompt = prompt_engine.load_prompt(
|
||||
template="script-generation-file-url-generation",
|
||||
intention=intention,
|
||||
data=payload_str,
|
||||
goal=prompt,
|
||||
)
|
||||
json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER(
|
||||
prompt=script_generation_file_url_prompt,
|
||||
prompt_name="script-generation-file-url-generation",
|
||||
organization_id=context.organization_id if context else None,
|
||||
)
|
||||
files = json_response.get("answer", files)
|
||||
except Exception:
|
||||
LOG.exception(f"Failed to adapt value for input text action on xpath={xpath}, file={files}")
|
||||
file_path = await download_file(files)
|
||||
locator = self.page.locator(f"xpath={xpath}")
|
||||
await locator.set_input_files(file_path)
|
||||
return files
|
||||
|
||||
@action_wrap(ActionType.SELECT_OPTION)
|
||||
async def select_option(
|
||||
self,
|
||||
xpath: str,
|
||||
option: str,
|
||||
value: str,
|
||||
ai_infer: bool = False,
|
||||
intention: str | None = None,
|
||||
data: str | dict[str, Any] | None = None,
|
||||
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
|
||||
) -> None:
|
||||
# if self.generate_response:
|
||||
# # TODO: regenerate option
|
||||
# pass
|
||||
locator = self.page.locator(f"xpath={xpath}")
|
||||
try:
|
||||
await locator.click(timeout=timeout)
|
||||
except Exception:
|
||||
print("Failed to click before select action")
|
||||
return
|
||||
await locator.select_option(option, timeout=timeout)
|
||||
) -> str:
|
||||
option_value = value or ""
|
||||
context = skyvern_context.current()
|
||||
if context and context.task_id and context.step_id and context.organization_id:
|
||||
task = await app.DATABASE.get_task(context.task_id, organization_id=context.organization_id)
|
||||
step = await app.DATABASE.get_step(context.step_id, organization_id=context.organization_id)
|
||||
if ai_infer and intention and task and step:
|
||||
try:
|
||||
prompt = context.prompt if context else None
|
||||
data = {k: v for k, v in data.items() if v} if isinstance(data, dict) else (data or "")
|
||||
payload_str = json.dumps(data) if isinstance(data, (dict, list)) else (data or "")
|
||||
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
|
||||
self.scraped_page = refreshed_page
|
||||
element_tree = refreshed_page.build_element_tree()
|
||||
merged_goal = SELECT_OPTION_GOAL.format(intention=intention, prompt=prompt)
|
||||
single_select_prompt = prompt_engine.load_prompt(
|
||||
template="single-select-action",
|
||||
navigation_payload_str=payload_str,
|
||||
navigation_goal=merged_goal,
|
||||
current_url=self.page.url,
|
||||
elements=element_tree,
|
||||
local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(),
|
||||
)
|
||||
json_response = await app.SELECT_AGENT_LLM_API_HANDLER(
|
||||
prompt=single_select_prompt,
|
||||
prompt_name="single-select-action",
|
||||
organization_id=context.organization_id if context else None,
|
||||
)
|
||||
actions = parse_actions(task, step.step_id, step.order, self.scraped_page, json_response["actions"])
|
||||
if actions:
|
||||
action = actions[0]
|
||||
if not action.option:
|
||||
raise ValueError("SelectOptionAction requires an 'option' field")
|
||||
option_value = action.option.value or action.option.label or ""
|
||||
await handle_select_option_action(
|
||||
action=action,
|
||||
page=self.page,
|
||||
scraped_page=self.scraped_page,
|
||||
task=task,
|
||||
step=step,
|
||||
)
|
||||
else:
|
||||
LOG.exception(
|
||||
f"Failed to parse actions for select option action on xpath={xpath}, value={value}"
|
||||
)
|
||||
except Exception:
|
||||
LOG.exception(f"Failed to adapt value for select option action on xpath={xpath}, value={value}")
|
||||
else:
|
||||
locator = self.page.locator(f"xpath={xpath}")
|
||||
await locator.select_option(option_value, timeout=timeout)
|
||||
return option_value
|
||||
|
||||
@action_wrap(ActionType.WAIT)
|
||||
async def wait(
|
||||
@@ -556,7 +662,8 @@ class SkyvernPage:
|
||||
step = None
|
||||
if context and context.organization_id and context.task_id and context.step_id:
|
||||
step = await app.DATABASE.get_step(
|
||||
task_id=context.task_id, step_id=context.step_id, organization_id=context.organization_id
|
||||
step_id=context.step_id,
|
||||
organization_id=context.organization_id,
|
||||
)
|
||||
|
||||
result = await app.EXTRACTION_LLM_API_HANDLER(
|
||||
|
||||
Reference in New Issue
Block a user