Smarter select_option & input_text actions (#3440)

This commit is contained in:
Shuchang Zheng
2025-09-15 13:16:34 -07:00
committed by GitHub
parent 6f212ff327
commit 6ee329866b
10 changed files with 300 additions and 105 deletions

View File

@@ -353,15 +353,69 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
)
)
elif method == "select_option":
option = act.get("option", {})
value = option.get("value")
if value:
if act.get("field_name"):
option_value = cst.Subscript(
value=cst.Attribute(
value=cst.Name("context"),
attr=cst.Name("parameters"),
),
slice=[cst.SubscriptElement(slice=cst.Index(value=_value(act["field_name"])))],
)
else:
option_value = _value(value)
args.append(
cst.Arg(
keyword=cst.Name("value"),
value=option_value,
whitespace_after_arg=cst.ParenthesizedWhitespace(
indent=True,
last_line=cst.SimpleWhitespace(INDENT),
),
),
)
args.append(
cst.Arg(
keyword=cst.Name("ai_infer"),
value=cst.Name("True"),
whitespace_after_arg=cst.ParenthesizedWhitespace(
indent=True,
last_line=cst.SimpleWhitespace(INDENT),
),
)
)
elif method == "upload_file":
if act.get("field_name"):
file_url_value = cst.Subscript(
value=cst.Attribute(
value=cst.Name("context"),
attr=cst.Name("parameters"),
),
slice=[cst.SubscriptElement(slice=cst.Index(value=_value(act["field_name"])))],
)
else:
file_url_value = _value(act["file_url"])
args.append(
cst.Arg(
keyword=cst.Name("option"),
value=_value(act["option"]["value"]),
keyword=cst.Name("files"),
value=file_url_value,
whitespace_after_arg=cst.ParenthesizedWhitespace(
indent=True,
last_line=cst.SimpleWhitespace(INDENT),
),
),
)
)
args.append(
cst.Arg(
keyword=cst.Name("ai_infer"),
value=cst.Name("True"),
whitespace_after_arg=cst.ParenthesizedWhitespace(
indent=True,
last_line=cst.SimpleWhitespace(INDENT),
),
)
)
elif method == "wait":
args.append(

View File

@@ -15,6 +15,7 @@ LOG = structlog.get_logger(__name__)
# Initialize prompt engine
prompt_engine = PromptEngine("skyvern")
CUSTOM_FIELD_ACTIONS = [ActionType.INPUT_TEXT, ActionType.UPLOAD_FILE, ActionType.SELECT_OPTION]
class GeneratedFieldMapping(BaseModel):
@@ -39,34 +40,45 @@ async def generate_workflow_parameters_schema(
- field_mappings: Dictionary mapping action indices to field names for hydration
"""
# Extract all input_text actions
input_actions = []
custom_field_actions = []
action_index_map = {}
action_counter = 1
for task_id, actions in actions_by_task.items():
for action in actions:
if action.get("action_type") == ActionType.INPUT_TEXT:
input_actions.append(
{
"text": action.get("text", ""),
"intention": action.get("intention", ""),
"task_id": task_id,
"action_id": action.get("action_id", ""),
}
)
action_index_map[f"action_index_{action_counter}"] = {
action_type = action.get("action_type", "")
if action_type not in CUSTOM_FIELD_ACTIONS:
continue
value = ""
if action_type == ActionType.INPUT_TEXT:
value = action.get("text", "")
elif action_type == ActionType.UPLOAD_FILE:
value = action.get("file_url", "")
elif action_type == ActionType.SELECT_OPTION:
value = action.get("option", "")
custom_field_actions.append(
{
"action_type": action_type,
"value": value,
"intention": action.get("intention", ""),
"task_id": task_id,
"action_id": action.get("action_id", ""),
}
action_counter += 1
)
action_index_map[f"action_index_{action_counter}"] = {
"task_id": task_id,
"action_id": action.get("action_id", ""),
}
action_counter += 1
if not input_actions:
LOG.warning("No input_text actions found in workflow run")
if not custom_field_actions:
LOG.warning("No field_name_actions found in workflow run")
return _generate_empty_schema(), {}
# Generate field names using LLM
try:
field_mapping = await _generate_field_names_with_llm(input_actions)
field_mapping = await _generate_field_names_with_llm(custom_field_actions)
# Generate the Pydantic schema code
schema_code = _generate_pydantic_schema(field_mapping.schema_fields)
@@ -86,7 +98,7 @@ async def generate_workflow_parameters_schema(
return _generate_empty_schema(), {}
async def _generate_field_names_with_llm(input_actions: List[Dict[str, Any]]) -> GeneratedFieldMapping:
async def _generate_field_names_with_llm(custom_field_actions: List[Dict[str, Any]]) -> GeneratedFieldMapping:
"""
Use LLM to generate field names from input actions.
@@ -96,7 +108,9 @@ async def _generate_field_names_with_llm(input_actions: List[Dict[str, Any]]) ->
Returns:
GeneratedFieldMapping with field mappings and schema definitions
"""
prompt = prompt_engine.load_prompt(template="generate-workflow-parameters", input_actions=input_actions)
prompt = prompt_engine.load_prompt(
template="generate-workflow-parameters", custom_field_actions=custom_field_actions
)
response = await app.LLM_API_HANDLER(prompt=prompt, prompt_name="generate-workflow-parameters")
@@ -166,22 +180,22 @@ def hydrate_input_text_actions_with_field_names(
for action in actions:
action_copy = action.copy()
if action.get("action_type") == ActionType.INPUT_TEXT:
if action.get("action_type") in CUSTOM_FIELD_ACTIONS:
action_id = action.get("action_id", "")
mapping_key = f"{task_id}:{action_id}"
if mapping_key in field_mappings:
action_copy["field_name"] = field_mappings[mapping_key]
else:
# Fallback field name if mapping not found
intention = action.get("intention", "")
if intention:
# Simple field name generation from intention
field_name = intention.lower().replace(" ", "_").replace("?", "").replace("'", "")
field_name = "".join(c for c in field_name if c.isalnum() or c == "_")
action_copy["field_name"] = field_name or "unknown_field"
else:
action_copy["field_name"] = "unknown_field"
# else:
# # Fallback field name if mapping not found
# intention = action.get("intention", "")
# if intention:
# # Simple field name generation from intention
# field_name = intention.lower().replace(" ", "_").replace("?", "").replace("'", "")
# field_name = "".join(c for c in field_name if c.isalnum() or c == "_")
# action_copy["field_name"] = field_name or "unknown_field"
# else:
# action_copy["field_name"] = "unknown_field"
updated_actions.append(action_copy)

View File

@@ -23,11 +23,15 @@ from skyvern.forge.sdk.core import skyvern_context
from skyvern.utils.prompt_engine import load_prompt_with_elements
from skyvern.webeye.actions import handler_utils
from skyvern.webeye.actions.action_types import ActionType
from skyvern.webeye.actions.actions import Action, ActionStatus, ExtractAction, SelectOption
from skyvern.webeye.actions.actions import Action, ActionStatus, ExtractAction, InputTextAction, SelectOption
from skyvern.webeye.actions.handler import handle_input_text_action, handle_select_option_action
from skyvern.webeye.actions.parse_actions import parse_actions
from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.scraper.scraper import ScrapedPage, scrape_website
LOG = structlog.get_logger()
SELECT_OPTION_GOAL = """- The intention to select an option: {intention}.
- The overall goal that the user wants to achieve: {prompt}."""
class Driver(StrEnum):
@@ -52,6 +56,12 @@ class ActionCall:
error: Exception | None = None # populated if failed
async def _get_element_id_by_xpath(xpath: str, page: Page) -> str | None:
locator = page.locator(f"xpath={xpath}")
element_id = await locator.get_attribute("unique_id")
return element_id
class SkyvernPage:
"""
A minimal adapter around the chosen driver that:
@@ -208,17 +218,20 @@ class SkyvernPage:
# Create action record. TODO: store more action fields
kwargs = kwargs or {}
# we're using "value" instead of "text" for input text actions interface
text = kwargs.get("value", "")
option_value = kwargs.get("option")
select_option = SelectOption(value=option_value) if option_value else None
text = None
select_option = None
response: str | None = kwargs.get("response")
file_url = kwargs.get("file_url")
if not response:
if action_type == ActionType.INPUT_TEXT:
text = str(call_result)
response = text
elif action_type == ActionType.SELECT_OPTION:
if select_option:
response = select_option.value
option_value = str(call_result) or ""
select_option = SelectOption(value=option_value)
response = option_value
elif action_type == ActionType.UPLOAD_FILE:
file_url = str(call_result)
action = Action(
element_id="",
@@ -234,6 +247,7 @@ class SkyvernPage:
reasoning=f"Auto-generated action for {action_type.value}",
text=text,
option=select_option,
file_url=file_url,
response=response,
created_by="script",
)
@@ -283,7 +297,8 @@ class SkyvernPage:
if screenshot:
# Create a minimal Step object for artifact creation
step = await app.DATABASE.get_step(
context.task_id, context.step_id, organization_id=context.organization_id
context.step_id,
organization_id=context.organization_id,
)
if not step:
return
@@ -415,17 +430,24 @@ class SkyvernPage:
context = skyvern_context.current()
value = value or ""
transformed_value = value
element_id: str | None = None
organization_id = context.organization_id if context else None
task_id = context.task_id if context else None
step_id = context.step_id if context else None
workflow_run_id = context.workflow_run_id if context else None
task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None
step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None
if ai_infer and intention:
try:
prompt = context.prompt if context else None
# Build the element tree of the current page for the prompt
# clean up empty data values
data = {k: v for k, v in data.items() if v} if isinstance(data, dict) else (data or "")
if (totp_identifier or totp_url) and context and context.organization_id and context.task_id:
if (totp_identifier or totp_url) and context and organization_id and task_id:
verification_code = await poll_verification_code(
organization_id=context.organization_id,
task_id=context.task_id,
workflow_run_id=context.workflow_run_id,
organization_id=organization_id,
task_id=task_id,
workflow_run_id=workflow_run_id,
totp_identifier=totp_identifier,
totp_verification_url=totp_url,
)
@@ -439,6 +461,10 @@ class SkyvernPage:
else:
data = {SPECIAL_FIELD_VERIFICATION_CODE: verification_code}
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
self.scraped_page = refreshed_page
# get the element_id by the xpath
element_id = await _get_element_id_by_xpath(xpath, self.page)
payload_str = json.dumps(data) if isinstance(data, (dict, list)) else (data or "")
script_generation_input_text_prompt = prompt_engine.load_prompt(
template="script-generation-input-text-generatiion",
@@ -449,7 +475,7 @@ class SkyvernPage:
json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER(
prompt=script_generation_input_text_prompt,
prompt_name="script-generation-input-text-generatiion",
organization_id=context.organization_id if context else None,
organization_id=organization_id,
)
value = json_response.get("answer", value)
except Exception:
@@ -458,39 +484,119 @@ class SkyvernPage:
if context and context.workflow_run_id:
transformed_value = await _get_actual_value_of_parameter_if_secret(context.workflow_run_id, value)
locator = self.page.locator(f"xpath={xpath}")
await handler_utils.input_sequentially(locator, transformed_value, timeout=timeout)
if element_id and organization_id and task and step:
action = InputTextAction(
element_id=element_id,
text=value,
status=ActionStatus.pending,
organization_id=organization_id,
workflow_run_id=workflow_run_id,
task_id=task_id,
step_id=context.step_id if context else None,
reasoning=intention,
intention=intention,
response=value,
)
await handle_input_text_action(action, self.page, self.scraped_page, task, step)
else:
locator = self.page.locator(f"xpath={xpath}")
await handler_utils.input_sequentially(locator, transformed_value, timeout=timeout)
return value
@action_wrap(ActionType.UPLOAD_FILE)
async def upload_file(
self, xpath: str, file_path: str, intention: str | None = None, data: str | dict[str, Any] | None = None
) -> None:
# if self.generate_response:
# # TODO: regenerate file_path and xpath
# pass
file = await download_file(file_path)
await self.page.set_input_files(xpath, file)
self,
xpath: str,
files: str,
ai_infer: bool = False,
intention: str | None = None,
data: str | dict[str, Any] | None = None,
) -> str:
if ai_infer and intention:
try:
context = skyvern_context.current()
prompt = context.prompt if context else None
data = {k: v for k, v in data.items() if v} if isinstance(data, dict) else (data or "")
payload_str = json.dumps(data) if isinstance(data, (dict, list)) else (data or "")
script_generation_file_url_prompt = prompt_engine.load_prompt(
template="script-generation-file-url-generation",
intention=intention,
data=payload_str,
goal=prompt,
)
json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER(
prompt=script_generation_file_url_prompt,
prompt_name="script-generation-file-url-generation",
organization_id=context.organization_id if context else None,
)
files = json_response.get("answer", files)
except Exception:
LOG.exception(f"Failed to adapt value for input text action on xpath={xpath}, file={files}")
file_path = await download_file(files)
locator = self.page.locator(f"xpath={xpath}")
await locator.set_input_files(file_path)
return files
@action_wrap(ActionType.SELECT_OPTION)
async def select_option(
self,
xpath: str,
option: str,
value: str,
ai_infer: bool = False,
intention: str | None = None,
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
) -> None:
# if self.generate_response:
# # TODO: regenerate option
# pass
locator = self.page.locator(f"xpath={xpath}")
try:
await locator.click(timeout=timeout)
except Exception:
print("Failed to click before select action")
return
await locator.select_option(option, timeout=timeout)
) -> str:
option_value = value or ""
context = skyvern_context.current()
if context and context.task_id and context.step_id and context.organization_id:
task = await app.DATABASE.get_task(context.task_id, organization_id=context.organization_id)
step = await app.DATABASE.get_step(context.step_id, organization_id=context.organization_id)
if ai_infer and intention and task and step:
try:
prompt = context.prompt if context else None
data = {k: v for k, v in data.items() if v} if isinstance(data, dict) else (data or "")
payload_str = json.dumps(data) if isinstance(data, (dict, list)) else (data or "")
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
self.scraped_page = refreshed_page
element_tree = refreshed_page.build_element_tree()
merged_goal = SELECT_OPTION_GOAL.format(intention=intention, prompt=prompt)
single_select_prompt = prompt_engine.load_prompt(
template="single-select-action",
navigation_payload_str=payload_str,
navigation_goal=merged_goal,
current_url=self.page.url,
elements=element_tree,
local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(),
)
json_response = await app.SELECT_AGENT_LLM_API_HANDLER(
prompt=single_select_prompt,
prompt_name="single-select-action",
organization_id=context.organization_id if context else None,
)
actions = parse_actions(task, step.step_id, step.order, self.scraped_page, json_response["actions"])
if actions:
action = actions[0]
if not action.option:
raise ValueError("SelectOptionAction requires an 'option' field")
option_value = action.option.value or action.option.label or ""
await handle_select_option_action(
action=action,
page=self.page,
scraped_page=self.scraped_page,
task=task,
step=step,
)
else:
LOG.exception(
f"Failed to parse actions for select option action on xpath={xpath}, value={value}"
)
except Exception:
LOG.exception(f"Failed to adapt value for select option action on xpath={xpath}, value={value}")
else:
locator = self.page.locator(f"xpath={xpath}")
await locator.select_option(option_value, timeout=timeout)
return option_value
@action_wrap(ActionType.WAIT)
async def wait(
@@ -556,7 +662,8 @@ class SkyvernPage:
step = None
if context and context.organization_id and context.task_id and context.step_id:
step = await app.DATABASE.get_step(
task_id=context.task_id, step_id=context.step_id, organization_id=context.organization_id
step_id=context.step_id,
organization_id=context.organization_id,
)
result = await app.EXTRACTION_LLM_API_HANDLER(