add ClickContext to support click action ai="fallback" in generated code (#3892)

This commit is contained in:
Shuchang Zheng
2025-11-04 15:31:44 +08:00
committed by GitHub
parent 599fbc8276
commit f43d04ae39
4 changed files with 30 additions and 6 deletions

View File

@@ -27,7 +27,8 @@ from skyvern.schemas.workflows import FileStorageType
from skyvern.webeye.actions.action_types import ActionType
LOG = structlog.get_logger(__name__)
GENERATE_CODE_AI_MODE = "proactive"
GENERATE_CODE_AI_MODE_PROACTIVE = "proactive"
GENERATE_CODE_AI_MODE_FALLBACK = "fallback"
# --------------------------------------------------------------------- #
@@ -250,10 +251,14 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
)
if method == "click":
ai_mode = GENERATE_CODE_AI_MODE_PROACTIVE
click_context = act.get("click_context")
if click_context and isinstance(click_context, dict) and click_context.get("single_option_click"):
ai_mode = GENERATE_CODE_AI_MODE_FALLBACK
args.append(
cst.Arg(
keyword=cst.Name("ai"),
value=_value(GENERATE_CODE_AI_MODE),
value=_value(ai_mode),
whitespace_after_arg=cst.ParenthesizedWhitespace(
indent=True,
last_line=cst.SimpleWhitespace(INDENT),
@@ -286,7 +291,7 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
args.append(
cst.Arg(
keyword=cst.Name("ai"),
value=_value(GENERATE_CODE_AI_MODE),
value=_value(GENERATE_CODE_AI_MODE_PROACTIVE),
whitespace_after_arg=cst.ParenthesizedWhitespace(
indent=True,
last_line=cst.SimpleWhitespace(INDENT),
@@ -343,7 +348,7 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
args.append(
cst.Arg(
keyword=cst.Name("ai"),
value=_value(GENERATE_CODE_AI_MODE),
value=_value(GENERATE_CODE_AI_MODE_PROACTIVE),
whitespace_after_arg=cst.ParenthesizedWhitespace(
indent=True,
last_line=cst.SimpleWhitespace(INDENT),
@@ -374,7 +379,7 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
args.append(
cst.Arg(
keyword=cst.Name("ai"),
value=_value(GENERATE_CODE_AI_MODE),
value=_value(GENERATE_CODE_AI_MODE_PROACTIVE),
whitespace_after_arg=cst.ParenthesizedWhitespace(
indent=True,
last_line=cst.SimpleWhitespace(INDENT),

View File

@@ -27,6 +27,10 @@ Reply in JSON format with the following keys:
"label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE
"index": int, // the index corresponding to the option index under the select element.
"value": str // the value of the option. MAKE SURE YOU USE THIS VALUE TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION VALUE HERE
},
"click_context": { // The context for CLICK action only. null if not CLICK action
"thought": str, // Describe how you decided that this action is a single choice option or multi-choice option.
"single_option_click": bool, // True if the click is the only choice to proceed towards the goal, regardless of different user context or input. False if there are multiple valid options that depend on user input. Examples: clicking a login button to login is True (it's the only way to login); clicking a radio button for a multi-choice question (e.g., selecting "male", "female", or "other" for gender) is False (the choice depends on user input). When clicking on radio buttons, dropdown options, or any element that represents one of multiple possible selections, this should be False.
}{% if parse_select_feature_enabled %},
"context": { // The context for INPUT_TEXT or SELECT_OPTION action only. null if not INPUT_TEXT or SELECT_OPTION action. Extract the following detailed information from the "reasoning", and double-check the information by analysing the HTML elements.
"thought": str, // A string to describe how you double-check the context information to ensure the accuracy.

View File

@@ -50,6 +50,11 @@ class InputOrSelectContext(BaseModel):
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})"
class ClickContext(BaseModel):
thought: str | None = None
single_option_click: bool | None = None
class Action(BaseModel):
model_config = ConfigDict(from_attributes=True)
@@ -88,6 +93,7 @@ class Action(BaseModel):
option: SelectOption | None = None
is_checked: bool | None = None
verified: bool = False
click_context: ClickContext | None = None
# TOTP timing information for multi-field TOTP sequences
totp_timing_info: dict[str, Any] | None = None

View File

@@ -21,6 +21,7 @@ from skyvern.webeye.actions.actions import (
Action,
CheckboxAction,
ClickAction,
ClickContext,
ClosePageAction,
CompleteAction,
DownloadFileAction,
@@ -97,7 +98,15 @@ def parse_action(
if action_type == ActionType.CLICK:
file_url = action["file_url"] if "file_url" in action else None
return ClickAction(**base_action_dict, file_url=file_url, download=action.get("download", False))
click_context = action.get("click_context", None)
if click_context:
click_context = ClickContext.model_validate(click_context)
return ClickAction(
**base_action_dict,
file_url=file_url,
download=action.get("download", False),
click_context=click_context,
)
if action_type == ActionType.INPUT_TEXT:
context_dict = action.get("context", {})