From f43d04ae39060ecb3f418dc4238b7fbbf1b4f5b2 Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Tue, 4 Nov 2025 15:31:44 +0800 Subject: [PATCH] add ClickContext to support click action ai="fallback" in generated code (#3892) --- .../core/script_generations/generate_script.py | 15 ++++++++++----- skyvern/forge/prompts/skyvern/extract-action.j2 | 4 ++++ skyvern/webeye/actions/actions.py | 6 ++++++ skyvern/webeye/actions/parse_actions.py | 11 ++++++++++- 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/skyvern/core/script_generations/generate_script.py b/skyvern/core/script_generations/generate_script.py index 1b698950..4f47c9e4 100644 --- a/skyvern/core/script_generations/generate_script.py +++ b/skyvern/core/script_generations/generate_script.py @@ -27,7 +27,8 @@ from skyvern.schemas.workflows import FileStorageType from skyvern.webeye.actions.action_types import ActionType LOG = structlog.get_logger(__name__) -GENERATE_CODE_AI_MODE = "proactive" +GENERATE_CODE_AI_MODE_PROACTIVE = "proactive" +GENERATE_CODE_AI_MODE_FALLBACK = "fallback" # --------------------------------------------------------------------- # @@ -250,10 +251,14 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output: ) if method == "click": + ai_mode = GENERATE_CODE_AI_MODE_PROACTIVE + click_context = act.get("click_context") + if click_context and isinstance(click_context, dict) and click_context.get("single_option_click"): + ai_mode = GENERATE_CODE_AI_MODE_FALLBACK args.append( cst.Arg( keyword=cst.Name("ai"), - value=_value(GENERATE_CODE_AI_MODE), + value=_value(ai_mode), whitespace_after_arg=cst.ParenthesizedWhitespace( indent=True, last_line=cst.SimpleWhitespace(INDENT), @@ -286,7 +291,7 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output: args.append( cst.Arg( keyword=cst.Name("ai"), - value=_value(GENERATE_CODE_AI_MODE), + value=_value(GENERATE_CODE_AI_MODE_PROACTIVE), whitespace_after_arg=cst.ParenthesizedWhitespace( indent=True, last_line=cst.SimpleWhitespace(INDENT), @@ -343,7 +348,7 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output: args.append( cst.Arg( keyword=cst.Name("ai"), - value=_value(GENERATE_CODE_AI_MODE), + value=_value(GENERATE_CODE_AI_MODE_PROACTIVE), whitespace_after_arg=cst.ParenthesizedWhitespace( indent=True, last_line=cst.SimpleWhitespace(INDENT), @@ -374,7 +379,7 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output: args.append( cst.Arg( keyword=cst.Name("ai"), - value=_value(GENERATE_CODE_AI_MODE), + value=_value(GENERATE_CODE_AI_MODE_PROACTIVE), whitespace_after_arg=cst.ParenthesizedWhitespace( indent=True, last_line=cst.SimpleWhitespace(INDENT), diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2 index 9d3e9898..0bb80491 100644 --- a/skyvern/forge/prompts/skyvern/extract-action.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action.j2 @@ -27,6 +27,10 @@ Reply in JSON format with the following keys: "label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE "index": int, // the index corresponding to the option index under the select element. "value": str // the value of the option. MAKE SURE YOU USE THIS VALUE TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION VALUE HERE + }, + "click_context": { // The context for CLICK action only. null if not CLICK action + "thought": str, // Describe how you decided that this action is a single choice option or multi-choice option. + "single_option_click": bool, // True if the click is the only choice to proceed towards the goal, regardless of different user context or input. False if there are multiple valid options that depend on user input. Examples: clicking a login button to login is True (it's the only way to login); clicking a radio button for a multi-choice question (e.g., selecting "male", "female", or "other" for gender) is False (the choice depends on user input). When clicking on radio buttons, dropdown options, or any element that represents one of multiple possible selections, this should be False. }{% if parse_select_feature_enabled %}, "context": { // The context for INPUT_TEXT or SELECT_OPTION action only. null if not INPUT_TEXT or SELECT_OPTION action. Extract the following detailed information from the "reasoning", and double-check the information by analysing the HTML elements. "thought": str, // A string to describe how you double-check the context information to ensure the accuracy. diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index a6229c66..ee374af0 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -50,6 +50,11 @@ class InputOrSelectContext(BaseModel): return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})" +class ClickContext(BaseModel): + thought: str | None = None + single_option_click: bool | None = None + + class Action(BaseModel): model_config = ConfigDict(from_attributes=True) @@ -88,6 +93,7 @@ class Action(BaseModel): option: SelectOption | None = None is_checked: bool | None = None verified: bool = False + click_context: ClickContext | None = None # TOTP timing information for multi-field TOTP sequences totp_timing_info: dict[str, Any] | None = None diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index ac732396..70b23a7e 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -21,6 +21,7 @@ from skyvern.webeye.actions.actions import ( Action, CheckboxAction, ClickAction, + ClickContext, ClosePageAction, CompleteAction, DownloadFileAction, @@ -97,7 +98,15 @@ def parse_action( if action_type == ActionType.CLICK: file_url = action["file_url"] if "file_url" in action else None - return ClickAction(**base_action_dict, file_url=file_url, download=action.get("download", False)) + click_context = action.get("click_context", None) + if click_context: + click_context = ClickContext.model_validate(click_context) + return ClickAction( + **base_action_dict, + file_url=file_url, + download=action.get("download", False), + click_context=click_context, + ) if action_type == ActionType.INPUT_TEXT: context_dict = action.get("context", {})