add ClickContext to support click action ai="fallback" in generated code (#3892)

2025-11-04 15:31:44 +08:00
parent 599fbc8276
commit f43d04ae39
4 changed files with 30 additions and 6 deletions
--- a/skyvern/core/script_generations/generate_script.py
+++ b/skyvern/core/script_generations/generate_script.py
@@ -27,7 +27,8 @@ from skyvern.schemas.workflows import FileStorageType
 from skyvern.webeye.actions.action_types import ActionType

 LOG = structlog.get_logger(__name__)
-GENERATE_CODE_AI_MODE = "proactive"
+GENERATE_CODE_AI_MODE_PROACTIVE = "proactive"
+GENERATE_CODE_AI_MODE_FALLBACK = "fallback"


 # --------------------------------------------------------------------- #
@@ -250,10 +251,14 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
        )

    if method == "click":
+        ai_mode = GENERATE_CODE_AI_MODE_PROACTIVE
+        click_context = act.get("click_context")
+        if click_context and isinstance(click_context, dict) and click_context.get("single_option_click"):
+            ai_mode = GENERATE_CODE_AI_MODE_FALLBACK
        args.append(
            cst.Arg(
                keyword=cst.Name("ai"),
-                value=_value(GENERATE_CODE_AI_MODE),
+                value=_value(ai_mode),
                whitespace_after_arg=cst.ParenthesizedWhitespace(
                    indent=True,
                    last_line=cst.SimpleWhitespace(INDENT),
@@ -286,7 +291,7 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
        args.append(
            cst.Arg(
                keyword=cst.Name("ai"),
-                value=_value(GENERATE_CODE_AI_MODE),
+                value=_value(GENERATE_CODE_AI_MODE_PROACTIVE),
                whitespace_after_arg=cst.ParenthesizedWhitespace(
                    indent=True,
                    last_line=cst.SimpleWhitespace(INDENT),
@@ -343,7 +348,7 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
            args.append(
                cst.Arg(
                    keyword=cst.Name("ai"),
-                    value=_value(GENERATE_CODE_AI_MODE),
+                    value=_value(GENERATE_CODE_AI_MODE_PROACTIVE),
                    whitespace_after_arg=cst.ParenthesizedWhitespace(
                        indent=True,
                        last_line=cst.SimpleWhitespace(INDENT),
@@ -374,7 +379,7 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
        args.append(
            cst.Arg(
                keyword=cst.Name("ai"),
-                value=_value(GENERATE_CODE_AI_MODE),
+                value=_value(GENERATE_CODE_AI_MODE_PROACTIVE),
                whitespace_after_arg=cst.ParenthesizedWhitespace(
                    indent=True,
                    last_line=cst.SimpleWhitespace(INDENT),
--- a/skyvern/forge/prompts/skyvern/extract-action.j2
+++ b/skyvern/forge/prompts/skyvern/extract-action.j2
@@ -27,6 +27,10 @@ Reply in JSON format with the following keys:
            "label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE
            "index": int, // the index corresponding to the option index under the select element.
            "value": str // the value of the option. MAKE SURE YOU USE THIS VALUE TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION VALUE HERE
+        },
+        "click_context": {  // The context for CLICK action only. null if not CLICK action
+            "thought": str, // Describe how you decided that this action is a single choice option or multi-choice option.
+            "single_option_click": bool, // True if the click is the only choice to proceed towards the goal, regardless of different user context or input. False if there are multiple valid options that depend on user input. Examples: clicking a login button to login is True (it's the only way to login); clicking a radio button for a multi-choice question (e.g., selecting "male", "female", or "other" for gender) is False (the choice depends on user input). When clicking on radio buttons, dropdown options, or any element that represents one of multiple possible selections, this should be False.
        }{% if parse_select_feature_enabled %},
        "context": { // The context for INPUT_TEXT or SELECT_OPTION action only. null if not INPUT_TEXT or SELECT_OPTION action. Extract the following detailed information from the "reasoning", and double-check the information by analysing the HTML elements.
            "thought": str, // A string to describe how you double-check the context information to ensure the accuracy.
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -50,6 +50,11 @@ class InputOrSelectContext(BaseModel):
        return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})"


+class ClickContext(BaseModel):
+    thought: str | None = None
+    single_option_click: bool | None = None
+
+
 class Action(BaseModel):
    model_config = ConfigDict(from_attributes=True)

@@ -88,6 +93,7 @@ class Action(BaseModel):
    option: SelectOption | None = None
    is_checked: bool | None = None
    verified: bool = False
+    click_context: ClickContext | None = None

    # TOTP timing information for multi-field TOTP sequences
    totp_timing_info: dict[str, Any] | None = None
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -21,6 +21,7 @@ from skyvern.webeye.actions.actions import (
    Action,
    CheckboxAction,
    ClickAction,
+    ClickContext,
    ClosePageAction,
    CompleteAction,
    DownloadFileAction,
@@ -97,7 +98,15 @@ def parse_action(

    if action_type == ActionType.CLICK:
        file_url = action["file_url"] if "file_url" in action else None
-        return ClickAction(**base_action_dict, file_url=file_url, download=action.get("download", False))
+        click_context = action.get("click_context", None)
+        if click_context:
+            click_context = ClickContext.model_validate(click_context)
+        return ClickAction(
+            **base_action_dict,
+            file_url=file_url,
+            download=action.get("download", False),
+            click_context=click_context,
+        )

    if action_type == ActionType.INPUT_TEXT:
        context_dict = action.get("context", {})