diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index d960c756..0e350719 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -1487,7 +1487,11 @@ class ForgeAgent: reason=json_response.get("thought"), error_type=json_response.get("error") ) - action_type: str = json_response.get("action_type") or "" + inferred_actions: list[dict[str, Any]] = json_response.get("inferred_actions", []) + if not inferred_actions: + raise FailedToParseActionInstruction(reason=json_response.get("thought"), error_type="EMPTY_ACTION") + + action_type: str = inferred_actions[0].get("action_type") or "" action_type = ActionType[action_type.upper()] if action_type == ActionType.CLICK: diff --git a/skyvern/forge/prompts/skyvern/infer-action-type.j2 b/skyvern/forge/prompts/skyvern/infer-action-type.j2 index cc343ac1..f018899b 100644 --- a/skyvern/forge/prompts/skyvern/infer-action-type.j2 +++ b/skyvern/forge/prompts/skyvern/infer-action-type.j2 @@ -4,10 +4,14 @@ MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing comma Reply in the following JSON format: { - "thought": str, // A string to describe how to infer the action type from the action instruction. - "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION". "CLICK" means user wants to click. "INPUT_TEXT" means user wants to input text. "UPLOAD_FILE" means user wants to upload a file. "SELECT_OPTION" means user wants to select an option. - "error": str, // It's a string enum to describe error. Null if you can identify the action as one of the defined action type. Use "UNKNOWN_ACTION" if none of the defined action type matched. Use "MULTIPLE_ACTIONS" if the instruction includes multiple actions. + "thought": str, // A string to describe how to infer the action types from the action instruction. + "inferred_actions": array // An array of action types. The result should be sorted in descending order by confidence_float. If there are multiple actions, each action should have an unique confidence_float value. + [{ + "reasoning": str, // A string to explain the reason for inferring the action from the instruction. + "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence. + "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION". "CLICK" means user wants to click. "INPUT_TEXT" means user wants to input text. "UPLOAD_FILE" means user wants to upload a file. "SELECT_OPTION" means user wants to select an option. + }], + "error": str, // It's a string enum to describe error. Null if you can identify any defined action from the instruction. Use "UNKNOWN_ACTION" if none of the defined action type matched. } Action instruction