diff --git a/skyvern/forge/prompts/skyvern/infer-action-type.j2 b/skyvern/forge/prompts/skyvern/infer-action-type.j2 new file mode 100644 index 00000000..cc343ac1 --- /dev/null +++ b/skyvern/forge/prompts/skyvern/infer-action-type.j2 @@ -0,0 +1,16 @@ +You are a browser agent performing actions on the web. You are instructed to take a single action. Help to identify which action type should be taken according to the action instruction. + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. + +Reply in the following JSON format: +{ + "thought": str, // A string to describe how to infer the action type from the action instruction. + "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence + "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION". "CLICK" means user wants to click. "INPUT_TEXT" means user wants to input text. "UPLOAD_FILE" means user wants to upload a file. "SELECT_OPTION" means user wants to select an option. + "error": str, // It's a string enum to describe error. Null if you can identify the action as one of the defined action type. Use "UNKNOWN_ACTION" if none of the defined action type matched. Use "MULTIPLE_ACTIONS" if the instruction includes multiple actions. +} + +Action instruction +``` +{{ navigation_goal }} +``` \ No newline at end of file diff --git a/skyvern/forge/sdk/workflow/exceptions.py b/skyvern/forge/sdk/workflow/exceptions.py index 96e09813..a49d6fd1 100644 --- a/skyvern/forge/sdk/workflow/exceptions.py +++ b/skyvern/forge/sdk/workflow/exceptions.py @@ -107,3 +107,10 @@ class WorkflowParameterMissingRequiredValue(BaseWorkflowHTTPException): f"Missing required value for workflow parameter. Workflow parameter type: {workflow_parameter_type}. workflow_parameter_key: {workflow_parameter_key}. Required value: {required_value}", status_code=status.HTTP_400_BAD_REQUEST, ) + + +class FailedToParseActionInstruction(SkyvernException): + def __init__(self, reason: str | None, error_type: str | None): + super().__init__( + f"Failed to parse the action instruction as '{reason}({error_type})'", + ) diff --git a/skyvern/forge/sdk/workflow/models/yaml.py b/skyvern/forge/sdk/workflow/models/yaml.py index 5dc4dcd0..d22c93b2 100644 --- a/skyvern/forge/sdk/workflow/models/yaml.py +++ b/skyvern/forge/sdk/workflow/models/yaml.py @@ -6,7 +6,6 @@ from pydantic import BaseModel, Field from skyvern.forge.sdk.schemas.tasks import ProxyLocation from skyvern.forge.sdk.workflow.models.block import BlockType, FileType from skyvern.forge.sdk.workflow.models.parameter import ParameterType, WorkflowParameterType -from skyvern.webeye.actions.actions import ActionType class ParameterYAML(BaseModel, abc.ABC): @@ -219,7 +218,6 @@ class ValidationBlockYAML(BlockYAML): class ActionBlockYAML(BlockYAML): - action_type: ActionType block_type: Literal[BlockType.ACTION] = BlockType.ACTION # type: ignore url: str | None = None diff --git a/skyvern/forge/sdk/workflow/service.py b/skyvern/forge/sdk/workflow/service.py index 51fa82e2..48506a25 100644 --- a/skyvern/forge/sdk/workflow/service.py +++ b/skyvern/forge/sdk/workflow/service.py @@ -14,6 +14,7 @@ from skyvern.exceptions import ( WorkflowRunNotFound, ) from skyvern.forge import app +from skyvern.forge.prompts import prompt_engine from skyvern.forge.sdk.artifact.models import ArtifactType from skyvern.forge.sdk.core import skyvern_context from skyvern.forge.sdk.core.security import generate_skyvern_signature @@ -23,6 +24,7 @@ from skyvern.forge.sdk.models import Organization, Step from skyvern.forge.sdk.schemas.tasks import ProxyLocation, Task from skyvern.forge.sdk.workflow.exceptions import ( ContextParameterSourceNotDefined, + FailedToParseActionInstruction, InvalidWorkflowDefinition, WorkflowDefinitionHasDuplicateParameterKeys, WorkflowDefinitionHasReservedParameterKeys, @@ -1366,18 +1368,35 @@ class WorkflowService: if block_yaml.parameter_keys else [] ) + + if not block_yaml.navigation_goal: + raise Exception("empty action instruction") + + prompt = prompt_engine.load_prompt("infer-action-type", navigation_goal=block_yaml.navigation_goal) + # TODO: no step here, so LLM call won't be saved as an artifact + json_response = await app.LLM_API_HANDLER(prompt=prompt) + if json_response.get("error"): + raise FailedToParseActionInstruction( + reason=json_response.get("thought"), error_type=json_response.get("error") + ) + + action_type: str = json_response.get("action_type") or "" + action_type = action_type.lower() + prompt_template = "" - if block_yaml.action_type == ActionType.CLICK: + if action_type == ActionType.CLICK: prompt_template = TaskPromptTemplate.SingleClickAction - elif block_yaml.action_type == ActionType.INPUT_TEXT: + elif action_type == ActionType.INPUT_TEXT: prompt_template = TaskPromptTemplate.SingleInputAction - elif block_yaml.action_type == ActionType.UPLOAD_FILE: + elif action_type == ActionType.UPLOAD_FILE: prompt_template = TaskPromptTemplate.SingleUploadAction - elif block_yaml.action_type == ActionType.SELECT_OPTION: + elif action_type == ActionType.SELECT_OPTION: prompt_template = TaskPromptTemplate.SingleSelectAction if not prompt_template: - raise Exception("not supported action type for action block") + raise Exception( + f"Not supported action for action block. Currently we only support [click, input_text, upload_file, select_option], but got [{action_type}]" + ) return ActionBlock( prompt_template=prompt_template,