infer action type from instruction (#1231)

2024-11-21 17:38:42 +08:00
parent 9cd1f15763
commit bb6d3e6a37
4 changed files with 47 additions and 7 deletions
--- a/skyvern/forge/prompts/skyvern/infer-action-type.j2
+++ b/skyvern/forge/prompts/skyvern/infer-action-type.j2
@@ -0,0 +1,16 @@
+You are a browser agent performing actions on the web. You are instructed to take a single action. Help to identify which action type should be taken according to the action instruction.
+
+MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
+
+Reply in the following JSON format:
+{
+    "thought": str, // A string to describe how to infer the action type from the action instruction.
+    "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
+    "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION". "CLICK" means user wants to click. "INPUT_TEXT" means user wants to input text. "UPLOAD_FILE" means user wants to upload a file. "SELECT_OPTION" means user wants to select an option.
+    "error": str, // It's a string enum to describe error. Null if you can identify the action as one of the defined action type. Use "UNKNOWN_ACTION" if none of the defined action type matched. Use "MULTIPLE_ACTIONS" if the instruction includes multiple actions.
+}
+
+Action instruction
+```
+{{ navigation_goal }}
+```
--- a/skyvern/forge/sdk/workflow/exceptions.py
+++ b/skyvern/forge/sdk/workflow/exceptions.py
@@ -107,3 +107,10 @@ class WorkflowParameterMissingRequiredValue(BaseWorkflowHTTPException):
            f"Missing required value for workflow parameter. Workflow parameter type: {workflow_parameter_type}. workflow_parameter_key: {workflow_parameter_key}. Required value: {required_value}",
            status_code=status.HTTP_400_BAD_REQUEST,
        )
+
+
+class FailedToParseActionInstruction(SkyvernException):
+    def __init__(self, reason: str | None, error_type: str | None):
+        super().__init__(
+            f"Failed to parse the action instruction as '{reason}({error_type})'",
+        )
--- a/skyvern/forge/sdk/workflow/models/yaml.py
+++ b/skyvern/forge/sdk/workflow/models/yaml.py
@@ -6,7 +6,6 @@ from pydantic import BaseModel, Field
 from skyvern.forge.sdk.schemas.tasks import ProxyLocation
 from skyvern.forge.sdk.workflow.models.block import BlockType, FileType
 from skyvern.forge.sdk.workflow.models.parameter import ParameterType, WorkflowParameterType
-from skyvern.webeye.actions.actions import ActionType


 class ParameterYAML(BaseModel, abc.ABC):
@@ -219,7 +218,6 @@ class ValidationBlockYAML(BlockYAML):


 class ActionBlockYAML(BlockYAML):
-    action_type: ActionType
    block_type: Literal[BlockType.ACTION] = BlockType.ACTION  # type: ignore

    url: str | None = None
--- a/skyvern/forge/sdk/workflow/service.py
+++ b/skyvern/forge/sdk/workflow/service.py
@@ -14,6 +14,7 @@ from skyvern.exceptions import (
    WorkflowRunNotFound,
 )
 from skyvern.forge import app
+from skyvern.forge.prompts import prompt_engine
 from skyvern.forge.sdk.artifact.models import ArtifactType
 from skyvern.forge.sdk.core import skyvern_context
 from skyvern.forge.sdk.core.security import generate_skyvern_signature
@@ -23,6 +24,7 @@ from skyvern.forge.sdk.models import Organization, Step
 from skyvern.forge.sdk.schemas.tasks import ProxyLocation, Task
 from skyvern.forge.sdk.workflow.exceptions import (
    ContextParameterSourceNotDefined,
+    FailedToParseActionInstruction,
    InvalidWorkflowDefinition,
    WorkflowDefinitionHasDuplicateParameterKeys,
    WorkflowDefinitionHasReservedParameterKeys,
@@ -1366,18 +1368,35 @@ class WorkflowService:
                if block_yaml.parameter_keys
                else []
            )
+
+            if not block_yaml.navigation_goal:
+                raise Exception("empty action instruction")
+
+            prompt = prompt_engine.load_prompt("infer-action-type", navigation_goal=block_yaml.navigation_goal)
+            # TODO: no step here, so LLM call won't be saved as an artifact
+            json_response = await app.LLM_API_HANDLER(prompt=prompt)
+            if json_response.get("error"):
+                raise FailedToParseActionInstruction(
+                    reason=json_response.get("thought"), error_type=json_response.get("error")
+                )
+
+            action_type: str = json_response.get("action_type") or ""
+            action_type = action_type.lower()
+
            prompt_template = ""
-            if block_yaml.action_type == ActionType.CLICK:
+            if action_type == ActionType.CLICK:
                prompt_template = TaskPromptTemplate.SingleClickAction
-            elif block_yaml.action_type == ActionType.INPUT_TEXT:
+            elif action_type == ActionType.INPUT_TEXT:
                prompt_template = TaskPromptTemplate.SingleInputAction
-            elif block_yaml.action_type == ActionType.UPLOAD_FILE:
+            elif action_type == ActionType.UPLOAD_FILE:
                prompt_template = TaskPromptTemplate.SingleUploadAction
-            elif block_yaml.action_type == ActionType.SELECT_OPTION:
+            elif action_type == ActionType.SELECT_OPTION:
                prompt_template = TaskPromptTemplate.SingleSelectAction

            if not prompt_template:
-                raise Exception("not supported action type for action block")
+                raise Exception(
+                    f"Not supported action for action block. Currently we only support [click, input_text, upload_file, select_option], but got [{action_type}]"
+                )

            return ActionBlock(
                prompt_template=prompt_template,