infer action type from instruction (#1231)

This commit is contained in:
LawyZheng
2024-11-21 17:38:42 +08:00
committed by GitHub
parent 9cd1f15763
commit bb6d3e6a37
4 changed files with 47 additions and 7 deletions

View File

@@ -0,0 +1,16 @@
You are a browser agent performing actions on the web. You are instructed to take a single action. Help to identify which action type should be taken according to the action instruction.
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
Reply in the following JSON format:
{
"thought": str, // A string to describe how to infer the action type from the action instruction.
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
"action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION". "CLICK" means user wants to click. "INPUT_TEXT" means user wants to input text. "UPLOAD_FILE" means user wants to upload a file. "SELECT_OPTION" means user wants to select an option.
"error": str, // It's a string enum to describe error. Null if you can identify the action as one of the defined action type. Use "UNKNOWN_ACTION" if none of the defined action type matched. Use "MULTIPLE_ACTIONS" if the instruction includes multiple actions.
}
Action instruction
```
{{ navigation_goal }}
```

View File

@@ -107,3 +107,10 @@ class WorkflowParameterMissingRequiredValue(BaseWorkflowHTTPException):
f"Missing required value for workflow parameter. Workflow parameter type: {workflow_parameter_type}. workflow_parameter_key: {workflow_parameter_key}. Required value: {required_value}",
status_code=status.HTTP_400_BAD_REQUEST,
)
class FailedToParseActionInstruction(SkyvernException):
def __init__(self, reason: str | None, error_type: str | None):
super().__init__(
f"Failed to parse the action instruction as '{reason}({error_type})'",
)

View File

@@ -6,7 +6,6 @@ from pydantic import BaseModel, Field
from skyvern.forge.sdk.schemas.tasks import ProxyLocation
from skyvern.forge.sdk.workflow.models.block import BlockType, FileType
from skyvern.forge.sdk.workflow.models.parameter import ParameterType, WorkflowParameterType
from skyvern.webeye.actions.actions import ActionType
class ParameterYAML(BaseModel, abc.ABC):
@@ -219,7 +218,6 @@ class ValidationBlockYAML(BlockYAML):
class ActionBlockYAML(BlockYAML):
action_type: ActionType
block_type: Literal[BlockType.ACTION] = BlockType.ACTION # type: ignore
url: str | None = None

View File

@@ -14,6 +14,7 @@ from skyvern.exceptions import (
WorkflowRunNotFound,
)
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.artifact.models import ArtifactType
from skyvern.forge.sdk.core import skyvern_context
from skyvern.forge.sdk.core.security import generate_skyvern_signature
@@ -23,6 +24,7 @@ from skyvern.forge.sdk.models import Organization, Step
from skyvern.forge.sdk.schemas.tasks import ProxyLocation, Task
from skyvern.forge.sdk.workflow.exceptions import (
ContextParameterSourceNotDefined,
FailedToParseActionInstruction,
InvalidWorkflowDefinition,
WorkflowDefinitionHasDuplicateParameterKeys,
WorkflowDefinitionHasReservedParameterKeys,
@@ -1366,18 +1368,35 @@ class WorkflowService:
if block_yaml.parameter_keys
else []
)
if not block_yaml.navigation_goal:
raise Exception("empty action instruction")
prompt = prompt_engine.load_prompt("infer-action-type", navigation_goal=block_yaml.navigation_goal)
# TODO: no step here, so LLM call won't be saved as an artifact
json_response = await app.LLM_API_HANDLER(prompt=prompt)
if json_response.get("error"):
raise FailedToParseActionInstruction(
reason=json_response.get("thought"), error_type=json_response.get("error")
)
action_type: str = json_response.get("action_type") or ""
action_type = action_type.lower()
prompt_template = ""
if block_yaml.action_type == ActionType.CLICK:
if action_type == ActionType.CLICK:
prompt_template = TaskPromptTemplate.SingleClickAction
elif block_yaml.action_type == ActionType.INPUT_TEXT:
elif action_type == ActionType.INPUT_TEXT:
prompt_template = TaskPromptTemplate.SingleInputAction
elif block_yaml.action_type == ActionType.UPLOAD_FILE:
elif action_type == ActionType.UPLOAD_FILE:
prompt_template = TaskPromptTemplate.SingleUploadAction
elif block_yaml.action_type == ActionType.SELECT_OPTION:
elif action_type == ActionType.SELECT_OPTION:
prompt_template = TaskPromptTemplate.SingleSelectAction
if not prompt_template:
raise Exception("not supported action type for action block")
raise Exception(
f"Not supported action for action block. Currently we only support [click, input_text, upload_file, select_option], but got [{action_type}]"
)
return ActionBlock(
prompt_template=prompt_template,