skip invalid actions during parsing (#381)

2024-05-30 09:23:58 +08:00
parent 0b86f9fb38
commit 6445fb93b0
2 changed files with 123 additions and 95 deletions
--- a/skyvern/exceptions.py
+++ b/skyvern/exceptions.py
@@ -267,3 +267,8 @@ class UnknownElementTreeFormat(SkyvernException):
 class StepTerminationError(SkyvernException):
    def __init__(self, step_id: str, reason: str) -> None:
        super().__init__(f"Step {step_id} cannot be executed and task is terminated. Reason: {reason}")
 class UnsupportedActionType(SkyvernException):
    def __init__(self, action_type: str):
        super().__init__(f"Unsupport action type: {action_type}")
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -4,8 +4,9 @@ from typing import Any, Dict, List
 import structlog
 from deprecation import deprecated
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ValidationError
 from skyvern.exceptions import UnsupportedActionType
 from skyvern.forge.sdk.schemas.tasks import Task
 LOG = structlog.get_logger()
@@ -133,9 +134,7 @@ class CompleteAction(DecisiveAction):
    data_extraction_goal: str | None = None
-def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Action]:
+def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None) -> Action:
    actions = []
    for action in json_response:
    if "id" in action:
        element_id = action["id"]
    elif "element_id" in action:
@@ -144,59 +143,49 @@ def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Actio
        element_id = None
    reasoning = action["reasoning"] if "reasoning" in action else None
    if "action_type" not in action or action["action_type"] is None:
-            actions.append(NullAction(reasoning=reasoning))
+        return NullAction(reasoning=reasoning)
-            continue
+
    # `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
    action_type = ActionType[action["action_type"].upper()]
    if action_type == ActionType.TERMINATE:
-            LOG.warning(
+        return TerminateAction(
                "Agent decided to terminate",
                task_id=task.task_id,
                llm_response=json_response,
                reasoning=reasoning,
                actions=actions,
            )
            actions.append(
                TerminateAction(
            reasoning=reasoning,
            errors=action["errors"] if "errors" in action else [],
        )
-            )
+
-        elif action_type == ActionType.CLICK:
+    if action_type == ActionType.CLICK:
        file_url = action["file_url"] if "file_url" in action else None
-            actions.append(
+        return ClickAction(
                ClickAction(
            element_id=element_id,
            reasoning=reasoning,
            file_url=file_url,
            download=action.get("download", False),
        )
            )
        elif action_type == ActionType.INPUT_TEXT:
            actions.append(InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning))
        elif action_type == ActionType.UPLOAD_FILE:
            # TODO: see if the element is a file input element. if it's not, convert this action into a click action
-            actions.append(
+    if action_type == ActionType.INPUT_TEXT:
-                UploadFileAction(
+        return InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning)
    if action_type == ActionType.UPLOAD_FILE:
        # TODO: see if the element is a file input element. if it's not, convert this action into a click action
        return UploadFileAction(
            element_id=element_id,
            file_url=action["file_url"],
            reasoning=reasoning,
        )
-            )
+
    # This action is not used in the current implementation. Click actions are used instead.
-        elif action_type == ActionType.DOWNLOAD_FILE:
+    if action_type == ActionType.DOWNLOAD_FILE:
-            actions.append(
+        return DownloadFileAction(
                DownloadFileAction(
            element_id=element_id,
            file_name=action["file_name"],
            reasoning=reasoning,
        )
-            )
+
-        elif action_type == ActionType.SELECT_OPTION:
+    if action_type == ActionType.SELECT_OPTION:
-            actions.append(
+        return SelectOptionAction(
                SelectOptionAction(
            element_id=element_id,
            option=SelectOption(
                label=action["option"]["label"],
@@ -205,36 +194,70 @@ def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Actio
            ),
            reasoning=reasoning,
        )
-            )
+
-        elif action_type == ActionType.CHECKBOX:
+    if action_type == ActionType.CHECKBOX:
-            actions.append(
+        return CheckboxAction(
                CheckboxAction(
            element_id=element_id,
            is_checked=action["is_checked"],
            reasoning=reasoning,
        )
-            )
+
-        elif action_type == ActionType.WAIT:
+    if action_type == ActionType.WAIT:
-            actions.append(WaitAction(reasoning=reasoning))
+        return WaitAction(reasoning=reasoning)
-        elif action_type == ActionType.COMPLETE:
+
-            actions.append(
+    if action_type == ActionType.COMPLETE:
-                CompleteAction(
+        return CompleteAction(
            reasoning=reasoning,
-                    data_extraction_goal=task.data_extraction_goal,
+            data_extraction_goal=data_extraction_goal,
            errors=action["errors"] if "errors" in action else [],
        )
    if action_type == "null":
        return NullAction(reasoning=reasoning)
    if action_type == ActionType.SOLVE_CAPTCHA:
        return SolveCaptchaAction(reasoning=reasoning)
    raise UnsupportedActionType(action_type=action_type)
 def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Action]:
    actions: List[Action] = []
    for action in json_response:
        try:
            action_instance = parse_action(action=action, data_extraction_goal=task.data_extraction_goal)
            if isinstance(action_instance, TerminateAction):
                LOG.warning(
                    "Agent decided to terminate",
                    task_id=task.task_id,
                    llm_response=json_response,
                    reasoning=action_instance.reasoning,
                    actions=actions,
                )
-        elif action_type == "null":
+            actions.append(action_instance)
-            actions.append(NullAction(reasoning=reasoning))
+
-        elif action_type == ActionType.SOLVE_CAPTCHA:
+        except UnsupportedActionType:
            actions.append(SolveCaptchaAction(reasoning=reasoning))
        else:
            LOG.error(
                "Unsupported action type when parsing actions",
                task_id=task.task_id,
                action_type=action_type,
                raw_action=action,
                exc_info=True,
            )
        except ValidationError:
            LOG.error(
                "Invalid action",
                task_id=task.task_id,
                raw_action=action,
                exc_info=True,
            )
        except Exception:
            LOG.error(
                "Failed to marshal action",
                task_id=task.task_id,
                raw_action=action,
                exc_info=True,
            )
    return actions