From 6445fb93b026a2c17e4a3dbf075cf3642b9d6c7b Mon Sep 17 00:00:00 2001
From: LawyZheng <lawyzheng1106@gmail.com>
Date: Thu, 30 May 2024 09:23:58 +0800
Subject: [PATCH] skip invalid actions during parsing (#381)

---
 skyvern/exceptions.py             |   5 +
 skyvern/webeye/actions/actions.py | 213 +++++++++++++++++-------------
 2 files changed, 123 insertions(+), 95 deletions(-)

diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py
index 7477c262..873ccad3 100644
--- a/skyvern/exceptions.py
+++ b/skyvern/exceptions.py
@@ -267,3 +267,8 @@ class UnknownElementTreeFormat(SkyvernException):
 class StepTerminationError(SkyvernException):
     def __init__(self, step_id: str, reason: str) -> None:
         super().__init__(f"Step {step_id} cannot be executed and task is terminated. Reason: {reason}")
+
+
+class UnsupportedActionType(SkyvernException):
+    def __init__(self, action_type: str):
+        super().__init__(f"Unsupport action type: {action_type}")
diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py
index f94ce382..e8b208ff 100644
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -4,8 +4,9 @@ from typing import Any, Dict, List
 
 import structlog
 from deprecation import deprecated
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ValidationError
 
+from skyvern.exceptions import UnsupportedActionType
 from skyvern.forge.sdk.schemas.tasks import Task
 
 LOG = structlog.get_logger()
@@ -133,108 +134,130 @@ class CompleteAction(DecisiveAction):
     data_extraction_goal: str | None = None
 
 
+def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None) -> Action:
+    if "id" in action:
+        element_id = action["id"]
+    elif "element_id" in action:
+        element_id = action["element_id"]
+    else:
+        element_id = None
+
+    reasoning = action["reasoning"] if "reasoning" in action else None
+
+    if "action_type" not in action or action["action_type"] is None:
+        return NullAction(reasoning=reasoning)
+
+    # `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
+    action_type = ActionType[action["action_type"].upper()]
+
+    if action_type == ActionType.TERMINATE:
+        return TerminateAction(
+            reasoning=reasoning,
+            errors=action["errors"] if "errors" in action else [],
+        )
+
+    if action_type == ActionType.CLICK:
+        file_url = action["file_url"] if "file_url" in action else None
+        return ClickAction(
+            element_id=element_id,
+            reasoning=reasoning,
+            file_url=file_url,
+            download=action.get("download", False),
+        )
+
+    if action_type == ActionType.INPUT_TEXT:
+        return InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning)
+
+    if action_type == ActionType.UPLOAD_FILE:
+        # TODO: see if the element is a file input element. if it's not, convert this action into a click action
+        return UploadFileAction(
+            element_id=element_id,
+            file_url=action["file_url"],
+            reasoning=reasoning,
+        )
+
+    # This action is not used in the current implementation. Click actions are used instead.
+    if action_type == ActionType.DOWNLOAD_FILE:
+        return DownloadFileAction(
+            element_id=element_id,
+            file_name=action["file_name"],
+            reasoning=reasoning,
+        )
+
+    if action_type == ActionType.SELECT_OPTION:
+        return SelectOptionAction(
+            element_id=element_id,
+            option=SelectOption(
+                label=action["option"]["label"],
+                value=action["option"]["value"],
+                index=action["option"]["index"],
+            ),
+            reasoning=reasoning,
+        )
+
+    if action_type == ActionType.CHECKBOX:
+        return CheckboxAction(
+            element_id=element_id,
+            is_checked=action["is_checked"],
+            reasoning=reasoning,
+        )
+
+    if action_type == ActionType.WAIT:
+        return WaitAction(reasoning=reasoning)
+
+    if action_type == ActionType.COMPLETE:
+        return CompleteAction(
+            reasoning=reasoning,
+            data_extraction_goal=data_extraction_goal,
+            errors=action["errors"] if "errors" in action else [],
+        )
+
+    if action_type == "null":
+        return NullAction(reasoning=reasoning)
+
+    if action_type == ActionType.SOLVE_CAPTCHA:
+        return SolveCaptchaAction(reasoning=reasoning)
+
+    raise UnsupportedActionType(action_type=action_type)
+
+
 def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Action]:
-    actions = []
+    actions: List[Action] = []
     for action in json_response:
-        if "id" in action:
-            element_id = action["id"]
-        elif "element_id" in action:
-            element_id = action["element_id"]
-        else:
-            element_id = None
+        try:
+            action_instance = parse_action(action=action, data_extraction_goal=task.data_extraction_goal)
+            if isinstance(action_instance, TerminateAction):
+                LOG.warning(
+                    "Agent decided to terminate",
+                    task_id=task.task_id,
+                    llm_response=json_response,
+                    reasoning=action_instance.reasoning,
+                    actions=actions,
+                )
+            actions.append(action_instance)
 
-        reasoning = action["reasoning"] if "reasoning" in action else None
-        if "action_type" not in action or action["action_type"] is None:
-            actions.append(NullAction(reasoning=reasoning))
-            continue
-        # `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
-        action_type = ActionType[action["action_type"].upper()]
-        if action_type == ActionType.TERMINATE:
-            LOG.warning(
-                "Agent decided to terminate",
-                task_id=task.task_id,
-                llm_response=json_response,
-                reasoning=reasoning,
-                actions=actions,
-            )
-            actions.append(
-                TerminateAction(
-                    reasoning=reasoning,
-                    errors=action["errors"] if "errors" in action else [],
-                )
-            )
-        elif action_type == ActionType.CLICK:
-            file_url = action["file_url"] if "file_url" in action else None
-            actions.append(
-                ClickAction(
-                    element_id=element_id,
-                    reasoning=reasoning,
-                    file_url=file_url,
-                    download=action.get("download", False),
-                )
-            )
-        elif action_type == ActionType.INPUT_TEXT:
-            actions.append(InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning))
-        elif action_type == ActionType.UPLOAD_FILE:
-            # TODO: see if the element is a file input element. if it's not, convert this action into a click action
-
-            actions.append(
-                UploadFileAction(
-                    element_id=element_id,
-                    file_url=action["file_url"],
-                    reasoning=reasoning,
-                )
-            )
-        # This action is not used in the current implementation. Click actions are used instead.
-        elif action_type == ActionType.DOWNLOAD_FILE:
-            actions.append(
-                DownloadFileAction(
-                    element_id=element_id,
-                    file_name=action["file_name"],
-                    reasoning=reasoning,
-                )
-            )
-        elif action_type == ActionType.SELECT_OPTION:
-            actions.append(
-                SelectOptionAction(
-                    element_id=element_id,
-                    option=SelectOption(
-                        label=action["option"]["label"],
-                        value=action["option"]["value"],
-                        index=action["option"]["index"],
-                    ),
-                    reasoning=reasoning,
-                )
-            )
-        elif action_type == ActionType.CHECKBOX:
-            actions.append(
-                CheckboxAction(
-                    element_id=element_id,
-                    is_checked=action["is_checked"],
-                    reasoning=reasoning,
-                )
-            )
-        elif action_type == ActionType.WAIT:
-            actions.append(WaitAction(reasoning=reasoning))
-        elif action_type == ActionType.COMPLETE:
-            actions.append(
-                CompleteAction(
-                    reasoning=reasoning,
-                    data_extraction_goal=task.data_extraction_goal,
-                    errors=action["errors"] if "errors" in action else [],
-                )
-            )
-        elif action_type == "null":
-            actions.append(NullAction(reasoning=reasoning))
-        elif action_type == ActionType.SOLVE_CAPTCHA:
-            actions.append(SolveCaptchaAction(reasoning=reasoning))
-        else:
+        except UnsupportedActionType:
             LOG.error(
                 "Unsupported action type when parsing actions",
                 task_id=task.task_id,
-                action_type=action_type,
                 raw_action=action,
+                exc_info=True,
             )
+        except ValidationError:
+            LOG.error(
+                "Invalid action",
+                task_id=task.task_id,
+                raw_action=action,
+                exc_info=True,
+            )
+        except Exception:
+            LOG.error(
+                "Failed to marshal action",
+                task_id=task.task_id,
+                raw_action=action,
+                exc_info=True,
+            )
+
     return actions