new API to the workflow run page (#1400)

2024-12-17 17:17:18 -08:00
parent b8e2527ea0
commit 58413db172
8 changed files with 403 additions and 178 deletions
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -1,13 +1,10 @@
+from datetime import datetime
 from enum import StrEnum
-from typing import Annotated, Any, Dict, Type, TypeVar
+from typing import Annotated, Any, Type, TypeVar

 import structlog
 from litellm import ConfigDict
-from pydantic import BaseModel, Field, ValidationError
-
-from skyvern.exceptions import UnsupportedActionType
-from skyvern.forge.sdk.schemas.tasks import Task
-from skyvern.webeye.scraper.scraper import ScrapedPage
+from pydantic import BaseModel, Field

 LOG = structlog.get_logger()
 T = TypeVar("T", bound="Action")
@@ -119,6 +116,9 @@ class Action(BaseModel):
    option: SelectOption | None = None
    is_checked: bool | None = None

+    created_at: datetime | None = None
+    modified_at: datetime | None = None
+
    @classmethod
    def validate(cls: Type[T], value: Any) -> T:
        if isinstance(value, dict):
@@ -239,176 +239,6 @@ class CompleteAction(DecisiveAction):
    data_extraction_goal: str | None = None


-def parse_action(action: Dict[str, Any], scraped_page: ScrapedPage, data_extraction_goal: str | None = None) -> Action:
-    if "id" in action:
-        element_id = action["id"]
-    elif "element_id" in action:
-        element_id = action["element_id"]
-    else:
-        element_id = None
-
-    skyvern_element_hash = scraped_page.id_to_element_hash.get(element_id) if element_id else None
-    skyvern_element_data = scraped_page.id_to_element_dict.get(element_id) if element_id else None
-
-    reasoning = action["reasoning"] if "reasoning" in action else None
-    confidence_float = action["confidence_float"] if "confidence_float" in action else None
-    # TODO: currently action intention and response are only used for Q&A actions, like input_text
-    # When we start supporting click action, intention will be the reasoning for the click action (why take the action)
-    intention = action["user_detail_query"] if "user_detail_query" in action else None
-    response = action["user_detail_answer"] if "user_detail_answer" in action else None
-
-    base_action_dict = {
-        "element_id": element_id,
-        "skyvern_element_hash": skyvern_element_hash,
-        "skyvern_element_data": skyvern_element_data,
-        "reasoning": reasoning,
-        "confidence_float": confidence_float,
-        "intention": intention,
-        "response": response,
-    }
-
-    if "action_type" not in action or action["action_type"] is None:
-        return NullAction(**base_action_dict)
-
-    # `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
-    action_type = ActionType[action["action_type"].upper()]
-
-    if not action_type.is_web_action():
-        # LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc.
-        # That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not
-        # set for non-web actions.
-        base_action_dict["element_id"] = None
-
-    if action_type == ActionType.TERMINATE:
-        return TerminateAction(**base_action_dict, errors=action["errors"] if "errors" in action else [])
-
-    if action_type == ActionType.CLICK:
-        file_url = action["file_url"] if "file_url" in action else None
-        return ClickAction(**base_action_dict, file_url=file_url, download=action.get("download", False))
-
-    if action_type == ActionType.INPUT_TEXT:
-        return InputTextAction(**base_action_dict, text=action["text"])
-
-    if action_type == ActionType.UPLOAD_FILE:
-        # TODO: see if the element is a file input element. if it's not, convert this action into a click action
-        return UploadFileAction(
-            **base_action_dict,
-            file_url=action["file_url"],
-        )
-
-    # This action is not used in the current implementation. Click actions are used instead.
-    if action_type == ActionType.DOWNLOAD_FILE:
-        return DownloadFileAction(**base_action_dict, file_name=action["file_name"])
-
-    if action_type == ActionType.SELECT_OPTION:
-        option = action["option"]
-        if option is None:
-            raise ValueError("SelectOptionAction requires an 'option' field")
-        label = option.get("label")
-        value = option.get("value")
-        index = option.get("index")
-        if label is None and value is None and index is None:
-            raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption")
-        return SelectOptionAction(
-            **base_action_dict,
-            option=SelectOption(
-                label=label,
-                value=value,
-                index=index,
-            ),
-        )
-
-    if action_type == ActionType.CHECKBOX:
-        return CheckboxAction(
-            **base_action_dict,
-            is_checked=action["is_checked"],
-        )
-
-    if action_type == ActionType.WAIT:
-        return WaitAction(**base_action_dict)
-
-    if action_type == ActionType.COMPLETE:
-        return CompleteAction(
-            **base_action_dict,
-            data_extraction_goal=data_extraction_goal,
-            errors=action["errors"] if "errors" in action else [],
-        )
-
-    if action_type == "null":
-        return NullAction(**base_action_dict)
-
-    if action_type == ActionType.SOLVE_CAPTCHA:
-        return SolveCaptchaAction(**base_action_dict)
-
-    raise UnsupportedActionType(action_type=action_type)
-
-
-def parse_actions(
-    task: Task, step_id: str, step_order: int, scraped_page: ScrapedPage, json_response: list[Dict[str, Any]]
-) -> list[Action]:
-    actions: list[Action] = []
-    for idx, action in enumerate(json_response):
-        try:
-            action_instance = parse_action(
-                action=action, scraped_page=scraped_page, data_extraction_goal=task.data_extraction_goal
-            )
-            action_instance.organization_id = task.organization_id
-            action_instance.workflow_run_id = task.workflow_run_id
-            action_instance.task_id = task.task_id
-            action_instance.step_id = step_id
-            action_instance.step_order = step_order
-            action_instance.action_order = idx
-            if isinstance(action_instance, TerminateAction):
-                LOG.warning(
-                    "Agent decided to terminate",
-                    task_id=task.task_id,
-                    llm_response=json_response,
-                    reasoning=action_instance.reasoning,
-                    actions=actions,
-                )
-            actions.append(action_instance)
-
-        except UnsupportedActionType:
-            LOG.error(
-                "Unsupported action type when parsing actions",
-                task_id=task.task_id,
-                raw_action=action,
-                exc_info=True,
-            )
-        except (ValidationError, ValueError):
-            LOG.warning(
-                "Invalid action",
-                task_id=task.task_id,
-                raw_action=action,
-                exc_info=True,
-            )
-        except Exception:
-            LOG.error(
-                "Failed to marshal action",
-                task_id=task.task_id,
-                raw_action=action,
-                exc_info=True,
-            )
-
-    ############################ This part of code might not be needed ############################
-    # Reason #1. validation can be done in action handler but not in parser
-    # Reason #2. no need to validate whether the element_id has a hash.
-    # If there's no hash, we can fall back to normal operation
-    all_element_ids = [action.element_id for action in actions if action.element_id]
-    missing_element_ids = [
-        element_id for element_id in all_element_ids if element_id not in scraped_page.id_to_element_hash
-    ]
-    if missing_element_ids:
-        LOG.warning(
-            "Missing elements in scraped page",
-            task_id=task.task_id,
-            missing_element_ids=missing_element_ids,
-            all_element_ids=all_element_ids,
-        )
-    ############################ This part of code might not be needed ############################
-    return actions
-
-
 class ScrapeResult(BaseModel):
    """
    Scraped response from a webpage, including:
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -0,0 +1,196 @@
+from typing import Any, Dict
+
+import structlog
+from pydantic import ValidationError
+
+from skyvern.exceptions import UnsupportedActionType
+from skyvern.forge.sdk.schemas.tasks import Task
+from skyvern.webeye.actions.actions import (
+    Action,
+    ActionType,
+    CheckboxAction,
+    ClickAction,
+    CompleteAction,
+    DownloadFileAction,
+    InputTextAction,
+    NullAction,
+    SelectOption,
+    SelectOptionAction,
+    SolveCaptchaAction,
+    TerminateAction,
+    UploadFileAction,
+    WaitAction,
+)
+from skyvern.webeye.scraper.scraper import ScrapedPage
+
+LOG = structlog.get_logger()
+
+
+def parse_action(action: Dict[str, Any], scraped_page: ScrapedPage, data_extraction_goal: str | None = None) -> Action:
+    if "id" in action:
+        element_id = action["id"]
+    elif "element_id" in action:
+        element_id = action["element_id"]
+    else:
+        element_id = None
+
+    skyvern_element_hash = scraped_page.id_to_element_hash.get(element_id) if element_id else None
+    skyvern_element_data = scraped_page.id_to_element_dict.get(element_id) if element_id else None
+
+    reasoning = action["reasoning"] if "reasoning" in action else None
+    confidence_float = action["confidence_float"] if "confidence_float" in action else None
+    # TODO: currently action intention and response are only used for Q&A actions, like input_text
+    # When we start supporting click action, intention will be the reasoning for the click action (why take the action)
+    intention = action["user_detail_query"] if "user_detail_query" in action else None
+    response = action["user_detail_answer"] if "user_detail_answer" in action else None
+
+    base_action_dict = {
+        "element_id": element_id,
+        "skyvern_element_hash": skyvern_element_hash,
+        "skyvern_element_data": skyvern_element_data,
+        "reasoning": reasoning,
+        "confidence_float": confidence_float,
+        "intention": intention,
+        "response": response,
+    }
+
+    if "action_type" not in action or action["action_type"] is None:
+        return NullAction(**base_action_dict)
+
+    # `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
+    action_type = ActionType[action["action_type"].upper()]
+
+    if not action_type.is_web_action():
+        # LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc.
+        # That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not
+        # set for non-web actions.
+        base_action_dict["element_id"] = None
+
+    if action_type == ActionType.TERMINATE:
+        return TerminateAction(**base_action_dict, errors=action["errors"] if "errors" in action else [])
+
+    if action_type == ActionType.CLICK:
+        file_url = action["file_url"] if "file_url" in action else None
+        return ClickAction(**base_action_dict, file_url=file_url, download=action.get("download", False))
+
+    if action_type == ActionType.INPUT_TEXT:
+        return InputTextAction(**base_action_dict, text=action["text"])
+
+    if action_type == ActionType.UPLOAD_FILE:
+        # TODO: see if the element is a file input element. if it's not, convert this action into a click action
+        return UploadFileAction(
+            **base_action_dict,
+            file_url=action["file_url"],
+        )
+
+    # This action is not used in the current implementation. Click actions are used instead.
+    if action_type == ActionType.DOWNLOAD_FILE:
+        return DownloadFileAction(**base_action_dict, file_name=action["file_name"])
+
+    if action_type == ActionType.SELECT_OPTION:
+        option = action["option"]
+        if option is None:
+            raise ValueError("SelectOptionAction requires an 'option' field")
+        label = option.get("label")
+        value = option.get("value")
+        index = option.get("index")
+        if label is None and value is None and index is None:
+            raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption")
+        return SelectOptionAction(
+            **base_action_dict,
+            option=SelectOption(
+                label=label,
+                value=value,
+                index=index,
+            ),
+        )
+
+    if action_type == ActionType.CHECKBOX:
+        return CheckboxAction(
+            **base_action_dict,
+            is_checked=action["is_checked"],
+        )
+
+    if action_type == ActionType.WAIT:
+        return WaitAction(**base_action_dict)
+
+    if action_type == ActionType.COMPLETE:
+        return CompleteAction(
+            **base_action_dict,
+            data_extraction_goal=data_extraction_goal,
+            errors=action["errors"] if "errors" in action else [],
+        )
+
+    if action_type == "null":
+        return NullAction(**base_action_dict)
+
+    if action_type == ActionType.SOLVE_CAPTCHA:
+        return SolveCaptchaAction(**base_action_dict)
+
+    raise UnsupportedActionType(action_type=action_type)
+
+
+def parse_actions(
+    task: Task, step_id: str, step_order: int, scraped_page: ScrapedPage, json_response: list[Dict[str, Any]]
+) -> list[Action]:
+    actions: list[Action] = []
+    for idx, action in enumerate(json_response):
+        try:
+            action_instance = parse_action(
+                action=action, scraped_page=scraped_page, data_extraction_goal=task.data_extraction_goal
+            )
+            action_instance.organization_id = task.organization_id
+            action_instance.workflow_run_id = task.workflow_run_id
+            action_instance.task_id = task.task_id
+            action_instance.step_id = step_id
+            action_instance.step_order = step_order
+            action_instance.action_order = idx
+            if isinstance(action_instance, TerminateAction):
+                LOG.warning(
+                    "Agent decided to terminate",
+                    task_id=task.task_id,
+                    llm_response=json_response,
+                    reasoning=action_instance.reasoning,
+                    actions=actions,
+                )
+            actions.append(action_instance)
+
+        except UnsupportedActionType:
+            LOG.error(
+                "Unsupported action type when parsing actions",
+                task_id=task.task_id,
+                raw_action=action,
+                exc_info=True,
+            )
+        except (ValidationError, ValueError):
+            LOG.warning(
+                "Invalid action",
+                task_id=task.task_id,
+                raw_action=action,
+                exc_info=True,
+            )
+        except Exception:
+            LOG.error(
+                "Failed to marshal action",
+                task_id=task.task_id,
+                raw_action=action,
+                exc_info=True,
+            )
+
+    ############################ This part of code might not be needed ############################
+    # Reason #1. validation can be done in action handler but not in parser
+    # Reason #2. no need to validate whether the element_id has a hash.
+    # If there's no hash, we can fall back to normal operation
+    all_element_ids = [action.element_id for action in actions if action.element_id]
+    missing_element_ids = [
+        element_id for element_id in all_element_ids if element_id not in scraped_page.id_to_element_hash
+    ]
+    if missing_element_ids:
+        LOG.warning(
+            "Missing elements in scraped page",
+            task_id=task.task_id,
+            missing_element_ids=missing_element_ids,
+            all_element_ids=all_element_ids,
+        )
+    ############################ This part of code might not be needed ############################
+    return actions