Files
Dorod-Sky/skyvern/webeye/actions/actions.py
2024-04-04 19:09:19 -07:00

226 lines
7.2 KiB
Python

import abc
from enum import StrEnum
from typing import Any, Dict, List
import structlog
from pydantic import BaseModel, Field
from skyvern.forge.sdk.schemas.tasks import Task
LOG = structlog.get_logger()
class ActionType(StrEnum):
CLICK = "click"
INPUT_TEXT = "input_text"
UPLOAD_FILE = "upload_file"
DOWNLOAD_FILE = "download_file"
SELECT_OPTION = "select_option"
CHECKBOX = "checkbox"
WAIT = "wait"
NULL_ACTION = "null_action"
SOLVE_CAPTCHA = "solve_captcha"
TERMINATE = "terminate"
COMPLETE = "complete"
# Note: Remember to update ActionTypeUnion with new actions
class Action(BaseModel):
action_type: ActionType
description: str | None = None
reasoning: str | None = None
class WebAction(Action, abc.ABC):
element_id: int
class UserDefinedError(BaseModel):
error_code: str
reasoning: str
confidence_float: float = Field(..., ge=0, le=1)
class DecisiveAction(Action, abc.ABC):
errors: List[UserDefinedError] = []
class ClickAction(WebAction):
action_type: ActionType = ActionType.CLICK
file_url: str | None = None
def __repr__(self) -> str:
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url})"
class InputTextAction(WebAction):
action_type: ActionType = ActionType.INPUT_TEXT
text: str
def __repr__(self) -> str:
return f"InputTextAction(element_id={self.element_id}, text={self.text})"
class UploadFileAction(WebAction):
action_type: ActionType = ActionType.UPLOAD_FILE
file_url: str
is_upload_file_tag: bool = True
def __repr__(self) -> str:
return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})"
class DownloadFileAction(WebAction):
action_type: ActionType = ActionType.DOWNLOAD_FILE
file_name: str
def __repr__(self) -> str:
return f"DownloadFileAction(element_id={self.element_id}, file_name={self.file_name})"
class NullAction(Action):
action_type: ActionType = ActionType.NULL_ACTION
class SolveCaptchaAction(Action):
action_type: ActionType = ActionType.SOLVE_CAPTCHA
class SelectOption(BaseModel):
label: str | None
value: str | None
index: int | None
def __repr__(self) -> str:
return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
class SelectOptionAction(WebAction):
action_type: ActionType = ActionType.SELECT_OPTION
option: SelectOption
def __repr__(self) -> str:
return f"SelectOptionAction(element_id={self.element_id}, option={self.option})"
###
# This action causes more harm than it does good.
# It frequently mis-behaves, or gets stuck in click loops.
# Treating checkbox actions as click actions seem to perform way more reliably
# Developers who tried this and failed: 2 (Suchintan and Shu 😂)
###
class CheckboxAction(WebAction):
action_type: ActionType = ActionType.CHECKBOX
is_checked: bool
def __repr__(self) -> str:
return f"CheckboxAction(element_id={self.element_id}, is_checked={self.is_checked})"
class WaitAction(Action):
action_type: ActionType = ActionType.WAIT
class TerminateAction(DecisiveAction):
action_type: ActionType = ActionType.TERMINATE
class CompleteAction(DecisiveAction):
action_type: ActionType = ActionType.COMPLETE
data_extraction_goal: str | None = None
def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Action]:
actions = []
for action in json_response:
element_id = action["id"]
reasoning = action["reasoning"] if "reasoning" in action else None
if "action_type" not in action or action["action_type"] is None:
actions.append(NullAction(reasoning=reasoning))
continue
# `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
action_type = ActionType[action["action_type"].upper()]
if action_type == ActionType.TERMINATE:
LOG.warning(
"Agent decided to terminate",
task_id=task.task_id,
llm_response=json_response,
reasoning=reasoning,
actions=actions,
)
actions.append(TerminateAction(reasoning=reasoning, errors=action["errors"] if "errors" in action else []))
elif action_type == ActionType.CLICK:
file_url = action["file_url"] if "file_url" in action else None
actions.append(ClickAction(element_id=element_id, reasoning=reasoning, file_url=file_url))
elif action_type == ActionType.INPUT_TEXT:
actions.append(InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning))
elif action_type == ActionType.UPLOAD_FILE:
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
actions.append(UploadFileAction(element_id=element_id, file_url=action["file_url"], reasoning=reasoning))
elif action_type == ActionType.DOWNLOAD_FILE:
actions.append(
DownloadFileAction(element_id=element_id, file_name=action["file_name"], reasoning=reasoning)
)
elif action_type == ActionType.SELECT_OPTION:
actions.append(
SelectOptionAction(
element_id=element_id,
option=SelectOption(
label=action["option"]["label"],
value=action["option"]["value"],
index=action["option"]["index"],
),
reasoning=reasoning,
)
)
elif action_type == ActionType.CHECKBOX:
actions.append(CheckboxAction(element_id=element_id, is_checked=action["is_checked"], reasoning=reasoning))
elif action_type == ActionType.WAIT:
actions.append(WaitAction(reasoning=reasoning))
elif action_type == ActionType.COMPLETE:
actions.append(
CompleteAction(
reasoning=reasoning,
data_extraction_goal=task.data_extraction_goal,
errors=action["errors"] if "errors" in action else [],
)
)
elif action_type == "null":
actions.append(NullAction(reasoning=reasoning))
elif action_type == ActionType.SOLVE_CAPTCHA:
actions.append(SolveCaptchaAction(reasoning=reasoning))
else:
LOG.error(
"Unsupported action type when parsing actions",
task_id=task.task_id,
action_type=action_type,
raw_action=action,
)
return actions
class ScrapeResult(BaseModel):
"""
Scraped response from a webpage, including:
1. JSON representation of what the user is seeing
"""
scraped_data: dict[str, Any] | list[dict[str, Any]]
# https://blog.devgenius.io/deserialize-child-classes-with-pydantic-that-gonna-work-784230e1cf83
ActionTypeUnion = (
ClickAction
| InputTextAction
| UploadFileAction
| DownloadFileAction
| SelectOptionAction
| CheckboxAction
| WaitAction
| NullAction
| SolveCaptchaAction
| TerminateAction
| CompleteAction
)