Files
Dorod-Sky/skyvern/webeye/actions/actions.py

323 lines
9.8 KiB
Python
Raw Normal View History

from datetime import datetime
from enum import StrEnum
2025-04-30 18:42:44 +08:00
from typing import Annotated, Any, Literal, Type, TypeVar
import structlog
from litellm import ConfigDict
from pydantic import BaseModel, Field
LOG = structlog.get_logger()
T = TypeVar("T", bound="Action")
class ActionType(StrEnum):
CLICK = "click"
INPUT_TEXT = "input_text"
UPLOAD_FILE = "upload_file"
# This action is not used in the current implementation. Click actions are used instead."
2024-04-04 19:09:19 -07:00
DOWNLOAD_FILE = "download_file"
SELECT_OPTION = "select_option"
CHECKBOX = "checkbox"
WAIT = "wait"
NULL_ACTION = "null_action"
SOLVE_CAPTCHA = "solve_captcha"
TERMINATE = "terminate"
COMPLETE = "complete"
RELOAD_PAGE = "reload_page"
2025-01-24 16:21:26 +08:00
EXTRACT = "extract"
2025-04-30 18:42:44 +08:00
VERIFICATION_CODE = "verification_code"
2025-04-11 11:18:53 -07:00
SCROLL = "scroll"
KEYPRESS = "keypress"
TYPE = "type"
2025-04-13 00:22:46 -07:00
MOVE = "move"
DRAG = "drag"
2025-04-30 18:42:44 +08:00
LEFT_MOUSE = "left_mouse"
2025-01-24 16:21:26 +08:00
def is_web_action(self) -> bool:
return self in [
ActionType.CLICK,
ActionType.INPUT_TEXT,
ActionType.UPLOAD_FILE,
ActionType.DOWNLOAD_FILE,
ActionType.SELECT_OPTION,
ActionType.CHECKBOX,
]
class ActionStatus(StrEnum):
pending = "pending"
skipped = "skipped"
failed = "failed"
completed = "completed"
class UserDefinedError(BaseModel):
error_code: str
reasoning: str
confidence_float: float = Field(..., ge=0, le=1)
2024-10-28 09:42:55 +08:00
def __repr__(self) -> str:
return f"{self.reasoning}(error_code={self.error_code}, confidence_float={self.confidence_float})"
class SelectOption(BaseModel):
label: str | None = None
value: str | None = None
index: int | None = None
def __repr__(self) -> str:
return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
class CompleteVerifyResult(BaseModel):
user_goal_achieved: bool
thoughts: str
page_info: str | None = None
def __repr__(self) -> str:
return f"CompleteVerifyResponse(thoughts={self.thoughts}, user_goal_achieved={self.user_goal_achieved}, page_info={self.page_info})"
class InputOrSelectContext(BaseModel):
2025-02-03 19:19:39 +08:00
intention: str | None = None
field: str | None = None
is_required: bool | None = None
is_search_bar: bool | None = None # don't trigger custom-selection logic when it's a search bar
is_location_input: bool | None = None # address input usually requires auto completion
2025-01-09 16:14:31 +08:00
is_date_related: bool | None = None # date picker mini agent requires some special logic
def __repr__(self) -> str:
2025-02-03 19:19:39 +08:00
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})"
class Action(BaseModel):
model_config = ConfigDict(from_attributes=True)
action_type: ActionType
status: ActionStatus = ActionStatus.pending
action_id: str | None = None
source_action_id: str | None = None
organization_id: str | None = None
workflow_run_id: str | None = None
task_id: str | None = None
step_id: str | None = None
step_order: int | None = None
action_order: int | None = None
2024-06-07 10:57:53 -07:00
confidence_float: float | None = None
description: str | None = None
reasoning: str | None = None
intention: str | None = None
response: str | None = None
element_id: Annotated[str, Field(coerce_numbers_to_str=True)] | None = None
skyvern_element_hash: str | None = None
skyvern_element_data: dict[str, Any] | None = None
tool_call_id: str | None = None
# DecisiveAction (CompleteAction, TerminateAction) fields
errors: list[UserDefinedError] | None = None
data_extraction_goal: str | None = None
# WebAction fields
file_name: str | None = None
file_url: str | None = None
download: bool | None = None
is_upload_file_tag: bool | None = None
text: str | None = None
option: SelectOption | None = None
is_checked: bool | None = None
verified: bool = False
created_at: datetime | None = None
modified_at: datetime | None = None
@classmethod
def validate(cls: Type[T], value: Any) -> T:
if isinstance(value, dict):
action_type = value["action_type"]
if action_type is ActionType.CLICK:
return ClickAction.model_validate(value)
elif action_type is ActionType.INPUT_TEXT:
return InputTextAction.model_validate(value)
elif action_type is ActionType.UPLOAD_FILE:
return UploadFileAction.model_validate(value)
elif action_type is ActionType.DOWNLOAD_FILE:
return DownloadFileAction.model_validate(value)
elif action_type is ActionType.NULL_ACTION:
return NullAction.model_validate(value)
elif action_type is ActionType.TERMINATE:
return TerminateAction.model_validate(value)
elif action_type is ActionType.COMPLETE:
return CompleteAction.model_validate(value)
elif action_type is ActionType.SELECT_OPTION:
return SelectOptionAction.model_validate(value)
elif action_type is ActionType.CHECKBOX:
return CheckboxAction.model_validate(value)
elif action_type is ActionType.WAIT:
return WaitAction.model_validate(value)
elif action_type is ActionType.SOLVE_CAPTCHA:
return SolveCaptchaAction.model_validate(value)
2025-01-19 19:56:09 -08:00
elif action_type is ActionType.RELOAD_PAGE:
return ReloadPageAction.model_validate(value)
else:
raise ValueError(f"Unsupported action type: {action_type}")
else:
raise ValueError("Invalid action data")
class WebAction(Action):
element_id: Annotated[str, Field(coerce_numbers_to_str=True)]
class DecisiveAction(Action):
errors: list[UserDefinedError] = []
# TODO: consider to implement this as a WebAction in the future
class ReloadPageAction(Action):
action_type: ActionType = ActionType.RELOAD_PAGE
class ClickAction(WebAction):
action_type: ActionType = ActionType.CLICK
file_url: str | None = None
download: bool = False
2025-04-11 11:18:53 -07:00
x: int | None = None
y: int | None = None
button: str = "left"
# normal click: 1, double click: 2, triple click: 3
repeat: int = 1
def __repr__(self) -> str:
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url}, download={self.download}, x={self.x}, y={self.y}, button={self.button}, tool_call_id={self.tool_call_id})"
class InputTextAction(WebAction):
action_type: ActionType = ActionType.INPUT_TEXT
text: str
def __repr__(self) -> str:
return f"InputTextAction(element_id={self.element_id}, text={self.text}, tool_call_id={self.tool_call_id})"
class UploadFileAction(WebAction):
action_type: ActionType = ActionType.UPLOAD_FILE
file_url: str
is_upload_file_tag: bool = True
def __repr__(self) -> str:
return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})"
2024-11-10 16:24:13 -08:00
# this is a deprecated action type
2024-04-04 19:09:19 -07:00
class DownloadFileAction(WebAction):
action_type: ActionType = ActionType.DOWNLOAD_FILE
file_name: str
def __repr__(self) -> str:
return f"DownloadFileAction(element_id={self.element_id}, file_name={self.file_name})"
class NullAction(Action):
action_type: ActionType = ActionType.NULL_ACTION
class SolveCaptchaAction(Action):
action_type: ActionType = ActionType.SOLVE_CAPTCHA
class SelectOptionAction(WebAction):
action_type: ActionType = ActionType.SELECT_OPTION
option: SelectOption
def __repr__(self) -> str:
return f"SelectOptionAction(element_id={self.element_id}, option={self.option})"
###
# This action causes more harm than it does good.
# It frequently mis-behaves, or gets stuck in click loops.
# Treating checkbox actions as click actions seem to perform way more reliably
# Developers who tried this and failed: 2 (Suchintan and Shu 😂)
###
class CheckboxAction(WebAction):
action_type: ActionType = ActionType.CHECKBOX
is_checked: bool
def __repr__(self) -> str:
return f"CheckboxAction(element_id={self.element_id}, is_checked={self.is_checked})"
class WaitAction(Action):
action_type: ActionType = ActionType.WAIT
2025-04-11 11:18:53 -07:00
seconds: int = 20
class TerminateAction(DecisiveAction):
action_type: ActionType = ActionType.TERMINATE
class CompleteAction(DecisiveAction):
action_type: ActionType = ActionType.COMPLETE
verified: bool = False
data_extraction_goal: str | None = None
2025-01-24 16:21:26 +08:00
class ExtractAction(Action):
action_type: ActionType = ActionType.EXTRACT
data_extraction_goal: str | None = None
data_extraction_schema: dict[str, Any] | None = None
2025-04-11 11:18:53 -07:00
class ScrollAction(Action):
action_type: ActionType = ActionType.SCROLL
2025-04-30 18:42:44 +08:00
x: int | None = None
y: int | None = None
2025-04-11 11:18:53 -07:00
scroll_x: int
scroll_y: int
class KeypressAction(Action):
action_type: ActionType = ActionType.KEYPRESS
keys: list[str] = []
2025-04-30 18:42:44 +08:00
hold: bool = False
duration: int = 0
2025-04-11 11:18:53 -07:00
2025-04-13 00:22:46 -07:00
class MoveAction(Action):
action_type: ActionType = ActionType.MOVE
x: int
y: int
class DragAction(Action):
action_type: ActionType = ActionType.DRAG
2025-04-30 18:42:44 +08:00
start_x: int | None = None
start_y: int | None = None
path: list[tuple[int, int]] = []
class VerificationCodeAction(Action):
action_type: ActionType = ActionType.VERIFICATION_CODE
verification_code: str
2025-04-30 18:42:44 +08:00
class LeftMouseAction(Action):
action_type: ActionType = ActionType.LEFT_MOUSE
direction: Literal["down", "up"]
x: int | None = None
y: int | None = None
class ScrapeResult(BaseModel):
"""
Scraped response from a webpage, including:
1. JSON representation of what the user is seeing
"""
2025-03-06 12:18:42 -08:00
scraped_data: dict[str, Any] | list | str | None