Files
Dorod-Sky/skyvern/webeye/actions/actions.py

355 lines
12 KiB
Python
Raw Normal View History

from datetime import datetime
from enum import StrEnum
2025-04-30 18:42:44 +08:00
from typing import Annotated, Any, Literal, Type, TypeVar
import structlog
from litellm import ConfigDict
from pydantic import BaseModel, Field
from skyvern.errors.errors import UserDefinedError
from skyvern.webeye.actions.action_types import ActionType
LOG = structlog.get_logger()
T = TypeVar("T", bound="Action")
class ActionStatus(StrEnum):
pending = "pending"
skipped = "skipped"
failed = "failed"
completed = "completed"
class SelectOption(BaseModel):
label: str | None = None
value: str | None = None
index: int | None = None
def __repr__(self) -> str:
return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
class VerificationStatus(StrEnum):
"""Status of user goal verification."""
complete = "complete" # Goal achieved successfully
terminate = "terminate" # Goal cannot be achieved, stop trying
continue_step = "continue" # Goal not yet achieved, continue with more steps
class CompleteVerifyResult(BaseModel):
# New field: explicit status with three options (used when experiment is enabled)
status: VerificationStatus | None = None
# Legacy fields: for backward compatibility (used when experiment is disabled)
user_goal_achieved: bool = False
should_terminate: bool = False
thoughts: str
page_info: str | None = None
def __repr__(self) -> str:
if self.status:
return f"CompleteVerifyResult(status={self.status}, thoughts={self.thoughts}, page_info={self.page_info})"
return f"CompleteVerifyResult(thoughts={self.thoughts}, user_goal_achieved={self.user_goal_achieved}, should_terminate={self.should_terminate}, page_info={self.page_info})"
@property
def is_complete(self) -> bool:
"""True if goal was achieved (supports both new and legacy formats)."""
if self.status:
return self.status == VerificationStatus.complete
return self.user_goal_achieved
@property
def is_terminate(self) -> bool:
"""True if task should terminate (supports both new and legacy formats)."""
if self.status:
return self.status == VerificationStatus.terminate
return self.should_terminate
@property
def is_continue(self) -> bool:
"""True if task should continue (supports both new and legacy formats)."""
if self.status:
return self.status == VerificationStatus.continue_step
return not self.user_goal_achieved and not self.should_terminate
class InputOrSelectContext(BaseModel):
2025-02-03 19:19:39 +08:00
intention: str | None = None
field: str | None = None
is_required: bool | None = None
is_search_bar: bool | None = None # don't trigger custom-selection logic when it's a search bar
is_location_input: bool | None = None # address input usually requires auto completion
2025-01-09 16:14:31 +08:00
is_date_related: bool | None = None # date picker mini agent requires some special logic
date_format: str | None = None
def __repr__(self) -> str:
2025-02-03 19:19:39 +08:00
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})"
class ClickContext(BaseModel):
thought: str | None = None
single_option_click: bool | None = None
class Action(BaseModel):
model_config = ConfigDict(from_attributes=True)
action_type: ActionType
status: ActionStatus = ActionStatus.pending
action_id: str | None = None
source_action_id: str | None = None
organization_id: str | None = None
workflow_run_id: str | None = None
task_id: str | None = None
step_id: str | None = None
step_order: int | None = None
action_order: int | None = None
2024-06-07 10:57:53 -07:00
confidence_float: float | None = None
description: str | None = None
reasoning: str | None = None
intention: str | None = None
response: str | None = None
element_id: Annotated[str, Field(coerce_numbers_to_str=True)] | None = None
skyvern_element_hash: str | None = None
skyvern_element_data: dict[str, Any] | None = None
tool_call_id: str | None = None
xpath: str | None = None
# DecisiveAction (CompleteAction, TerminateAction) fields
errors: list[UserDefinedError] | None = None
data_extraction_goal: str | None = None
# WebAction fields
file_name: str | None = None
file_url: str | None = None
download: bool | None = None
is_upload_file_tag: bool | None = None
text: str | None = None
input_or_select_context: InputOrSelectContext | None = None
option: SelectOption | None = None
is_checked: bool | None = None
verified: bool = False
click_context: ClickContext | None = None
# TOTP timing information for multi-field TOTP sequences
totp_timing_info: dict[str, Any] | None = None
created_at: datetime | None = None
modified_at: datetime | None = None
created_by: str | None = None
@classmethod
def validate(cls: Type[T], value: Any) -> T:
if isinstance(value, dict):
action_type = value["action_type"]
if action_type is ActionType.CLICK:
return ClickAction.model_validate(value)
elif action_type is ActionType.INPUT_TEXT:
return InputTextAction.model_validate(value)
elif action_type is ActionType.UPLOAD_FILE:
return UploadFileAction.model_validate(value)
elif action_type is ActionType.DOWNLOAD_FILE:
return DownloadFileAction.model_validate(value)
elif action_type is ActionType.NULL_ACTION:
return NullAction.model_validate(value)
elif action_type is ActionType.TERMINATE:
return TerminateAction.model_validate(value)
elif action_type is ActionType.COMPLETE:
return CompleteAction.model_validate(value)
elif action_type is ActionType.SELECT_OPTION:
return SelectOptionAction.model_validate(value)
elif action_type is ActionType.CHECKBOX:
return CheckboxAction.model_validate(value)
elif action_type is ActionType.WAIT:
return WaitAction.model_validate(value)
elif action_type is ActionType.SOLVE_CAPTCHA:
return SolveCaptchaAction.model_validate(value)
2025-01-19 19:56:09 -08:00
elif action_type is ActionType.RELOAD_PAGE:
return ReloadPageAction.model_validate(value)
elif action_type is ActionType.GOTO_URL:
return GotoUrlAction.model_validate(value)
elif action_type is ActionType.CLOSE_PAGE:
return ClosePageAction.model_validate(value)
else:
raise ValueError(f"Unsupported action type: {action_type}")
else:
raise ValueError("Invalid action data")
2025-06-09 03:26:44 -07:00
def get_xpath(self) -> str | None:
if self.xpath:
return self.xpath
2025-06-09 03:26:44 -07:00
if not self.skyvern_element_data:
return None
if "xpath" in self.skyvern_element_data:
return self.skyvern_element_data["xpath"]
return None
class WebAction(Action):
element_id: Annotated[str, Field(coerce_numbers_to_str=True)]
class DecisiveAction(Action):
errors: list[UserDefinedError] = []
# TODO: consider to implement this as a WebAction in the future
class ReloadPageAction(Action):
action_type: ActionType = ActionType.RELOAD_PAGE
# TODO: right now, it's only enabled when there's magic link during login
class ClosePageAction(Action):
action_type: ActionType = ActionType.CLOSE_PAGE
class ClickAction(WebAction):
action_type: ActionType = ActionType.CLICK
file_url: str | None = None
download: bool = False
2025-04-11 11:18:53 -07:00
x: int | None = None
y: int | None = None
button: str = "left"
# normal click: 1, double click: 2, triple click: 3
repeat: int = 1
def __repr__(self) -> str:
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url}, download={self.download}, x={self.x}, y={self.y}, button={self.button}, tool_call_id={self.tool_call_id})"
class InputTextAction(WebAction):
action_type: ActionType = ActionType.INPUT_TEXT
text: str
totp_code_required: bool = False
def __repr__(self) -> str:
return f"InputTextAction(element_id={self.element_id}, text={self.text}, context={self.input_or_select_context}, tool_call_id={self.tool_call_id})"
class UploadFileAction(WebAction):
action_type: ActionType = ActionType.UPLOAD_FILE
file_url: str
is_upload_file_tag: bool = True
def __repr__(self) -> str:
return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})"
2024-11-10 16:24:13 -08:00
# this is a deprecated action type
2024-04-04 19:09:19 -07:00
class DownloadFileAction(WebAction):
action_type: ActionType = ActionType.DOWNLOAD_FILE
file_name: str
def __repr__(self) -> str:
return f"DownloadFileAction(element_id={self.element_id}, file_name={self.file_name})"
class NullAction(Action):
action_type: ActionType = ActionType.NULL_ACTION
class SolveCaptchaAction(Action):
action_type: ActionType = ActionType.SOLVE_CAPTCHA
class SelectOptionAction(WebAction):
action_type: ActionType = ActionType.SELECT_OPTION
option: SelectOption
download: bool = False
def __repr__(self) -> str:
return f"SelectOptionAction(element_id={self.element_id}, option={self.option}, context={self.input_or_select_context}, download={self.download})"
###
# This action causes more harm than it does good.
# It frequently mis-behaves, or gets stuck in click loops.
# Treating checkbox actions as click actions seem to perform way more reliably
# Developers who tried this and failed: 2 (Suchintan and Shu 😂)
###
class CheckboxAction(WebAction):
action_type: ActionType = ActionType.CHECKBOX
is_checked: bool
def __repr__(self) -> str:
return f"CheckboxAction(element_id={self.element_id}, is_checked={self.is_checked})"
class WaitAction(Action):
action_type: ActionType = ActionType.WAIT
2025-04-11 11:18:53 -07:00
seconds: int = 20
class TerminateAction(DecisiveAction):
action_type: ActionType = ActionType.TERMINATE
class CompleteAction(DecisiveAction):
action_type: ActionType = ActionType.COMPLETE
verified: bool = False
data_extraction_goal: str | None = None
2025-01-24 16:21:26 +08:00
class ExtractAction(Action):
action_type: ActionType = ActionType.EXTRACT
data_extraction_goal: str | None = None
2025-09-03 12:05:20 +08:00
data_extraction_schema: dict[str, Any] | list | str | None = None
2025-01-24 16:21:26 +08:00
2025-04-11 11:18:53 -07:00
class ScrollAction(Action):
action_type: ActionType = ActionType.SCROLL
2025-04-30 18:42:44 +08:00
x: int | None = None
y: int | None = None
2025-04-11 11:18:53 -07:00
scroll_x: int
scroll_y: int
class KeypressAction(Action):
action_type: ActionType = ActionType.KEYPRESS
keys: list[str] = []
2025-04-30 18:42:44 +08:00
hold: bool = False
duration: int = 0
2025-04-11 11:18:53 -07:00
2025-10-14 16:24:14 +08:00
class GotoUrlAction(Action):
action_type: ActionType = ActionType.GOTO_URL
url: str
is_magic_link: bool = False # if True, shouldn't go to url directly when replaying the cache
2025-10-14 16:24:14 +08:00
2025-04-13 00:22:46 -07:00
class MoveAction(Action):
action_type: ActionType = ActionType.MOVE
x: int
y: int
class DragAction(Action):
action_type: ActionType = ActionType.DRAG
2025-04-30 18:42:44 +08:00
start_x: int | None = None
start_y: int | None = None
path: list[tuple[int, int]] = []
class VerificationCodeAction(Action):
action_type: ActionType = ActionType.VERIFICATION_CODE
verification_code: str
2025-04-30 18:42:44 +08:00
class LeftMouseAction(Action):
action_type: ActionType = ActionType.LEFT_MOUSE
direction: Literal["down", "up"]
x: int | None = None
y: int | None = None
class ScrapeResult(BaseModel):
"""
Scraped response from a webpage, including:
1. JSON representation of what the user is seeing
"""
2025-03-06 12:18:42 -08:00
scraped_data: dict[str, Any] | list | str | None