SDK: Support AI act feature (#3888)
This commit is contained in:
committed by
GitHub
parent
bdbabd5153
commit
ba99e9ea2c
@@ -7,6 +7,7 @@ from importlib import import_module
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from .types import (
|
||||
ActAction,
|
||||
Action,
|
||||
ActionBlock,
|
||||
ActionBlockDataSchema,
|
||||
@@ -271,6 +272,7 @@ if typing.TYPE_CHECKING:
|
||||
ScriptFileCreate,
|
||||
ScriptRunResponse,
|
||||
SdkAction,
|
||||
SdkAction_AiAct,
|
||||
SdkAction_AiClick,
|
||||
SdkAction_AiInputText,
|
||||
SdkAction_AiSelectOption,
|
||||
@@ -466,6 +468,7 @@ if typing.TYPE_CHECKING:
|
||||
from .environment import SkyvernEnvironment
|
||||
from .version import __version__
|
||||
_dynamic_imports: typing.Dict[str, str] = {
|
||||
"ActAction": ".types",
|
||||
"Action": ".types",
|
||||
"ActionBlock": ".types",
|
||||
"ActionBlockDataSchema": ".types",
|
||||
@@ -734,6 +737,7 @@ _dynamic_imports: typing.Dict[str, str] = {
|
||||
"ScriptFileCreate": ".types",
|
||||
"ScriptRunResponse": ".types",
|
||||
"SdkAction": ".types",
|
||||
"SdkAction_AiAct": ".types",
|
||||
"SdkAction_AiClick": ".types",
|
||||
"SdkAction_AiInputText": ".types",
|
||||
"SdkAction_AiSelectOption": ".types",
|
||||
@@ -952,6 +956,7 @@ def __dir__():
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ActAction",
|
||||
"Action",
|
||||
"ActionBlock",
|
||||
"ActionBlockDataSchema",
|
||||
@@ -1220,6 +1225,7 @@ __all__ = [
|
||||
"ScriptFileCreate",
|
||||
"ScriptRunResponse",
|
||||
"SdkAction",
|
||||
"SdkAction_AiAct",
|
||||
"SdkAction_AiClick",
|
||||
"SdkAction_AiInputText",
|
||||
"SdkAction_AiSelectOption",
|
||||
|
||||
@@ -6,6 +6,7 @@ import typing
|
||||
from importlib import import_module
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from .act_action import ActAction
|
||||
from .action import Action
|
||||
from .action_block import ActionBlock
|
||||
from .action_block_data_schema import ActionBlockDataSchema
|
||||
@@ -297,6 +298,7 @@ if typing.TYPE_CHECKING:
|
||||
from .script_run_response import ScriptRunResponse
|
||||
from .sdk_action import (
|
||||
SdkAction,
|
||||
SdkAction_AiAct,
|
||||
SdkAction_AiClick,
|
||||
SdkAction_AiInputText,
|
||||
SdkAction_AiSelectOption,
|
||||
@@ -505,6 +507,7 @@ if typing.TYPE_CHECKING:
|
||||
from .workflow_run_timeline_type import WorkflowRunTimelineType
|
||||
from .workflow_status import WorkflowStatus
|
||||
_dynamic_imports: typing.Dict[str, str] = {
|
||||
"ActAction": ".act_action",
|
||||
"Action": ".action",
|
||||
"ActionBlock": ".action_block",
|
||||
"ActionBlockDataSchema": ".action_block_data_schema",
|
||||
@@ -769,6 +772,7 @@ _dynamic_imports: typing.Dict[str, str] = {
|
||||
"ScriptFileCreate": ".script_file_create",
|
||||
"ScriptRunResponse": ".script_run_response",
|
||||
"SdkAction": ".sdk_action",
|
||||
"SdkAction_AiAct": ".sdk_action",
|
||||
"SdkAction_AiClick": ".sdk_action",
|
||||
"SdkAction_AiInputText": ".sdk_action",
|
||||
"SdkAction_AiSelectOption": ".sdk_action",
|
||||
@@ -982,6 +986,7 @@ def __dir__():
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ActAction",
|
||||
"Action",
|
||||
"ActionBlock",
|
||||
"ActionBlockDataSchema",
|
||||
@@ -1246,6 +1251,7 @@ __all__ = [
|
||||
"ScriptFileCreate",
|
||||
"ScriptRunResponse",
|
||||
"SdkAction",
|
||||
"SdkAction_AiAct",
|
||||
"SdkAction_AiClick",
|
||||
"SdkAction_AiInputText",
|
||||
"SdkAction_AiSelectOption",
|
||||
|
||||
@@ -68,6 +68,20 @@ class SdkAction_AiSelectOption(UniversalBaseModel):
|
||||
extra = pydantic.Extra.allow
|
||||
|
||||
|
||||
class SdkAction_AiAct(UniversalBaseModel):
|
||||
type: typing.Literal["ai_act"] = "ai_act"
|
||||
intention: typing.Optional[str] = None
|
||||
|
||||
if IS_PYDANTIC_V2:
|
||||
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
||||
else:
|
||||
|
||||
class Config:
|
||||
frozen = True
|
||||
smart_union = True
|
||||
extra = pydantic.Extra.allow
|
||||
|
||||
|
||||
class SdkAction_Extract(UniversalBaseModel):
|
||||
type: typing.Literal["extract"] = "extract"
|
||||
prompt: typing.Optional[str] = None
|
||||
@@ -86,4 +100,6 @@ class SdkAction_Extract(UniversalBaseModel):
|
||||
extra = pydantic.Extra.allow
|
||||
|
||||
|
||||
SdkAction = typing.Union[SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_Extract]
|
||||
SdkAction = typing.Union[
|
||||
SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_AiAct, SdkAction_Extract
|
||||
]
|
||||
|
||||
@@ -20,7 +20,9 @@ from skyvern.utils.prompt_engine import load_prompt_with_elements
|
||||
from skyvern.webeye.actions import handler_utils
|
||||
from skyvern.webeye.actions.actions import (
|
||||
ActionStatus,
|
||||
ClickAction,
|
||||
InputTextAction,
|
||||
SelectOptionAction,
|
||||
UploadFileAction,
|
||||
)
|
||||
from skyvern.webeye.actions.handler import (
|
||||
@@ -171,7 +173,7 @@ class RealSkyvernPageAi(SkyvernPageAi):
|
||||
actions = parse_actions(
|
||||
task, step.step_id, step.order, self.scraped_page, json_response.get("actions", [])
|
||||
)
|
||||
action = actions[0]
|
||||
action = cast(ClickAction, actions[0])
|
||||
result = await handle_click_action(action, self.page, self.scraped_page, task, step)
|
||||
if result and result[-1].success is False:
|
||||
raise Exception(result[-1].exception_message)
|
||||
@@ -452,7 +454,7 @@ class RealSkyvernPageAi(SkyvernPageAi):
|
||||
task, step.step_id, step.order, self.scraped_page, json_response.get("actions", [])
|
||||
)
|
||||
if actions:
|
||||
action = actions[0]
|
||||
action = cast(SelectOptionAction, actions[0])
|
||||
if not action.option:
|
||||
raise ValueError("SelectOptionAction requires an 'option' field")
|
||||
option_value = action.option.value or action.option.label or ""
|
||||
@@ -531,6 +533,132 @@ class RealSkyvernPageAi(SkyvernPageAi):
|
||||
print(f"{'-' * 50}\n")
|
||||
return result
|
||||
|
||||
async def ai_act(
|
||||
self,
|
||||
prompt: str,
|
||||
) -> None:
|
||||
"""Perform an action on the page using AI based on a natural language prompt."""
|
||||
context = skyvern_context.ensure_context()
|
||||
organization_id = context.organization_id
|
||||
task_id = context.task_id
|
||||
step_id = context.step_id
|
||||
|
||||
task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None
|
||||
step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None
|
||||
|
||||
if not task or not step:
|
||||
LOG.warning("ai_act: missing task or step", task_id=task_id, step_id=step_id)
|
||||
return
|
||||
|
||||
# First, infer the action type from the prompt
|
||||
infer_action_type_prompt = prompt_engine.load_prompt(
|
||||
template="infer-action-type",
|
||||
navigation_goal=prompt,
|
||||
)
|
||||
|
||||
json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER(
|
||||
prompt=infer_action_type_prompt,
|
||||
prompt_name="infer-action-type",
|
||||
step=step,
|
||||
organization_id=organization_id,
|
||||
)
|
||||
|
||||
if not json_response or "inferred_actions" not in json_response:
|
||||
LOG.warning("ai_act: failed to infer action type", prompt=prompt, response=json_response)
|
||||
return
|
||||
|
||||
inferred_actions = json_response.get("inferred_actions", [])
|
||||
if not inferred_actions:
|
||||
error = json_response.get("error")
|
||||
LOG.warning("ai_act: no action type inferred", prompt=prompt, error=error)
|
||||
return
|
||||
|
||||
action_info = inferred_actions[0]
|
||||
action_type = action_info.get("action_type")
|
||||
confidence = action_info.get("confidence_float", 0.0)
|
||||
|
||||
LOG.info(
|
||||
"ai_act: inferred action type",
|
||||
prompt=prompt,
|
||||
action_type=action_type,
|
||||
confidence=confidence,
|
||||
reasoning=action_info.get("reasoning"),
|
||||
)
|
||||
|
||||
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
|
||||
self.scraped_page = refreshed_page
|
||||
element_tree = refreshed_page.build_element_tree()
|
||||
|
||||
template: str
|
||||
llm_handler: Any
|
||||
if action_type == "CLICK":
|
||||
template = "single-click-action"
|
||||
llm_handler = app.SINGLE_CLICK_AGENT_LLM_API_HANDLER
|
||||
elif action_type == "INPUT_TEXT":
|
||||
template = "single-input-action"
|
||||
llm_handler = app.SINGLE_INPUT_AGENT_LLM_API_HANDLER
|
||||
elif action_type == "UPLOAD_FILE":
|
||||
template = "single-upload-action"
|
||||
llm_handler = app.SINGLE_INPUT_AGENT_LLM_API_HANDLER
|
||||
elif action_type == "SELECT_OPTION":
|
||||
template = "single-select-action"
|
||||
llm_handler = app.SELECT_AGENT_LLM_API_HANDLER
|
||||
else:
|
||||
LOG.warning("ai_act: unknown action type", action_type=action_type, prompt=prompt)
|
||||
return
|
||||
|
||||
single_action_prompt = prompt_engine.load_prompt(
|
||||
template=template,
|
||||
navigation_goal=prompt,
|
||||
navigation_payload_str=None,
|
||||
current_url=self.page.url,
|
||||
elements=element_tree,
|
||||
local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(),
|
||||
)
|
||||
|
||||
try:
|
||||
action_response = await llm_handler(
|
||||
prompt=single_action_prompt,
|
||||
prompt_name=template,
|
||||
step=step,
|
||||
organization_id=organization_id,
|
||||
)
|
||||
|
||||
actions_json = action_response.get("actions", [])
|
||||
if not actions_json:
|
||||
LOG.warning("ai_act: no actions generated", prompt=prompt, action_type=action_type)
|
||||
return
|
||||
|
||||
actions = parse_actions(task, step.step_id, step.order, refreshed_page, actions_json)
|
||||
if not actions:
|
||||
LOG.warning("ai_act: failed to parse actions", prompt=prompt, action_type=action_type)
|
||||
return
|
||||
|
||||
action = actions[0]
|
||||
|
||||
if action_type == "CLICK" and isinstance(action, ClickAction):
|
||||
result = await handle_click_action(action, self.page, refreshed_page, task, step)
|
||||
elif action_type == "INPUT_TEXT" and isinstance(action, InputTextAction):
|
||||
result = await handle_input_text_action(action, self.page, refreshed_page, task, step)
|
||||
elif action_type == "UPLOAD_FILE" and isinstance(action, UploadFileAction):
|
||||
result = await handle_upload_file_action(action, self.page, refreshed_page, task, step)
|
||||
elif action_type == "SELECT_OPTION" and isinstance(action, SelectOptionAction):
|
||||
result = await handle_select_option_action(action, self.page, refreshed_page, task, step)
|
||||
else:
|
||||
LOG.warning(
|
||||
"ai_act: action type mismatch",
|
||||
expected_type=action_type,
|
||||
actual_type=type(action).__name__,
|
||||
prompt=prompt,
|
||||
)
|
||||
return
|
||||
|
||||
if result and result[-1].success is False:
|
||||
raise Exception(result[-1].exception_message)
|
||||
|
||||
except Exception:
|
||||
LOG.exception("ai_act: failed to execute action", action_type=action_type, prompt=prompt)
|
||||
|
||||
|
||||
async def _get_actual_value_of_parameter_if_secret(workflow_run_id: str, parameter: str) -> Any:
|
||||
"""
|
||||
|
||||
@@ -63,3 +63,10 @@ class SkyvernPageAi(Protocol):
|
||||
) -> dict[str, Any] | list | str | None:
|
||||
"""Extract information from the page using AI."""
|
||||
...
|
||||
|
||||
async def ai_act(
|
||||
self,
|
||||
prompt: str,
|
||||
) -> None:
|
||||
"""Perform an action on the page using AI based on a natural language prompt."""
|
||||
...
|
||||
|
||||
@@ -174,6 +174,11 @@ async def run_sdk_action(
|
||||
data=action.data,
|
||||
timeout=action.timeout,
|
||||
)
|
||||
elif action.type == "ai_act":
|
||||
await page_ai.ai_act(
|
||||
prompt=action.intention,
|
||||
)
|
||||
result = None
|
||||
elif action.type == "extract":
|
||||
extract_result = await page_ai.ai_extract(
|
||||
prompt=action.prompt,
|
||||
|
||||
@@ -12,6 +12,7 @@ class SdkActionType(str, Enum):
|
||||
AI_CLICK = "ai_click"
|
||||
AI_INPUT_TEXT = "ai_input_text"
|
||||
AI_SELECT_OPTION = "ai_select_option"
|
||||
AI_ACT = "ai_act"
|
||||
EXTRACT = "extract"
|
||||
|
||||
|
||||
@@ -57,6 +58,14 @@ class SelectOptionAction(SdkActionBase):
|
||||
timeout: float = Field(default=settings.BROWSER_ACTION_TIMEOUT_MS, description="Timeout in milliseconds")
|
||||
|
||||
|
||||
class ActAction(SdkActionBase):
|
||||
"""AI act action parameters."""
|
||||
|
||||
type: Literal["ai_act"] = "ai_act"
|
||||
intention: str = Field(default="", description="Natural language prompt for the action")
|
||||
data: str | dict[str, Any] | None = Field(None, description="Additional context data")
|
||||
|
||||
|
||||
class ExtractAction(SdkActionBase):
|
||||
"""Extract data action parameters."""
|
||||
|
||||
@@ -70,7 +79,7 @@ class ExtractAction(SdkActionBase):
|
||||
|
||||
# Discriminated union of all action types
|
||||
SdkAction = Annotated[
|
||||
Union[ClickAction, InputTextAction, SelectOptionAction, ExtractAction],
|
||||
Union[ClickAction, InputTextAction, SelectOptionAction, ActAction, ExtractAction],
|
||||
Field(discriminator="type"),
|
||||
]
|
||||
|
||||
|
||||
@@ -586,6 +586,23 @@ class SkyvernBrowserPage:
|
||||
"""
|
||||
return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data)
|
||||
|
||||
async def act(
|
||||
self,
|
||||
prompt: str,
|
||||
) -> None:
|
||||
"""Perform an action on the page using AI based on a natural language prompt.
|
||||
|
||||
Args:
|
||||
prompt: Natural language description of the action to perform.
|
||||
|
||||
Examples:
|
||||
```python
|
||||
# Simple action
|
||||
await page.act("Click the login button")
|
||||
```
|
||||
"""
|
||||
return await self._ai.ai_act(prompt)
|
||||
|
||||
async def reload(self, **kwargs: Any) -> None:
|
||||
"""Reload the current page.
|
||||
|
||||
|
||||
@@ -2,7 +2,13 @@ from typing import TYPE_CHECKING, Any
|
||||
|
||||
from playwright.async_api import Page
|
||||
|
||||
from skyvern.client import SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_Extract
|
||||
from skyvern.client import (
|
||||
SdkAction_AiAct,
|
||||
SdkAction_AiClick,
|
||||
SdkAction_AiInputText,
|
||||
SdkAction_AiSelectOption,
|
||||
SdkAction_Extract,
|
||||
)
|
||||
from skyvern.config import settings
|
||||
from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
|
||||
|
||||
@@ -140,3 +146,21 @@ class SdkSkyvernPageAi(SkyvernPageAi):
|
||||
)
|
||||
self._browser.workflow_run_id = response.workflow_run_id
|
||||
return response.result if response.result else None
|
||||
|
||||
async def ai_act(
|
||||
self,
|
||||
prompt: str,
|
||||
) -> None:
|
||||
"""Perform an action on the page using AI via API call."""
|
||||
|
||||
await self._browser.sdk.ensure_has_server()
|
||||
response = await self._browser.client.run_sdk_action(
|
||||
url=self._page.url,
|
||||
action=SdkAction_AiAct(
|
||||
intention=prompt,
|
||||
),
|
||||
browser_session_id=self._browser.browser_session_id,
|
||||
browser_address=self._browser.browser_address,
|
||||
workflow_run_id=self._browser.workflow_run_id,
|
||||
)
|
||||
self._browser.workflow_run_id = response.workflow_run_id
|
||||
|
||||
Reference in New Issue
Block a user