SDK: Support AI act feature (#3888)

This commit is contained in:
Stanislav Novosad
2025-11-04 11:28:43 -07:00
committed by GitHub
parent bdbabd5153
commit ba99e9ea2c
10 changed files with 246 additions and 5 deletions

View File

@@ -9570,6 +9570,25 @@
"required": ["type"], "required": ["type"],
"title": "SelectOptionAction" "title": "SelectOptionAction"
}, },
"ActAction": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "ai_act",
"title": "Type",
"default": "ai_act"
},
"intention": {
"type": "string",
"title": "Prompt",
"description": "Natural language prompt for the action",
"default": ""
}
},
"required": ["type"],
"title": "ActAction"
},
"ExtractAction": { "ExtractAction": {
"type": "object", "type": "object",
"properties": { "properties": {
@@ -9660,6 +9679,9 @@
{ {
"$ref": "#/components/schemas/SelectOptionAction" "$ref": "#/components/schemas/SelectOptionAction"
}, },
{
"$ref": "#/components/schemas/ActAction"
},
{ {
"$ref": "#/components/schemas/ExtractAction" "$ref": "#/components/schemas/ExtractAction"
} }
@@ -9670,6 +9692,7 @@
"ai_click": "#/components/schemas/ClickAction", "ai_click": "#/components/schemas/ClickAction",
"ai_input_text": "#/components/schemas/InputTextAction", "ai_input_text": "#/components/schemas/InputTextAction",
"ai_select_option": "#/components/schemas/SelectOptionAction", "ai_select_option": "#/components/schemas/SelectOptionAction",
"ai_act": "#/components/schemas/ActAction",
"extract": "#/components/schemas/ExtractAction" "extract": "#/components/schemas/ExtractAction"
} }
}, },

View File

@@ -7,6 +7,7 @@ from importlib import import_module
if typing.TYPE_CHECKING: if typing.TYPE_CHECKING:
from .types import ( from .types import (
ActAction,
Action, Action,
ActionBlock, ActionBlock,
ActionBlockDataSchema, ActionBlockDataSchema,
@@ -271,6 +272,7 @@ if typing.TYPE_CHECKING:
ScriptFileCreate, ScriptFileCreate,
ScriptRunResponse, ScriptRunResponse,
SdkAction, SdkAction,
SdkAction_AiAct,
SdkAction_AiClick, SdkAction_AiClick,
SdkAction_AiInputText, SdkAction_AiInputText,
SdkAction_AiSelectOption, SdkAction_AiSelectOption,
@@ -466,6 +468,7 @@ if typing.TYPE_CHECKING:
from .environment import SkyvernEnvironment from .environment import SkyvernEnvironment
from .version import __version__ from .version import __version__
_dynamic_imports: typing.Dict[str, str] = { _dynamic_imports: typing.Dict[str, str] = {
"ActAction": ".types",
"Action": ".types", "Action": ".types",
"ActionBlock": ".types", "ActionBlock": ".types",
"ActionBlockDataSchema": ".types", "ActionBlockDataSchema": ".types",
@@ -734,6 +737,7 @@ _dynamic_imports: typing.Dict[str, str] = {
"ScriptFileCreate": ".types", "ScriptFileCreate": ".types",
"ScriptRunResponse": ".types", "ScriptRunResponse": ".types",
"SdkAction": ".types", "SdkAction": ".types",
"SdkAction_AiAct": ".types",
"SdkAction_AiClick": ".types", "SdkAction_AiClick": ".types",
"SdkAction_AiInputText": ".types", "SdkAction_AiInputText": ".types",
"SdkAction_AiSelectOption": ".types", "SdkAction_AiSelectOption": ".types",
@@ -952,6 +956,7 @@ def __dir__():
__all__ = [ __all__ = [
"ActAction",
"Action", "Action",
"ActionBlock", "ActionBlock",
"ActionBlockDataSchema", "ActionBlockDataSchema",
@@ -1220,6 +1225,7 @@ __all__ = [
"ScriptFileCreate", "ScriptFileCreate",
"ScriptRunResponse", "ScriptRunResponse",
"SdkAction", "SdkAction",
"SdkAction_AiAct",
"SdkAction_AiClick", "SdkAction_AiClick",
"SdkAction_AiInputText", "SdkAction_AiInputText",
"SdkAction_AiSelectOption", "SdkAction_AiSelectOption",

View File

@@ -6,6 +6,7 @@ import typing
from importlib import import_module from importlib import import_module
if typing.TYPE_CHECKING: if typing.TYPE_CHECKING:
from .act_action import ActAction
from .action import Action from .action import Action
from .action_block import ActionBlock from .action_block import ActionBlock
from .action_block_data_schema import ActionBlockDataSchema from .action_block_data_schema import ActionBlockDataSchema
@@ -297,6 +298,7 @@ if typing.TYPE_CHECKING:
from .script_run_response import ScriptRunResponse from .script_run_response import ScriptRunResponse
from .sdk_action import ( from .sdk_action import (
SdkAction, SdkAction,
SdkAction_AiAct,
SdkAction_AiClick, SdkAction_AiClick,
SdkAction_AiInputText, SdkAction_AiInputText,
SdkAction_AiSelectOption, SdkAction_AiSelectOption,
@@ -505,6 +507,7 @@ if typing.TYPE_CHECKING:
from .workflow_run_timeline_type import WorkflowRunTimelineType from .workflow_run_timeline_type import WorkflowRunTimelineType
from .workflow_status import WorkflowStatus from .workflow_status import WorkflowStatus
_dynamic_imports: typing.Dict[str, str] = { _dynamic_imports: typing.Dict[str, str] = {
"ActAction": ".act_action",
"Action": ".action", "Action": ".action",
"ActionBlock": ".action_block", "ActionBlock": ".action_block",
"ActionBlockDataSchema": ".action_block_data_schema", "ActionBlockDataSchema": ".action_block_data_schema",
@@ -769,6 +772,7 @@ _dynamic_imports: typing.Dict[str, str] = {
"ScriptFileCreate": ".script_file_create", "ScriptFileCreate": ".script_file_create",
"ScriptRunResponse": ".script_run_response", "ScriptRunResponse": ".script_run_response",
"SdkAction": ".sdk_action", "SdkAction": ".sdk_action",
"SdkAction_AiAct": ".sdk_action",
"SdkAction_AiClick": ".sdk_action", "SdkAction_AiClick": ".sdk_action",
"SdkAction_AiInputText": ".sdk_action", "SdkAction_AiInputText": ".sdk_action",
"SdkAction_AiSelectOption": ".sdk_action", "SdkAction_AiSelectOption": ".sdk_action",
@@ -982,6 +986,7 @@ def __dir__():
__all__ = [ __all__ = [
"ActAction",
"Action", "Action",
"ActionBlock", "ActionBlock",
"ActionBlockDataSchema", "ActionBlockDataSchema",
@@ -1246,6 +1251,7 @@ __all__ = [
"ScriptFileCreate", "ScriptFileCreate",
"ScriptRunResponse", "ScriptRunResponse",
"SdkAction", "SdkAction",
"SdkAction_AiAct",
"SdkAction_AiClick", "SdkAction_AiClick",
"SdkAction_AiInputText", "SdkAction_AiInputText",
"SdkAction_AiSelectOption", "SdkAction_AiSelectOption",

View File

@@ -68,6 +68,20 @@ class SdkAction_AiSelectOption(UniversalBaseModel):
extra = pydantic.Extra.allow extra = pydantic.Extra.allow
class SdkAction_AiAct(UniversalBaseModel):
type: typing.Literal["ai_act"] = "ai_act"
intention: typing.Optional[str] = None
if IS_PYDANTIC_V2:
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
else:
class Config:
frozen = True
smart_union = True
extra = pydantic.Extra.allow
class SdkAction_Extract(UniversalBaseModel): class SdkAction_Extract(UniversalBaseModel):
type: typing.Literal["extract"] = "extract" type: typing.Literal["extract"] = "extract"
prompt: typing.Optional[str] = None prompt: typing.Optional[str] = None
@@ -86,4 +100,6 @@ class SdkAction_Extract(UniversalBaseModel):
extra = pydantic.Extra.allow extra = pydantic.Extra.allow
SdkAction = typing.Union[SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_Extract] SdkAction = typing.Union[
SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_AiAct, SdkAction_Extract
]

View File

@@ -20,7 +20,9 @@ from skyvern.utils.prompt_engine import load_prompt_with_elements
from skyvern.webeye.actions import handler_utils from skyvern.webeye.actions import handler_utils
from skyvern.webeye.actions.actions import ( from skyvern.webeye.actions.actions import (
ActionStatus, ActionStatus,
ClickAction,
InputTextAction, InputTextAction,
SelectOptionAction,
UploadFileAction, UploadFileAction,
) )
from skyvern.webeye.actions.handler import ( from skyvern.webeye.actions.handler import (
@@ -171,7 +173,7 @@ class RealSkyvernPageAi(SkyvernPageAi):
actions = parse_actions( actions = parse_actions(
task, step.step_id, step.order, self.scraped_page, json_response.get("actions", []) task, step.step_id, step.order, self.scraped_page, json_response.get("actions", [])
) )
action = actions[0] action = cast(ClickAction, actions[0])
result = await handle_click_action(action, self.page, self.scraped_page, task, step) result = await handle_click_action(action, self.page, self.scraped_page, task, step)
if result and result[-1].success is False: if result and result[-1].success is False:
raise Exception(result[-1].exception_message) raise Exception(result[-1].exception_message)
@@ -452,7 +454,7 @@ class RealSkyvernPageAi(SkyvernPageAi):
task, step.step_id, step.order, self.scraped_page, json_response.get("actions", []) task, step.step_id, step.order, self.scraped_page, json_response.get("actions", [])
) )
if actions: if actions:
action = actions[0] action = cast(SelectOptionAction, actions[0])
if not action.option: if not action.option:
raise ValueError("SelectOptionAction requires an 'option' field") raise ValueError("SelectOptionAction requires an 'option' field")
option_value = action.option.value or action.option.label or "" option_value = action.option.value or action.option.label or ""
@@ -531,6 +533,132 @@ class RealSkyvernPageAi(SkyvernPageAi):
print(f"{'-' * 50}\n") print(f"{'-' * 50}\n")
return result return result
async def ai_act(
self,
prompt: str,
) -> None:
"""Perform an action on the page using AI based on a natural language prompt."""
context = skyvern_context.ensure_context()
organization_id = context.organization_id
task_id = context.task_id
step_id = context.step_id
task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None
step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None
if not task or not step:
LOG.warning("ai_act: missing task or step", task_id=task_id, step_id=step_id)
return
# First, infer the action type from the prompt
infer_action_type_prompt = prompt_engine.load_prompt(
template="infer-action-type",
navigation_goal=prompt,
)
json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER(
prompt=infer_action_type_prompt,
prompt_name="infer-action-type",
step=step,
organization_id=organization_id,
)
if not json_response or "inferred_actions" not in json_response:
LOG.warning("ai_act: failed to infer action type", prompt=prompt, response=json_response)
return
inferred_actions = json_response.get("inferred_actions", [])
if not inferred_actions:
error = json_response.get("error")
LOG.warning("ai_act: no action type inferred", prompt=prompt, error=error)
return
action_info = inferred_actions[0]
action_type = action_info.get("action_type")
confidence = action_info.get("confidence_float", 0.0)
LOG.info(
"ai_act: inferred action type",
prompt=prompt,
action_type=action_type,
confidence=confidence,
reasoning=action_info.get("reasoning"),
)
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
self.scraped_page = refreshed_page
element_tree = refreshed_page.build_element_tree()
template: str
llm_handler: Any
if action_type == "CLICK":
template = "single-click-action"
llm_handler = app.SINGLE_CLICK_AGENT_LLM_API_HANDLER
elif action_type == "INPUT_TEXT":
template = "single-input-action"
llm_handler = app.SINGLE_INPUT_AGENT_LLM_API_HANDLER
elif action_type == "UPLOAD_FILE":
template = "single-upload-action"
llm_handler = app.SINGLE_INPUT_AGENT_LLM_API_HANDLER
elif action_type == "SELECT_OPTION":
template = "single-select-action"
llm_handler = app.SELECT_AGENT_LLM_API_HANDLER
else:
LOG.warning("ai_act: unknown action type", action_type=action_type, prompt=prompt)
return
single_action_prompt = prompt_engine.load_prompt(
template=template,
navigation_goal=prompt,
navigation_payload_str=None,
current_url=self.page.url,
elements=element_tree,
local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(),
)
try:
action_response = await llm_handler(
prompt=single_action_prompt,
prompt_name=template,
step=step,
organization_id=organization_id,
)
actions_json = action_response.get("actions", [])
if not actions_json:
LOG.warning("ai_act: no actions generated", prompt=prompt, action_type=action_type)
return
actions = parse_actions(task, step.step_id, step.order, refreshed_page, actions_json)
if not actions:
LOG.warning("ai_act: failed to parse actions", prompt=prompt, action_type=action_type)
return
action = actions[0]
if action_type == "CLICK" and isinstance(action, ClickAction):
result = await handle_click_action(action, self.page, refreshed_page, task, step)
elif action_type == "INPUT_TEXT" and isinstance(action, InputTextAction):
result = await handle_input_text_action(action, self.page, refreshed_page, task, step)
elif action_type == "UPLOAD_FILE" and isinstance(action, UploadFileAction):
result = await handle_upload_file_action(action, self.page, refreshed_page, task, step)
elif action_type == "SELECT_OPTION" and isinstance(action, SelectOptionAction):
result = await handle_select_option_action(action, self.page, refreshed_page, task, step)
else:
LOG.warning(
"ai_act: action type mismatch",
expected_type=action_type,
actual_type=type(action).__name__,
prompt=prompt,
)
return
if result and result[-1].success is False:
raise Exception(result[-1].exception_message)
except Exception:
LOG.exception("ai_act: failed to execute action", action_type=action_type, prompt=prompt)
async def _get_actual_value_of_parameter_if_secret(workflow_run_id: str, parameter: str) -> Any: async def _get_actual_value_of_parameter_if_secret(workflow_run_id: str, parameter: str) -> Any:
""" """

View File

@@ -63,3 +63,10 @@ class SkyvernPageAi(Protocol):
) -> dict[str, Any] | list | str | None: ) -> dict[str, Any] | list | str | None:
"""Extract information from the page using AI.""" """Extract information from the page using AI."""
... ...
async def ai_act(
self,
prompt: str,
) -> None:
"""Perform an action on the page using AI based on a natural language prompt."""
...

View File

@@ -174,6 +174,11 @@ async def run_sdk_action(
data=action.data, data=action.data,
timeout=action.timeout, timeout=action.timeout,
) )
elif action.type == "ai_act":
await page_ai.ai_act(
prompt=action.intention,
)
result = None
elif action.type == "extract": elif action.type == "extract":
extract_result = await page_ai.ai_extract( extract_result = await page_ai.ai_extract(
prompt=action.prompt, prompt=action.prompt,

View File

@@ -12,6 +12,7 @@ class SdkActionType(str, Enum):
AI_CLICK = "ai_click" AI_CLICK = "ai_click"
AI_INPUT_TEXT = "ai_input_text" AI_INPUT_TEXT = "ai_input_text"
AI_SELECT_OPTION = "ai_select_option" AI_SELECT_OPTION = "ai_select_option"
AI_ACT = "ai_act"
EXTRACT = "extract" EXTRACT = "extract"
@@ -57,6 +58,14 @@ class SelectOptionAction(SdkActionBase):
timeout: float = Field(default=settings.BROWSER_ACTION_TIMEOUT_MS, description="Timeout in milliseconds") timeout: float = Field(default=settings.BROWSER_ACTION_TIMEOUT_MS, description="Timeout in milliseconds")
class ActAction(SdkActionBase):
"""AI act action parameters."""
type: Literal["ai_act"] = "ai_act"
intention: str = Field(default="", description="Natural language prompt for the action")
data: str | dict[str, Any] | None = Field(None, description="Additional context data")
class ExtractAction(SdkActionBase): class ExtractAction(SdkActionBase):
"""Extract data action parameters.""" """Extract data action parameters."""
@@ -70,7 +79,7 @@ class ExtractAction(SdkActionBase):
# Discriminated union of all action types # Discriminated union of all action types
SdkAction = Annotated[ SdkAction = Annotated[
Union[ClickAction, InputTextAction, SelectOptionAction, ExtractAction], Union[ClickAction, InputTextAction, SelectOptionAction, ActAction, ExtractAction],
Field(discriminator="type"), Field(discriminator="type"),
] ]

View File

@@ -586,6 +586,23 @@ class SkyvernBrowserPage:
""" """
return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data) return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data)
async def act(
self,
prompt: str,
) -> None:
"""Perform an action on the page using AI based on a natural language prompt.
Args:
prompt: Natural language description of the action to perform.
Examples:
```python
# Simple action
await page.act("Click the login button")
```
"""
return await self._ai.ai_act(prompt)
async def reload(self, **kwargs: Any) -> None: async def reload(self, **kwargs: Any) -> None:
"""Reload the current page. """Reload the current page.

View File

@@ -2,7 +2,13 @@ from typing import TYPE_CHECKING, Any
from playwright.async_api import Page from playwright.async_api import Page
from skyvern.client import SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_Extract from skyvern.client import (
SdkAction_AiAct,
SdkAction_AiClick,
SdkAction_AiInputText,
SdkAction_AiSelectOption,
SdkAction_Extract,
)
from skyvern.config import settings from skyvern.config import settings
from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
@@ -140,3 +146,21 @@ class SdkSkyvernPageAi(SkyvernPageAi):
) )
self._browser.workflow_run_id = response.workflow_run_id self._browser.workflow_run_id = response.workflow_run_id
return response.result if response.result else None return response.result if response.result else None
async def ai_act(
self,
prompt: str,
) -> None:
"""Perform an action on the page using AI via API call."""
await self._browser.sdk.ensure_has_server()
response = await self._browser.client.run_sdk_action(
url=self._page.url,
action=SdkAction_AiAct(
intention=prompt,
),
browser_session_id=self._browser.browser_session_id,
browser_address=self._browser.browser_address,
workflow_run_id=self._browser.workflow_run_id,
)
self._browser.workflow_run_id = response.workflow_run_id