diff --git a/fern/openapi/skyvern_openapi.json b/fern/openapi/skyvern_openapi.json index 938dfbb0..5004fe90 100644 --- a/fern/openapi/skyvern_openapi.json +++ b/fern/openapi/skyvern_openapi.json @@ -9570,6 +9570,25 @@ "required": ["type"], "title": "SelectOptionAction" }, + "ActAction": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "ai_act", + "title": "Type", + "default": "ai_act" + }, + "intention": { + "type": "string", + "title": "Prompt", + "description": "Natural language prompt for the action", + "default": "" + } + }, + "required": ["type"], + "title": "ActAction" + }, "ExtractAction": { "type": "object", "properties": { @@ -9660,6 +9679,9 @@ { "$ref": "#/components/schemas/SelectOptionAction" }, + { + "$ref": "#/components/schemas/ActAction" + }, { "$ref": "#/components/schemas/ExtractAction" } @@ -9670,6 +9692,7 @@ "ai_click": "#/components/schemas/ClickAction", "ai_input_text": "#/components/schemas/InputTextAction", "ai_select_option": "#/components/schemas/SelectOptionAction", + "ai_act": "#/components/schemas/ActAction", "extract": "#/components/schemas/ExtractAction" } }, diff --git a/skyvern/client/__init__.py b/skyvern/client/__init__.py index b2b87372..626bf670 100644 --- a/skyvern/client/__init__.py +++ b/skyvern/client/__init__.py @@ -7,6 +7,7 @@ from importlib import import_module if typing.TYPE_CHECKING: from .types import ( + ActAction, Action, ActionBlock, ActionBlockDataSchema, @@ -271,6 +272,7 @@ if typing.TYPE_CHECKING: ScriptFileCreate, ScriptRunResponse, SdkAction, + SdkAction_AiAct, SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, @@ -466,6 +468,7 @@ if typing.TYPE_CHECKING: from .environment import SkyvernEnvironment from .version import __version__ _dynamic_imports: typing.Dict[str, str] = { + "ActAction": ".types", "Action": ".types", "ActionBlock": ".types", "ActionBlockDataSchema": ".types", @@ -734,6 +737,7 @@ _dynamic_imports: typing.Dict[str, str] = { "ScriptFileCreate": ".types", "ScriptRunResponse": ".types", "SdkAction": ".types", + "SdkAction_AiAct": ".types", "SdkAction_AiClick": ".types", "SdkAction_AiInputText": ".types", "SdkAction_AiSelectOption": ".types", @@ -952,6 +956,7 @@ def __dir__(): __all__ = [ + "ActAction", "Action", "ActionBlock", "ActionBlockDataSchema", @@ -1220,6 +1225,7 @@ __all__ = [ "ScriptFileCreate", "ScriptRunResponse", "SdkAction", + "SdkAction_AiAct", "SdkAction_AiClick", "SdkAction_AiInputText", "SdkAction_AiSelectOption", diff --git a/skyvern/client/types/__init__.py b/skyvern/client/types/__init__.py index 0310da7b..874c5df0 100644 --- a/skyvern/client/types/__init__.py +++ b/skyvern/client/types/__init__.py @@ -6,6 +6,7 @@ import typing from importlib import import_module if typing.TYPE_CHECKING: + from .act_action import ActAction from .action import Action from .action_block import ActionBlock from .action_block_data_schema import ActionBlockDataSchema @@ -297,6 +298,7 @@ if typing.TYPE_CHECKING: from .script_run_response import ScriptRunResponse from .sdk_action import ( SdkAction, + SdkAction_AiAct, SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, @@ -505,6 +507,7 @@ if typing.TYPE_CHECKING: from .workflow_run_timeline_type import WorkflowRunTimelineType from .workflow_status import WorkflowStatus _dynamic_imports: typing.Dict[str, str] = { + "ActAction": ".act_action", "Action": ".action", "ActionBlock": ".action_block", "ActionBlockDataSchema": ".action_block_data_schema", @@ -769,6 +772,7 @@ _dynamic_imports: typing.Dict[str, str] = { "ScriptFileCreate": ".script_file_create", "ScriptRunResponse": ".script_run_response", "SdkAction": ".sdk_action", + "SdkAction_AiAct": ".sdk_action", "SdkAction_AiClick": ".sdk_action", "SdkAction_AiInputText": ".sdk_action", "SdkAction_AiSelectOption": ".sdk_action", @@ -982,6 +986,7 @@ def __dir__(): __all__ = [ + "ActAction", "Action", "ActionBlock", "ActionBlockDataSchema", @@ -1246,6 +1251,7 @@ __all__ = [ "ScriptFileCreate", "ScriptRunResponse", "SdkAction", + "SdkAction_AiAct", "SdkAction_AiClick", "SdkAction_AiInputText", "SdkAction_AiSelectOption", diff --git a/skyvern/client/types/sdk_action.py b/skyvern/client/types/sdk_action.py index ec0bd5ee..41a15e2b 100644 --- a/skyvern/client/types/sdk_action.py +++ b/skyvern/client/types/sdk_action.py @@ -68,6 +68,20 @@ class SdkAction_AiSelectOption(UniversalBaseModel): extra = pydantic.Extra.allow +class SdkAction_AiAct(UniversalBaseModel): + type: typing.Literal["ai_act"] = "ai_act" + intention: typing.Optional[str] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + class SdkAction_Extract(UniversalBaseModel): type: typing.Literal["extract"] = "extract" prompt: typing.Optional[str] = None @@ -86,4 +100,6 @@ class SdkAction_Extract(UniversalBaseModel): extra = pydantic.Extra.allow -SdkAction = typing.Union[SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_Extract] +SdkAction = typing.Union[ + SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_AiAct, SdkAction_Extract +] diff --git a/skyvern/core/script_generations/real_skyvern_page_ai.py b/skyvern/core/script_generations/real_skyvern_page_ai.py index d2bc76b0..6aa98f28 100644 --- a/skyvern/core/script_generations/real_skyvern_page_ai.py +++ b/skyvern/core/script_generations/real_skyvern_page_ai.py @@ -20,7 +20,9 @@ from skyvern.utils.prompt_engine import load_prompt_with_elements from skyvern.webeye.actions import handler_utils from skyvern.webeye.actions.actions import ( ActionStatus, + ClickAction, InputTextAction, + SelectOptionAction, UploadFileAction, ) from skyvern.webeye.actions.handler import ( @@ -171,7 +173,7 @@ class RealSkyvernPageAi(SkyvernPageAi): actions = parse_actions( task, step.step_id, step.order, self.scraped_page, json_response.get("actions", []) ) - action = actions[0] + action = cast(ClickAction, actions[0]) result = await handle_click_action(action, self.page, self.scraped_page, task, step) if result and result[-1].success is False: raise Exception(result[-1].exception_message) @@ -452,7 +454,7 @@ class RealSkyvernPageAi(SkyvernPageAi): task, step.step_id, step.order, self.scraped_page, json_response.get("actions", []) ) if actions: - action = actions[0] + action = cast(SelectOptionAction, actions[0]) if not action.option: raise ValueError("SelectOptionAction requires an 'option' field") option_value = action.option.value or action.option.label or "" @@ -531,6 +533,132 @@ class RealSkyvernPageAi(SkyvernPageAi): print(f"{'-' * 50}\n") return result + async def ai_act( + self, + prompt: str, + ) -> None: + """Perform an action on the page using AI based on a natural language prompt.""" + context = skyvern_context.ensure_context() + organization_id = context.organization_id + task_id = context.task_id + step_id = context.step_id + + task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None + step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None + + if not task or not step: + LOG.warning("ai_act: missing task or step", task_id=task_id, step_id=step_id) + return + + # First, infer the action type from the prompt + infer_action_type_prompt = prompt_engine.load_prompt( + template="infer-action-type", + navigation_goal=prompt, + ) + + json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER( + prompt=infer_action_type_prompt, + prompt_name="infer-action-type", + step=step, + organization_id=organization_id, + ) + + if not json_response or "inferred_actions" not in json_response: + LOG.warning("ai_act: failed to infer action type", prompt=prompt, response=json_response) + return + + inferred_actions = json_response.get("inferred_actions", []) + if not inferred_actions: + error = json_response.get("error") + LOG.warning("ai_act: no action type inferred", prompt=prompt, error=error) + return + + action_info = inferred_actions[0] + action_type = action_info.get("action_type") + confidence = action_info.get("confidence_float", 0.0) + + LOG.info( + "ai_act: inferred action type", + prompt=prompt, + action_type=action_type, + confidence=confidence, + reasoning=action_info.get("reasoning"), + ) + + refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() + self.scraped_page = refreshed_page + element_tree = refreshed_page.build_element_tree() + + template: str + llm_handler: Any + if action_type == "CLICK": + template = "single-click-action" + llm_handler = app.SINGLE_CLICK_AGENT_LLM_API_HANDLER + elif action_type == "INPUT_TEXT": + template = "single-input-action" + llm_handler = app.SINGLE_INPUT_AGENT_LLM_API_HANDLER + elif action_type == "UPLOAD_FILE": + template = "single-upload-action" + llm_handler = app.SINGLE_INPUT_AGENT_LLM_API_HANDLER + elif action_type == "SELECT_OPTION": + template = "single-select-action" + llm_handler = app.SELECT_AGENT_LLM_API_HANDLER + else: + LOG.warning("ai_act: unknown action type", action_type=action_type, prompt=prompt) + return + + single_action_prompt = prompt_engine.load_prompt( + template=template, + navigation_goal=prompt, + navigation_payload_str=None, + current_url=self.page.url, + elements=element_tree, + local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(), + ) + + try: + action_response = await llm_handler( + prompt=single_action_prompt, + prompt_name=template, + step=step, + organization_id=organization_id, + ) + + actions_json = action_response.get("actions", []) + if not actions_json: + LOG.warning("ai_act: no actions generated", prompt=prompt, action_type=action_type) + return + + actions = parse_actions(task, step.step_id, step.order, refreshed_page, actions_json) + if not actions: + LOG.warning("ai_act: failed to parse actions", prompt=prompt, action_type=action_type) + return + + action = actions[0] + + if action_type == "CLICK" and isinstance(action, ClickAction): + result = await handle_click_action(action, self.page, refreshed_page, task, step) + elif action_type == "INPUT_TEXT" and isinstance(action, InputTextAction): + result = await handle_input_text_action(action, self.page, refreshed_page, task, step) + elif action_type == "UPLOAD_FILE" and isinstance(action, UploadFileAction): + result = await handle_upload_file_action(action, self.page, refreshed_page, task, step) + elif action_type == "SELECT_OPTION" and isinstance(action, SelectOptionAction): + result = await handle_select_option_action(action, self.page, refreshed_page, task, step) + else: + LOG.warning( + "ai_act: action type mismatch", + expected_type=action_type, + actual_type=type(action).__name__, + prompt=prompt, + ) + return + + if result and result[-1].success is False: + raise Exception(result[-1].exception_message) + + except Exception: + LOG.exception("ai_act: failed to execute action", action_type=action_type, prompt=prompt) + async def _get_actual_value_of_parameter_if_secret(workflow_run_id: str, parameter: str) -> Any: """ diff --git a/skyvern/core/script_generations/skyvern_page_ai.py b/skyvern/core/script_generations/skyvern_page_ai.py index 0770b7ac..f10d635e 100644 --- a/skyvern/core/script_generations/skyvern_page_ai.py +++ b/skyvern/core/script_generations/skyvern_page_ai.py @@ -63,3 +63,10 @@ class SkyvernPageAi(Protocol): ) -> dict[str, Any] | list | str | None: """Extract information from the page using AI.""" ... + + async def ai_act( + self, + prompt: str, + ) -> None: + """Perform an action on the page using AI based on a natural language prompt.""" + ... diff --git a/skyvern/forge/sdk/routes/sdk.py b/skyvern/forge/sdk/routes/sdk.py index c6a2afbe..22a6b64a 100644 --- a/skyvern/forge/sdk/routes/sdk.py +++ b/skyvern/forge/sdk/routes/sdk.py @@ -174,6 +174,11 @@ async def run_sdk_action( data=action.data, timeout=action.timeout, ) + elif action.type == "ai_act": + await page_ai.ai_act( + prompt=action.intention, + ) + result = None elif action.type == "extract": extract_result = await page_ai.ai_extract( prompt=action.prompt, diff --git a/skyvern/forge/sdk/schemas/sdk_actions.py b/skyvern/forge/sdk/schemas/sdk_actions.py index 3f835ef3..ab4c8a08 100644 --- a/skyvern/forge/sdk/schemas/sdk_actions.py +++ b/skyvern/forge/sdk/schemas/sdk_actions.py @@ -12,6 +12,7 @@ class SdkActionType(str, Enum): AI_CLICK = "ai_click" AI_INPUT_TEXT = "ai_input_text" AI_SELECT_OPTION = "ai_select_option" + AI_ACT = "ai_act" EXTRACT = "extract" @@ -57,6 +58,14 @@ class SelectOptionAction(SdkActionBase): timeout: float = Field(default=settings.BROWSER_ACTION_TIMEOUT_MS, description="Timeout in milliseconds") +class ActAction(SdkActionBase): + """AI act action parameters.""" + + type: Literal["ai_act"] = "ai_act" + intention: str = Field(default="", description="Natural language prompt for the action") + data: str | dict[str, Any] | None = Field(None, description="Additional context data") + + class ExtractAction(SdkActionBase): """Extract data action parameters.""" @@ -70,7 +79,7 @@ class ExtractAction(SdkActionBase): # Discriminated union of all action types SdkAction = Annotated[ - Union[ClickAction, InputTextAction, SelectOptionAction, ExtractAction], + Union[ClickAction, InputTextAction, SelectOptionAction, ActAction, ExtractAction], Field(discriminator="type"), ] diff --git a/skyvern/library/skyvern_browser_page.py b/skyvern/library/skyvern_browser_page.py index c92f0b33..f7c0631c 100644 --- a/skyvern/library/skyvern_browser_page.py +++ b/skyvern/library/skyvern_browser_page.py @@ -586,6 +586,23 @@ class SkyvernBrowserPage: """ return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data) + async def act( + self, + prompt: str, + ) -> None: + """Perform an action on the page using AI based on a natural language prompt. + + Args: + prompt: Natural language description of the action to perform. + + Examples: + ```python + # Simple action + await page.act("Click the login button") + ``` + """ + return await self._ai.ai_act(prompt) + async def reload(self, **kwargs: Any) -> None: """Reload the current page. diff --git a/skyvern/library/skyvern_browser_page_ai.py b/skyvern/library/skyvern_browser_page_ai.py index 0b7aca8d..75749ee2 100644 --- a/skyvern/library/skyvern_browser_page_ai.py +++ b/skyvern/library/skyvern_browser_page_ai.py @@ -2,7 +2,13 @@ from typing import TYPE_CHECKING, Any from playwright.async_api import Page -from skyvern.client import SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_Extract +from skyvern.client import ( + SdkAction_AiAct, + SdkAction_AiClick, + SdkAction_AiInputText, + SdkAction_AiSelectOption, + SdkAction_Extract, +) from skyvern.config import settings from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi @@ -140,3 +146,21 @@ class SdkSkyvernPageAi(SkyvernPageAi): ) self._browser.workflow_run_id = response.workflow_run_id return response.result if response.result else None + + async def ai_act( + self, + prompt: str, + ) -> None: + """Perform an action on the page using AI via API call.""" + + await self._browser.sdk.ensure_has_server() + response = await self._browser.client.run_sdk_action( + url=self._page.url, + action=SdkAction_AiAct( + intention=prompt, + ), + browser_session_id=self._browser.browser_session_id, + browser_address=self._browser.browser_address, + workflow_run_id=self._browser.workflow_run_id, + ) + self._browser.workflow_run_id = response.workflow_run_id