From 33ad4cfcd1d3b04c20cc3c4b265fd2f98e14242e Mon Sep 17 00:00:00 2001 From: Stanislav Novosad Date: Wed, 29 Oct 2025 11:54:57 -0600 Subject: [PATCH] SDK: support actions skeleton (#3817) Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- fern/docs.yml | 4 + fern/openapi/skyvern_openapi.json | 442 ++++++++++++++++++ skyvern/client/__init__.py | 45 ++ skyvern/client/client.py | 140 ++++++ skyvern/client/raw_client.py | 218 +++++++++ skyvern/client/types/__init__.py | 47 ++ skyvern/client/types/click_action.py | 38 ++ skyvern/client/types/click_action_data.py | 5 + skyvern/client/types/extract_action.py | 44 ++ skyvern/client/types/extract_action_data.py | 5 + .../types/extract_action_extract_schema.py | 7 + skyvern/client/types/input_text_action.py | 53 +++ .../client/types/input_text_action_data.py | 5 + .../client/types/run_sdk_action_response.py | 27 ++ skyvern/client/types/sdk_action.py | 89 ++++ skyvern/client/types/select_option_action.py | 43 ++ .../client/types/select_option_action_data.py | 5 + .../real_skyvern_page_ai.py | 441 +++++++++++++++++ .../core/script_generations/skyvern_page.py | 5 +- .../script_generations/skyvern_page_ai.py | 390 +--------------- skyvern/forge/sdk/routes/__init__.py | 1 + skyvern/forge/sdk/routes/sdk.py | 180 +++++++ skyvern/forge/sdk/schemas/sdk_actions.py | 100 ++++ skyvern/library/SdkSkyvernPageAi.py | 144 ++++++ skyvern/library/skyvern_browser.py | 19 +- skyvern/library/skyvern_browser_page.py | 203 ++++++-- 26 files changed, 2274 insertions(+), 426 deletions(-) create mode 100644 skyvern/client/types/click_action.py create mode 100644 skyvern/client/types/click_action_data.py create mode 100644 skyvern/client/types/extract_action.py create mode 100644 skyvern/client/types/extract_action_data.py create mode 100644 skyvern/client/types/extract_action_extract_schema.py create mode 100644 skyvern/client/types/input_text_action.py create mode 100644 skyvern/client/types/input_text_action_data.py create mode 100644 skyvern/client/types/run_sdk_action_response.py create mode 100644 skyvern/client/types/sdk_action.py create mode 100644 skyvern/client/types/select_option_action.py create mode 100644 skyvern/client/types/select_option_action_data.py create mode 100644 skyvern/core/script_generations/real_skyvern_page_ai.py create mode 100644 skyvern/forge/sdk/routes/sdk.py create mode 100644 skyvern/forge/sdk/schemas/sdk_actions.py create mode 100644 skyvern/library/SdkSkyvernPageAi.py diff --git a/fern/docs.yml b/fern/docs.yml index 3e608bb8..e1863704 100644 --- a/fern/docs.yml +++ b/fern/docs.yml @@ -204,6 +204,10 @@ navigation: - POST /v1/scripts/{script_id}/deploy - GET /v1/scripts/{script_id} - GET /v1/scripts + - section: SDK + hidden: true + contents: + - POST /v1/sdk/run_action - tab: blog - tab: community - tab: github diff --git a/fern/openapi/skyvern_openapi.json b/fern/openapi/skyvern_openapi.json index fee37273..b02ea907 100644 --- a/fern/openapi/skyvern_openapi.json +++ b/fern/openapi/skyvern_openapi.json @@ -2248,6 +2248,101 @@ } } } + }, + "/v1/sdk/run_action": { + "post": { + "tags": [ + "SDK" + ], + "summary": "Run an SDK action", + "description": "Execute a single SDK action with the specified parameters", + "operationId": "run_sdk_action_v1_sdk_run_action_post", + "parameters": [ + { + "name": "x-api-key", + "in": "header", + "required": false, + "schema": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "description": "Skyvern API key for authentication. API key can be found at https://app.skyvern.com/settings.", + "title": "X-Api-Key" + }, + "description": "Skyvern API key for authentication. API key can be found at https://app.skyvern.com/settings." + }, + { + "name": "x-user-agent", + "in": "header", + "required": false, + "schema": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "X-User-Agent" + } + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RunSdkActionRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successfully executed SDK action", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RunSdkActionResponse" + } + } + } + }, + "403": { + "description": "Unauthorized - Invalid or missing authentication" + }, + "400": { + "description": "Invalid operation" + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + }, + "x-fern-sdk-method-name": "run_sdk_action", + "x-fern-examples": [ + { + "code-samples": [ + { + "sdk": "python", + "code": "from skyvern import Skyvern\nfrom skyvern.forge.sdk.schemas.sdk_actions import ClickAction\n\nskyvern = Skyvern(api_key=\"YOUR_API_KEY\")\nresponse = await skyvern.run_sdk_action(\n url=\"https://example.com\",\n browser_session_id=\"pbs_123\",\n action=ClickAction(\n selector=\"button\",\n intention=\"Click the submit button\"\n )\n)\nprint(response.workflow_run_id)\n" + } + ] + } + ] + } } }, "components": { @@ -9298,6 +9393,353 @@ ], "title": "RunStatus" }, + "ClickAction": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "ai_click", + "title": "Type", + "default": "ai_click" + }, + "selector": { + "type": "string", + "title": "Selector", + "description": "CSS selector for the element", + "default": "" + }, + "intention": { + "type": "string", + "title": "Intention", + "description": "The intention or goal of the click", + "default": "" + }, + "data": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Data", + "description": "Additional context data" + }, + "timeout": { + "type": "number", + "title": "Timeout", + "description": "Timeout in milliseconds", + "default": 30000 + } + }, + "required": ["type"], + "title": "ClickAction" + }, + "InputTextAction": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "ai_input_text", + "title": "Type", + "default": "ai_input_text" + }, + "selector": { + "type": "string", + "title": "Selector", + "description": "CSS selector for the element", + "default": "" + }, + "value": { + "type": "string", + "title": "Value", + "description": "Value to input", + "default": "" + }, + "intention": { + "type": "string", + "title": "Intention", + "description": "The intention or goal of the input", + "default": "" + }, + "data": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Data", + "description": "Additional context data" + }, + "totp_identifier": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Totp Identifier", + "description": "TOTP identifier for input_text actions" + }, + "totp_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Totp Url", + "description": "TOTP URL for input_text actions" + }, + "timeout": { + "type": "number", + "title": "Timeout", + "description": "Timeout in milliseconds", + "default": 30000 + } + }, + "required": ["type"], + "title": "InputTextAction" + }, + "SelectOptionAction": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "ai_select_option", + "title": "Type", + "default": "ai_select_option" + }, + "selector": { + "type": "string", + "title": "Selector", + "description": "CSS selector for the element", + "default": "" + }, + "value": { + "type": "string", + "title": "Value", + "description": "Value to select", + "default": "" + }, + "intention": { + "type": "string", + "title": "Intention", + "description": "The intention or goal of the selection", + "default": "" + }, + "data": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Data", + "description": "Additional context data" + }, + "timeout": { + "type": "number", + "title": "Timeout", + "description": "Timeout in milliseconds", + "default": 30000 + } + }, + "required": ["type"], + "title": "SelectOptionAction" + }, + "ExtractAction": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "extract", + "title": "Type", + "default": "extract" + }, + "prompt": { + "type": "string", + "title": "Prompt", + "description": "Extraction prompt", + "default": "" + }, + "extract_schema": { + "anyOf": [ + { + "type": "object" + }, + { + "type": "array" + }, + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Extract Schema", + "description": "Schema for extraction" + }, + "error_code_mapping": { + "anyOf": [ + { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + { + "type": "null" + } + ], + "title": "Error Code Mapping", + "description": "Error code mapping for extraction" + }, + "intention": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Intention", + "description": "The intention or goal of the extraction" + }, + "data": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Data", + "description": "Additional context data" + } + }, + "required": ["type"], + "title": "ExtractAction" + }, + "SdkAction": { + "oneOf": [ + { + "$ref": "#/components/schemas/ClickAction" + }, + { + "$ref": "#/components/schemas/InputTextAction" + }, + { + "$ref": "#/components/schemas/SelectOptionAction" + }, + { + "$ref": "#/components/schemas/ExtractAction" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "ai_click": "#/components/schemas/ClickAction", + "ai_input_text": "#/components/schemas/InputTextAction", + "ai_select_option": "#/components/schemas/SelectOptionAction", + "extract": "#/components/schemas/ExtractAction" + } + }, + "title": "SdkAction" + }, + "RunSdkActionRequest": { + "type": "object", + "properties": { + "url": { + "type": "string", + "title": "Url", + "description": "The URL where the action should be executed" + }, + "browser_session_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Browser Session Id", + "description": "The browser session ID" + }, + "browser_address": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Browser Address", + "description": "The browser address" + }, + "workflow_run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Workflow Run Id", + "description": "Optional workflow run ID to continue an existing workflow run" + }, + "action": { + "$ref": "#/components/schemas/SdkAction", + "description": "The action to execute with its specific parameters" + } + }, + "required": ["url", "action"], + "title": "RunSdkActionRequest" + }, + "RunSdkActionResponse": { + "type": "object", + "properties": { + "workflow_run_id": { + "type": "string", + "title": "Workflow Run Id", + "description": "The workflow run ID used for this action" + }, + "result": { + "title": "Result", + "description": "The result from the action (e.g., selector, value, extracted data)" + } + }, + "required": ["workflow_run_id"], + "title": "RunSdkActionResponse" + }, "Script": { "properties": { "script_revision_id": { diff --git a/skyvern/client/__init__.py b/skyvern/client/__init__.py index d94ef537..8048b095 100644 --- a/skyvern/client/__init__.py +++ b/skyvern/client/__init__.py @@ -40,6 +40,8 @@ if typing.TYPE_CHECKING: BitwardenSensitiveInformationParameterYaml, BlockType, BrowserSessionResponse, + ClickAction, + ClickActionData, CodeBlock, CodeBlockParametersItem, CodeBlockParametersItem_AwsSecret, @@ -79,6 +81,9 @@ if typing.TYPE_CHECKING: CreditCardCredentialResponse, DownloadToS3Block, DownloadToS3BlockYaml, + ExtractAction, + ExtractActionData, + ExtractActionExtractSchema, ExtractionBlock, ExtractionBlockDataSchema, ExtractionBlockParametersItem, @@ -215,6 +220,8 @@ if typing.TYPE_CHECKING: HumanInteractionBlockParametersItem_Workflow, HumanInteractionBlockYaml, InputOrSelectContext, + InputTextAction, + InputTextActionData, LoginBlock, LoginBlockDataSchema, LoginBlockParametersItem, @@ -257,11 +264,19 @@ if typing.TYPE_CHECKING: PdfParserBlockYaml, ProxyLocation, RunEngine, + RunSdkActionResponse, RunStatus, Script, ScriptFileCreate, ScriptRunResponse, + SdkAction, + SdkAction_AiClick, + SdkAction_AiInputText, + SdkAction_AiSelectOption, + SdkAction_Extract, SelectOption, + SelectOptionAction, + SelectOptionActionData, SendEmailBlock, SendEmailBlockYaml, SkyvernForgeSdkSchemasCredentialsCredentialType, @@ -485,6 +500,8 @@ _dynamic_imports: typing.Dict[str, str] = { "BitwardenSensitiveInformationParameterYaml": ".types", "BlockType": ".types", "BrowserSessionResponse": ".types", + "ClickAction": ".types", + "ClickActionData": ".types", "CodeBlock": ".types", "CodeBlockParametersItem": ".types", "CodeBlockParametersItem_AwsSecret": ".types", @@ -524,6 +541,9 @@ _dynamic_imports: typing.Dict[str, str] = { "CreditCardCredentialResponse": ".types", "DownloadToS3Block": ".types", "DownloadToS3BlockYaml": ".types", + "ExtractAction": ".types", + "ExtractActionData": ".types", + "ExtractActionExtractSchema": ".types", "ExtractionBlock": ".types", "ExtractionBlockDataSchema": ".types", "ExtractionBlockParametersItem": ".types", @@ -661,6 +681,8 @@ _dynamic_imports: typing.Dict[str, str] = { "HumanInteractionBlockParametersItem_Workflow": ".types", "HumanInteractionBlockYaml": ".types", "InputOrSelectContext": ".types", + "InputTextAction": ".types", + "InputTextActionData": ".types", "LoginBlock": ".types", "LoginBlockDataSchema": ".types", "LoginBlockParametersItem": ".types", @@ -704,11 +726,19 @@ _dynamic_imports: typing.Dict[str, str] = { "PdfParserBlockYaml": ".types", "ProxyLocation": ".types", "RunEngine": ".types", + "RunSdkActionResponse": ".types", "RunStatus": ".types", "Script": ".types", "ScriptFileCreate": ".types", "ScriptRunResponse": ".types", + "SdkAction": ".types", + "SdkAction_AiClick": ".types", + "SdkAction_AiInputText": ".types", + "SdkAction_AiSelectOption": ".types", + "SdkAction_Extract": ".types", "SelectOption": ".types", + "SelectOptionAction": ".types", + "SelectOptionActionData": ".types", "SendEmailBlock": ".types", "SendEmailBlockYaml": ".types", "Skyvern": ".client", @@ -955,6 +985,8 @@ __all__ = [ "BitwardenSensitiveInformationParameterYaml", "BlockType", "BrowserSessionResponse", + "ClickAction", + "ClickActionData", "CodeBlock", "CodeBlockParametersItem", "CodeBlockParametersItem_AwsSecret", @@ -994,6 +1026,9 @@ __all__ = [ "CreditCardCredentialResponse", "DownloadToS3Block", "DownloadToS3BlockYaml", + "ExtractAction", + "ExtractActionData", + "ExtractActionExtractSchema", "ExtractionBlock", "ExtractionBlockDataSchema", "ExtractionBlockParametersItem", @@ -1131,6 +1166,8 @@ __all__ = [ "HumanInteractionBlockParametersItem_Workflow", "HumanInteractionBlockYaml", "InputOrSelectContext", + "InputTextAction", + "InputTextActionData", "LoginBlock", "LoginBlockDataSchema", "LoginBlockParametersItem", @@ -1174,11 +1211,19 @@ __all__ = [ "PdfParserBlockYaml", "ProxyLocation", "RunEngine", + "RunSdkActionResponse", "RunStatus", "Script", "ScriptFileCreate", "ScriptRunResponse", + "SdkAction", + "SdkAction_AiClick", + "SdkAction_AiInputText", + "SdkAction_AiSelectOption", + "SdkAction_Extract", "SelectOption", + "SelectOptionAction", + "SelectOptionActionData", "SendEmailBlock", "SendEmailBlockYaml", "Skyvern", diff --git a/skyvern/client/client.py b/skyvern/client/client.py index cfc95d85..a8307f4b 100644 --- a/skyvern/client/client.py +++ b/skyvern/client/client.py @@ -19,8 +19,10 @@ from .types.credential_response import CredentialResponse from .types.get_run_response import GetRunResponse from .types.proxy_location import ProxyLocation from .types.run_engine import RunEngine +from .types.run_sdk_action_response import RunSdkActionResponse from .types.script import Script from .types.script_file_create import ScriptFileCreate +from .types.sdk_action import SdkAction from .types.skyvern_forge_sdk_schemas_credentials_credential_type import SkyvernForgeSdkSchemasCredentialsCredentialType from .types.skyvern_schemas_run_blocks_credential_type import SkyvernSchemasRunBlocksCredentialType from .types.task_run_request_data_extraction_schema import TaskRunRequestDataExtractionSchema @@ -1494,6 +1496,71 @@ class Skyvern: _response = self._raw_client.deploy_script(script_id, files=files, request_options=request_options) return _response.data + def run_sdk_action( + self, + *, + url: str, + action: SdkAction, + user_agent: typing.Optional[str] = None, + browser_session_id: typing.Optional[str] = OMIT, + browser_address: typing.Optional[str] = OMIT, + workflow_run_id: typing.Optional[str] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> RunSdkActionResponse: + """ + Execute a single SDK action with the specified parameters + + Parameters + ---------- + url : str + The URL where the action should be executed + + action : SdkAction + The action to execute with its specific parameters + + user_agent : typing.Optional[str] + + browser_session_id : typing.Optional[str] + The browser session ID + + browser_address : typing.Optional[str] + The browser address + + workflow_run_id : typing.Optional[str] + Optional workflow run ID to continue an existing workflow run + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + RunSdkActionResponse + Successfully executed SDK action + + Examples + -------- + from skyvern import SdkAction_AiClick, Skyvern + + client = Skyvern( + api_key="YOUR_API_KEY", + ) + client.run_sdk_action( + user_agent="x-user-agent", + url="url", + action=SdkAction_AiClick(), + ) + """ + _response = self._raw_client.run_sdk_action( + url=url, + action=action, + user_agent=user_agent, + browser_session_id=browser_session_id, + browser_address=browser_address, + workflow_run_id=workflow_run_id, + request_options=request_options, + ) + return _response.data + @property def scripts(self): if self._scripts is None: @@ -3174,6 +3241,79 @@ class AsyncSkyvern: _response = await self._raw_client.deploy_script(script_id, files=files, request_options=request_options) return _response.data + async def run_sdk_action( + self, + *, + url: str, + action: SdkAction, + user_agent: typing.Optional[str] = None, + browser_session_id: typing.Optional[str] = OMIT, + browser_address: typing.Optional[str] = OMIT, + workflow_run_id: typing.Optional[str] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> RunSdkActionResponse: + """ + Execute a single SDK action with the specified parameters + + Parameters + ---------- + url : str + The URL where the action should be executed + + action : SdkAction + The action to execute with its specific parameters + + user_agent : typing.Optional[str] + + browser_session_id : typing.Optional[str] + The browser session ID + + browser_address : typing.Optional[str] + The browser address + + workflow_run_id : typing.Optional[str] + Optional workflow run ID to continue an existing workflow run + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + RunSdkActionResponse + Successfully executed SDK action + + Examples + -------- + import asyncio + + from skyvern import AsyncSkyvern, SdkAction_AiClick + + client = AsyncSkyvern( + api_key="YOUR_API_KEY", + ) + + + async def main() -> None: + await client.run_sdk_action( + user_agent="x-user-agent", + url="url", + action=SdkAction_AiClick(), + ) + + + asyncio.run(main()) + """ + _response = await self._raw_client.run_sdk_action( + url=url, + action=action, + user_agent=user_agent, + browser_session_id=browser_session_id, + browser_address=browser_address, + workflow_run_id=workflow_run_id, + request_options=request_options, + ) + return _response.data + @property def scripts(self): if self._scripts is None: diff --git a/skyvern/client/raw_client.py b/skyvern/client/raw_client.py index ea71b621..feb8b1ea 100644 --- a/skyvern/client/raw_client.py +++ b/skyvern/client/raw_client.py @@ -24,8 +24,10 @@ from .types.credential_response import CredentialResponse from .types.get_run_response import GetRunResponse from .types.proxy_location import ProxyLocation from .types.run_engine import RunEngine +from .types.run_sdk_action_response import RunSdkActionResponse from .types.script import Script from .types.script_file_create import ScriptFileCreate +from .types.sdk_action import SdkAction from .types.skyvern_forge_sdk_schemas_credentials_credential_type import SkyvernForgeSdkSchemasCredentialsCredentialType from .types.skyvern_schemas_run_blocks_credential_type import SkyvernSchemasRunBlocksCredentialType from .types.task_run_request_data_extraction_schema import TaskRunRequestDataExtractionSchema @@ -2052,6 +2054,114 @@ class RawSkyvern: raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text) raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) + def run_sdk_action( + self, + *, + url: str, + action: SdkAction, + user_agent: typing.Optional[str] = None, + browser_session_id: typing.Optional[str] = OMIT, + browser_address: typing.Optional[str] = OMIT, + workflow_run_id: typing.Optional[str] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> HttpResponse[RunSdkActionResponse]: + """ + Execute a single SDK action with the specified parameters + + Parameters + ---------- + url : str + The URL where the action should be executed + + action : SdkAction + The action to execute with its specific parameters + + user_agent : typing.Optional[str] + + browser_session_id : typing.Optional[str] + The browser session ID + + browser_address : typing.Optional[str] + The browser address + + workflow_run_id : typing.Optional[str] + Optional workflow run ID to continue an existing workflow run + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + HttpResponse[RunSdkActionResponse] + Successfully executed SDK action + """ + _response = self._client_wrapper.httpx_client.request( + "v1/sdk/run_action", + method="POST", + json={ + "url": url, + "browser_session_id": browser_session_id, + "browser_address": browser_address, + "workflow_run_id": workflow_run_id, + "action": convert_and_respect_annotation_metadata( + object_=action, annotation=SdkAction, direction="write" + ), + }, + headers={ + "content-type": "application/json", + "x-user-agent": str(user_agent) if user_agent is not None else None, + }, + request_options=request_options, + omit=OMIT, + ) + try: + if 200 <= _response.status_code < 300: + _data = typing.cast( + RunSdkActionResponse, + parse_obj_as( + type_=RunSdkActionResponse, # type: ignore + object_=_response.json(), + ), + ) + return HttpResponse(response=_response, data=_data) + if _response.status_code == 400: + raise BadRequestError( + headers=dict(_response.headers), + body=typing.cast( + typing.Optional[typing.Any], + parse_obj_as( + type_=typing.Optional[typing.Any], # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 403: + raise ForbiddenError( + headers=dict(_response.headers), + body=typing.cast( + typing.Optional[typing.Any], + parse_obj_as( + type_=typing.Optional[typing.Any], # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 422: + raise UnprocessableEntityError( + headers=dict(_response.headers), + body=typing.cast( + typing.Optional[typing.Any], + parse_obj_as( + type_=typing.Optional[typing.Any], # type: ignore + object_=_response.json(), + ), + ), + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text) + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) + class AsyncRawSkyvern: def __init__(self, *, client_wrapper: AsyncClientWrapper): @@ -4064,3 +4174,111 @@ class AsyncRawSkyvern: except JSONDecodeError: raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text) raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) + + async def run_sdk_action( + self, + *, + url: str, + action: SdkAction, + user_agent: typing.Optional[str] = None, + browser_session_id: typing.Optional[str] = OMIT, + browser_address: typing.Optional[str] = OMIT, + workflow_run_id: typing.Optional[str] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> AsyncHttpResponse[RunSdkActionResponse]: + """ + Execute a single SDK action with the specified parameters + + Parameters + ---------- + url : str + The URL where the action should be executed + + action : SdkAction + The action to execute with its specific parameters + + user_agent : typing.Optional[str] + + browser_session_id : typing.Optional[str] + The browser session ID + + browser_address : typing.Optional[str] + The browser address + + workflow_run_id : typing.Optional[str] + Optional workflow run ID to continue an existing workflow run + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + AsyncHttpResponse[RunSdkActionResponse] + Successfully executed SDK action + """ + _response = await self._client_wrapper.httpx_client.request( + "v1/sdk/run_action", + method="POST", + json={ + "url": url, + "browser_session_id": browser_session_id, + "browser_address": browser_address, + "workflow_run_id": workflow_run_id, + "action": convert_and_respect_annotation_metadata( + object_=action, annotation=SdkAction, direction="write" + ), + }, + headers={ + "content-type": "application/json", + "x-user-agent": str(user_agent) if user_agent is not None else None, + }, + request_options=request_options, + omit=OMIT, + ) + try: + if 200 <= _response.status_code < 300: + _data = typing.cast( + RunSdkActionResponse, + parse_obj_as( + type_=RunSdkActionResponse, # type: ignore + object_=_response.json(), + ), + ) + return AsyncHttpResponse(response=_response, data=_data) + if _response.status_code == 400: + raise BadRequestError( + headers=dict(_response.headers), + body=typing.cast( + typing.Optional[typing.Any], + parse_obj_as( + type_=typing.Optional[typing.Any], # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 403: + raise ForbiddenError( + headers=dict(_response.headers), + body=typing.cast( + typing.Optional[typing.Any], + parse_obj_as( + type_=typing.Optional[typing.Any], # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 422: + raise UnprocessableEntityError( + headers=dict(_response.headers), + body=typing.cast( + typing.Optional[typing.Any], + parse_obj_as( + type_=typing.Optional[typing.Any], # type: ignore + object_=_response.json(), + ), + ), + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text) + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) diff --git a/skyvern/client/types/__init__.py b/skyvern/client/types/__init__.py index 363815d6..44001b53 100644 --- a/skyvern/client/types/__init__.py +++ b/skyvern/client/types/__init__.py @@ -41,6 +41,8 @@ if typing.TYPE_CHECKING: from .bitwarden_sensitive_information_parameter_yaml import BitwardenSensitiveInformationParameterYaml from .block_type import BlockType from .browser_session_response import BrowserSessionResponse + from .click_action import ClickAction + from .click_action_data import ClickActionData from .code_block import CodeBlock from .code_block_parameters_item import ( CodeBlockParametersItem, @@ -84,6 +86,9 @@ if typing.TYPE_CHECKING: from .credit_card_credential_response import CreditCardCredentialResponse from .download_to_s3block import DownloadToS3Block from .download_to_s3block_yaml import DownloadToS3BlockYaml + from .extract_action import ExtractAction + from .extract_action_data import ExtractActionData + from .extract_action_extract_schema import ExtractActionExtractSchema from .extraction_block import ExtractionBlock from .extraction_block_data_schema import ExtractionBlockDataSchema from .extraction_block_parameters_item import ( @@ -236,6 +241,8 @@ if typing.TYPE_CHECKING: ) from .human_interaction_block_yaml import HumanInteractionBlockYaml from .input_or_select_context import InputOrSelectContext + from .input_text_action import InputTextAction + from .input_text_action_data import InputTextActionData from .login_block import LoginBlock from .login_block_data_schema import LoginBlockDataSchema from .login_block_parameters_item import ( @@ -282,11 +289,21 @@ if typing.TYPE_CHECKING: from .pdf_parser_block_yaml import PdfParserBlockYaml from .proxy_location import ProxyLocation from .run_engine import RunEngine + from .run_sdk_action_response import RunSdkActionResponse from .run_status import RunStatus from .script import Script from .script_file_create import ScriptFileCreate from .script_run_response import ScriptRunResponse + from .sdk_action import ( + SdkAction, + SdkAction_AiClick, + SdkAction_AiInputText, + SdkAction_AiSelectOption, + SdkAction_Extract, + ) from .select_option import SelectOption + from .select_option_action import SelectOptionAction + from .select_option_action_data import SelectOptionActionData from .send_email_block import SendEmailBlock from .send_email_block_yaml import SendEmailBlockYaml from .skyvern_forge_sdk_schemas_credentials_credential_type import SkyvernForgeSdkSchemasCredentialsCredentialType @@ -520,6 +537,8 @@ _dynamic_imports: typing.Dict[str, str] = { "BitwardenSensitiveInformationParameterYaml": ".bitwarden_sensitive_information_parameter_yaml", "BlockType": ".block_type", "BrowserSessionResponse": ".browser_session_response", + "ClickAction": ".click_action", + "ClickActionData": ".click_action_data", "CodeBlock": ".code_block", "CodeBlockParametersItem": ".code_block_parameters_item", "CodeBlockParametersItem_AwsSecret": ".code_block_parameters_item", @@ -559,6 +578,9 @@ _dynamic_imports: typing.Dict[str, str] = { "CreditCardCredentialResponse": ".credit_card_credential_response", "DownloadToS3Block": ".download_to_s3block", "DownloadToS3BlockYaml": ".download_to_s3block_yaml", + "ExtractAction": ".extract_action", + "ExtractActionData": ".extract_action_data", + "ExtractActionExtractSchema": ".extract_action_extract_schema", "ExtractionBlock": ".extraction_block", "ExtractionBlockDataSchema": ".extraction_block_data_schema", "ExtractionBlockParametersItem": ".extraction_block_parameters_item", @@ -695,6 +717,8 @@ _dynamic_imports: typing.Dict[str, str] = { "HumanInteractionBlockParametersItem_Workflow": ".human_interaction_block_parameters_item", "HumanInteractionBlockYaml": ".human_interaction_block_yaml", "InputOrSelectContext": ".input_or_select_context", + "InputTextAction": ".input_text_action", + "InputTextActionData": ".input_text_action_data", "LoginBlock": ".login_block", "LoginBlockDataSchema": ".login_block_data_schema", "LoginBlockParametersItem": ".login_block_parameters_item", @@ -737,11 +761,19 @@ _dynamic_imports: typing.Dict[str, str] = { "PdfParserBlockYaml": ".pdf_parser_block_yaml", "ProxyLocation": ".proxy_location", "RunEngine": ".run_engine", + "RunSdkActionResponse": ".run_sdk_action_response", "RunStatus": ".run_status", "Script": ".script", "ScriptFileCreate": ".script_file_create", "ScriptRunResponse": ".script_run_response", + "SdkAction": ".sdk_action", + "SdkAction_AiClick": ".sdk_action", + "SdkAction_AiInputText": ".sdk_action", + "SdkAction_AiSelectOption": ".sdk_action", + "SdkAction_Extract": ".sdk_action", "SelectOption": ".select_option", + "SelectOptionAction": ".select_option_action", + "SelectOptionActionData": ".select_option_action_data", "SendEmailBlock": ".send_email_block", "SendEmailBlockYaml": ".send_email_block_yaml", "SkyvernForgeSdkSchemasCredentialsCredentialType": ".skyvern_forge_sdk_schemas_credentials_credential_type", @@ -981,6 +1013,8 @@ __all__ = [ "BitwardenSensitiveInformationParameterYaml", "BlockType", "BrowserSessionResponse", + "ClickAction", + "ClickActionData", "CodeBlock", "CodeBlockParametersItem", "CodeBlockParametersItem_AwsSecret", @@ -1020,6 +1054,9 @@ __all__ = [ "CreditCardCredentialResponse", "DownloadToS3Block", "DownloadToS3BlockYaml", + "ExtractAction", + "ExtractActionData", + "ExtractActionExtractSchema", "ExtractionBlock", "ExtractionBlockDataSchema", "ExtractionBlockParametersItem", @@ -1156,6 +1193,8 @@ __all__ = [ "HumanInteractionBlockParametersItem_Workflow", "HumanInteractionBlockYaml", "InputOrSelectContext", + "InputTextAction", + "InputTextActionData", "LoginBlock", "LoginBlockDataSchema", "LoginBlockParametersItem", @@ -1198,11 +1237,19 @@ __all__ = [ "PdfParserBlockYaml", "ProxyLocation", "RunEngine", + "RunSdkActionResponse", "RunStatus", "Script", "ScriptFileCreate", "ScriptRunResponse", + "SdkAction", + "SdkAction_AiClick", + "SdkAction_AiInputText", + "SdkAction_AiSelectOption", + "SdkAction_Extract", "SelectOption", + "SelectOptionAction", + "SelectOptionActionData", "SendEmailBlock", "SendEmailBlockYaml", "SkyvernForgeSdkSchemasCredentialsCredentialType", diff --git a/skyvern/client/types/click_action.py b/skyvern/client/types/click_action.py new file mode 100644 index 00000000..5cbcde5b --- /dev/null +++ b/skyvern/client/types/click_action.py @@ -0,0 +1,38 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel +from .click_action_data import ClickActionData + + +class ClickAction(UniversalBaseModel): + selector: typing.Optional[str] = pydantic.Field(default=None) + """ + CSS selector for the element + """ + + intention: typing.Optional[str] = pydantic.Field(default=None) + """ + The intention or goal of the click + """ + + data: typing.Optional[ClickActionData] = pydantic.Field(default=None) + """ + Additional context data + """ + + timeout: typing.Optional[float] = pydantic.Field(default=None) + """ + Timeout in milliseconds + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/skyvern/client/types/click_action_data.py b/skyvern/client/types/click_action_data.py new file mode 100644 index 00000000..acab32a4 --- /dev/null +++ b/skyvern/client/types/click_action_data.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +ClickActionData = typing.Union[str, typing.Dict[str, typing.Optional[typing.Any]]] diff --git a/skyvern/client/types/extract_action.py b/skyvern/client/types/extract_action.py new file mode 100644 index 00000000..7a2730c6 --- /dev/null +++ b/skyvern/client/types/extract_action.py @@ -0,0 +1,44 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel +from .extract_action_data import ExtractActionData +from .extract_action_extract_schema import ExtractActionExtractSchema + + +class ExtractAction(UniversalBaseModel): + prompt: typing.Optional[str] = pydantic.Field(default=None) + """ + Extraction prompt + """ + + extract_schema: typing.Optional[ExtractActionExtractSchema] = pydantic.Field(default=None) + """ + Schema for extraction + """ + + error_code_mapping: typing.Optional[typing.Dict[str, typing.Optional[str]]] = pydantic.Field(default=None) + """ + Error code mapping for extraction + """ + + intention: typing.Optional[str] = pydantic.Field(default=None) + """ + The intention or goal of the extraction + """ + + data: typing.Optional[ExtractActionData] = pydantic.Field(default=None) + """ + Additional context data + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/skyvern/client/types/extract_action_data.py b/skyvern/client/types/extract_action_data.py new file mode 100644 index 00000000..65ed5301 --- /dev/null +++ b/skyvern/client/types/extract_action_data.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +ExtractActionData = typing.Union[str, typing.Dict[str, typing.Optional[typing.Any]]] diff --git a/skyvern/client/types/extract_action_extract_schema.py b/skyvern/client/types/extract_action_extract_schema.py new file mode 100644 index 00000000..96fdd56e --- /dev/null +++ b/skyvern/client/types/extract_action_extract_schema.py @@ -0,0 +1,7 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +ExtractActionExtractSchema = typing.Union[ + typing.Dict[str, typing.Optional[typing.Any]], typing.List[typing.Optional[typing.Any]], str +] diff --git a/skyvern/client/types/input_text_action.py b/skyvern/client/types/input_text_action.py new file mode 100644 index 00000000..2effef63 --- /dev/null +++ b/skyvern/client/types/input_text_action.py @@ -0,0 +1,53 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel +from .input_text_action_data import InputTextActionData + + +class InputTextAction(UniversalBaseModel): + selector: typing.Optional[str] = pydantic.Field(default=None) + """ + CSS selector for the element + """ + + value: typing.Optional[str] = pydantic.Field(default=None) + """ + Value to input + """ + + intention: typing.Optional[str] = pydantic.Field(default=None) + """ + The intention or goal of the input + """ + + data: typing.Optional[InputTextActionData] = pydantic.Field(default=None) + """ + Additional context data + """ + + totp_identifier: typing.Optional[str] = pydantic.Field(default=None) + """ + TOTP identifier for input_text actions + """ + + totp_url: typing.Optional[str] = pydantic.Field(default=None) + """ + TOTP URL for input_text actions + """ + + timeout: typing.Optional[float] = pydantic.Field(default=None) + """ + Timeout in milliseconds + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/skyvern/client/types/input_text_action_data.py b/skyvern/client/types/input_text_action_data.py new file mode 100644 index 00000000..5cd89b25 --- /dev/null +++ b/skyvern/client/types/input_text_action_data.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +InputTextActionData = typing.Union[str, typing.Dict[str, typing.Optional[typing.Any]]] diff --git a/skyvern/client/types/run_sdk_action_response.py b/skyvern/client/types/run_sdk_action_response.py new file mode 100644 index 00000000..065f19a8 --- /dev/null +++ b/skyvern/client/types/run_sdk_action_response.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel + + +class RunSdkActionResponse(UniversalBaseModel): + workflow_run_id: str = pydantic.Field() + """ + The workflow run ID used for this action + """ + + result: typing.Optional[typing.Optional[typing.Any]] = pydantic.Field(default=None) + """ + The result from the action (e.g., selector, value, extracted data) + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/skyvern/client/types/sdk_action.py b/skyvern/client/types/sdk_action.py new file mode 100644 index 00000000..ec0bd5ee --- /dev/null +++ b/skyvern/client/types/sdk_action.py @@ -0,0 +1,89 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel +from .click_action_data import ClickActionData +from .extract_action_data import ExtractActionData +from .extract_action_extract_schema import ExtractActionExtractSchema +from .input_text_action_data import InputTextActionData +from .select_option_action_data import SelectOptionActionData + + +class SdkAction_AiClick(UniversalBaseModel): + type: typing.Literal["ai_click"] = "ai_click" + selector: typing.Optional[str] = None + intention: typing.Optional[str] = None + data: typing.Optional[ClickActionData] = None + timeout: typing.Optional[float] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class SdkAction_AiInputText(UniversalBaseModel): + type: typing.Literal["ai_input_text"] = "ai_input_text" + selector: typing.Optional[str] = None + value: typing.Optional[str] = None + intention: typing.Optional[str] = None + data: typing.Optional[InputTextActionData] = None + totp_identifier: typing.Optional[str] = None + totp_url: typing.Optional[str] = None + timeout: typing.Optional[float] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class SdkAction_AiSelectOption(UniversalBaseModel): + type: typing.Literal["ai_select_option"] = "ai_select_option" + selector: typing.Optional[str] = None + value: typing.Optional[str] = None + intention: typing.Optional[str] = None + data: typing.Optional[SelectOptionActionData] = None + timeout: typing.Optional[float] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class SdkAction_Extract(UniversalBaseModel): + type: typing.Literal["extract"] = "extract" + prompt: typing.Optional[str] = None + extract_schema: typing.Optional[ExtractActionExtractSchema] = None + error_code_mapping: typing.Optional[typing.Dict[str, typing.Optional[str]]] = None + intention: typing.Optional[str] = None + data: typing.Optional[ExtractActionData] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +SdkAction = typing.Union[SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_Extract] diff --git a/skyvern/client/types/select_option_action.py b/skyvern/client/types/select_option_action.py new file mode 100644 index 00000000..35ca2485 --- /dev/null +++ b/skyvern/client/types/select_option_action.py @@ -0,0 +1,43 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel +from .select_option_action_data import SelectOptionActionData + + +class SelectOptionAction(UniversalBaseModel): + selector: typing.Optional[str] = pydantic.Field(default=None) + """ + CSS selector for the element + """ + + value: typing.Optional[str] = pydantic.Field(default=None) + """ + Value to select + """ + + intention: typing.Optional[str] = pydantic.Field(default=None) + """ + The intention or goal of the selection + """ + + data: typing.Optional[SelectOptionActionData] = pydantic.Field(default=None) + """ + Additional context data + """ + + timeout: typing.Optional[float] = pydantic.Field(default=None) + """ + Timeout in milliseconds + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/skyvern/client/types/select_option_action_data.py b/skyvern/client/types/select_option_action_data.py new file mode 100644 index 00000000..1b2c35e9 --- /dev/null +++ b/skyvern/client/types/select_option_action_data.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +SelectOptionActionData = typing.Union[str, typing.Dict[str, typing.Optional[typing.Any]]] diff --git a/skyvern/core/script_generations/real_skyvern_page_ai.py b/skyvern/core/script_generations/real_skyvern_page_ai.py new file mode 100644 index 00000000..44577ace --- /dev/null +++ b/skyvern/core/script_generations/real_skyvern_page_ai.py @@ -0,0 +1,441 @@ +from __future__ import annotations + +import json +from datetime import datetime, timezone +from typing import Any + +import structlog +from jinja2.sandbox import SandboxedEnvironment +from playwright.async_api import Page + +from skyvern.config import settings +from skyvern.constants import SPECIAL_FIELD_VERIFICATION_CODE +from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi +from skyvern.forge import app +from skyvern.forge.prompts import prompt_engine +from skyvern.forge.sdk.api.files import download_file +from skyvern.forge.sdk.core import skyvern_context +from skyvern.forge.sdk.schemas.totp_codes import OTPType +from skyvern.services.otp_service import poll_otp_value +from skyvern.utils.prompt_engine import load_prompt_with_elements +from skyvern.webeye.actions import handler_utils +from skyvern.webeye.actions.actions import ( + ActionStatus, + InputTextAction, +) +from skyvern.webeye.actions.handler import ( + handle_click_action, + handle_input_text_action, + handle_select_option_action, +) +from skyvern.webeye.actions.parse_actions import parse_actions +from skyvern.webeye.scraper.scraper import ScrapedPage + +jinja_sandbox_env = SandboxedEnvironment() + +LOG = structlog.get_logger() + +SELECT_OPTION_GOAL = """- The intention to select an option: {intention}. +- The overall goal that the user wants to achieve: {prompt}.""" + + +async def _get_element_id_by_selector(selector: str, page: Page) -> str | None: + locator = page.locator(selector) + element_id = await locator.get_attribute("unique_id") + return element_id + + +def _get_context_data(data: str | dict[str, Any] | None = None) -> dict[str, Any] | str | None: + context = skyvern_context.current() + global_context_data = context.script_run_parameters if context else None + if not data: + return global_context_data + result: dict[str, Any] | str | None + if isinstance(data, dict): + result = {k: v for k, v in data.items() if v} + if global_context_data: + result.update(global_context_data) + else: + global_context_data_str = json.dumps(global_context_data) if global_context_data else "" + result = f"{data}\n{global_context_data_str}" + return result + + +def _render_template_with_label(template: str, label: str | None = None) -> str: + template_data = {} + context = skyvern_context.current() + if context and context.workflow_run_id: + workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(context.workflow_run_id) + block_reference_data: dict[str, Any] = workflow_run_context.get_block_metadata(label) + template_data = workflow_run_context.values.copy() + if label in template_data: + current_value = template_data[label] + if isinstance(current_value, dict): + block_reference_data.update(current_value) + else: + LOG.warning( + f"Script service: Parameter {label} has a registered reference value, going to overwrite it by block metadata" + ) + + if label: + template_data[label] = block_reference_data + + # inject the forloop metadata as global variables + if "current_index" in block_reference_data: + template_data["current_index"] = block_reference_data["current_index"] + if "current_item" in block_reference_data: + template_data["current_item"] = block_reference_data["current_item"] + if "current_value" in block_reference_data: + template_data["current_value"] = block_reference_data["current_value"] + try: + return render_template(template, data=template_data) + except Exception: + LOG.exception("Failed to render template", template=template, data=template_data) + return template + + +def render_template(template: str, data: dict[str, Any] | None = None) -> str: + """ + Refer to Block.format_block_parameter_template_from_workflow_run_context + + TODO: complete this function so that block code shares the same template rendering logic + """ + template_data = data.copy() if data else {} + jinja_template = jinja_sandbox_env.from_string(template) + context = skyvern_context.current() + if context and context.workflow_run_id: + workflow_run_id = context.workflow_run_id + workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(workflow_run_id) + template_data.update(workflow_run_context.values) + if template in template_data: + return template_data[template] + + return jinja_template.render(template_data) + + +class RealSkyvernPageAi(SkyvernPageAi): + def __init__( + self, + scraped_page: ScrapedPage, + page: Page, + ): + self.scraped_page = scraped_page + self.page = page + self.current_label: str | None = None + + async def ai_click( + self, + selector: str, + intention: str, + data: str | dict[str, Any] | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> str: + """Click an element using AI to locate it based on intention.""" + try: + # Build the element tree of the current page for the prompt + context = skyvern_context.ensure_context() + payload_str = _get_context_data(data) + refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() + element_tree = refreshed_page.build_element_tree() + single_click_prompt = prompt_engine.load_prompt( + template="single-click-action", + navigation_goal=intention, + navigation_payload_str=payload_str, + current_url=self.page.url, + elements=element_tree, + local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(), + # user_context=getattr(context, "prompt", None), + ) + json_response = await app.SINGLE_CLICK_AGENT_LLM_API_HANDLER( + prompt=single_click_prompt, + prompt_name="single-click-action", + organization_id=context.organization_id, + ) + actions_json = json_response.get("actions", []) + if actions_json: + organization_id = context.organization_id if context else None + task_id = context.task_id if context else None + step_id = context.step_id if context else None + task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None + step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None + if organization_id and task and step: + actions = parse_actions( + task, step.step_id, step.order, self.scraped_page, json_response.get("actions", []) + ) + action = actions[0] + result = await handle_click_action(action, self.page, self.scraped_page, task, step) + if result and result[-1].success is False: + raise Exception(result[-1].exception_message) + xpath = action.get_xpath() + selector = f"xpath={xpath}" if xpath else selector + return selector + except Exception: + LOG.exception( + f"Failed to do ai click. Falling back to original selector={selector}, intention={intention}, data={data}" + ) + + if selector: + locator = self.page.locator(selector) + await locator.click(timeout=timeout) + return selector + + async def ai_input_text( + self, + selector: str, + value: str, + intention: str, + data: str | dict[str, Any] | None = None, + totp_identifier: str | None = None, + totp_url: str | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> str: + """Input text into an element using AI to determine the value.""" + + context = skyvern_context.current() + value = value or "" + transformed_value = value + element_id: str | None = None + organization_id = context.organization_id if context else None + task_id = context.task_id if context else None + step_id = context.step_id if context else None + workflow_run_id = context.workflow_run_id if context else None + task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None + step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None + if intention: + try: + prompt = context.prompt if context else None + data = data or {} + if (totp_identifier or totp_url) and context and organization_id and task_id: + if totp_identifier: + totp_identifier = _render_template_with_label(totp_identifier, label=self.current_label) + if totp_url: + totp_url = _render_template_with_label(totp_url, label=self.current_label) + otp_value = await poll_otp_value( + organization_id=organization_id, + task_id=task_id, + workflow_run_id=workflow_run_id, + totp_identifier=totp_identifier, + totp_verification_url=totp_url, + ) + if otp_value and otp_value.get_otp_type() == OTPType.TOTP: + verification_code = otp_value.value + if isinstance(data, dict) and SPECIAL_FIELD_VERIFICATION_CODE not in data: + data[SPECIAL_FIELD_VERIFICATION_CODE] = verification_code + elif isinstance(data, str) and SPECIAL_FIELD_VERIFICATION_CODE not in data: + data = f"{data}\n" + str({SPECIAL_FIELD_VERIFICATION_CODE: verification_code}) + elif isinstance(data, list): + data.append({SPECIAL_FIELD_VERIFICATION_CODE: verification_code}) + else: + data = {SPECIAL_FIELD_VERIFICATION_CODE: verification_code} + + refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() + self.scraped_page = refreshed_page + # get the element_id by the selector + element_id = await _get_element_id_by_selector(selector, self.page) + script_generation_input_text_prompt = prompt_engine.load_prompt( + template="script-generation-input-text-generatiion", + intention=intention, + goal=prompt, + data=data, + ) + json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER( + prompt=script_generation_input_text_prompt, + prompt_name="script-generation-input-text-generatiion", + organization_id=organization_id, + ) + value = json_response.get("answer", value) + except Exception: + LOG.exception(f"Failed to adapt value for input text action on selector={selector}, value={value}") + + if context and context.workflow_run_id: + transformed_value = await _get_actual_value_of_parameter_if_secret(context.workflow_run_id, str(value)) + + if element_id and organization_id and task and step: + action = InputTextAction( + element_id=element_id, + text=value, + status=ActionStatus.pending, + organization_id=organization_id, + workflow_run_id=workflow_run_id, + task_id=task_id, + step_id=context.step_id if context else None, + reasoning=intention, + intention=intention, + response=value, + ) + result = await handle_input_text_action(action, self.page, self.scraped_page, task, step) + if result and result[-1].success is False: + raise Exception(result[-1].exception_message) + else: + locator = self.page.locator(selector) + await handler_utils.input_sequentially(locator, transformed_value, timeout=timeout) + return value + + async def ai_upload_file( + self, + selector: str, + files: str, + intention: str, + data: str | dict[str, Any] | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> str: + """Upload a file using AI to process the file URL.""" + + if intention: + try: + context = skyvern_context.current() + prompt = context.prompt if context else None + data = _get_context_data(data) + script_generation_file_url_prompt = prompt_engine.load_prompt( + template="script-generation-file-url-generation", + intention=intention, + data=data, + goal=prompt, + ) + json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER( + prompt=script_generation_file_url_prompt, + prompt_name="script-generation-file-url-generation", + organization_id=context.organization_id if context else None, + ) + files = json_response.get("answer", files) + except Exception: + LOG.exception(f"Failed to adapt value for input text action on selector={selector}, file={files}") + if not files: + raise ValueError("file url must be provided") + file_path = await download_file(files) + locator = self.page.locator(selector) + await locator.set_input_files(file_path, timeout=timeout) + return files + + async def ai_select_option( + self, + selector: str, + value: str, + intention: str, + data: str | dict[str, Any] | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> str: + """Select an option from a dropdown using AI.""" + + option_value = value or "" + context = skyvern_context.current() + if context and context.task_id and context.step_id and context.organization_id: + task = await app.DATABASE.get_task(context.task_id, organization_id=context.organization_id) + step = await app.DATABASE.get_step(context.step_id, organization_id=context.organization_id) + if intention and task and step: + try: + prompt = context.prompt if context else None + # data = _get_context_data(data) + data = data or {} + refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() + self.scraped_page = refreshed_page + element_tree = refreshed_page.build_element_tree() + merged_goal = SELECT_OPTION_GOAL.format(intention=intention, prompt=prompt) + single_select_prompt = prompt_engine.load_prompt( + template="single-select-action", + navigation_payload_str=data, + navigation_goal=merged_goal, + current_url=self.page.url, + elements=element_tree, + local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(), + ) + json_response = await app.SELECT_AGENT_LLM_API_HANDLER( + prompt=single_select_prompt, + prompt_name="single-select-action", + organization_id=context.organization_id if context else None, + ) + actions = parse_actions( + task, step.step_id, step.order, self.scraped_page, json_response.get("actions", []) + ) + if actions: + action = actions[0] + if not action.option: + raise ValueError("SelectOptionAction requires an 'option' field") + option_value = action.option.value or action.option.label or "" + await handle_select_option_action( + action=action, + page=self.page, + scraped_page=self.scraped_page, + task=task, + step=step, + ) + else: + LOG.exception( + f"Failed to parse actions for select option action on selector={selector}, value={value}" + ) + except Exception: + LOG.exception( + f"Failed to adapt value for select option action on selector={selector}, value={value}" + ) + else: + locator = self.page.locator(selector) + await locator.select_option(option_value, timeout=timeout) + return option_value + + async def ai_extract( + self, + prompt: str, + schema: dict[str, Any] | list | str | None = None, + error_code_mapping: dict[str, str] | None = None, + intention: str | None = None, + data: str | dict[str, Any] | None = None, + ) -> dict[str, Any] | list | str | None: + """Extract information from the page using AI.""" + + scraped_page_refreshed = await self.scraped_page.refresh() + context = skyvern_context.current() + tz_info = datetime.now(tz=timezone.utc).tzinfo + if context and context.tz_info: + tz_info = context.tz_info + prompt = _render_template_with_label(prompt, label=self.current_label) + extract_information_prompt = load_prompt_with_elements( + element_tree_builder=scraped_page_refreshed, + prompt_engine=prompt_engine, + template_name="extract-information", + html_need_skyvern_attrs=False, + data_extraction_goal=prompt, + extracted_information_schema=schema, + current_url=scraped_page_refreshed.url, + extracted_text=scraped_page_refreshed.extracted_text, + error_code_mapping_str=(json.dumps(error_code_mapping) if error_code_mapping else None), + local_datetime=datetime.now(tz_info).isoformat(), + ) + step = None + if context and context.organization_id and context.task_id and context.step_id: + step = await app.DATABASE.get_step( + step_id=context.step_id, + organization_id=context.organization_id, + ) + + result = await app.EXTRACTION_LLM_API_HANDLER( + prompt=extract_information_prompt, + step=step, + screenshots=scraped_page_refreshed.screenshots, + prompt_name="extract-information", + ) + if context and context.script_mode: + print(f"\n✨ 📊 Extracted Information:\n{'-' * 50}") + + try: + # Pretty print JSON if result is a dict/list + if isinstance(result, (dict, list)): + print(json.dumps(result, indent=2, ensure_ascii=False)) + else: + print(result) + except Exception: + print(result) + print(f"{'-' * 50}\n") + return result + + +async def _get_actual_value_of_parameter_if_secret(workflow_run_id: str, parameter: str) -> Any: + """ + Get the actual value of a parameter if it's a secret. If it's not a secret, return the parameter value as is. + + Just return the parameter value if the task isn't a workflow's task. + + This is only used for InputTextAction, UploadFileAction, and ClickAction (if it has a file_url). + """ + workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(workflow_run_id) + secret_value = workflow_run_context.get_original_secret_value_or_none(parameter) + return secret_value if secret_value is not None else parameter diff --git a/skyvern/core/script_generations/skyvern_page.py b/skyvern/core/script_generations/skyvern_page.py index 30cffc16..b3512571 100644 --- a/skyvern/core/script_generations/skyvern_page.py +++ b/skyvern/core/script_generations/skyvern_page.py @@ -10,7 +10,8 @@ import structlog from playwright.async_api import Page from skyvern.config import settings -from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi, render_template +from skyvern.core.script_generations.real_skyvern_page_ai import RealSkyvernPageAi, render_template +from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi from skyvern.exceptions import ScriptTerminationException, WorkflowRunNotFound from skyvern.forge import app from skyvern.forge.prompts import prompt_engine @@ -123,7 +124,7 @@ class SkyvernPage: ) -> SkyvernPage: scraped_page = await cls.create_scraped_page(browser_session_id=browser_session_id) page = await scraped_page._browser_state.must_get_working_page() - ai = SkyvernPageAi(scraped_page, page) + ai = RealSkyvernPageAi(scraped_page, page) return cls(scraped_page=scraped_page, page=page, ai=ai) @classmethod diff --git a/skyvern/core/script_generations/skyvern_page_ai.py b/skyvern/core/script_generations/skyvern_page_ai.py index 51949a9d..1b8c7c61 100644 --- a/skyvern/core/script_generations/skyvern_page_ai.py +++ b/skyvern/core/script_generations/skyvern_page_ai.py @@ -1,126 +1,12 @@ from __future__ import annotations -import json -from datetime import datetime, timezone -from typing import Any - -import structlog -from jinja2.sandbox import SandboxedEnvironment -from playwright.async_api import Page +from typing import Any, Protocol from skyvern.config import settings -from skyvern.constants import SPECIAL_FIELD_VERIFICATION_CODE -from skyvern.forge import app -from skyvern.forge.prompts import prompt_engine -from skyvern.forge.sdk.api.files import download_file -from skyvern.forge.sdk.core import skyvern_context -from skyvern.forge.sdk.schemas.totp_codes import OTPType -from skyvern.services.otp_service import poll_otp_value -from skyvern.utils.prompt_engine import load_prompt_with_elements -from skyvern.webeye.actions import handler_utils -from skyvern.webeye.actions.actions import ( - ActionStatus, - InputTextAction, -) -from skyvern.webeye.actions.handler import ( - handle_click_action, - handle_input_text_action, - handle_select_option_action, -) -from skyvern.webeye.actions.parse_actions import parse_actions -from skyvern.webeye.scraper.scraper import ScrapedPage - -jinja_sandbox_env = SandboxedEnvironment() - -LOG = structlog.get_logger() - -SELECT_OPTION_GOAL = """- The intention to select an option: {intention}. -- The overall goal that the user wants to achieve: {prompt}.""" -async def _get_element_id_by_selector(selector: str, page: Page) -> str | None: - locator = page.locator(selector) - element_id = await locator.get_attribute("unique_id") - return element_id - - -def _get_context_data(data: str | dict[str, Any] | None = None) -> dict[str, Any] | str | None: - context = skyvern_context.current() - global_context_data = context.script_run_parameters if context else None - if not data: - return global_context_data - result: dict[str, Any] | str | None - if isinstance(data, dict): - result = {k: v for k, v in data.items() if v} - if global_context_data: - result.update(global_context_data) - else: - global_context_data_str = json.dumps(global_context_data) if global_context_data else "" - result = f"{data}\n{global_context_data_str}" - return result - - -def _render_template_with_label(template: str, label: str | None = None) -> str: - template_data = {} - context = skyvern_context.current() - if context and context.workflow_run_id: - workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(context.workflow_run_id) - block_reference_data: dict[str, Any] = workflow_run_context.get_block_metadata(label) - template_data = workflow_run_context.values.copy() - if label in template_data: - current_value = template_data[label] - if isinstance(current_value, dict): - block_reference_data.update(current_value) - else: - LOG.warning( - f"Script service: Parameter {label} has a registered reference value, going to overwrite it by block metadata" - ) - - if label: - template_data[label] = block_reference_data - - # inject the forloop metadata as global variables - if "current_index" in block_reference_data: - template_data["current_index"] = block_reference_data["current_index"] - if "current_item" in block_reference_data: - template_data["current_item"] = block_reference_data["current_item"] - if "current_value" in block_reference_data: - template_data["current_value"] = block_reference_data["current_value"] - try: - return render_template(template, data=template_data) - except Exception: - LOG.exception("Failed to render template", template=template, data=template_data) - return template - - -def render_template(template: str, data: dict[str, Any] | None = None) -> str: - """ - Refer to Block.format_block_parameter_template_from_workflow_run_context - - TODO: complete this function so that block code shares the same template rendering logic - """ - template_data = data.copy() if data else {} - jinja_template = jinja_sandbox_env.from_string(template) - context = skyvern_context.current() - if context and context.workflow_run_id: - workflow_run_id = context.workflow_run_id - workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(workflow_run_id) - template_data.update(workflow_run_context.values) - if template in template_data: - return template_data[template] - - return jinja_template.render(template_data) - - -class SkyvernPageAi: - def __init__( - self, - scraped_page: ScrapedPage, - page: Page, - ): - self.scraped_page = scraped_page - self.page = page - self.current_label: str | None = None +class SkyvernPageAi(Protocol): + """Protocol defining the interface for AI-powered page interactions.""" async def ai_click( self, @@ -130,52 +16,7 @@ class SkyvernPageAi: timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, ) -> str: """Click an element using AI to locate it based on intention.""" - try: - # Build the element tree of the current page for the prompt - context = skyvern_context.ensure_context() - payload_str = _get_context_data(data) - refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() - element_tree = refreshed_page.build_element_tree() - single_click_prompt = prompt_engine.load_prompt( - template="single-click-action", - navigation_goal=intention, - navigation_payload_str=payload_str, - current_url=self.page.url, - elements=element_tree, - local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(), - # user_context=getattr(context, "prompt", None), - ) - json_response = await app.SINGLE_CLICK_AGENT_LLM_API_HANDLER( - prompt=single_click_prompt, - prompt_name="single-click-action", - organization_id=context.organization_id, - ) - actions_json = json_response.get("actions", []) - if actions_json: - organization_id = context.organization_id if context else None - task_id = context.task_id if context else None - step_id = context.step_id if context else None - task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None - step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None - if organization_id and task and step: - actions = parse_actions( - task, step.step_id, step.order, self.scraped_page, json_response.get("actions", []) - ) - action = actions[0] - result = await handle_click_action(action, self.page, self.scraped_page, task, step) - if result and result[-1].success is False: - raise Exception(result[-1].exception_message) - xpath = action.get_xpath() - selector = f"xpath={xpath}" if xpath else selector - return selector - except Exception: - LOG.exception( - f"Failed to do ai click. Falling back to original selector={selector}, intention={intention}, data={data}" - ) - - locator = self.page.locator(selector) - await locator.click(timeout=timeout) - return selector + ... async def ai_input_text( self, @@ -188,86 +29,7 @@ class SkyvernPageAi: timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, ) -> str: """Input text into an element using AI to determine the value.""" - - context = skyvern_context.current() - value = value or "" - transformed_value = value - element_id: str | None = None - organization_id = context.organization_id if context else None - task_id = context.task_id if context else None - step_id = context.step_id if context else None - workflow_run_id = context.workflow_run_id if context else None - task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None - step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None - if intention: - try: - prompt = context.prompt if context else None - data = data or {} - if (totp_identifier or totp_url) and context and organization_id and task_id: - if totp_identifier: - totp_identifier = _render_template_with_label(totp_identifier, label=self.current_label) - if totp_url: - totp_url = _render_template_with_label(totp_url, label=self.current_label) - otp_value = await poll_otp_value( - organization_id=organization_id, - task_id=task_id, - workflow_run_id=workflow_run_id, - totp_identifier=totp_identifier, - totp_verification_url=totp_url, - ) - if otp_value and otp_value.get_otp_type() == OTPType.TOTP: - verification_code = otp_value.value - if isinstance(data, dict) and SPECIAL_FIELD_VERIFICATION_CODE not in data: - data[SPECIAL_FIELD_VERIFICATION_CODE] = verification_code - elif isinstance(data, str) and SPECIAL_FIELD_VERIFICATION_CODE not in data: - data = f"{data}\n" + str({SPECIAL_FIELD_VERIFICATION_CODE: verification_code}) - elif isinstance(data, list): - data.append({SPECIAL_FIELD_VERIFICATION_CODE: verification_code}) - else: - data = {SPECIAL_FIELD_VERIFICATION_CODE: verification_code} - - refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() - self.scraped_page = refreshed_page - # get the element_id by the selector - element_id = await _get_element_id_by_selector(selector, self.page) - script_generation_input_text_prompt = prompt_engine.load_prompt( - template="script-generation-input-text-generatiion", - intention=intention, - goal=prompt, - data=data, - ) - json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER( - prompt=script_generation_input_text_prompt, - prompt_name="script-generation-input-text-generatiion", - organization_id=organization_id, - ) - value = json_response.get("answer", value) - except Exception: - LOG.exception(f"Failed to adapt value for input text action on selector={selector}, value={value}") - - if context and context.workflow_run_id: - transformed_value = await _get_actual_value_of_parameter_if_secret(context.workflow_run_id, str(value)) - - if element_id and organization_id and task and step: - action = InputTextAction( - element_id=element_id, - text=value, - status=ActionStatus.pending, - organization_id=organization_id, - workflow_run_id=workflow_run_id, - task_id=task_id, - step_id=context.step_id if context else None, - reasoning=intention, - intention=intention, - response=value, - ) - result = await handle_input_text_action(action, self.page, self.scraped_page, task, step) - if result and result[-1].success is False: - raise Exception(result[-1].exception_message) - else: - locator = self.page.locator(selector) - await handler_utils.input_sequentially(locator, transformed_value, timeout=timeout) - return value + ... async def ai_upload_file( self, @@ -278,32 +40,7 @@ class SkyvernPageAi: timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, ) -> str: """Upload a file using AI to process the file URL.""" - - if intention: - try: - context = skyvern_context.current() - prompt = context.prompt if context else None - data = _get_context_data(data) - script_generation_file_url_prompt = prompt_engine.load_prompt( - template="script-generation-file-url-generation", - intention=intention, - data=data, - goal=prompt, - ) - json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER( - prompt=script_generation_file_url_prompt, - prompt_name="script-generation-file-url-generation", - organization_id=context.organization_id if context else None, - ) - files = json_response.get("answer", files) - except Exception: - LOG.exception(f"Failed to adapt value for input text action on selector={selector}, file={files}") - if not files: - raise ValueError("file url must be provided") - file_path = await download_file(files) - locator = self.page.locator(selector) - await locator.set_input_files(file_path, timeout=timeout) - return files + ... async def ai_select_option( self, @@ -314,61 +51,7 @@ class SkyvernPageAi: timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, ) -> str: """Select an option from a dropdown using AI.""" - - option_value = value or "" - context = skyvern_context.current() - if context and context.task_id and context.step_id and context.organization_id: - task = await app.DATABASE.get_task(context.task_id, organization_id=context.organization_id) - step = await app.DATABASE.get_step(context.step_id, organization_id=context.organization_id) - if intention and task and step: - try: - prompt = context.prompt if context else None - # data = _get_context_data(data) - data = data or {} - refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() - self.scraped_page = refreshed_page - element_tree = refreshed_page.build_element_tree() - merged_goal = SELECT_OPTION_GOAL.format(intention=intention, prompt=prompt) - single_select_prompt = prompt_engine.load_prompt( - template="single-select-action", - navigation_payload_str=data, - navigation_goal=merged_goal, - current_url=self.page.url, - elements=element_tree, - local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(), - ) - json_response = await app.SELECT_AGENT_LLM_API_HANDLER( - prompt=single_select_prompt, - prompt_name="single-select-action", - organization_id=context.organization_id if context else None, - ) - actions = parse_actions( - task, step.step_id, step.order, self.scraped_page, json_response.get("actions", []) - ) - if actions: - action = actions[0] - if not action.option: - raise ValueError("SelectOptionAction requires an 'option' field") - option_value = action.option.value or action.option.label or "" - await handle_select_option_action( - action=action, - page=self.page, - scraped_page=self.scraped_page, - task=task, - step=step, - ) - else: - LOG.exception( - f"Failed to parse actions for select option action on selector={selector}, value={value}" - ) - except Exception: - LOG.exception( - f"Failed to adapt value for select option action on selector={selector}, value={value}" - ) - else: - locator = self.page.locator(selector) - await locator.select_option(option_value, timeout=timeout) - return option_value + ... async def ai_extract( self, @@ -379,61 +62,4 @@ class SkyvernPageAi: data: str | dict[str, Any] | None = None, ) -> dict[str, Any] | list | str | None: """Extract information from the page using AI.""" - - scraped_page_refreshed = await self.scraped_page.refresh() - context = skyvern_context.current() - tz_info = datetime.now(tz=timezone.utc).tzinfo - if context and context.tz_info: - tz_info = context.tz_info - prompt = _render_template_with_label(prompt, label=self.current_label) - extract_information_prompt = load_prompt_with_elements( - element_tree_builder=scraped_page_refreshed, - prompt_engine=prompt_engine, - template_name="extract-information", - html_need_skyvern_attrs=False, - data_extraction_goal=prompt, - extracted_information_schema=schema, - current_url=scraped_page_refreshed.url, - extracted_text=scraped_page_refreshed.extracted_text, - error_code_mapping_str=(json.dumps(error_code_mapping) if error_code_mapping else None), - local_datetime=datetime.now(tz_info).isoformat(), - ) - step = None - if context and context.organization_id and context.task_id and context.step_id: - step = await app.DATABASE.get_step( - step_id=context.step_id, - organization_id=context.organization_id, - ) - - result = await app.EXTRACTION_LLM_API_HANDLER( - prompt=extract_information_prompt, - step=step, - screenshots=scraped_page_refreshed.screenshots, - prompt_name="extract-information", - ) - if context and context.script_mode: - print(f"\n✨ 📊 Extracted Information:\n{'-' * 50}") - - try: - # Pretty print JSON if result is a dict/list - if isinstance(result, (dict, list)): - print(json.dumps(result, indent=2, ensure_ascii=False)) - else: - print(result) - except Exception: - print(result) - print(f"{'-' * 50}\n") - return result - - -async def _get_actual_value_of_parameter_if_secret(workflow_run_id: str, parameter: str) -> Any: - """ - Get the actual value of a parameter if it's a secret. If it's not a secret, return the parameter value as is. - - Just return the parameter value if the task isn't a workflow's task. - - This is only used for InputTextAction, UploadFileAction, and ClickAction (if it has a file_url). - """ - workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(workflow_run_id) - secret_value = workflow_run_context.get_original_secret_value_or_none(parameter) - return secret_value if secret_value is not None else parameter + ... diff --git a/skyvern/forge/sdk/routes/__init__.py b/skyvern/forge/sdk/routes/__init__.py index 53188c8b..662f0e61 100644 --- a/skyvern/forge/sdk/routes/__init__.py +++ b/skyvern/forge/sdk/routes/__init__.py @@ -5,6 +5,7 @@ from skyvern.forge.sdk.routes import debug_sessions # noqa: F401 from skyvern.forge.sdk.routes import pylon # noqa: F401 from skyvern.forge.sdk.routes import run_blocks # noqa: F401 from skyvern.forge.sdk.routes import scripts # noqa: F401 +from skyvern.forge.sdk.routes import sdk # noqa: F401 from skyvern.forge.sdk.routes import streaming # noqa: F401 from skyvern.forge.sdk.routes import streaming_messages # noqa: F401 from skyvern.forge.sdk.routes import streaming_vnc # noqa: F401 diff --git a/skyvern/forge/sdk/routes/sdk.py b/skyvern/forge/sdk/routes/sdk.py new file mode 100644 index 00000000..bfb9ecbb --- /dev/null +++ b/skyvern/forge/sdk/routes/sdk.py @@ -0,0 +1,180 @@ +import json + +import structlog +from fastapi import Depends, HTTPException, status + +from skyvern import SkyvernPage +from skyvern.core.script_generations.real_skyvern_page_ai import RealSkyvernPageAi +from skyvern.forge import app +from skyvern.forge.sdk.core import skyvern_context +from skyvern.forge.sdk.core.skyvern_context import SkyvernContext +from skyvern.forge.sdk.routes.routers import base_router +from skyvern.forge.sdk.schemas.organizations import Organization +from skyvern.forge.sdk.schemas.sdk_actions import ( + RunSdkActionRequest, + RunSdkActionResponse, +) +from skyvern.forge.sdk.services import org_auth_service +from skyvern.forge.sdk.workflow.models.workflow import ( + WorkflowRequestBody, + WorkflowRunStatus, +) +from skyvern.schemas.workflows import BlockType, WorkflowStatus + +LOG = structlog.get_logger() + + +@base_router.post( + "/sdk/run_action", + response_model=RunSdkActionResponse, + summary="Run an SDK action", + description="Execute a single SDK action with the specified parameters", + tags=["SDK"], + openapi_extra={ + "x-fern-sdk-method-name": "run_sdk_action", + }, +) +@base_router.post("/sdk/run_action/", include_in_schema=False) +async def run_sdk_action( + action_request: RunSdkActionRequest, + organization: Organization = Depends(org_auth_service.get_current_org), +) -> RunSdkActionResponse: + """Execute a single SDK action with the specified parameters.""" + LOG.info( + "Running SDK action", + organization_id=organization.organization_id, + action_type=action_request.action.type, + ) + + organization_id = organization.organization_id + browser_session_id = action_request.browser_session_id + browser_address = action_request.browser_address + action = action_request.action + + # Use existing workflow_run_id if provided, otherwise create a new one + if action_request.workflow_run_id: + workflow_run = await app.DATABASE.get_workflow_run( + workflow_run_id=action_request.workflow_run_id, + organization_id=organization_id, + ) + if not workflow_run: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Workflow run {action_request.workflow_run_id} not found", + ) + workflow = await app.DATABASE.get_workflow( + workflow_id=workflow_run.workflow_id, + organization_id=organization_id, + ) + if not workflow: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Workflow {workflow_run.workflow_id} not found", + ) + else: + workflow = await app.WORKFLOW_SERVICE.create_empty_workflow( + organization, + title="SDK Workflow", + status=WorkflowStatus.auto_generated, + ) + workflow_run = await app.WORKFLOW_SERVICE.setup_workflow_run( + request_id=None, + workflow_request=WorkflowRequestBody( + browser_session_id=browser_session_id, + browser_address=browser_address, + ), + workflow_permanent_id=workflow.workflow_permanent_id, + organization=organization, + version=None, + ) + workflow_run = await app.DATABASE.update_workflow_run( + workflow_run_id=workflow_run.workflow_run_id, + status=WorkflowRunStatus.completed, + ) + + task = await app.DATABASE.create_task( + organization_id=organization_id, + url=action_request.url, + navigation_goal=None, + navigation_payload=None, + data_extraction_goal=None, + title=f"SDK Action Task: {action_request.action.type}", + workflow_run_id=workflow_run.workflow_run_id, + browser_session_id=browser_session_id, + browser_address=browser_address, + ) + + step = await app.DATABASE.create_step( + task.task_id, + order=0, + retry_index=0, + organization_id=organization.organization_id, + ) + + await app.DATABASE.create_workflow_run_block( + workflow_run_id=workflow_run.workflow_run_id, + organization_id=organization_id, + block_type=BlockType.ACTION, + task_id=task.task_id, + ) + + context = skyvern_context.ensure_context() + skyvern_context.set( + SkyvernContext( + request_id=context.request_id, + organization_id=task.organization_id, + task_id=task.task_id, + step_id=step.step_id, + browser_session_id=browser_session_id, + max_screenshot_scrolls=task.max_screenshot_scrolls, + workflow_id=workflow.workflow_id, + workflow_run_id=workflow_run.workflow_run_id, + ) + ) + result = None + try: + scraped_page = await SkyvernPage.create_scraped_page(browser_session_id=browser_session_id) + page = await scraped_page._browser_state.must_get_working_page() + page_ai = RealSkyvernPageAi(scraped_page, page) + + if action.type == "ai_click": + result = await page_ai.ai_click( + selector=action.selector, + intention=action.intention, + data=action.data, + timeout=action.timeout, + ) + elif action.type == "ai_input_text": + result = await page_ai.ai_input_text( + selector=action.selector, + value=action.value, + intention=action.intention, + data=action.data, + totp_identifier=action.totp_identifier, + totp_url=action.totp_url, + timeout=action.timeout, + ) + elif action.type == "ai_select_option": + result = await page_ai.ai_select_option( + selector=action.selector, + value=action.value, + intention=action.intention, + data=action.data, + timeout=action.timeout, + ) + elif action.type == "extract": + extract_result = await page_ai.ai_extract( + prompt=action.prompt, + schema=action.extract_schema, + error_code_mapping=action.error_code_mapping, + intention=action.intention, + data=action.data, + ) + result = json.dumps(extract_result) + finally: + skyvern_context.reset() + + return RunSdkActionResponse( + workflow_run_id=workflow_run.workflow_run_id, + result=result, + ) diff --git a/skyvern/forge/sdk/schemas/sdk_actions.py b/skyvern/forge/sdk/schemas/sdk_actions.py new file mode 100644 index 00000000..cdf3b302 --- /dev/null +++ b/skyvern/forge/sdk/schemas/sdk_actions.py @@ -0,0 +1,100 @@ +from enum import Enum +from typing import Annotated, Any, Literal, Union + +from pydantic import BaseModel, Field + +from skyvern.config import settings + + +class SdkActionType(str, Enum): + """Enum for SDK action types that can be executed.""" + + AI_CLICK = "ai_click" + AI_INPUT_TEXT = "ai_input_text" + AI_SELECT_OPTION = "ai_select_option" + EXTRACT = "extract" + + +# Base action class +class SdkActionBase(BaseModel): + """Base class for SDK actions.""" + + type: str = Field(..., description="The type of action") + + +# Specific action types +class ClickAction(SdkActionBase): + """Click action parameters.""" + + type: Literal["ai_click"] = "ai_click" + selector: str = Field(default="", description="CSS selector for the element") + intention: str = Field(default="", description="The intention or goal of the click") + data: str | dict[str, Any] | None = Field(None, description="Additional context data") + timeout: float = Field(default=settings.BROWSER_ACTION_TIMEOUT_MS, description="Timeout in milliseconds") + + +class InputTextAction(SdkActionBase): + """Input text action parameters.""" + + type: Literal["ai_input_text"] = "ai_input_text" + selector: str = Field(default="", description="CSS selector for the element") + value: str = Field(default="", description="Value to input") + intention: str = Field(default="", description="The intention or goal of the input") + data: str | dict[str, Any] | None = Field(None, description="Additional context data") + totp_identifier: str | None = Field(None, description="TOTP identifier for input_text actions") + totp_url: str | None = Field(None, description="TOTP URL for input_text actions") + timeout: float = Field(default=settings.BROWSER_ACTION_TIMEOUT_MS, description="Timeout in milliseconds") + + +class SelectOptionAction(SdkActionBase): + """Select option action parameters.""" + + type: Literal["ai_select_option"] = "ai_select_option" + selector: str = Field(default="", description="CSS selector for the element") + value: str = Field(default="", description="Value to select") + intention: str = Field(default="", description="The intention or goal of the selection") + data: str | dict[str, Any] | None = Field(None, description="Additional context data") + timeout: float = Field(default=settings.BROWSER_ACTION_TIMEOUT_MS, description="Timeout in milliseconds") + + +class ExtractAction(SdkActionBase): + """Extract data action parameters.""" + + type: Literal["extract"] = "extract" + prompt: str = Field(default="", description="Extraction prompt") + extract_schema: dict[str, Any] | list | str | None = Field(None, description="Schema for extraction") + error_code_mapping: dict[str, str] | None = Field(None, description="Error code mapping for extraction") + intention: str | None = Field(None, description="The intention or goal of the extraction") + data: str | dict[str, Any] | None = Field(None, description="Additional context data") + + +# Discriminated union of all action types +SdkAction = Annotated[ + Union[ClickAction, InputTextAction, SelectOptionAction, ExtractAction], + Field(discriminator="type"), +] + + +class RunActionResponse(BaseModel): + """Response from running an action.""" + + workflow_run_id: str = Field(..., description="The workflow run ID used for this action") + + +class RunSdkActionRequest(BaseModel): + """Request to run a single SDK action.""" + + url: str = Field(..., description="The URL where the action should be executed") + browser_session_id: str | None = Field(None, description="The browser session ID") + browser_address: str | None = Field(None, description="The browser address") + workflow_run_id: str | None = Field( + None, description="Optional workflow run ID to continue an existing workflow run" + ) + action: SdkAction = Field(..., description="The action to execute with its specific parameters") + + +class RunSdkActionResponse(BaseModel): + """Response from running an SDK action.""" + + workflow_run_id: str = Field(..., description="The workflow run ID used for this action") + result: Any | None = Field(None, description="The result from the action (e.g., selector, value, extracted data)") diff --git a/skyvern/library/SdkSkyvernPageAi.py b/skyvern/library/SdkSkyvernPageAi.py new file mode 100644 index 00000000..c3a41298 --- /dev/null +++ b/skyvern/library/SdkSkyvernPageAi.py @@ -0,0 +1,144 @@ +from typing import TYPE_CHECKING, Any + +from playwright.async_api import Page + +from skyvern.config import settings +from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi +from skyvern.forge.sdk.schemas.sdk_actions import ( + ClickAction, + ExtractAction, + InputTextAction, + SelectOptionAction, +) + +if TYPE_CHECKING: + from skyvern.library.skyvern_browser import SkyvernBrowser + + +class SdkSkyvernPageAi(SkyvernPageAi): + """Implementation of SkyvernPageAi that makes API calls to the server.""" + + def __init__( + self, + browser: "SkyvernBrowser", + page: Page, + ): + self._browser = browser + self._page = page + + async def ai_click( + self, + selector: str, + intention: str, + data: str | dict[str, Any] | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> str: + """Click an element using AI via API call.""" + + action = ClickAction( + selector=selector, + intention=intention, + data=data, + timeout=timeout, + ) + response = await self._browser.client.run_sdk_action( + url=self._page.url, + browser_session_id=self._browser.browser_session_id, + browser_address=self._browser.browser_address, + workflow_run_id=self._browser.workflow_run_id, + action=action, + ) + self._browser.workflow_run_id = response.workflow_run_id + return response.result if response.result else selector + + async def ai_input_text( + self, + selector: str, + value: str, + intention: str, + data: str | dict[str, Any] | None = None, + totp_identifier: str | None = None, + totp_url: str | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> str: + """Input text into an element using AI via API call.""" + + response = await self._browser.client.run_sdk_action( + url=self._page.url, + action=InputTextAction( + selector=selector, + value=value, + intention=intention, + data=data, + totp_identifier=totp_identifier, + totp_url=totp_url, + timeout=timeout, + ), + browser_session_id=self._browser.browser_session_id, + browser_address=self._browser.browser_address, + workflow_run_id=self._browser.workflow_run_id, + ) + self._browser.workflow_run_id = response.workflow_run_id + return response.result if response.result else value + + async def ai_select_option( + self, + selector: str, + value: str, + intention: str, + data: str | dict[str, Any] | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> str: + """Select an option from a dropdown using AI via API call.""" + + response = await self._browser.client.run_sdk_action( + url=self._page.url, + action=SelectOptionAction( + selector=selector, + value=value, + intention=intention, + data=data, + timeout=timeout, + ), + browser_session_id=self._browser.browser_session_id, + browser_address=self._browser.browser_address, + workflow_run_id=self._browser.workflow_run_id, + ) + self._browser.workflow_run_id = response.workflow_run_id + return response.result if response.result else value + + async def ai_upload_file( + self, + selector: str, + files: str, + intention: str, + data: str | dict[str, Any] | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> str: + raise NotImplementedError("Upload is not supported yet") + + async def ai_extract( + self, + prompt: str, + schema: dict[str, Any] | list | str | None = None, + error_code_mapping: dict[str, str] | None = None, + intention: str | None = None, + data: str | dict[str, Any] | None = None, + ) -> dict[str, Any] | list | str | None: + """Extract information from the page using AI via API call.""" + + response = await self._browser.client.run_sdk_action( + url=self._page.url, + action=ExtractAction( + prompt=prompt, + extract_schema=schema, + error_code_mapping=error_code_mapping, + intention=intention, + data=data, + ), + browser_session_id=self._browser.browser_session_id, + browser_address=self._browser.browser_address, + workflow_run_id=self._browser.workflow_run_id, + ) + self._browser.workflow_run_id = response.workflow_run_id + return response.result if response.result else None diff --git a/skyvern/library/skyvern_browser.py b/skyvern/library/skyvern_browser.py index 8c763b5c..f78686dd 100644 --- a/skyvern/library/skyvern_browser.py +++ b/skyvern/library/skyvern_browser.py @@ -1,7 +1,7 @@ from playwright.async_api import BrowserContext, Page from skyvern.client import AsyncSkyvern -from skyvern.library.skyvern_browser_page import SkyvernBrowserPage, SkyvernPageRun +from skyvern.library.skyvern_browser_page import SkyvernBrowserPage class SkyvernBrowser: @@ -44,6 +44,20 @@ class SkyvernBrowser: self._browser_address = browser_address self._client = client + self.workflow_run_id: None | str = None + + @property + def browser_session_id(self) -> str | None: + return self._browser_session_id + + @property + def browser_address(self) -> str | None: + return self._browser_address + + @property + def client(self) -> AsyncSkyvern: + return self._client + async def get_working_page(self) -> SkyvernBrowserPage: """Get the most recent page or create a new one if none exists. @@ -73,5 +87,4 @@ class SkyvernBrowser: return await self._create_skyvern_page(page) async def _create_skyvern_page(self, page: Page) -> SkyvernBrowserPage: - page_ai = SkyvernPageRun(page, self._browser_session_id, self._browser_address, self._client) - return SkyvernBrowserPage(page, page_ai) + return SkyvernBrowserPage(self, page) diff --git a/skyvern/library/skyvern_browser_page.py b/skyvern/library/skyvern_browser_page.py index 8179e942..44785742 100644 --- a/skyvern/library/skyvern_browser_page.py +++ b/skyvern/library/skyvern_browser_page.py @@ -1,11 +1,18 @@ import asyncio -from typing import Any +from typing import TYPE_CHECKING, Any from playwright.async_api import Page -from skyvern.client import AsyncSkyvern, GetRunResponse +from skyvern.client import GetRunResponse from skyvern.client.types.workflow_run_response import WorkflowRunResponse +from skyvern.config import settings from skyvern.library.constants import DEFAULT_AGENT_HEARTBEAT_INTERVAL, DEFAULT_AGENT_TIMEOUT +from skyvern.library.SdkSkyvernPageAi import SdkSkyvernPageAi +from skyvern.webeye.actions import handler_utils + +if TYPE_CHECKING: + from skyvern.library.skyvern_browser import SkyvernBrowser + from skyvern.schemas.run_blocks import CredentialType from skyvern.schemas.runs import RunEngine, RunStatus, TaskRunResponse @@ -18,13 +25,9 @@ class SkyvernPageRun: and pre-defined workflows with automatic waiting for completion. """ - def __init__( - self, page: Page, browser_session_id: str | None, browser_address: str | None, client: AsyncSkyvern - ) -> None: + def __init__(self, browser: "SkyvernBrowser", page: Page) -> None: + self._browser = browser self._page = page - self._browser_session_id = browser_session_id - self._browser_address = browser_address - self._client = client async def run_task( self, @@ -63,7 +66,7 @@ class SkyvernPageRun: TaskRunResponse containing the task execution results. """ - task_run = await self._client.run_task( + task_run = await self._browser.client.run_task( prompt=prompt, engine=engine, model=model, @@ -75,8 +78,8 @@ class SkyvernPageRun: error_code_mapping=error_code_mapping, data_extraction_schema=data_extraction_schema, max_steps=max_steps, - browser_session_id=self._browser_session_id, - browser_address=self._browser_address, + browser_session_id=self._browser.browser_session_id, + browser_address=self._browser.browser_address, user_agent=user_agent, ) @@ -121,7 +124,7 @@ class SkyvernPageRun: WorkflowRunResponse containing the login workflow execution results. """ - workflow_run = await self._client.login( + workflow_run = await self._browser.client.login( credential_type=credential_type, url=url or self._get_page_url(), credential_id=credential_id, @@ -133,8 +136,8 @@ class SkyvernPageRun: webhook_url=webhook_url, totp_identifier=totp_identifier, totp_url=totp_url, - browser_session_id=self._browser_session_id, - browser_address=self._browser_address, + browser_session_id=self._browser.browser_session_id, + browser_address=self._browser.browser_address, extra_http_headers=extra_http_headers, ) @@ -167,7 +170,7 @@ class SkyvernPageRun: Returns: WorkflowRunResponse containing the workflow execution results. """ - workflow_run = await self._client.run_workflow( + workflow_run = await self._browser.client.run_workflow( workflow_id=workflow_id, parameters=parameters, template=template, @@ -175,8 +178,8 @@ class SkyvernPageRun: webhook_url=webhook_url, totp_url=totp_url, totp_identifier=totp_identifier, - browser_session_id=self._browser_session_id, - browser_address=self._browser_address, + browser_session_id=self._browser.browser_session_id, + browser_address=self._browser.browser_address, ) workflow_run = await self._wait_for_run_completion(workflow_run.run_id, timeout) @@ -185,7 +188,7 @@ class SkyvernPageRun: async def _wait_for_run_completion(self, run_id: str, timeout: float) -> GetRunResponse: async with asyncio.timeout(timeout): while True: - task_run = await self._client.get_run(run_id) + task_run = await self._browser.client.get_run(run_id) if RunStatus(task_run.status).is_final(): break await asyncio.sleep(DEFAULT_AGENT_HEARTBEAT_INTERVAL) @@ -221,28 +224,150 @@ class SkyvernBrowserPage: run: SkyvernPageRun instance for executing AI-powered tasks and workflows. """ - def __init__(self, page: Page, run: SkyvernPageRun): - self.run = run - self._playwright_page = page + def __init__(self, browser: "SkyvernBrowser", page: Page): + self._browser = browser + self._page = page + self._ai = SdkSkyvernPageAi(browser, page) + self.run = SkyvernPageRun(browser, page) - async def click(self, selector: str, **kwargs: Any) -> None: - """Click an element matching the selector. + async def click( + self, + *, + selector: str | None = None, + intention: str | None = None, + ai: str | None = "fallback", + data: str | dict[str, Any] | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> str | None: + """Click an element identified by ``selector``. - Args: - selector: A selector to search for an element to click. - **kwargs: Additional options like timeout, force, position, etc. + When ``intention`` and ``data`` are provided a new click action is + generated via the ``single-click-action`` prompt. The model returns a + fresh "xpath=..." selector based on the current DOM and the updated data for this run. + The browser then clicks the element using this newly generated xpath selector. + + If the prompt generation or parsing fails for any reason we fall back to + clicking the originally supplied ``selector``. """ - await self._playwright_page.click(selector, **kwargs) - async def fill(self, selector: str, value: str, **kwargs: Any) -> None: - """Fill an input field with the given value. + if ai == "fallback": + # try to click the element with the original selector first + error_to_raise = None + if selector: + try: + locator = self._page.locator(selector) + await locator.click(timeout=timeout) + return selector + except Exception as e: + error_to_raise = e - Args: - selector: A selector to search for an element to fill. - value: Value to fill for the input field. - **kwargs: Additional options like timeout, force, no_wait_after, etc. + # if the original selector doesn't work, try to click the element with the ai generated selector + if intention: + return await self._ai.ai_click( + selector=selector or "", + intention=intention, + data=data, + timeout=timeout, + ) + if error_to_raise: + raise error_to_raise + else: + return selector + elif ai == "proactive": + if intention: + return await self._ai.ai_click( + selector=selector or "", + intention=intention, + data=data, + timeout=timeout, + ) + + if selector: + locator = self._page.locator(selector) + await locator.click(timeout=timeout) + return selector + + async def _input_text( + self, + selector: str, + value: str, + ai: str | None = "fallback", + intention: str | None = None, + data: str | dict[str, Any] | None = None, + totp_identifier: str | None = None, + totp_url: str | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> str: + """Input text into an element identified by ``selector``. + + When ``intention`` and ``data`` are provided a new input text action is + generated via the `script-generation-input-text-generation` prompt. The model returns a + fresh text based on the current DOM and the updated data for this run. + The browser then inputs the text using this newly generated text. + + If the prompt generation or parsing fails for any reason we fall back to + inputting the originally supplied ``value``. """ - await self._playwright_page.fill(selector, value, **kwargs) + + # format the text with the actual value of the parameter if it's a secret when running a workflow + if ai == "fallback": + error_to_raise = None + try: + locator = self._page.locator(selector) + await handler_utils.input_sequentially(locator, value, timeout=timeout) + return value + except Exception as e: + error_to_raise = e + + if intention: + return await self._ai.ai_input_text( + selector=selector, + value=value, + intention=intention, + data=data, + totp_identifier=totp_identifier, + totp_url=totp_url, + timeout=timeout, + ) + if error_to_raise: + raise error_to_raise + else: + return value + elif ai == "proactive" and intention: + return await self._ai.ai_input_text( + selector=selector, + value=value, + intention=intention, + data=data, + totp_identifier=totp_identifier, + totp_url=totp_url, + timeout=timeout, + ) + locator = self._page.locator(selector) + await handler_utils.input_sequentially(locator, value, timeout=timeout) + return value + + async def fill( + self, + selector: str, + value: str, + ai: str | None = "fallback", + intention: str | None = None, + data: str | dict[str, Any] | None = None, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + totp_identifier: str | None = None, + totp_url: str | None = None, + ) -> str: + return await self._input_text( + selector=selector, + value=value, + ai=ai, + intention=intention, + data=data, + timeout=timeout, + totp_identifier=totp_identifier, + totp_url=totp_url, + ) async def goto(self, url: str, **kwargs: Any) -> None: """Navigate to the given URL. @@ -251,7 +376,7 @@ class SkyvernBrowserPage: url: URL to navigate page to. **kwargs: Additional options like timeout, wait_until, referer, etc. """ - await self._playwright_page.goto(url, **kwargs) + await self._page.goto(url, **kwargs) async def type(self, selector: str, text: str, **kwargs: Any) -> None: """Type text into an element character by character. @@ -261,7 +386,7 @@ class SkyvernBrowserPage: text: Text to type into the element. **kwargs: Additional options like delay, timeout, no_wait_after, etc. """ - await self._playwright_page.type(selector, text, **kwargs) + await self._page.type(selector, text, **kwargs) async def select_option(self, selector: str, value: Any = None, **kwargs: Any) -> list[str]: """Select option(s) in a