diff --git a/fern/openapi/skyvern_openapi.json b/fern/openapi/skyvern_openapi.json index 854073ea..461fb1e5 100644 --- a/fern/openapi/skyvern_openapi.json +++ b/fern/openapi/skyvern_openapi.json @@ -6420,6 +6420,25 @@ "title": "ExtractAction", "description": "Extract data action parameters." }, + "LocateElementAction": { + "properties": { + "type": { + "type": "string", + "const": "locate_element", + "title": "Type", + "default": "locate_element" + }, + "prompt": { + "type": "string", + "title": "Prompt", + "description": "Natural language prompt to locate an element", + "default": "" + } + }, + "type": "object", + "title": "LocateElementAction", + "description": "Locate element action parameters." + }, "ExtractionBlock": { "properties": { "label": { @@ -10915,6 +10934,9 @@ }, { "$ref": "#/components/schemas/ExtractAction" + }, + { + "$ref": "#/components/schemas/LocateElementAction" } ], "title": "Action", @@ -10927,7 +10949,8 @@ "ai_input_text": "#/components/schemas/InputTextAction", "ai_select_option": "#/components/schemas/SelectOptionAction", "ai_upload_file": "#/components/schemas/UploadFileAction", - "extract": "#/components/schemas/ExtractAction" + "extract": "#/components/schemas/ExtractAction", + "locate_element": "#/components/schemas/LocateElementAction" } } } diff --git a/skyvern/client/__init__.py b/skyvern/client/__init__.py index 22c29e84..919047a6 100644 --- a/skyvern/client/__init__.py +++ b/skyvern/client/__init__.py @@ -224,9 +224,11 @@ if typing.TYPE_CHECKING: HumanInteractionBlockParametersItem_Output, HumanInteractionBlockParametersItem_Workflow, HumanInteractionBlockYaml, + ImprovePromptResponse, InputOrSelectContext, InputTextAction, InputTextActionData, + LocateElementAction, LoginBlock, LoginBlockDataSchema, LoginBlockParametersItem, @@ -276,6 +278,7 @@ if typing.TYPE_CHECKING: RunSdkActionRequestAction_AiSelectOption, RunSdkActionRequestAction_AiUploadFile, RunSdkActionRequestAction_Extract, + RunSdkActionRequestAction_LocateElement, RunSdkActionResponse, RunStatus, Script, @@ -469,7 +472,7 @@ if typing.TYPE_CHECKING: WorkflowStatus, ) from .errors import BadRequestError, ForbiddenError, NotFoundError, UnprocessableEntityError - from . import browser_profiles, scripts, workflows + from . import browser_profiles, prompts, scripts, workflows from .client import AsyncSkyvern, Skyvern from .environment import SkyvernEnvironment from .version import __version__ @@ -694,9 +697,11 @@ _dynamic_imports: typing.Dict[str, str] = { "HumanInteractionBlockParametersItem_Output": ".types", "HumanInteractionBlockParametersItem_Workflow": ".types", "HumanInteractionBlockYaml": ".types", + "ImprovePromptResponse": ".types", "InputOrSelectContext": ".types", "InputTextAction": ".types", "InputTextActionData": ".types", + "LocateElementAction": ".types", "LoginBlock": ".types", "LoginBlockDataSchema": ".types", "LoginBlockParametersItem": ".types", @@ -747,6 +752,7 @@ _dynamic_imports: typing.Dict[str, str] = { "RunSdkActionRequestAction_AiSelectOption": ".types", "RunSdkActionRequestAction_AiUploadFile": ".types", "RunSdkActionRequestAction_Extract": ".types", + "RunSdkActionRequestAction_LocateElement": ".types", "RunSdkActionResponse": ".types", "RunStatus": ".types", "Script": ".types", @@ -943,6 +949,7 @@ _dynamic_imports: typing.Dict[str, str] = { "WorkflowStatus": ".types", "__version__": ".version", "browser_profiles": ".browser_profiles", + "prompts": ".prompts", "scripts": ".scripts", "workflows": ".workflows", } @@ -1190,9 +1197,11 @@ __all__ = [ "HumanInteractionBlockParametersItem_Output", "HumanInteractionBlockParametersItem_Workflow", "HumanInteractionBlockYaml", + "ImprovePromptResponse", "InputOrSelectContext", "InputTextAction", "InputTextActionData", + "LocateElementAction", "LoginBlock", "LoginBlockDataSchema", "LoginBlockParametersItem", @@ -1243,6 +1252,7 @@ __all__ = [ "RunSdkActionRequestAction_AiSelectOption", "RunSdkActionRequestAction_AiUploadFile", "RunSdkActionRequestAction_Extract", + "RunSdkActionRequestAction_LocateElement", "RunSdkActionResponse", "RunStatus", "Script", @@ -1439,6 +1449,7 @@ __all__ = [ "WorkflowStatus", "__version__", "browser_profiles", + "prompts", "scripts", "workflows", ] diff --git a/skyvern/client/client.py b/skyvern/client/client.py index 521af1c9..21f50118 100644 --- a/skyvern/client/client.py +++ b/skyvern/client/client.py @@ -36,6 +36,7 @@ from .types.workflow_status import WorkflowStatus if typing.TYPE_CHECKING: from .browser_profiles.client import AsyncBrowserProfilesClient, BrowserProfilesClient + from .prompts.client import AsyncPromptsClient, PromptsClient from .scripts.client import AsyncScriptsClient, ScriptsClient from .workflows.client import AsyncWorkflowsClient, WorkflowsClient # this is used as the default value for optional parameters @@ -110,6 +111,7 @@ class Skyvern: self._raw_client = RawSkyvern(client_wrapper=self._client_wrapper) self._workflows: typing.Optional[WorkflowsClient] = None self._browser_profiles: typing.Optional[BrowserProfilesClient] = None + self._prompts: typing.Optional[PromptsClient] = None self._scripts: typing.Optional[ScriptsClient] = None @property @@ -1240,6 +1242,7 @@ class Skyvern: totp_identifier: typing.Optional[str] = OMIT, totp_url: typing.Optional[str] = OMIT, browser_session_id: typing.Optional[str] = OMIT, + browser_profile_id: typing.Optional[str] = OMIT, browser_address: typing.Optional[str] = OMIT, extra_http_headers: typing.Optional[typing.Dict[str, typing.Optional[str]]] = OMIT, max_screenshot_scrolling_times: typing.Optional[int] = OMIT, @@ -1283,6 +1286,9 @@ class Skyvern: browser_session_id : typing.Optional[str] ID of the browser session to use, which is prefixed by `pbs_` e.g. `pbs_123456` + browser_profile_id : typing.Optional[str] + ID of a browser profile to reuse for this run + browser_address : typing.Optional[str] The CDP address for the task. @@ -1347,6 +1353,7 @@ class Skyvern: totp_identifier=totp_identifier, totp_url=totp_url, browser_session_id=browser_session_id, + browser_profile_id=browser_profile_id, browser_address=browser_address, extra_http_headers=extra_http_headers, max_screenshot_scrolling_times=max_screenshot_scrolling_times, @@ -1601,6 +1608,14 @@ class Skyvern: self._browser_profiles = BrowserProfilesClient(client_wrapper=self._client_wrapper) return self._browser_profiles + @property + def prompts(self): + if self._prompts is None: + from .prompts.client import PromptsClient # noqa: E402 + + self._prompts = PromptsClient(client_wrapper=self._client_wrapper) + return self._prompts + @property def scripts(self): if self._scripts is None: @@ -1678,6 +1693,7 @@ class AsyncSkyvern: self._raw_client = AsyncRawSkyvern(client_wrapper=self._client_wrapper) self._workflows: typing.Optional[AsyncWorkflowsClient] = None self._browser_profiles: typing.Optional[AsyncBrowserProfilesClient] = None + self._prompts: typing.Optional[AsyncPromptsClient] = None self._scripts: typing.Optional[AsyncScriptsClient] = None @property @@ -2982,6 +2998,7 @@ class AsyncSkyvern: totp_identifier: typing.Optional[str] = OMIT, totp_url: typing.Optional[str] = OMIT, browser_session_id: typing.Optional[str] = OMIT, + browser_profile_id: typing.Optional[str] = OMIT, browser_address: typing.Optional[str] = OMIT, extra_http_headers: typing.Optional[typing.Dict[str, typing.Optional[str]]] = OMIT, max_screenshot_scrolling_times: typing.Optional[int] = OMIT, @@ -3025,6 +3042,9 @@ class AsyncSkyvern: browser_session_id : typing.Optional[str] ID of the browser session to use, which is prefixed by `pbs_` e.g. `pbs_123456` + browser_profile_id : typing.Optional[str] + ID of a browser profile to reuse for this run + browser_address : typing.Optional[str] The CDP address for the task. @@ -3097,6 +3117,7 @@ class AsyncSkyvern: totp_identifier=totp_identifier, totp_url=totp_url, browser_session_id=browser_session_id, + browser_profile_id=browser_profile_id, browser_address=browser_address, extra_http_headers=extra_http_headers, max_screenshot_scrolling_times=max_screenshot_scrolling_times, @@ -3391,6 +3412,14 @@ class AsyncSkyvern: self._browser_profiles = AsyncBrowserProfilesClient(client_wrapper=self._client_wrapper) return self._browser_profiles + @property + def prompts(self): + if self._prompts is None: + from .prompts.client import AsyncPromptsClient # noqa: E402 + + self._prompts = AsyncPromptsClient(client_wrapper=self._client_wrapper) + return self._prompts + @property def scripts(self): if self._scripts is None: diff --git a/skyvern/client/prompts/__init__.py b/skyvern/client/prompts/__init__.py new file mode 100644 index 00000000..5cde0202 --- /dev/null +++ b/skyvern/client/prompts/__init__.py @@ -0,0 +1,4 @@ +# This file was auto-generated by Fern from our API Definition. + +# isort: skip_file + diff --git a/skyvern/client/prompts/client.py b/skyvern/client/prompts/client.py new file mode 100644 index 00000000..333a06a1 --- /dev/null +++ b/skyvern/client/prompts/client.py @@ -0,0 +1,145 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper +from ..core.request_options import RequestOptions +from ..types.improve_prompt_response import ImprovePromptResponse +from .raw_client import AsyncRawPromptsClient, RawPromptsClient + +# this is used as the default value for optional parameters +OMIT = typing.cast(typing.Any, ...) + + +class PromptsClient: + def __init__(self, *, client_wrapper: SyncClientWrapper): + self._raw_client = RawPromptsClient(client_wrapper=client_wrapper) + + @property + def with_raw_response(self) -> RawPromptsClient: + """ + Retrieves a raw implementation of this client that returns raw responses. + + Returns + ------- + RawPromptsClient + """ + return self._raw_client + + def improve_prompt( + self, + *, + use_case: str, + prompt: str, + context: typing.Optional[str] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> ImprovePromptResponse: + """ + Improve a prompt based on a specific use-case + + Parameters + ---------- + use_case : str + The use-case for prompt improvement + + prompt : str + The original prompt to improve + + context : typing.Optional[str] + Additional context about the user's needs + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + ImprovePromptResponse + Successful Response + + Examples + -------- + from skyvern import Skyvern + + client = Skyvern( + api_key="YOUR_API_KEY", + ) + client.prompts.improve_prompt( + use_case="use-case", + prompt="prompt", + ) + """ + _response = self._raw_client.improve_prompt( + use_case=use_case, prompt=prompt, context=context, request_options=request_options + ) + return _response.data + + +class AsyncPromptsClient: + def __init__(self, *, client_wrapper: AsyncClientWrapper): + self._raw_client = AsyncRawPromptsClient(client_wrapper=client_wrapper) + + @property + def with_raw_response(self) -> AsyncRawPromptsClient: + """ + Retrieves a raw implementation of this client that returns raw responses. + + Returns + ------- + AsyncRawPromptsClient + """ + return self._raw_client + + async def improve_prompt( + self, + *, + use_case: str, + prompt: str, + context: typing.Optional[str] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> ImprovePromptResponse: + """ + Improve a prompt based on a specific use-case + + Parameters + ---------- + use_case : str + The use-case for prompt improvement + + prompt : str + The original prompt to improve + + context : typing.Optional[str] + Additional context about the user's needs + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + ImprovePromptResponse + Successful Response + + Examples + -------- + import asyncio + + from skyvern import AsyncSkyvern + + client = AsyncSkyvern( + api_key="YOUR_API_KEY", + ) + + + async def main() -> None: + await client.prompts.improve_prompt( + use_case="use-case", + prompt="prompt", + ) + + + asyncio.run(main()) + """ + _response = await self._raw_client.improve_prompt( + use_case=use_case, prompt=prompt, context=context, request_options=request_options + ) + return _response.data diff --git a/skyvern/client/prompts/raw_client.py b/skyvern/client/prompts/raw_client.py new file mode 100644 index 00000000..0eda599b --- /dev/null +++ b/skyvern/client/prompts/raw_client.py @@ -0,0 +1,169 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing +from json.decoder import JSONDecodeError + +from ..core.api_error import ApiError +from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper +from ..core.http_response import AsyncHttpResponse, HttpResponse +from ..core.pydantic_utilities import parse_obj_as +from ..core.request_options import RequestOptions +from ..errors.unprocessable_entity_error import UnprocessableEntityError +from ..types.improve_prompt_response import ImprovePromptResponse + +# this is used as the default value for optional parameters +OMIT = typing.cast(typing.Any, ...) + + +class RawPromptsClient: + def __init__(self, *, client_wrapper: SyncClientWrapper): + self._client_wrapper = client_wrapper + + def improve_prompt( + self, + *, + use_case: str, + prompt: str, + context: typing.Optional[str] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> HttpResponse[ImprovePromptResponse]: + """ + Improve a prompt based on a specific use-case + + Parameters + ---------- + use_case : str + The use-case for prompt improvement + + prompt : str + The original prompt to improve + + context : typing.Optional[str] + Additional context about the user's needs + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + HttpResponse[ImprovePromptResponse] + Successful Response + """ + _response = self._client_wrapper.httpx_client.request( + "v1/prompts/improve", + method="POST", + params={ + "use-case": use_case, + }, + json={ + "context": context, + "prompt": prompt, + }, + headers={ + "content-type": "application/json", + }, + request_options=request_options, + omit=OMIT, + ) + try: + if 200 <= _response.status_code < 300: + _data = typing.cast( + ImprovePromptResponse, + parse_obj_as( + type_=ImprovePromptResponse, # type: ignore + object_=_response.json(), + ), + ) + return HttpResponse(response=_response, data=_data) + if _response.status_code == 422: + raise UnprocessableEntityError( + headers=dict(_response.headers), + body=typing.cast( + typing.Optional[typing.Any], + parse_obj_as( + type_=typing.Optional[typing.Any], # type: ignore + object_=_response.json(), + ), + ), + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text) + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) + + +class AsyncRawPromptsClient: + def __init__(self, *, client_wrapper: AsyncClientWrapper): + self._client_wrapper = client_wrapper + + async def improve_prompt( + self, + *, + use_case: str, + prompt: str, + context: typing.Optional[str] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> AsyncHttpResponse[ImprovePromptResponse]: + """ + Improve a prompt based on a specific use-case + + Parameters + ---------- + use_case : str + The use-case for prompt improvement + + prompt : str + The original prompt to improve + + context : typing.Optional[str] + Additional context about the user's needs + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + AsyncHttpResponse[ImprovePromptResponse] + Successful Response + """ + _response = await self._client_wrapper.httpx_client.request( + "v1/prompts/improve", + method="POST", + params={ + "use-case": use_case, + }, + json={ + "context": context, + "prompt": prompt, + }, + headers={ + "content-type": "application/json", + }, + request_options=request_options, + omit=OMIT, + ) + try: + if 200 <= _response.status_code < 300: + _data = typing.cast( + ImprovePromptResponse, + parse_obj_as( + type_=ImprovePromptResponse, # type: ignore + object_=_response.json(), + ), + ) + return AsyncHttpResponse(response=_response, data=_data) + if _response.status_code == 422: + raise UnprocessableEntityError( + headers=dict(_response.headers), + body=typing.cast( + typing.Optional[typing.Any], + parse_obj_as( + type_=typing.Optional[typing.Any], # type: ignore + object_=_response.json(), + ), + ), + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text) + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) diff --git a/skyvern/client/raw_client.py b/skyvern/client/raw_client.py index 7a437ab1..f8a197a6 100644 --- a/skyvern/client/raw_client.py +++ b/skyvern/client/raw_client.py @@ -1683,6 +1683,7 @@ class RawSkyvern: totp_identifier: typing.Optional[str] = OMIT, totp_url: typing.Optional[str] = OMIT, browser_session_id: typing.Optional[str] = OMIT, + browser_profile_id: typing.Optional[str] = OMIT, browser_address: typing.Optional[str] = OMIT, extra_http_headers: typing.Optional[typing.Dict[str, typing.Optional[str]]] = OMIT, max_screenshot_scrolling_times: typing.Optional[int] = OMIT, @@ -1726,6 +1727,9 @@ class RawSkyvern: browser_session_id : typing.Optional[str] ID of the browser session to use, which is prefixed by `pbs_` e.g. `pbs_123456` + browser_profile_id : typing.Optional[str] + ID of a browser profile to reuse for this run + browser_address : typing.Optional[str] The CDP address for the task. @@ -1782,6 +1786,7 @@ class RawSkyvern: "totp_identifier": totp_identifier, "totp_url": totp_url, "browser_session_id": browser_session_id, + "browser_profile_id": browser_profile_id, "browser_address": browser_address, "extra_http_headers": extra_http_headers, "max_screenshot_scrolling_times": max_screenshot_scrolling_times, @@ -3799,6 +3804,7 @@ class AsyncRawSkyvern: totp_identifier: typing.Optional[str] = OMIT, totp_url: typing.Optional[str] = OMIT, browser_session_id: typing.Optional[str] = OMIT, + browser_profile_id: typing.Optional[str] = OMIT, browser_address: typing.Optional[str] = OMIT, extra_http_headers: typing.Optional[typing.Dict[str, typing.Optional[str]]] = OMIT, max_screenshot_scrolling_times: typing.Optional[int] = OMIT, @@ -3842,6 +3848,9 @@ class AsyncRawSkyvern: browser_session_id : typing.Optional[str] ID of the browser session to use, which is prefixed by `pbs_` e.g. `pbs_123456` + browser_profile_id : typing.Optional[str] + ID of a browser profile to reuse for this run + browser_address : typing.Optional[str] The CDP address for the task. @@ -3898,6 +3907,7 @@ class AsyncRawSkyvern: "totp_identifier": totp_identifier, "totp_url": totp_url, "browser_session_id": browser_session_id, + "browser_profile_id": browser_profile_id, "browser_address": browser_address, "extra_http_headers": extra_http_headers, "max_screenshot_scrolling_times": max_screenshot_scrolling_times, diff --git a/skyvern/client/types/__init__.py b/skyvern/client/types/__init__.py index 7967124c..d8c37c62 100644 --- a/skyvern/client/types/__init__.py +++ b/skyvern/client/types/__init__.py @@ -245,9 +245,11 @@ if typing.TYPE_CHECKING: HumanInteractionBlockParametersItem_Workflow, ) from .human_interaction_block_yaml import HumanInteractionBlockYaml + from .improve_prompt_response import ImprovePromptResponse from .input_or_select_context import InputOrSelectContext from .input_text_action import InputTextAction from .input_text_action_data import InputTextActionData + from .locate_element_action import LocateElementAction from .login_block import LoginBlock from .login_block_data_schema import LoginBlockDataSchema from .login_block_parameters_item import ( @@ -302,6 +304,7 @@ if typing.TYPE_CHECKING: RunSdkActionRequestAction_AiSelectOption, RunSdkActionRequestAction_AiUploadFile, RunSdkActionRequestAction_Extract, + RunSdkActionRequestAction_LocateElement, ) from .run_sdk_action_response import RunSdkActionResponse from .run_status import RunStatus @@ -730,9 +733,11 @@ _dynamic_imports: typing.Dict[str, str] = { "HumanInteractionBlockParametersItem_Output": ".human_interaction_block_parameters_item", "HumanInteractionBlockParametersItem_Workflow": ".human_interaction_block_parameters_item", "HumanInteractionBlockYaml": ".human_interaction_block_yaml", + "ImprovePromptResponse": ".improve_prompt_response", "InputOrSelectContext": ".input_or_select_context", "InputTextAction": ".input_text_action", "InputTextActionData": ".input_text_action_data", + "LocateElementAction": ".locate_element_action", "LoginBlock": ".login_block", "LoginBlockDataSchema": ".login_block_data_schema", "LoginBlockParametersItem": ".login_block_parameters_item", @@ -782,6 +787,7 @@ _dynamic_imports: typing.Dict[str, str] = { "RunSdkActionRequestAction_AiSelectOption": ".run_sdk_action_request_action", "RunSdkActionRequestAction_AiUploadFile": ".run_sdk_action_request_action", "RunSdkActionRequestAction_Extract": ".run_sdk_action_request_action", + "RunSdkActionRequestAction_LocateElement": ".run_sdk_action_request_action", "RunSdkActionResponse": ".run_sdk_action_response", "RunStatus": ".run_status", "Script": ".script", @@ -1215,9 +1221,11 @@ __all__ = [ "HumanInteractionBlockParametersItem_Output", "HumanInteractionBlockParametersItem_Workflow", "HumanInteractionBlockYaml", + "ImprovePromptResponse", "InputOrSelectContext", "InputTextAction", "InputTextActionData", + "LocateElementAction", "LoginBlock", "LoginBlockDataSchema", "LoginBlockParametersItem", @@ -1267,6 +1275,7 @@ __all__ = [ "RunSdkActionRequestAction_AiSelectOption", "RunSdkActionRequestAction_AiUploadFile", "RunSdkActionRequestAction_Extract", + "RunSdkActionRequestAction_LocateElement", "RunSdkActionResponse", "RunStatus", "Script", diff --git a/skyvern/client/types/improve_prompt_response.py b/skyvern/client/types/improve_prompt_response.py new file mode 100644 index 00000000..546094a5 --- /dev/null +++ b/skyvern/client/types/improve_prompt_response.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel + + +class ImprovePromptResponse(UniversalBaseModel): + error: typing.Optional[str] = pydantic.Field(default=None) + """ + Error message if prompt improvement failed + """ + + improved: str = pydantic.Field() + """ + The improved version of the prompt + """ + + original: str = pydantic.Field() + """ + The original prompt provided for improvement + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/skyvern/client/types/locate_element_action.py b/skyvern/client/types/locate_element_action.py new file mode 100644 index 00000000..fa366112 --- /dev/null +++ b/skyvern/client/types/locate_element_action.py @@ -0,0 +1,26 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel + + +class LocateElementAction(UniversalBaseModel): + """ + Locate element action parameters. + """ + + prompt: typing.Optional[str] = pydantic.Field(default=None) + """ + Natural language prompt to locate an element + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/skyvern/client/types/run_sdk_action_request_action.py b/skyvern/client/types/run_sdk_action_request_action.py index b9834c11..cc540568 100644 --- a/skyvern/client/types/run_sdk_action_request_action.py +++ b/skyvern/client/types/run_sdk_action_request_action.py @@ -145,6 +145,24 @@ class RunSdkActionRequestAction_Extract(UniversalBaseModel): extra = pydantic.Extra.allow +class RunSdkActionRequestAction_LocateElement(UniversalBaseModel): + """ + The action to execute with its specific parameters + """ + + type: typing.Literal["locate_element"] = "locate_element" + prompt: typing.Optional[str] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + RunSdkActionRequestAction = typing.Union[ RunSdkActionRequestAction_AiAct, RunSdkActionRequestAction_AiClick, @@ -152,4 +170,5 @@ RunSdkActionRequestAction = typing.Union[ RunSdkActionRequestAction_AiSelectOption, RunSdkActionRequestAction_AiUploadFile, RunSdkActionRequestAction_Extract, + RunSdkActionRequestAction_LocateElement, ] diff --git a/skyvern/core/script_generations/real_skyvern_page_ai.py b/skyvern/core/script_generations/real_skyvern_page_ai.py index a4d0beb4..b26ca822 100644 --- a/skyvern/core/script_generations/real_skyvern_page_ai.py +++ b/skyvern/core/script_generations/real_skyvern_page_ai.py @@ -550,6 +550,87 @@ class RealSkyvernPageAi(SkyvernPageAi): print(f"{'-' * 50}\n") return result + async def ai_locate_element( + self, + prompt: str, + ) -> str | None: + """Locate an element on the page using AI and return its XPath selector. + + Args: + prompt: Natural language description of the element to locate (e.g., 'find "download invoices" button') + + Returns: + XPath selector string (e.g., 'xpath=//button[@id="download"]') or None if not found + """ + scraped_page_refreshed = await self.scraped_page.refresh() + context = skyvern_context.ensure_context() + + prompt_rendered = _render_template_with_label(prompt, label=self.current_label) + + locate_element_prompt = load_prompt_with_elements( + element_tree_builder=scraped_page_refreshed, + prompt_engine=prompt_engine, + template_name="single-locate-element", + html_need_skyvern_attrs=True, + data_extraction_goal=prompt_rendered, + current_url=scraped_page_refreshed.url, + local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(), + ) + + step = None + if context.organization_id and context.task_id and context.step_id: + step = await app.DATABASE.get_step( + step_id=context.step_id, + organization_id=context.organization_id, + ) + + result = await app.EXTRACTION_LLM_API_HANDLER( + prompt=locate_element_prompt, + step=step, + screenshots=scraped_page_refreshed.screenshots, + prompt_name="single-locate-element", + ) + + if not result or not isinstance(result, dict): + LOG.error( + "AI locate element failed - invalid result", + result=result, + result_type=type(result).__name__, + prompt=prompt_rendered, + ) + return None + + element_id = result.get("element_id", None) + confidence = result.get("confidence_float", 0.0) + + xpath: str | None = None + if element_id: + skyvern_element_data = scraped_page_refreshed.id_to_element_dict.get(element_id) + if skyvern_element_data and "xpath" in skyvern_element_data: + xpath = skyvern_element_data.get("xpath") + + if not xpath: + xpath = result.get("xpath", None) + + if not xpath: + LOG.error( + "AI locate element failed - no xpath in element data", + element_id=element_id, + result=result, + prompt=prompt_rendered, + ) + return None + + LOG.info( + "AI locate element result", + element_id=element_id, + xpath=xpath, + confidence=confidence, + prompt=prompt_rendered, + ) + + return xpath + async def ai_act( self, prompt: str, diff --git a/skyvern/core/script_generations/skyvern_page.py b/skyvern/core/script_generations/skyvern_page.py index 3ec22c05..1e2fb5ab 100644 --- a/skyvern/core/script_generations/skyvern_page.py +++ b/skyvern/core/script_generations/skyvern_page.py @@ -6,12 +6,13 @@ from dataclasses import dataclass from typing import Any, Callable, Literal, overload import structlog -from playwright.async_api import Page +from playwright.async_api import Locator, Page from skyvern.config import settings from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi from skyvern.forge.sdk.api.files import download_file from skyvern.forge.sdk.core import skyvern_context +from skyvern.library.ai_locator import AILocator from skyvern.webeye.actions import handler_utils from skyvern.webeye.actions.action_types import ActionType @@ -683,6 +684,120 @@ class SkyvernPage(Page): data = kwargs.pop("data", None) return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data) + @overload + def locator( + self, + selector: str, + *, + prompt: str | None = None, + ai: str | None = "fallback", + **kwargs: Any, + ) -> Locator: ... + + @overload + def locator( + self, + *, + prompt: str, + ai: str | None = "fallback", + **kwargs: Any, + ) -> Locator: ... + + def locator( + self, + selector: str | None = None, + *, + prompt: str | None = None, + ai: str | None = "fallback", + **kwargs: Any, + ) -> Locator: + """Get a Playwright locator using a CSS selector, AI-powered prompt, or both. + + This method extends Playwright's locator() with AI capabilities. It supports three modes: + - **Selector-based**: Get locator using CSS selector (standard Playwright behavior) + - **AI-powered**: Use natural language to describe the element (returns lazy AILocator) + - **Fallback mode** (default): Try the selector first, fall back to AI if it fails + + The AI-powered locator is lazy - it only calls ai_locate_element when you actually + use the locator (e.g., when you call .click(), .fill(), etc.). Note that using this + AI locator lookup with prompt only works for elements you can interact with on the page. + + Args: + selector: CSS selector for the target element. + prompt: Natural language description of which element to locate. + ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI. + **kwargs: All Playwright locator parameters (has_text, has, etc.) + + Returns: + A Playwright Locator object (or AILocator proxy that acts like one). + + Examples: + ```python + # Standard Playwright usage - selector only + download_button = page.locator("#download-btn") + await download_button.click() + + # AI-powered - prompt only (returns lazy _AILocator) + download_button = page.locator(prompt='find "download invoices" button') + await download_button.click() # AI resolves XPath here + + # Fallback mode - try selector first, use AI if it fails + download_button = page.locator("#download-btn", prompt='find "download invoices" button') + await download_button.click() + + # With Playwright parameters + submit_button = page.locator(prompt="find submit button", has_text="Submit") + await submit_button.click() + ``` + """ + if not selector and not prompt: + raise ValueError("Missing input: pass a selector and/or a prompt.") + + context = skyvern_context.current() + if context and context.ai_mode_override: + ai = context.ai_mode_override + + if ai == "fallback": + if selector and prompt: + # Try selector first, then AI + return AILocator( + self.page, + self._ai, + prompt, + selector=selector, + selector_kwargs=kwargs, + try_selector_first=True, + ) + + if selector: + return self.page.locator(selector, **kwargs) + + if prompt: + return AILocator( + self.page, + self._ai, + prompt, + selector=None, + selector_kwargs=kwargs, + ) + + elif ai == "proactive": + if prompt: + # Try AI first, then selector + return AILocator( + self.page, + self._ai, + prompt, + selector=selector, + selector_kwargs=kwargs, + try_selector_first=False, + ) + + if selector: + return self.page.locator(selector, **kwargs) + + raise ValueError("Selector is required but was not provided") + @action_wrap(ActionType.VERIFICATION_CODE) async def verification_code(self, prompt: str | None = None) -> None: return diff --git a/skyvern/core/script_generations/skyvern_page_ai.py b/skyvern/core/script_generations/skyvern_page_ai.py index 08a35f14..bf3b0e7b 100644 --- a/skyvern/core/script_generations/skyvern_page_ai.py +++ b/skyvern/core/script_generations/skyvern_page_ai.py @@ -71,3 +71,10 @@ class SkyvernPageAi(Protocol): ) -> None: """Perform an action on the page using AI based on a natural language prompt.""" ... + + async def ai_locate_element( + self, + prompt: str, + ) -> str | None: + """Locate an element on the page using AI and return its XPath selector.""" + ... diff --git a/skyvern/forge/prompts/skyvern/single-locate-element.j2 b/skyvern/forge/prompts/skyvern/single-locate-element.j2 new file mode 100644 index 00000000..4284b71f --- /dev/null +++ b/skyvern/forge/prompts/skyvern/single-locate-element.j2 @@ -0,0 +1,32 @@ +You are here to help the user locate a specific element on a web page and return its element ID. Use the user's description, the content of the elements parsed from the page, the screenshots of the page, and the current URL to identify the correct element. + +Each actionable element is tagged with an ID. Only select elements provided in the HTML elements list - do not imagine any new elements. + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. + +Reply in JSON format with the following keys: +{ + "thoughts": str, // Think step by step. Explain what information and visual cues help you identify the correct element. Reference specific attributes, text content, position, or visual characteristics you see. + "element_id": str, // The ID of the element from the HTML elements list. This must be one of the IDs from the elements provided above or the nearest parent with id containing the element. + "xpath": str, // A fallback XPath selector for the element. This will be used if the element_id cannot be found in the page data. Provide a complete, valid XPath (e.g., "//button[@id='submit']" or "//input[@name='username']"). + "confidence_float": float // Your confidence that this is the correct element. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence. +} + +User's element description (what element to locate): +``` +{{ data_extraction_goal }} +``` + +The URL of the page you're on right now is `{{ current_url }}`. + +HTML elements from `{{ current_url }}`: +``` +{{ elements }} +``` + +Text extracted from the webpage: {{ extracted_text }} + +Current datetime, ISO format: +``` +{{ local_datetime }} +``` diff --git a/skyvern/forge/sdk/routes/sdk.py b/skyvern/forge/sdk/routes/sdk.py index c9632e4a..91e18c9a 100644 --- a/skyvern/forge/sdk/routes/sdk.py +++ b/skyvern/forge/sdk/routes/sdk.py @@ -199,6 +199,11 @@ async def run_sdk_action( data=action.data, ) result = extract_result + elif action.type == "locate_element": + xpath_result = await page_ai.ai_locate_element( + prompt=action.prompt, + ) + result = xpath_result finally: skyvern_context.reset() diff --git a/skyvern/forge/sdk/schemas/sdk_actions.py b/skyvern/forge/sdk/schemas/sdk_actions.py index e83b6cc6..f8bcc819 100644 --- a/skyvern/forge/sdk/schemas/sdk_actions.py +++ b/skyvern/forge/sdk/schemas/sdk_actions.py @@ -15,6 +15,7 @@ class SdkActionType(str, Enum): AI_UPLOAD_FILE = "ai_upload_file" AI_ACT = "ai_act" EXTRACT = "extract" + LOCATE_ELEMENT = "locate_element" # Base action class @@ -137,9 +138,30 @@ class ExtractAction(SdkActionBase): return self.data if isinstance(self.data, dict) else None +class LocateElementAction(SdkActionBase): + """Locate element action parameters.""" + + type: Literal["locate_element"] = "locate_element" + prompt: str = Field(default="", description="Natural language prompt to locate an element") + + def get_navigation_goal(self) -> str | None: + return self.prompt + + def get_navigation_payload(self) -> dict[str, Any] | None: + return None + + # Discriminated union of all action types SdkAction = Annotated[ - Union[ClickAction, InputTextAction, SelectOptionAction, UploadFileAction, ActAction, ExtractAction], + Union[ + ClickAction, + InputTextAction, + SelectOptionAction, + UploadFileAction, + ActAction, + ExtractAction, + LocateElementAction, + ], Field(discriminator="type"), ] diff --git a/skyvern/library/ai_locator.py b/skyvern/library/ai_locator.py new file mode 100644 index 00000000..7d5d5f62 --- /dev/null +++ b/skyvern/library/ai_locator.py @@ -0,0 +1,123 @@ +from __future__ import annotations + +from typing import Any, Callable + +from playwright.async_api import Locator, Page + +from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi + +LOCATOR_CHAIN_METHODS = { + "nth", + "first", + "last", + "locator", + "filter", + "and_", + "or_", + "frame_locator", + "get_by_alt_text", + "get_by_label", + "get_by_placeholder", + "get_by_role", + "get_by_test_id", + "get_by_text", + "get_by_title", +} + + +class AILocator(Locator): + """A lazy proxy that acts like a Playwright Locator but resolves XPath via AI on first use. + + This class defers the AI call until an actual Playwright method is invoked, + allowing the locator to be created synchronously while the AI resolution happens asynchronously. + + Supports fallback to a selector if AI resolution fails. + """ + + def __init__( + self, + page: Page, + page_ai: SkyvernPageAi, + prompt: str, + selector: str | None = None, + selector_kwargs: dict[str, Any] | None = None, + try_selector_first: bool = True, + parent_resolver: Callable[[], Any] | None = None, + ): + super().__init__(page) + self._page = page + self._page_ai = page_ai + self._prompt = prompt + self._selector = selector + self._selector_kwargs = selector_kwargs or {} + self._resolved_locator: Locator | None = None + self._try_selector_first = try_selector_first + + # For chaining: store a resolver function that returns the final Locator + self._parent_resolver = parent_resolver + + async def _resolve(self) -> Locator: + if self._resolved_locator is None: + if self._parent_resolver: + self._resolved_locator = await self._parent_resolver() + else: + if self._try_selector_first and self._selector: + try: + selector_locator = self._page.locator(self._selector, **self._selector_kwargs) + count = await selector_locator.count() + if count > 0: + self._resolved_locator = selector_locator + return self._resolved_locator + except Exception: + # Selector failed, will try AI below + pass + + try: + xpath = await self._page_ai.ai_locate_element(prompt=self._prompt) + if not xpath: + raise ValueError(f"AI failed to locate element with prompt: {self._prompt}") + + self._resolved_locator = self._page.locator( + xpath if xpath.startswith(("xpath=", "css=", "text=", "role=", "id=")) else f"xpath={xpath}" + ) + except Exception as e: + if self._selector and not self._try_selector_first: + self._resolved_locator = self._page.locator(self._selector, **self._selector_kwargs) + else: + raise e + + return self._resolved_locator + + def __getattribute__(self, name: str) -> Any: + if name.startswith("_"): + return object.__getattribute__(self, name) + + # Locator chaining method + if name in LOCATOR_CHAIN_METHODS: + + def locator_chain_wrapper(*args: Any, **kwargs: Any) -> AILocator: + async def resolver() -> Locator: + parent_locator = await self._resolve() + method = getattr(parent_locator, name) + return method(*args, **kwargs) + + return AILocator( + page=self._page, + page_ai=self._page_ai, + prompt=self._prompt, + selector=self._selector, + selector_kwargs=self._selector_kwargs, + try_selector_first=self._try_selector_first, + parent_resolver=resolver, + ) + + return locator_chain_wrapper + + # For all other methods (async actions like click, fill, etc.) + async def async_method_wrapper(*args: Any, **kwargs: Any) -> Any: + locator = await self._resolve() + method = getattr(locator, name) + result = method(*args, **kwargs) + return await result + + return async_method_wrapper diff --git a/skyvern/library/skyvern_browser_page_ai.py b/skyvern/library/skyvern_browser_page_ai.py index 829ab62c..0615341e 100644 --- a/skyvern/library/skyvern_browser_page_ai.py +++ b/skyvern/library/skyvern_browser_page_ai.py @@ -10,6 +10,7 @@ from skyvern.client import ( RunSdkActionRequestAction_AiSelectOption, RunSdkActionRequestAction_AiUploadFile, RunSdkActionRequestAction_Extract, + RunSdkActionRequestAction_LocateElement, ) from skyvern.config import settings from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi @@ -192,3 +193,35 @@ class SdkSkyvernPageAi(SkyvernPageAi): workflow_run_id=self._browser.workflow_run_id, ) self._browser.workflow_run_id = response.workflow_run_id + + async def ai_locate_element( + self, + prompt: str, + ) -> str | None: + """Locate an element on the page using AI and return its XPath selector via API call. + + Args: + prompt: Natural language description of the element to locate (e.g., 'find "download invoices" button') + + Returns: + XPath selector string (e.g., 'xpath=//button[@id="download"]') or None if not found + """ + + LOG.info("AI locate element", prompt=prompt, workflow_run_id=self._browser.workflow_run_id) + + response = await self._browser.skyvern.run_sdk_action( + url=self._page.url, + action=RunSdkActionRequestAction_LocateElement( + prompt=prompt, + ), + browser_session_id=self._browser.browser_session_id, + browser_address=self._browser.browser_address, + workflow_run_id=self._browser.workflow_run_id, + ) + self._browser.workflow_run_id = response.workflow_run_id + + # Return the XPath result directly + if response.result and isinstance(response.result, str): + return response.result + + return None