SDK: Prompt-based locator (#4027)

This commit is contained in:
Stanislav Novosad
2025-11-21 19:13:42 -07:00
committed by GitHub
parent 90f51bcacb
commit 8fb46ef1ca
19 changed files with 899 additions and 4 deletions

View File

@@ -6420,6 +6420,25 @@
"title": "ExtractAction",
"description": "Extract data action parameters."
},
"LocateElementAction": {
"properties": {
"type": {
"type": "string",
"const": "locate_element",
"title": "Type",
"default": "locate_element"
},
"prompt": {
"type": "string",
"title": "Prompt",
"description": "Natural language prompt to locate an element",
"default": ""
}
},
"type": "object",
"title": "LocateElementAction",
"description": "Locate element action parameters."
},
"ExtractionBlock": {
"properties": {
"label": {
@@ -10915,6 +10934,9 @@
},
{
"$ref": "#/components/schemas/ExtractAction"
},
{
"$ref": "#/components/schemas/LocateElementAction"
}
],
"title": "Action",
@@ -10927,7 +10949,8 @@
"ai_input_text": "#/components/schemas/InputTextAction",
"ai_select_option": "#/components/schemas/SelectOptionAction",
"ai_upload_file": "#/components/schemas/UploadFileAction",
"extract": "#/components/schemas/ExtractAction"
"extract": "#/components/schemas/ExtractAction",
"locate_element": "#/components/schemas/LocateElementAction"
}
}
}

View File

@@ -224,9 +224,11 @@ if typing.TYPE_CHECKING:
HumanInteractionBlockParametersItem_Output,
HumanInteractionBlockParametersItem_Workflow,
HumanInteractionBlockYaml,
ImprovePromptResponse,
InputOrSelectContext,
InputTextAction,
InputTextActionData,
LocateElementAction,
LoginBlock,
LoginBlockDataSchema,
LoginBlockParametersItem,
@@ -276,6 +278,7 @@ if typing.TYPE_CHECKING:
RunSdkActionRequestAction_AiSelectOption,
RunSdkActionRequestAction_AiUploadFile,
RunSdkActionRequestAction_Extract,
RunSdkActionRequestAction_LocateElement,
RunSdkActionResponse,
RunStatus,
Script,
@@ -469,7 +472,7 @@ if typing.TYPE_CHECKING:
WorkflowStatus,
)
from .errors import BadRequestError, ForbiddenError, NotFoundError, UnprocessableEntityError
from . import browser_profiles, scripts, workflows
from . import browser_profiles, prompts, scripts, workflows
from .client import AsyncSkyvern, Skyvern
from .environment import SkyvernEnvironment
from .version import __version__
@@ -694,9 +697,11 @@ _dynamic_imports: typing.Dict[str, str] = {
"HumanInteractionBlockParametersItem_Output": ".types",
"HumanInteractionBlockParametersItem_Workflow": ".types",
"HumanInteractionBlockYaml": ".types",
"ImprovePromptResponse": ".types",
"InputOrSelectContext": ".types",
"InputTextAction": ".types",
"InputTextActionData": ".types",
"LocateElementAction": ".types",
"LoginBlock": ".types",
"LoginBlockDataSchema": ".types",
"LoginBlockParametersItem": ".types",
@@ -747,6 +752,7 @@ _dynamic_imports: typing.Dict[str, str] = {
"RunSdkActionRequestAction_AiSelectOption": ".types",
"RunSdkActionRequestAction_AiUploadFile": ".types",
"RunSdkActionRequestAction_Extract": ".types",
"RunSdkActionRequestAction_LocateElement": ".types",
"RunSdkActionResponse": ".types",
"RunStatus": ".types",
"Script": ".types",
@@ -943,6 +949,7 @@ _dynamic_imports: typing.Dict[str, str] = {
"WorkflowStatus": ".types",
"__version__": ".version",
"browser_profiles": ".browser_profiles",
"prompts": ".prompts",
"scripts": ".scripts",
"workflows": ".workflows",
}
@@ -1190,9 +1197,11 @@ __all__ = [
"HumanInteractionBlockParametersItem_Output",
"HumanInteractionBlockParametersItem_Workflow",
"HumanInteractionBlockYaml",
"ImprovePromptResponse",
"InputOrSelectContext",
"InputTextAction",
"InputTextActionData",
"LocateElementAction",
"LoginBlock",
"LoginBlockDataSchema",
"LoginBlockParametersItem",
@@ -1243,6 +1252,7 @@ __all__ = [
"RunSdkActionRequestAction_AiSelectOption",
"RunSdkActionRequestAction_AiUploadFile",
"RunSdkActionRequestAction_Extract",
"RunSdkActionRequestAction_LocateElement",
"RunSdkActionResponse",
"RunStatus",
"Script",
@@ -1439,6 +1449,7 @@ __all__ = [
"WorkflowStatus",
"__version__",
"browser_profiles",
"prompts",
"scripts",
"workflows",
]

View File

@@ -36,6 +36,7 @@ from .types.workflow_status import WorkflowStatus
if typing.TYPE_CHECKING:
from .browser_profiles.client import AsyncBrowserProfilesClient, BrowserProfilesClient
from .prompts.client import AsyncPromptsClient, PromptsClient
from .scripts.client import AsyncScriptsClient, ScriptsClient
from .workflows.client import AsyncWorkflowsClient, WorkflowsClient
# this is used as the default value for optional parameters
@@ -110,6 +111,7 @@ class Skyvern:
self._raw_client = RawSkyvern(client_wrapper=self._client_wrapper)
self._workflows: typing.Optional[WorkflowsClient] = None
self._browser_profiles: typing.Optional[BrowserProfilesClient] = None
self._prompts: typing.Optional[PromptsClient] = None
self._scripts: typing.Optional[ScriptsClient] = None
@property
@@ -1240,6 +1242,7 @@ class Skyvern:
totp_identifier: typing.Optional[str] = OMIT,
totp_url: typing.Optional[str] = OMIT,
browser_session_id: typing.Optional[str] = OMIT,
browser_profile_id: typing.Optional[str] = OMIT,
browser_address: typing.Optional[str] = OMIT,
extra_http_headers: typing.Optional[typing.Dict[str, typing.Optional[str]]] = OMIT,
max_screenshot_scrolling_times: typing.Optional[int] = OMIT,
@@ -1283,6 +1286,9 @@ class Skyvern:
browser_session_id : typing.Optional[str]
ID of the browser session to use, which is prefixed by `pbs_` e.g. `pbs_123456`
browser_profile_id : typing.Optional[str]
ID of a browser profile to reuse for this run
browser_address : typing.Optional[str]
The CDP address for the task.
@@ -1347,6 +1353,7 @@ class Skyvern:
totp_identifier=totp_identifier,
totp_url=totp_url,
browser_session_id=browser_session_id,
browser_profile_id=browser_profile_id,
browser_address=browser_address,
extra_http_headers=extra_http_headers,
max_screenshot_scrolling_times=max_screenshot_scrolling_times,
@@ -1601,6 +1608,14 @@ class Skyvern:
self._browser_profiles = BrowserProfilesClient(client_wrapper=self._client_wrapper)
return self._browser_profiles
@property
def prompts(self):
if self._prompts is None:
from .prompts.client import PromptsClient # noqa: E402
self._prompts = PromptsClient(client_wrapper=self._client_wrapper)
return self._prompts
@property
def scripts(self):
if self._scripts is None:
@@ -1678,6 +1693,7 @@ class AsyncSkyvern:
self._raw_client = AsyncRawSkyvern(client_wrapper=self._client_wrapper)
self._workflows: typing.Optional[AsyncWorkflowsClient] = None
self._browser_profiles: typing.Optional[AsyncBrowserProfilesClient] = None
self._prompts: typing.Optional[AsyncPromptsClient] = None
self._scripts: typing.Optional[AsyncScriptsClient] = None
@property
@@ -2982,6 +2998,7 @@ class AsyncSkyvern:
totp_identifier: typing.Optional[str] = OMIT,
totp_url: typing.Optional[str] = OMIT,
browser_session_id: typing.Optional[str] = OMIT,
browser_profile_id: typing.Optional[str] = OMIT,
browser_address: typing.Optional[str] = OMIT,
extra_http_headers: typing.Optional[typing.Dict[str, typing.Optional[str]]] = OMIT,
max_screenshot_scrolling_times: typing.Optional[int] = OMIT,
@@ -3025,6 +3042,9 @@ class AsyncSkyvern:
browser_session_id : typing.Optional[str]
ID of the browser session to use, which is prefixed by `pbs_` e.g. `pbs_123456`
browser_profile_id : typing.Optional[str]
ID of a browser profile to reuse for this run
browser_address : typing.Optional[str]
The CDP address for the task.
@@ -3097,6 +3117,7 @@ class AsyncSkyvern:
totp_identifier=totp_identifier,
totp_url=totp_url,
browser_session_id=browser_session_id,
browser_profile_id=browser_profile_id,
browser_address=browser_address,
extra_http_headers=extra_http_headers,
max_screenshot_scrolling_times=max_screenshot_scrolling_times,
@@ -3391,6 +3412,14 @@ class AsyncSkyvern:
self._browser_profiles = AsyncBrowserProfilesClient(client_wrapper=self._client_wrapper)
return self._browser_profiles
@property
def prompts(self):
if self._prompts is None:
from .prompts.client import AsyncPromptsClient # noqa: E402
self._prompts = AsyncPromptsClient(client_wrapper=self._client_wrapper)
return self._prompts
@property
def scripts(self):
if self._scripts is None:

View File

@@ -0,0 +1,4 @@
# This file was auto-generated by Fern from our API Definition.
# isort: skip_file

View File

@@ -0,0 +1,145 @@
# This file was auto-generated by Fern from our API Definition.
import typing
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
from ..core.request_options import RequestOptions
from ..types.improve_prompt_response import ImprovePromptResponse
from .raw_client import AsyncRawPromptsClient, RawPromptsClient
# this is used as the default value for optional parameters
OMIT = typing.cast(typing.Any, ...)
class PromptsClient:
def __init__(self, *, client_wrapper: SyncClientWrapper):
self._raw_client = RawPromptsClient(client_wrapper=client_wrapper)
@property
def with_raw_response(self) -> RawPromptsClient:
"""
Retrieves a raw implementation of this client that returns raw responses.
Returns
-------
RawPromptsClient
"""
return self._raw_client
def improve_prompt(
self,
*,
use_case: str,
prompt: str,
context: typing.Optional[str] = OMIT,
request_options: typing.Optional[RequestOptions] = None,
) -> ImprovePromptResponse:
"""
Improve a prompt based on a specific use-case
Parameters
----------
use_case : str
The use-case for prompt improvement
prompt : str
The original prompt to improve
context : typing.Optional[str]
Additional context about the user's needs
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
ImprovePromptResponse
Successful Response
Examples
--------
from skyvern import Skyvern
client = Skyvern(
api_key="YOUR_API_KEY",
)
client.prompts.improve_prompt(
use_case="use-case",
prompt="prompt",
)
"""
_response = self._raw_client.improve_prompt(
use_case=use_case, prompt=prompt, context=context, request_options=request_options
)
return _response.data
class AsyncPromptsClient:
def __init__(self, *, client_wrapper: AsyncClientWrapper):
self._raw_client = AsyncRawPromptsClient(client_wrapper=client_wrapper)
@property
def with_raw_response(self) -> AsyncRawPromptsClient:
"""
Retrieves a raw implementation of this client that returns raw responses.
Returns
-------
AsyncRawPromptsClient
"""
return self._raw_client
async def improve_prompt(
self,
*,
use_case: str,
prompt: str,
context: typing.Optional[str] = OMIT,
request_options: typing.Optional[RequestOptions] = None,
) -> ImprovePromptResponse:
"""
Improve a prompt based on a specific use-case
Parameters
----------
use_case : str
The use-case for prompt improvement
prompt : str
The original prompt to improve
context : typing.Optional[str]
Additional context about the user's needs
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
ImprovePromptResponse
Successful Response
Examples
--------
import asyncio
from skyvern import AsyncSkyvern
client = AsyncSkyvern(
api_key="YOUR_API_KEY",
)
async def main() -> None:
await client.prompts.improve_prompt(
use_case="use-case",
prompt="prompt",
)
asyncio.run(main())
"""
_response = await self._raw_client.improve_prompt(
use_case=use_case, prompt=prompt, context=context, request_options=request_options
)
return _response.data

View File

@@ -0,0 +1,169 @@
# This file was auto-generated by Fern from our API Definition.
import typing
from json.decoder import JSONDecodeError
from ..core.api_error import ApiError
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
from ..core.http_response import AsyncHttpResponse, HttpResponse
from ..core.pydantic_utilities import parse_obj_as
from ..core.request_options import RequestOptions
from ..errors.unprocessable_entity_error import UnprocessableEntityError
from ..types.improve_prompt_response import ImprovePromptResponse
# this is used as the default value for optional parameters
OMIT = typing.cast(typing.Any, ...)
class RawPromptsClient:
def __init__(self, *, client_wrapper: SyncClientWrapper):
self._client_wrapper = client_wrapper
def improve_prompt(
self,
*,
use_case: str,
prompt: str,
context: typing.Optional[str] = OMIT,
request_options: typing.Optional[RequestOptions] = None,
) -> HttpResponse[ImprovePromptResponse]:
"""
Improve a prompt based on a specific use-case
Parameters
----------
use_case : str
The use-case for prompt improvement
prompt : str
The original prompt to improve
context : typing.Optional[str]
Additional context about the user's needs
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
HttpResponse[ImprovePromptResponse]
Successful Response
"""
_response = self._client_wrapper.httpx_client.request(
"v1/prompts/improve",
method="POST",
params={
"use-case": use_case,
},
json={
"context": context,
"prompt": prompt,
},
headers={
"content-type": "application/json",
},
request_options=request_options,
omit=OMIT,
)
try:
if 200 <= _response.status_code < 300:
_data = typing.cast(
ImprovePromptResponse,
parse_obj_as(
type_=ImprovePromptResponse, # type: ignore
object_=_response.json(),
),
)
return HttpResponse(response=_response, data=_data)
if _response.status_code == 422:
raise UnprocessableEntityError(
headers=dict(_response.headers),
body=typing.cast(
typing.Optional[typing.Any],
parse_obj_as(
type_=typing.Optional[typing.Any], # type: ignore
object_=_response.json(),
),
),
)
_response_json = _response.json()
except JSONDecodeError:
raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text)
raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json)
class AsyncRawPromptsClient:
def __init__(self, *, client_wrapper: AsyncClientWrapper):
self._client_wrapper = client_wrapper
async def improve_prompt(
self,
*,
use_case: str,
prompt: str,
context: typing.Optional[str] = OMIT,
request_options: typing.Optional[RequestOptions] = None,
) -> AsyncHttpResponse[ImprovePromptResponse]:
"""
Improve a prompt based on a specific use-case
Parameters
----------
use_case : str
The use-case for prompt improvement
prompt : str
The original prompt to improve
context : typing.Optional[str]
Additional context about the user's needs
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
AsyncHttpResponse[ImprovePromptResponse]
Successful Response
"""
_response = await self._client_wrapper.httpx_client.request(
"v1/prompts/improve",
method="POST",
params={
"use-case": use_case,
},
json={
"context": context,
"prompt": prompt,
},
headers={
"content-type": "application/json",
},
request_options=request_options,
omit=OMIT,
)
try:
if 200 <= _response.status_code < 300:
_data = typing.cast(
ImprovePromptResponse,
parse_obj_as(
type_=ImprovePromptResponse, # type: ignore
object_=_response.json(),
),
)
return AsyncHttpResponse(response=_response, data=_data)
if _response.status_code == 422:
raise UnprocessableEntityError(
headers=dict(_response.headers),
body=typing.cast(
typing.Optional[typing.Any],
parse_obj_as(
type_=typing.Optional[typing.Any], # type: ignore
object_=_response.json(),
),
),
)
_response_json = _response.json()
except JSONDecodeError:
raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text)
raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json)

View File

@@ -1683,6 +1683,7 @@ class RawSkyvern:
totp_identifier: typing.Optional[str] = OMIT,
totp_url: typing.Optional[str] = OMIT,
browser_session_id: typing.Optional[str] = OMIT,
browser_profile_id: typing.Optional[str] = OMIT,
browser_address: typing.Optional[str] = OMIT,
extra_http_headers: typing.Optional[typing.Dict[str, typing.Optional[str]]] = OMIT,
max_screenshot_scrolling_times: typing.Optional[int] = OMIT,
@@ -1726,6 +1727,9 @@ class RawSkyvern:
browser_session_id : typing.Optional[str]
ID of the browser session to use, which is prefixed by `pbs_` e.g. `pbs_123456`
browser_profile_id : typing.Optional[str]
ID of a browser profile to reuse for this run
browser_address : typing.Optional[str]
The CDP address for the task.
@@ -1782,6 +1786,7 @@ class RawSkyvern:
"totp_identifier": totp_identifier,
"totp_url": totp_url,
"browser_session_id": browser_session_id,
"browser_profile_id": browser_profile_id,
"browser_address": browser_address,
"extra_http_headers": extra_http_headers,
"max_screenshot_scrolling_times": max_screenshot_scrolling_times,
@@ -3799,6 +3804,7 @@ class AsyncRawSkyvern:
totp_identifier: typing.Optional[str] = OMIT,
totp_url: typing.Optional[str] = OMIT,
browser_session_id: typing.Optional[str] = OMIT,
browser_profile_id: typing.Optional[str] = OMIT,
browser_address: typing.Optional[str] = OMIT,
extra_http_headers: typing.Optional[typing.Dict[str, typing.Optional[str]]] = OMIT,
max_screenshot_scrolling_times: typing.Optional[int] = OMIT,
@@ -3842,6 +3848,9 @@ class AsyncRawSkyvern:
browser_session_id : typing.Optional[str]
ID of the browser session to use, which is prefixed by `pbs_` e.g. `pbs_123456`
browser_profile_id : typing.Optional[str]
ID of a browser profile to reuse for this run
browser_address : typing.Optional[str]
The CDP address for the task.
@@ -3898,6 +3907,7 @@ class AsyncRawSkyvern:
"totp_identifier": totp_identifier,
"totp_url": totp_url,
"browser_session_id": browser_session_id,
"browser_profile_id": browser_profile_id,
"browser_address": browser_address,
"extra_http_headers": extra_http_headers,
"max_screenshot_scrolling_times": max_screenshot_scrolling_times,

View File

@@ -245,9 +245,11 @@ if typing.TYPE_CHECKING:
HumanInteractionBlockParametersItem_Workflow,
)
from .human_interaction_block_yaml import HumanInteractionBlockYaml
from .improve_prompt_response import ImprovePromptResponse
from .input_or_select_context import InputOrSelectContext
from .input_text_action import InputTextAction
from .input_text_action_data import InputTextActionData
from .locate_element_action import LocateElementAction
from .login_block import LoginBlock
from .login_block_data_schema import LoginBlockDataSchema
from .login_block_parameters_item import (
@@ -302,6 +304,7 @@ if typing.TYPE_CHECKING:
RunSdkActionRequestAction_AiSelectOption,
RunSdkActionRequestAction_AiUploadFile,
RunSdkActionRequestAction_Extract,
RunSdkActionRequestAction_LocateElement,
)
from .run_sdk_action_response import RunSdkActionResponse
from .run_status import RunStatus
@@ -730,9 +733,11 @@ _dynamic_imports: typing.Dict[str, str] = {
"HumanInteractionBlockParametersItem_Output": ".human_interaction_block_parameters_item",
"HumanInteractionBlockParametersItem_Workflow": ".human_interaction_block_parameters_item",
"HumanInteractionBlockYaml": ".human_interaction_block_yaml",
"ImprovePromptResponse": ".improve_prompt_response",
"InputOrSelectContext": ".input_or_select_context",
"InputTextAction": ".input_text_action",
"InputTextActionData": ".input_text_action_data",
"LocateElementAction": ".locate_element_action",
"LoginBlock": ".login_block",
"LoginBlockDataSchema": ".login_block_data_schema",
"LoginBlockParametersItem": ".login_block_parameters_item",
@@ -782,6 +787,7 @@ _dynamic_imports: typing.Dict[str, str] = {
"RunSdkActionRequestAction_AiSelectOption": ".run_sdk_action_request_action",
"RunSdkActionRequestAction_AiUploadFile": ".run_sdk_action_request_action",
"RunSdkActionRequestAction_Extract": ".run_sdk_action_request_action",
"RunSdkActionRequestAction_LocateElement": ".run_sdk_action_request_action",
"RunSdkActionResponse": ".run_sdk_action_response",
"RunStatus": ".run_status",
"Script": ".script",
@@ -1215,9 +1221,11 @@ __all__ = [
"HumanInteractionBlockParametersItem_Output",
"HumanInteractionBlockParametersItem_Workflow",
"HumanInteractionBlockYaml",
"ImprovePromptResponse",
"InputOrSelectContext",
"InputTextAction",
"InputTextActionData",
"LocateElementAction",
"LoginBlock",
"LoginBlockDataSchema",
"LoginBlockParametersItem",
@@ -1267,6 +1275,7 @@ __all__ = [
"RunSdkActionRequestAction_AiSelectOption",
"RunSdkActionRequestAction_AiUploadFile",
"RunSdkActionRequestAction_Extract",
"RunSdkActionRequestAction_LocateElement",
"RunSdkActionResponse",
"RunStatus",
"Script",

View File

@@ -0,0 +1,32 @@
# This file was auto-generated by Fern from our API Definition.
import typing
import pydantic
from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
class ImprovePromptResponse(UniversalBaseModel):
error: typing.Optional[str] = pydantic.Field(default=None)
"""
Error message if prompt improvement failed
"""
improved: str = pydantic.Field()
"""
The improved version of the prompt
"""
original: str = pydantic.Field()
"""
The original prompt provided for improvement
"""
if IS_PYDANTIC_V2:
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
else:
class Config:
frozen = True
smart_union = True
extra = pydantic.Extra.allow

View File

@@ -0,0 +1,26 @@
# This file was auto-generated by Fern from our API Definition.
import typing
import pydantic
from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
class LocateElementAction(UniversalBaseModel):
"""
Locate element action parameters.
"""
prompt: typing.Optional[str] = pydantic.Field(default=None)
"""
Natural language prompt to locate an element
"""
if IS_PYDANTIC_V2:
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
else:
class Config:
frozen = True
smart_union = True
extra = pydantic.Extra.allow

View File

@@ -145,6 +145,24 @@ class RunSdkActionRequestAction_Extract(UniversalBaseModel):
extra = pydantic.Extra.allow
class RunSdkActionRequestAction_LocateElement(UniversalBaseModel):
"""
The action to execute with its specific parameters
"""
type: typing.Literal["locate_element"] = "locate_element"
prompt: typing.Optional[str] = None
if IS_PYDANTIC_V2:
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
else:
class Config:
frozen = True
smart_union = True
extra = pydantic.Extra.allow
RunSdkActionRequestAction = typing.Union[
RunSdkActionRequestAction_AiAct,
RunSdkActionRequestAction_AiClick,
@@ -152,4 +170,5 @@ RunSdkActionRequestAction = typing.Union[
RunSdkActionRequestAction_AiSelectOption,
RunSdkActionRequestAction_AiUploadFile,
RunSdkActionRequestAction_Extract,
RunSdkActionRequestAction_LocateElement,
]

View File

@@ -550,6 +550,87 @@ class RealSkyvernPageAi(SkyvernPageAi):
print(f"{'-' * 50}\n")
return result
async def ai_locate_element(
self,
prompt: str,
) -> str | None:
"""Locate an element on the page using AI and return its XPath selector.
Args:
prompt: Natural language description of the element to locate (e.g., 'find "download invoices" button')
Returns:
XPath selector string (e.g., 'xpath=//button[@id="download"]') or None if not found
"""
scraped_page_refreshed = await self.scraped_page.refresh()
context = skyvern_context.ensure_context()
prompt_rendered = _render_template_with_label(prompt, label=self.current_label)
locate_element_prompt = load_prompt_with_elements(
element_tree_builder=scraped_page_refreshed,
prompt_engine=prompt_engine,
template_name="single-locate-element",
html_need_skyvern_attrs=True,
data_extraction_goal=prompt_rendered,
current_url=scraped_page_refreshed.url,
local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(),
)
step = None
if context.organization_id and context.task_id and context.step_id:
step = await app.DATABASE.get_step(
step_id=context.step_id,
organization_id=context.organization_id,
)
result = await app.EXTRACTION_LLM_API_HANDLER(
prompt=locate_element_prompt,
step=step,
screenshots=scraped_page_refreshed.screenshots,
prompt_name="single-locate-element",
)
if not result or not isinstance(result, dict):
LOG.error(
"AI locate element failed - invalid result",
result=result,
result_type=type(result).__name__,
prompt=prompt_rendered,
)
return None
element_id = result.get("element_id", None)
confidence = result.get("confidence_float", 0.0)
xpath: str | None = None
if element_id:
skyvern_element_data = scraped_page_refreshed.id_to_element_dict.get(element_id)
if skyvern_element_data and "xpath" in skyvern_element_data:
xpath = skyvern_element_data.get("xpath")
if not xpath:
xpath = result.get("xpath", None)
if not xpath:
LOG.error(
"AI locate element failed - no xpath in element data",
element_id=element_id,
result=result,
prompt=prompt_rendered,
)
return None
LOG.info(
"AI locate element result",
element_id=element_id,
xpath=xpath,
confidence=confidence,
prompt=prompt_rendered,
)
return xpath
async def ai_act(
self,
prompt: str,

View File

@@ -6,12 +6,13 @@ from dataclasses import dataclass
from typing import Any, Callable, Literal, overload
import structlog
from playwright.async_api import Page
from playwright.async_api import Locator, Page
from skyvern.config import settings
from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
from skyvern.forge.sdk.api.files import download_file
from skyvern.forge.sdk.core import skyvern_context
from skyvern.library.ai_locator import AILocator
from skyvern.webeye.actions import handler_utils
from skyvern.webeye.actions.action_types import ActionType
@@ -683,6 +684,120 @@ class SkyvernPage(Page):
data = kwargs.pop("data", None)
return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data)
@overload
def locator(
self,
selector: str,
*,
prompt: str | None = None,
ai: str | None = "fallback",
**kwargs: Any,
) -> Locator: ...
@overload
def locator(
self,
*,
prompt: str,
ai: str | None = "fallback",
**kwargs: Any,
) -> Locator: ...
def locator(
self,
selector: str | None = None,
*,
prompt: str | None = None,
ai: str | None = "fallback",
**kwargs: Any,
) -> Locator:
"""Get a Playwright locator using a CSS selector, AI-powered prompt, or both.
This method extends Playwright's locator() with AI capabilities. It supports three modes:
- **Selector-based**: Get locator using CSS selector (standard Playwright behavior)
- **AI-powered**: Use natural language to describe the element (returns lazy AILocator)
- **Fallback mode** (default): Try the selector first, fall back to AI if it fails
The AI-powered locator is lazy - it only calls ai_locate_element when you actually
use the locator (e.g., when you call .click(), .fill(), etc.). Note that using this
AI locator lookup with prompt only works for elements you can interact with on the page.
Args:
selector: CSS selector for the target element.
prompt: Natural language description of which element to locate.
ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
**kwargs: All Playwright locator parameters (has_text, has, etc.)
Returns:
A Playwright Locator object (or AILocator proxy that acts like one).
Examples:
```python
# Standard Playwright usage - selector only
download_button = page.locator("#download-btn")
await download_button.click()
# AI-powered - prompt only (returns lazy _AILocator)
download_button = page.locator(prompt='find "download invoices" button')
await download_button.click() # AI resolves XPath here
# Fallback mode - try selector first, use AI if it fails
download_button = page.locator("#download-btn", prompt='find "download invoices" button')
await download_button.click()
# With Playwright parameters
submit_button = page.locator(prompt="find submit button", has_text="Submit")
await submit_button.click()
```
"""
if not selector and not prompt:
raise ValueError("Missing input: pass a selector and/or a prompt.")
context = skyvern_context.current()
if context and context.ai_mode_override:
ai = context.ai_mode_override
if ai == "fallback":
if selector and prompt:
# Try selector first, then AI
return AILocator(
self.page,
self._ai,
prompt,
selector=selector,
selector_kwargs=kwargs,
try_selector_first=True,
)
if selector:
return self.page.locator(selector, **kwargs)
if prompt:
return AILocator(
self.page,
self._ai,
prompt,
selector=None,
selector_kwargs=kwargs,
)
elif ai == "proactive":
if prompt:
# Try AI first, then selector
return AILocator(
self.page,
self._ai,
prompt,
selector=selector,
selector_kwargs=kwargs,
try_selector_first=False,
)
if selector:
return self.page.locator(selector, **kwargs)
raise ValueError("Selector is required but was not provided")
@action_wrap(ActionType.VERIFICATION_CODE)
async def verification_code(self, prompt: str | None = None) -> None:
return

View File

@@ -71,3 +71,10 @@ class SkyvernPageAi(Protocol):
) -> None:
"""Perform an action on the page using AI based on a natural language prompt."""
...
async def ai_locate_element(
self,
prompt: str,
) -> str | None:
"""Locate an element on the page using AI and return its XPath selector."""
...

View File

@@ -0,0 +1,32 @@
You are here to help the user locate a specific element on a web page and return its element ID. Use the user's description, the content of the elements parsed from the page, the screenshots of the page, and the current URL to identify the correct element.
Each actionable element is tagged with an ID. Only select elements provided in the HTML elements list - do not imagine any new elements.
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
Reply in JSON format with the following keys:
{
"thoughts": str, // Think step by step. Explain what information and visual cues help you identify the correct element. Reference specific attributes, text content, position, or visual characteristics you see.
"element_id": str, // The ID of the element from the HTML elements list. This must be one of the IDs from the elements provided above or the nearest parent with id containing the element.
"xpath": str, // A fallback XPath selector for the element. This will be used if the element_id cannot be found in the page data. Provide a complete, valid XPath (e.g., "//button[@id='submit']" or "//input[@name='username']").
"confidence_float": float // Your confidence that this is the correct element. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence.
}
User's element description (what element to locate):
```
{{ data_extraction_goal }}
```
The URL of the page you're on right now is `{{ current_url }}`.
HTML elements from `{{ current_url }}`:
```
{{ elements }}
```
Text extracted from the webpage: {{ extracted_text }}
Current datetime, ISO format:
```
{{ local_datetime }}
```

View File

@@ -199,6 +199,11 @@ async def run_sdk_action(
data=action.data,
)
result = extract_result
elif action.type == "locate_element":
xpath_result = await page_ai.ai_locate_element(
prompt=action.prompt,
)
result = xpath_result
finally:
skyvern_context.reset()

View File

@@ -15,6 +15,7 @@ class SdkActionType(str, Enum):
AI_UPLOAD_FILE = "ai_upload_file"
AI_ACT = "ai_act"
EXTRACT = "extract"
LOCATE_ELEMENT = "locate_element"
# Base action class
@@ -137,9 +138,30 @@ class ExtractAction(SdkActionBase):
return self.data if isinstance(self.data, dict) else None
class LocateElementAction(SdkActionBase):
"""Locate element action parameters."""
type: Literal["locate_element"] = "locate_element"
prompt: str = Field(default="", description="Natural language prompt to locate an element")
def get_navigation_goal(self) -> str | None:
return self.prompt
def get_navigation_payload(self) -> dict[str, Any] | None:
return None
# Discriminated union of all action types
SdkAction = Annotated[
Union[ClickAction, InputTextAction, SelectOptionAction, UploadFileAction, ActAction, ExtractAction],
Union[
ClickAction,
InputTextAction,
SelectOptionAction,
UploadFileAction,
ActAction,
ExtractAction,
LocateElementAction,
],
Field(discriminator="type"),
]

View File

@@ -0,0 +1,123 @@
from __future__ import annotations
from typing import Any, Callable
from playwright.async_api import Locator, Page
from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
LOCATOR_CHAIN_METHODS = {
"nth",
"first",
"last",
"locator",
"filter",
"and_",
"or_",
"frame_locator",
"get_by_alt_text",
"get_by_label",
"get_by_placeholder",
"get_by_role",
"get_by_test_id",
"get_by_text",
"get_by_title",
}
class AILocator(Locator):
"""A lazy proxy that acts like a Playwright Locator but resolves XPath via AI on first use.
This class defers the AI call until an actual Playwright method is invoked,
allowing the locator to be created synchronously while the AI resolution happens asynchronously.
Supports fallback to a selector if AI resolution fails.
"""
def __init__(
self,
page: Page,
page_ai: SkyvernPageAi,
prompt: str,
selector: str | None = None,
selector_kwargs: dict[str, Any] | None = None,
try_selector_first: bool = True,
parent_resolver: Callable[[], Any] | None = None,
):
super().__init__(page)
self._page = page
self._page_ai = page_ai
self._prompt = prompt
self._selector = selector
self._selector_kwargs = selector_kwargs or {}
self._resolved_locator: Locator | None = None
self._try_selector_first = try_selector_first
# For chaining: store a resolver function that returns the final Locator
self._parent_resolver = parent_resolver
async def _resolve(self) -> Locator:
if self._resolved_locator is None:
if self._parent_resolver:
self._resolved_locator = await self._parent_resolver()
else:
if self._try_selector_first and self._selector:
try:
selector_locator = self._page.locator(self._selector, **self._selector_kwargs)
count = await selector_locator.count()
if count > 0:
self._resolved_locator = selector_locator
return self._resolved_locator
except Exception:
# Selector failed, will try AI below
pass
try:
xpath = await self._page_ai.ai_locate_element(prompt=self._prompt)
if not xpath:
raise ValueError(f"AI failed to locate element with prompt: {self._prompt}")
self._resolved_locator = self._page.locator(
xpath if xpath.startswith(("xpath=", "css=", "text=", "role=", "id=")) else f"xpath={xpath}"
)
except Exception as e:
if self._selector and not self._try_selector_first:
self._resolved_locator = self._page.locator(self._selector, **self._selector_kwargs)
else:
raise e
return self._resolved_locator
def __getattribute__(self, name: str) -> Any:
if name.startswith("_"):
return object.__getattribute__(self, name)
# Locator chaining method
if name in LOCATOR_CHAIN_METHODS:
def locator_chain_wrapper(*args: Any, **kwargs: Any) -> AILocator:
async def resolver() -> Locator:
parent_locator = await self._resolve()
method = getattr(parent_locator, name)
return method(*args, **kwargs)
return AILocator(
page=self._page,
page_ai=self._page_ai,
prompt=self._prompt,
selector=self._selector,
selector_kwargs=self._selector_kwargs,
try_selector_first=self._try_selector_first,
parent_resolver=resolver,
)
return locator_chain_wrapper
# For all other methods (async actions like click, fill, etc.)
async def async_method_wrapper(*args: Any, **kwargs: Any) -> Any:
locator = await self._resolve()
method = getattr(locator, name)
result = method(*args, **kwargs)
return await result
return async_method_wrapper

View File

@@ -10,6 +10,7 @@ from skyvern.client import (
RunSdkActionRequestAction_AiSelectOption,
RunSdkActionRequestAction_AiUploadFile,
RunSdkActionRequestAction_Extract,
RunSdkActionRequestAction_LocateElement,
)
from skyvern.config import settings
from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
@@ -192,3 +193,35 @@ class SdkSkyvernPageAi(SkyvernPageAi):
workflow_run_id=self._browser.workflow_run_id,
)
self._browser.workflow_run_id = response.workflow_run_id
async def ai_locate_element(
self,
prompt: str,
) -> str | None:
"""Locate an element on the page using AI and return its XPath selector via API call.
Args:
prompt: Natural language description of the element to locate (e.g., 'find "download invoices" button')
Returns:
XPath selector string (e.g., 'xpath=//button[@id="download"]') or None if not found
"""
LOG.info("AI locate element", prompt=prompt, workflow_run_id=self._browser.workflow_run_id)
response = await self._browser.skyvern.run_sdk_action(
url=self._page.url,
action=RunSdkActionRequestAction_LocateElement(
prompt=prompt,
),
browser_session_id=self._browser.browser_session_id,
browser_address=self._browser.browser_address,
workflow_run_id=self._browser.workflow_run_id,
)
self._browser.workflow_run_id = response.workflow_run_id
# Return the XPath result directly
if response.result and isinstance(response.result, str):
return response.result
return None