SDK: Prompt-based locator (#4027)

This commit is contained in:
Stanislav Novosad
2025-11-21 19:13:42 -07:00
committed by GitHub
parent 90f51bcacb
commit 8fb46ef1ca
19 changed files with 899 additions and 4 deletions

View File

@@ -550,6 +550,87 @@ class RealSkyvernPageAi(SkyvernPageAi):
print(f"{'-' * 50}\n")
return result
async def ai_locate_element(
self,
prompt: str,
) -> str | None:
"""Locate an element on the page using AI and return its XPath selector.
Args:
prompt: Natural language description of the element to locate (e.g., 'find "download invoices" button')
Returns:
XPath selector string (e.g., 'xpath=//button[@id="download"]') or None if not found
"""
scraped_page_refreshed = await self.scraped_page.refresh()
context = skyvern_context.ensure_context()
prompt_rendered = _render_template_with_label(prompt, label=self.current_label)
locate_element_prompt = load_prompt_with_elements(
element_tree_builder=scraped_page_refreshed,
prompt_engine=prompt_engine,
template_name="single-locate-element",
html_need_skyvern_attrs=True,
data_extraction_goal=prompt_rendered,
current_url=scraped_page_refreshed.url,
local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(),
)
step = None
if context.organization_id and context.task_id and context.step_id:
step = await app.DATABASE.get_step(
step_id=context.step_id,
organization_id=context.organization_id,
)
result = await app.EXTRACTION_LLM_API_HANDLER(
prompt=locate_element_prompt,
step=step,
screenshots=scraped_page_refreshed.screenshots,
prompt_name="single-locate-element",
)
if not result or not isinstance(result, dict):
LOG.error(
"AI locate element failed - invalid result",
result=result,
result_type=type(result).__name__,
prompt=prompt_rendered,
)
return None
element_id = result.get("element_id", None)
confidence = result.get("confidence_float", 0.0)
xpath: str | None = None
if element_id:
skyvern_element_data = scraped_page_refreshed.id_to_element_dict.get(element_id)
if skyvern_element_data and "xpath" in skyvern_element_data:
xpath = skyvern_element_data.get("xpath")
if not xpath:
xpath = result.get("xpath", None)
if not xpath:
LOG.error(
"AI locate element failed - no xpath in element data",
element_id=element_id,
result=result,
prompt=prompt_rendered,
)
return None
LOG.info(
"AI locate element result",
element_id=element_id,
xpath=xpath,
confidence=confidence,
prompt=prompt_rendered,
)
return xpath
async def ai_act(
self,
prompt: str,

View File

@@ -6,12 +6,13 @@ from dataclasses import dataclass
from typing import Any, Callable, Literal, overload
import structlog
from playwright.async_api import Page
from playwright.async_api import Locator, Page
from skyvern.config import settings
from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
from skyvern.forge.sdk.api.files import download_file
from skyvern.forge.sdk.core import skyvern_context
from skyvern.library.ai_locator import AILocator
from skyvern.webeye.actions import handler_utils
from skyvern.webeye.actions.action_types import ActionType
@@ -683,6 +684,120 @@ class SkyvernPage(Page):
data = kwargs.pop("data", None)
return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data)
@overload
def locator(
self,
selector: str,
*,
prompt: str | None = None,
ai: str | None = "fallback",
**kwargs: Any,
) -> Locator: ...
@overload
def locator(
self,
*,
prompt: str,
ai: str | None = "fallback",
**kwargs: Any,
) -> Locator: ...
def locator(
self,
selector: str | None = None,
*,
prompt: str | None = None,
ai: str | None = "fallback",
**kwargs: Any,
) -> Locator:
"""Get a Playwright locator using a CSS selector, AI-powered prompt, or both.
This method extends Playwright's locator() with AI capabilities. It supports three modes:
- **Selector-based**: Get locator using CSS selector (standard Playwright behavior)
- **AI-powered**: Use natural language to describe the element (returns lazy AILocator)
- **Fallback mode** (default): Try the selector first, fall back to AI if it fails
The AI-powered locator is lazy - it only calls ai_locate_element when you actually
use the locator (e.g., when you call .click(), .fill(), etc.). Note that using this
AI locator lookup with prompt only works for elements you can interact with on the page.
Args:
selector: CSS selector for the target element.
prompt: Natural language description of which element to locate.
ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
**kwargs: All Playwright locator parameters (has_text, has, etc.)
Returns:
A Playwright Locator object (or AILocator proxy that acts like one).
Examples:
```python
# Standard Playwright usage - selector only
download_button = page.locator("#download-btn")
await download_button.click()
# AI-powered - prompt only (returns lazy _AILocator)
download_button = page.locator(prompt='find "download invoices" button')
await download_button.click() # AI resolves XPath here
# Fallback mode - try selector first, use AI if it fails
download_button = page.locator("#download-btn", prompt='find "download invoices" button')
await download_button.click()
# With Playwright parameters
submit_button = page.locator(prompt="find submit button", has_text="Submit")
await submit_button.click()
```
"""
if not selector and not prompt:
raise ValueError("Missing input: pass a selector and/or a prompt.")
context = skyvern_context.current()
if context and context.ai_mode_override:
ai = context.ai_mode_override
if ai == "fallback":
if selector and prompt:
# Try selector first, then AI
return AILocator(
self.page,
self._ai,
prompt,
selector=selector,
selector_kwargs=kwargs,
try_selector_first=True,
)
if selector:
return self.page.locator(selector, **kwargs)
if prompt:
return AILocator(
self.page,
self._ai,
prompt,
selector=None,
selector_kwargs=kwargs,
)
elif ai == "proactive":
if prompt:
# Try AI first, then selector
return AILocator(
self.page,
self._ai,
prompt,
selector=selector,
selector_kwargs=kwargs,
try_selector_first=False,
)
if selector:
return self.page.locator(selector, **kwargs)
raise ValueError("Selector is required but was not provided")
@action_wrap(ActionType.VERIFICATION_CODE)
async def verification_code(self, prompt: str | None = None) -> None:
return

View File

@@ -71,3 +71,10 @@ class SkyvernPageAi(Protocol):
) -> None:
"""Perform an action on the page using AI based on a natural language prompt."""
...
async def ai_locate_element(
self,
prompt: str,
) -> str | None:
"""Locate an element on the page using AI and return its XPath selector."""
...