SkyvernBrowserPage extends SkyvernPage (#3921)

This commit is contained in:
Stanislav Novosad
2025-11-07 10:58:22 -07:00
committed by GitHub
parent 926a5da13e
commit a9c3d692ff
6 changed files with 304 additions and 492 deletions

View File

@@ -134,11 +134,11 @@ class RealSkyvernPageAi(SkyvernPageAi):
async def ai_click(
self,
selector: str,
selector: str | None,
intention: str,
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
) -> str:
) -> str | None:
"""Click an element using AI to locate it based on intention."""
try:
# Build the element tree of the current page for the prompt
@@ -193,7 +193,7 @@ class RealSkyvernPageAi(SkyvernPageAi):
async def ai_input_text(
self,
selector: str | None,
value: str,
value: str | None,
intention: str,
data: str | dict[str, Any] | None = None,
totp_identifier: str | None = None,
@@ -419,8 +419,8 @@ class RealSkyvernPageAi(SkyvernPageAi):
async def ai_select_option(
self,
selector: str,
value: str,
selector: str | None,
value: str | None,
intention: str,
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,

View File

@@ -4,7 +4,7 @@ import asyncio
import copy
from dataclasses import dataclass
from enum import StrEnum
from typing import Any, Callable, Literal
from typing import Any, Callable, Literal, overload
import structlog
from playwright.async_api import Page
@@ -95,42 +95,94 @@ class SkyvernPage:
await self.page.goto(url, timeout=timeout)
######### Public Interfaces #########
@action_wrap(ActionType.CLICK)
@overload
async def click(
self,
selector: str,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
intention: str | None = None, # backward compatibility
) -> str:
"""Click an element identified by ``selector``.
**kwargs: Any,
) -> str | None: ...
When ``prompt`` and ``data`` are provided a new click action is
generated via the ``single-click-action`` prompt. The model returns a
fresh "xpath=..." selector based on the current DOM and the updated data for this run.
The browser then clicks the element using this newly generated xpath selector.
@overload
async def click(
self,
*,
prompt: str,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
intention: str | None = None, # backward compatibility
**kwargs: Any,
) -> str | None: ...
If the prompt generation or parsing fails for any reason we fall back to
clicking the originally supplied ``selector``.
@action_wrap(ActionType.CLICK)
async def click(
self,
selector: str | None = None,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
intention: str | None = None, # backward compatibility
**kwargs: Any,
) -> str | None:
"""Click an element using a CSS selector, AI-powered prompt matching, or both.
This method supports three modes:
- **Selector-based**: Click the element matching the CSS selector
- **AI-powered**: Use natural language to describe which element to click
- **Fallback mode** (default): Try the selector first, fall back to AI if it fails
Args:
selector: CSS selector for the target element.
prompt: Natural language description of which element to click.
ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
data: Additional context data for AI processing.
timeout: Maximum time to wait for the click action in milliseconds.
Returns:
The selector string that was successfully used to click the element, or None.
Examples:
```python
# Click using a CSS selector
await page.click("#open-invoice-button")
# Click using AI with natural language
await page.click(prompt="Click on the 'Open Invoice' button")
# Try selector first, fall back to AI if selector fails
await page.click("#open-invoice-button", prompt="Click on the 'Open Invoice' button")
```
"""
# Backward compatibility
if intention is not None and prompt is None:
prompt = intention
if not selector and not prompt:
raise ValueError("Missing input: pass a selector and/or a prompt.")
context = skyvern_context.current()
if context and context.ai_mode_override:
ai = context.ai_mode_override
if ai == "fallback":
# try to click the element with the original selector first
error_to_raise = None
try:
locator = self.page.locator(selector)
await locator.click(timeout=timeout)
return selector
except Exception as e:
error_to_raise = e
if selector:
try:
locator = self.page.locator(selector)
await locator.click(timeout=timeout, **kwargs)
return selector
except Exception as e:
error_to_raise = e
selector = None
# if the original selector doesn't work, try to click the element with the ai generated selector
if prompt:
@@ -152,30 +204,104 @@ class SkyvernPage:
data=data,
timeout=timeout,
)
locator = self.page.locator(selector)
await locator.click(timeout=timeout)
if selector:
locator = self.page.locator(selector, **kwargs)
await locator.click(timeout=timeout)
return selector
@overload
async def fill(
self,
selector: str,
value: str,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
totp_identifier: str | None = None,
totp_url: str | None = None,
intention: str | None = None, # backward compatibility
) -> str: ...
@overload
async def fill(
self,
*,
prompt: str,
value: str | None = None,
selector: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
totp_identifier: str | None = None,
totp_url: str | None = None,
intention: str | None = None, # backward compatibility
) -> str: ...
@action_wrap(ActionType.INPUT_TEXT)
async def fill(
self,
selector: str | None,
value: str,
ai: str | None = "fallback",
selector: str | None = None,
value: str | None = None,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
totp_identifier: str | None = None,
totp_url: str | None = None,
intention: str | None = None, # backward compatibility
) -> str:
"""Fill an input field using a CSS selector, AI-powered prompt matching, or both.
This method supports three modes:
- **Selector-based**: Fill the input field with a value using CSS selector
- **AI-powered**: Use natural language prompt (AI extracts value from prompt)
- **Fallback mode** (default): Try the selector first, fall back to AI if it fails
Args:
selector: CSS selector for the target input element.
value: The text value to input into the field.
prompt: Natural language description of which field to fill and what value.
ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
data: Additional context data for AI processing.
timeout: Maximum time to wait for the fill action in milliseconds.
totp_identifier: TOTP identifier for time-based one-time password fields.
totp_url: URL to fetch TOTP codes from for authentication.
Returns:
The value that was successfully filled into the field.
Examples:
```python
# Fill using selector and value (both positional)
await page.fill("#email-input", "user@example.com")
# Fill using AI with natural language (prompt only)
await page.fill(prompt="Fill 'user@example.com' in the email address field")
# Try selector first, fall back to AI if selector fails
await page.fill(
"#email-input",
"user@example.com",
prompt="Fill the email address with user@example.com"
)
```
"""
# Backward compatibility
if intention is not None and prompt is None:
prompt = intention
if not selector and not prompt:
raise ValueError("Missing input: pass a selector and/or a prompt.")
return await self._input_text(
selector=selector,
value=value,
value=value or "",
ai=ai,
intention=prompt,
data=data,
@@ -201,6 +327,9 @@ class SkyvernPage:
if intention is not None and prompt is None:
prompt = intention
if not selector and not prompt:
raise ValueError("Missing input: pass a selector and/or a prompt.")
return await self._input_text(
selector=selector,
value=value,
@@ -225,7 +354,7 @@ class SkyvernPage:
) -> str:
"""Input text into an element identified by ``selector``.
When ``prompt`` and ``data`` are provided a new input text action is
When ``intention`` and ``data`` are provided a new input text action is
generated via the `script-generation-input-text-generation` prompt. The model returns a
fresh text based on the current DOM and the updated data for this run.
The browser then inputs the text using this newly generated text.
@@ -248,6 +377,7 @@ class SkyvernPage:
return value
except Exception as e:
error_to_raise = e
selector = None
if intention:
return await self._ai.ai_input_text(
@@ -296,6 +426,9 @@ class SkyvernPage:
if intention is not None and prompt is None:
prompt = intention
if not selector and not prompt:
raise ValueError("Missing input: pass a selector and/or a prompt.")
context = skyvern_context.current()
if context and context.ai_mode_override:
ai = context.ai_mode_override
@@ -308,6 +441,7 @@ class SkyvernPage:
await locator.set_input_files(file_path)
except Exception as e:
error_to_raise = e
selector = None
if prompt:
return await self._ai.ai_upload_file(
@@ -338,34 +472,104 @@ class SkyvernPage:
await locator.set_input_files(file_path, timeout=timeout)
return files
@action_wrap(ActionType.SELECT_OPTION)
@overload
async def select_option(
self,
selector: str,
value: str | None = None,
label: str | None = None,
ai: str | None = "fallback",
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
intention: str | None = None, # backward compatibility
) -> str:
**kwargs: Any,
) -> str | None: ...
@overload
async def select_option(
self,
*,
prompt: str,
value: str | None = None,
selector: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
intention: str | None = None, # backward compatibility
**kwargs: Any,
) -> str | None: ...
@action_wrap(ActionType.SELECT_OPTION)
async def select_option(
self,
selector: str | None = None,
value: str | None = None,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
intention: str | None = None, # backward compatibility
**kwargs: Any,
) -> str | None:
"""Select an option from a dropdown using a CSS selector, AI-powered prompt matching, or both.
This method supports three modes:
- **Selector-based**: Select the option with a value using CSS selector
- **AI-powered**: Use natural language prompt (AI extracts value from prompt)
- **Fallback mode** (default): Try the selector first, fall back to AI if it fails
Args:
selector: CSS selector for the target select/dropdown element.
value: The option value to select.
prompt: Natural language description of which option to select.
ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
data: Additional context data for AI processing.
timeout: Maximum time to wait for the select action in milliseconds.
Returns:
The value that was successfully selected.
Examples:
```python
# Select using selector and value (both positional)
await page.select_option("#country", "us")
# Select using AI with natural language (prompt only)
await page.select_option(prompt="Select 'United States' from the country dropdown")
# Try selector first, fall back to AI if selector fails
await page.select_option(
"#country",
"us",
prompt="Select United States from country"
)
```
"""
# Backward compatibility
if intention is not None and prompt is None:
prompt = intention
if not selector and not prompt:
raise ValueError("Missing input: pass a selector and/or a prompt.")
context = skyvern_context.current()
if context and context.ai_mode_override:
ai = context.ai_mode_override
value = value or ""
if ai == "fallback":
error_to_raise = None
try:
locator = self.page.locator(selector)
await locator.select_option(value, timeout=timeout)
return value
except Exception as e:
error_to_raise = e
if selector:
try:
locator = self.page.locator(selector)
await locator.select_option(value, timeout=timeout, **kwargs)
return value
except Exception as e:
error_to_raise = e
selector = None
if prompt:
return await self._ai.ai_select_option(
selector=selector,
@@ -386,8 +590,9 @@ class SkyvernPage:
data=data,
timeout=timeout,
)
locator = self.page.locator(selector)
await locator.select_option(value, timeout=timeout)
if selector:
locator = self.page.locator(selector)
await locator.select_option(value, timeout=timeout, **kwargs)
return value
@action_wrap(ActionType.WAIT)
@@ -445,6 +650,35 @@ class SkyvernPage:
intention: str | None = None,
data: str | dict[str, Any] | None = None,
) -> dict[str, Any] | list | str | None:
"""Extract structured data from the page using AI.
Args:
prompt: Natural language description of what data to extract.
schema: JSON Schema defining the structure of data to extract.
error_code_mapping: Mapping of error codes to custom error messages.
intention: Additional context about the extraction intent.
data: Additional context data for AI processing.
Returns:
Extracted data matching the provided schema, or None if extraction fails.
Examples:
```python
# Extract structured data with JSON Schema
result = await page.extract(
prompt="Extract product information",
schema={
"type": "object",
"properties": {
"name": {"type": "string", "description": "Product name"},
"price": {"type": "number", "description": "Product price"}
},
"required": ["name", "price"]
}
)
# Returns: {"name": "...", "price": 29.99}
```
"""
return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data)
@action_wrap(ActionType.VERIFICATION_CODE)

View File

@@ -10,18 +10,18 @@ class SkyvernPageAi(Protocol):
async def ai_click(
self,
selector: str,
selector: str | None,
intention: str,
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
) -> str:
) -> str | None:
"""Click an element using AI to locate it based on intention."""
...
async def ai_input_text(
self,
selector: str | None,
value: str,
value: str | None,
intention: str,
data: str | dict[str, Any] | None = None,
totp_identifier: str | None = None,
@@ -44,8 +44,8 @@ class SkyvernPageAi(Protocol):
async def ai_select_option(
self,
selector: str,
value: str,
selector: str | None,
value: str | None,
intention: str,
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,