SDK: support select_option and extract (#3850)

This commit is contained in:
Stanislav Novosad
2025-10-30 09:05:20 -06:00
committed by GitHub
parent ac069838c7
commit af9a5f31e4
21 changed files with 774 additions and 124 deletions

View File

@@ -2,14 +2,9 @@ from typing import TYPE_CHECKING, Any
from playwright.async_api import Page
from skyvern.client import SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_Extract
from skyvern.config import settings
from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
from skyvern.forge.sdk.schemas.sdk_actions import (
ClickAction,
ExtractAction,
InputTextAction,
SelectOptionAction,
)
if TYPE_CHECKING:
from skyvern.library.skyvern_browser import SkyvernBrowser
@@ -35,18 +30,17 @@ class SdkSkyvernPageAi(SkyvernPageAi):
) -> str:
"""Click an element using AI via API call."""
action = ClickAction(
selector=selector,
intention=intention,
data=data,
timeout=timeout,
)
response = await self._browser.client.run_sdk_action(
url=self._page.url,
browser_session_id=self._browser.browser_session_id,
browser_address=self._browser.browser_address,
workflow_run_id=self._browser.workflow_run_id,
action=action,
action=SdkAction_AiClick(
selector=selector,
intention=intention,
data=data,
timeout=timeout,
),
)
self._browser.workflow_run_id = response.workflow_run_id
return response.result if response.result else selector
@@ -65,7 +59,7 @@ class SdkSkyvernPageAi(SkyvernPageAi):
response = await self._browser.client.run_sdk_action(
url=self._page.url,
action=InputTextAction(
action=SdkAction_AiInputText(
selector=selector,
value=value,
intention=intention,
@@ -93,7 +87,7 @@ class SdkSkyvernPageAi(SkyvernPageAi):
response = await self._browser.client.run_sdk_action(
url=self._page.url,
action=SelectOptionAction(
action=SdkAction_AiSelectOption(
selector=selector,
value=value,
intention=intention,
@@ -129,7 +123,7 @@ class SdkSkyvernPageAi(SkyvernPageAi):
response = await self._browser.client.run_sdk_action(
url=self._page.url,
action=ExtractAction(
action=SdkAction_Extract(
prompt=prompt,
extract_schema=schema,
error_code_mapping=error_code_mapping,

View File

@@ -1,5 +1,5 @@
import asyncio
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, overload
from playwright.async_api import Page
@@ -230,24 +230,64 @@ class SkyvernBrowserPage:
self._ai = SdkSkyvernPageAi(browser, page)
self.run = SkyvernPageRun(browser, page)
@overload
async def click(
self,
selector: str,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
) -> str | None: ...
@overload
async def click(
self,
*,
prompt: str,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
) -> str | None: ...
async def click(
self,
selector: str | None = None,
intention: str | None = None,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
) -> str | None:
"""Click an element identified by ``selector``.
"""Click an element using a CSS selector, AI-powered prompt matching, or both.
When ``intention`` and ``data`` are provided a new click action is
generated via the ``single-click-action`` prompt. The model returns a
fresh "xpath=..." selector based on the current DOM and the updated data for this run.
The browser then clicks the element using this newly generated xpath selector.
This method supports three modes:
- **Selector-based**: Click the element matching the CSS selector
- **AI-powered**: Use natural language to describe which element to click
- **Fallback mode** (default): Try the selector first, fall back to AI if it fails
If the prompt generation or parsing fails for any reason we fall back to
clicking the originally supplied ``selector``.
Args:
selector: CSS selector for the target element.
prompt: Natural language description of which element to click.
ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
data: Additional context data for AI processing.
timeout: Maximum time to wait for the click action in milliseconds.
Returns:
The selector string that was successfully used to click the element, or None.
Examples:
```python
# Click using a CSS selector
await page.click("#open-invoice-button")
# Click using AI with natural language
await page.click(prompt="Click on the 'Open Invoice' button")
# Try selector first, fall back to AI if selector fails
await page.click("#open-invoice-button", prompt="Click on the 'Open Invoice' button")
```
"""
if ai == "fallback":
@@ -262,10 +302,10 @@ class SkyvernBrowserPage:
error_to_raise = e
# if the original selector doesn't work, try to click the element with the ai generated selector
if intention:
if prompt:
return await self._ai.ai_click(
selector=selector or "",
intention=intention,
intention=prompt,
data=data,
timeout=timeout,
)
@@ -274,10 +314,10 @@ class SkyvernBrowserPage:
else:
return selector
elif ai == "proactive":
if intention:
if prompt:
return await self._ai.ai_click(
selector=selector or "",
intention=intention,
intention=prompt,
data=data,
timeout=timeout,
)
@@ -287,6 +327,244 @@ class SkyvernBrowserPage:
await locator.click(timeout=timeout)
return selector
@overload
async def fill(
self,
selector: str,
value: str,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
totp_identifier: str | None = None,
totp_url: str | None = None,
) -> str: ...
@overload
async def fill(
self,
*,
prompt: str,
value: str | None = None,
selector: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
totp_identifier: str | None = None,
totp_url: str | None = None,
) -> str: ...
async def fill(
self,
selector: str | None = None,
value: str | None = None,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
totp_identifier: str | None = None,
totp_url: str | None = None,
) -> str:
"""Fill an input field using a CSS selector, AI-powered prompt matching, or both.
This method supports three modes:
- **Selector-based**: Fill the input field with a value using CSS selector
- **AI-powered**: Use natural language prompt (AI extracts value from prompt)
- **Fallback mode** (default): Try the selector first, fall back to AI if it fails
Args:
selector: CSS selector for the target input element.
value: The text value to input into the field.
prompt: Natural language description of which field to fill and what value.
ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
data: Additional context data for AI processing.
timeout: Maximum time to wait for the fill action in milliseconds.
totp_identifier: TOTP identifier for time-based one-time password fields.
totp_url: URL to fetch TOTP codes from for authentication.
Returns:
The value that was successfully filled into the field.
Examples:
```python
# Fill using selector and value (both positional)
await page.fill("#email-input", "user@example.com")
# Fill using AI with natural language (prompt only)
await page.fill(prompt="Fill 'user@example.com' in the email address field")
# Try selector first, fall back to AI if selector fails
await page.fill(
"#email-input",
"user@example.com",
prompt="Fill the email address with user@example.com"
)
```
"""
return await self._input_text(
selector=selector or "",
value=value or "",
ai=ai,
intention=prompt,
data=data,
timeout=timeout,
totp_identifier=totp_identifier,
totp_url=totp_url,
)
async def goto(self, url: str, **kwargs: Any) -> None:
"""Navigate to the given URL.
Args:
url: URL to navigate page to.
**kwargs: Additional options like timeout, wait_until, referer, etc.
"""
await self._page.goto(url, **kwargs)
async def type(self, selector: str, text: str, **kwargs: Any) -> None:
"""Type text into an element character by character.
Args:
selector: A selector to search for an element to type into.
text: Text to type into the element.
**kwargs: Additional options like delay, timeout, no_wait_after, etc.
"""
await self._page.type(selector, text, **kwargs)
@overload
async def select_option(
self,
selector: str,
value: str,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
) -> str: ...
@overload
async def select_option(
self,
*,
prompt: str,
value: str | None = None,
selector: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
) -> str: ...
async def select_option(
self,
selector: str | None = None,
value: str | None = None,
*,
prompt: str | None = None,
ai: str | None = "fallback",
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
) -> str:
"""Select an option from a dropdown using a CSS selector, AI-powered prompt matching, or both.
This method supports three modes:
- **Selector-based**: Select the option with a value using CSS selector
- **AI-powered**: Use natural language prompt (AI extracts value from prompt)
- **Fallback mode** (default): Try the selector first, fall back to AI if it fails
Args:
selector: CSS selector for the target select/dropdown element.
value: The option value to select.
prompt: Natural language description of which option to select.
ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
data: Additional context data for AI processing.
timeout: Maximum time to wait for the select action in milliseconds.
Returns:
The value that was successfully selected.
Examples:
```python
# Select using selector and value (both positional)
await page.select_option("#country", "us")
# Select using AI with natural language (prompt only)
await page.select_option(prompt="Select 'United States' from the country dropdown")
# Try selector first, fall back to AI if selector fails
await page.select_option(
"#country",
"us",
prompt="Select United States from country"
)
```
"""
value = value or ""
if ai == "fallback":
error_to_raise = None
if selector:
try:
locator = self._page.locator(selector)
await locator.select_option(value, timeout=timeout)
return value
except Exception as e:
error_to_raise = e
if prompt:
return await self._ai.ai_select_option(
selector=selector or "",
value=value,
intention=prompt,
data=data,
timeout=timeout,
)
if error_to_raise:
raise error_to_raise
else:
return value
elif ai == "proactive" and prompt:
return await self._ai.ai_select_option(
selector=selector or "",
value=value,
intention=prompt,
data=data,
timeout=timeout,
)
if selector:
locator = self._page.locator(selector)
await locator.select_option(value, timeout=timeout)
return value
async def extract(
self,
prompt: str,
schema: dict[str, Any] | list | str | None = None,
error_code_mapping: dict[str, str] | None = None,
intention: str | None = None,
data: str | dict[str, Any] | None = None,
) -> dict[str, Any] | list | str | None:
return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data)
async def reload(self, **kwargs: Any) -> None:
"""Reload the current page.
Args:
**kwargs: Additional options like timeout, wait_until, etc.
"""
await self._page.reload(**kwargs)
async def screenshot(self, **kwargs: Any) -> bytes:
"""Take a screenshot of the page.
Args:
**kwargs: Additional options like path, full_page, clip, type, quality, etc.
Returns:
bytes: The screenshot as bytes (unless path is specified, then saves to file).
"""
return await self._page.screenshot(**kwargs)
async def _input_text(
self,
selector: str,
@@ -346,76 +624,3 @@ class SkyvernBrowserPage:
locator = self._page.locator(selector)
await handler_utils.input_sequentially(locator, value, timeout=timeout)
return value
async def fill(
self,
selector: str,
value: str,
ai: str | None = "fallback",
intention: str | None = None,
data: str | dict[str, Any] | None = None,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
totp_identifier: str | None = None,
totp_url: str | None = None,
) -> str:
return await self._input_text(
selector=selector,
value=value,
ai=ai,
intention=intention,
data=data,
timeout=timeout,
totp_identifier=totp_identifier,
totp_url=totp_url,
)
async def goto(self, url: str, **kwargs: Any) -> None:
"""Navigate to the given URL.
Args:
url: URL to navigate page to.
**kwargs: Additional options like timeout, wait_until, referer, etc.
"""
await self._page.goto(url, **kwargs)
async def type(self, selector: str, text: str, **kwargs: Any) -> None:
"""Type text into an element character by character.
Args:
selector: A selector to search for an element to type into.
text: Text to type into the element.
**kwargs: Additional options like delay, timeout, no_wait_after, etc.
"""
await self._page.type(selector, text, **kwargs)
async def select_option(self, selector: str, value: Any = None, **kwargs: Any) -> list[str]:
"""Select option(s) in a <select> element.
Args:
selector: A selector to search for a select element.
value: Option value(s) to select. Can be a string, list of strings, or dict with value/label/index.
**kwargs: Additional options like timeout, force, no_wait_after, etc.
Returns:
List of option values that have been successfully selected.
"""
return await self._page.select_option(selector, value, **kwargs)
async def reload(self, **kwargs: Any) -> None:
"""Reload the current page.
Args:
**kwargs: Additional options like timeout, wait_until, etc.
"""
await self._page.reload(**kwargs)
async def screenshot(self, **kwargs: Any) -> bytes:
"""Take a screenshot of the page.
Args:
**kwargs: Additional options like path, full_page, clip, type, quality, etc.
Returns:
bytes: The screenshot as bytes (unless path is specified, then saves to file).
"""
return await self._page.screenshot(**kwargs)