SDK: support select_option and extract (#3850)

2025-10-30 09:05:20 -06:00
parent ac069838c7
commit af9a5f31e4
21 changed files with 774 additions and 124 deletions
--- a/skyvern/library/SdkSkyvernPageAi.py
+++ b/skyvern/library/SdkSkyvernPageAi.py
@@ -2,14 +2,9 @@ from typing import TYPE_CHECKING, Any

 from playwright.async_api import Page

+from skyvern.client import SdkAction_AiClick, SdkAction_AiInputText, SdkAction_AiSelectOption, SdkAction_Extract
 from skyvern.config import settings
 from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
-from skyvern.forge.sdk.schemas.sdk_actions import (
-    ClickAction,
-    ExtractAction,
-    InputTextAction,
-    SelectOptionAction,
-)

 if TYPE_CHECKING:
    from skyvern.library.skyvern_browser import SkyvernBrowser
@@ -35,18 +30,17 @@ class SdkSkyvernPageAi(SkyvernPageAi):
    ) -> str:
        """Click an element using AI via API call."""

-        action = ClickAction(
-            selector=selector,
-            intention=intention,
-            data=data,
-            timeout=timeout,
-        )
        response = await self._browser.client.run_sdk_action(
            url=self._page.url,
            browser_session_id=self._browser.browser_session_id,
            browser_address=self._browser.browser_address,
            workflow_run_id=self._browser.workflow_run_id,
-            action=action,
+            action=SdkAction_AiClick(
+                selector=selector,
+                intention=intention,
+                data=data,
+                timeout=timeout,
+            ),
        )
        self._browser.workflow_run_id = response.workflow_run_id
        return response.result if response.result else selector
@@ -65,7 +59,7 @@ class SdkSkyvernPageAi(SkyvernPageAi):

        response = await self._browser.client.run_sdk_action(
            url=self._page.url,
-            action=InputTextAction(
+            action=SdkAction_AiInputText(
                selector=selector,
                value=value,
                intention=intention,
@@ -93,7 +87,7 @@ class SdkSkyvernPageAi(SkyvernPageAi):

        response = await self._browser.client.run_sdk_action(
            url=self._page.url,
-            action=SelectOptionAction(
+            action=SdkAction_AiSelectOption(
                selector=selector,
                value=value,
                intention=intention,
@@ -129,7 +123,7 @@ class SdkSkyvernPageAi(SkyvernPageAi):

        response = await self._browser.client.run_sdk_action(
            url=self._page.url,
-            action=ExtractAction(
+            action=SdkAction_Extract(
                prompt=prompt,
                extract_schema=schema,
                error_code_mapping=error_code_mapping,
--- a/skyvern/library/skyvern_browser_page.py
+++ b/skyvern/library/skyvern_browser_page.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, overload

 from playwright.async_api import Page

@@ -230,24 +230,64 @@ class SkyvernBrowserPage:
        self._ai = SdkSkyvernPageAi(browser, page)
        self.run = SkyvernPageRun(browser, page)

+    @overload
+    async def click(
+        self,
+        selector: str,
+        *,
+        prompt: str | None = None,
+        ai: str | None = "fallback",
+        data: str | dict[str, Any] | None = None,
+        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
+    ) -> str | None: ...
+
+    @overload
    async def click(
        self,
        *,
+        prompt: str,
+        ai: str | None = "fallback",
+        data: str | dict[str, Any] | None = None,
+        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
+    ) -> str | None: ...
+
+    async def click(
+        self,
        selector: str | None = None,
-        intention: str | None = None,
+        *,
+        prompt: str | None = None,
        ai: str | None = "fallback",
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> str | None:
-        """Click an element identified by ``selector``.
+        """Click an element using a CSS selector, AI-powered prompt matching, or both.

-        When ``intention`` and ``data`` are provided a new click action is
-        generated via the ``single-click-action`` prompt.  The model returns a
-        fresh "xpath=..." selector based on the current DOM and the updated data for this run.
-        The browser then clicks the element using this newly generated xpath selector.
+        This method supports three modes:
+        - **Selector-based**: Click the element matching the CSS selector
+        - **AI-powered**: Use natural language to describe which element to click
+        - **Fallback mode** (default): Try the selector first, fall back to AI if it fails

-        If the prompt generation or parsing fails for any reason we fall back to
-        clicking the originally supplied ``selector``.
+        Args:
+            selector: CSS selector for the target element.
+            prompt: Natural language description of which element to click.
+            ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
+            data: Additional context data for AI processing.
+            timeout: Maximum time to wait for the click action in milliseconds.
+
+        Returns:
+            The selector string that was successfully used to click the element, or None.
+
+        Examples:
+            ```python
+            # Click using a CSS selector
+            await page.click("#open-invoice-button")
+
+            # Click using AI with natural language
+            await page.click(prompt="Click on the 'Open Invoice' button")
+
+            # Try selector first, fall back to AI if selector fails
+            await page.click("#open-invoice-button", prompt="Click on the 'Open Invoice' button")
+            ```
        """

        if ai == "fallback":
@@ -262,10 +302,10 @@ class SkyvernBrowserPage:
                    error_to_raise = e

            # if the original selector doesn't work, try to click the element with the ai generated selector
-            if intention:
+            if prompt:
                return await self._ai.ai_click(
                    selector=selector or "",
-                    intention=intention,
+                    intention=prompt,
                    data=data,
                    timeout=timeout,
                )
@@ -274,10 +314,10 @@ class SkyvernBrowserPage:
            else:
                return selector
        elif ai == "proactive":
-            if intention:
+            if prompt:
                return await self._ai.ai_click(
                    selector=selector or "",
-                    intention=intention,
+                    intention=prompt,
                    data=data,
                    timeout=timeout,
                )
@@ -287,6 +327,244 @@ class SkyvernBrowserPage:
            await locator.click(timeout=timeout)
        return selector

+    @overload
+    async def fill(
+        self,
+        selector: str,
+        value: str,
+        *,
+        prompt: str | None = None,
+        ai: str | None = "fallback",
+        data: str | dict[str, Any] | None = None,
+        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
+        totp_identifier: str | None = None,
+        totp_url: str | None = None,
+    ) -> str: ...
+
+    @overload
+    async def fill(
+        self,
+        *,
+        prompt: str,
+        value: str | None = None,
+        selector: str | None = None,
+        ai: str | None = "fallback",
+        data: str | dict[str, Any] | None = None,
+        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
+        totp_identifier: str | None = None,
+        totp_url: str | None = None,
+    ) -> str: ...
+
+    async def fill(
+        self,
+        selector: str | None = None,
+        value: str | None = None,
+        *,
+        prompt: str | None = None,
+        ai: str | None = "fallback",
+        data: str | dict[str, Any] | None = None,
+        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
+        totp_identifier: str | None = None,
+        totp_url: str | None = None,
+    ) -> str:
+        """Fill an input field using a CSS selector, AI-powered prompt matching, or both.
+
+        This method supports three modes:
+        - **Selector-based**: Fill the input field with a value using CSS selector
+        - **AI-powered**: Use natural language prompt (AI extracts value from prompt)
+        - **Fallback mode** (default): Try the selector first, fall back to AI if it fails
+
+        Args:
+            selector: CSS selector for the target input element.
+            value: The text value to input into the field.
+            prompt: Natural language description of which field to fill and what value.
+            ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
+            data: Additional context data for AI processing.
+            timeout: Maximum time to wait for the fill action in milliseconds.
+            totp_identifier: TOTP identifier for time-based one-time password fields.
+            totp_url: URL to fetch TOTP codes from for authentication.
+
+        Returns:
+            The value that was successfully filled into the field.
+
+        Examples:
+            ```python
+            # Fill using selector and value (both positional)
+            await page.fill("#email-input", "user@example.com")
+
+            # Fill using AI with natural language (prompt only)
+            await page.fill(prompt="Fill 'user@example.com' in the email address field")
+
+            # Try selector first, fall back to AI if selector fails
+            await page.fill(
+                "#email-input",
+                "user@example.com",
+                prompt="Fill the email address with user@example.com"
+            )
+            ```
+        """
+        return await self._input_text(
+            selector=selector or "",
+            value=value or "",
+            ai=ai,
+            intention=prompt,
+            data=data,
+            timeout=timeout,
+            totp_identifier=totp_identifier,
+            totp_url=totp_url,
+        )
+
+    async def goto(self, url: str, **kwargs: Any) -> None:
+        """Navigate to the given URL.
+
+        Args:
+            url: URL to navigate page to.
+            **kwargs: Additional options like timeout, wait_until, referer, etc.
+        """
+        await self._page.goto(url, **kwargs)
+
+    async def type(self, selector: str, text: str, **kwargs: Any) -> None:
+        """Type text into an element character by character.
+
+        Args:
+            selector: A selector to search for an element to type into.
+            text: Text to type into the element.
+            **kwargs: Additional options like delay, timeout, no_wait_after, etc.
+        """
+        await self._page.type(selector, text, **kwargs)
+
+    @overload
+    async def select_option(
+        self,
+        selector: str,
+        value: str,
+        *,
+        prompt: str | None = None,
+        ai: str | None = "fallback",
+        data: str | dict[str, Any] | None = None,
+        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
+    ) -> str: ...
+
+    @overload
+    async def select_option(
+        self,
+        *,
+        prompt: str,
+        value: str | None = None,
+        selector: str | None = None,
+        ai: str | None = "fallback",
+        data: str | dict[str, Any] | None = None,
+        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
+    ) -> str: ...
+
+    async def select_option(
+        self,
+        selector: str | None = None,
+        value: str | None = None,
+        *,
+        prompt: str | None = None,
+        ai: str | None = "fallback",
+        data: str | dict[str, Any] | None = None,
+        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
+    ) -> str:
+        """Select an option from a dropdown using a CSS selector, AI-powered prompt matching, or both.
+
+        This method supports three modes:
+        - **Selector-based**: Select the option with a value using CSS selector
+        - **AI-powered**: Use natural language prompt (AI extracts value from prompt)
+        - **Fallback mode** (default): Try the selector first, fall back to AI if it fails
+
+        Args:
+            selector: CSS selector for the target select/dropdown element.
+            value: The option value to select.
+            prompt: Natural language description of which option to select.
+            ai: AI behavior mode. Defaults to "fallback" which tries selector first, then AI.
+            data: Additional context data for AI processing.
+            timeout: Maximum time to wait for the select action in milliseconds.
+
+        Returns:
+            The value that was successfully selected.
+
+        Examples:
+            ```python
+            # Select using selector and value (both positional)
+            await page.select_option("#country", "us")
+
+            # Select using AI with natural language (prompt only)
+            await page.select_option(prompt="Select 'United States' from the country dropdown")
+
+            # Try selector first, fall back to AI if selector fails
+            await page.select_option(
+                "#country",
+                "us",
+                prompt="Select United States from country"
+            )
+            ```
+        """
+        value = value or ""
+        if ai == "fallback":
+            error_to_raise = None
+            if selector:
+                try:
+                    locator = self._page.locator(selector)
+                    await locator.select_option(value, timeout=timeout)
+                    return value
+                except Exception as e:
+                    error_to_raise = e
+            if prompt:
+                return await self._ai.ai_select_option(
+                    selector=selector or "",
+                    value=value,
+                    intention=prompt,
+                    data=data,
+                    timeout=timeout,
+                )
+            if error_to_raise:
+                raise error_to_raise
+            else:
+                return value
+        elif ai == "proactive" and prompt:
+            return await self._ai.ai_select_option(
+                selector=selector or "",
+                value=value,
+                intention=prompt,
+                data=data,
+                timeout=timeout,
+            )
+        if selector:
+            locator = self._page.locator(selector)
+            await locator.select_option(value, timeout=timeout)
+        return value
+
+    async def extract(
+        self,
+        prompt: str,
+        schema: dict[str, Any] | list | str | None = None,
+        error_code_mapping: dict[str, str] | None = None,
+        intention: str | None = None,
+        data: str | dict[str, Any] | None = None,
+    ) -> dict[str, Any] | list | str | None:
+        return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data)
+
+    async def reload(self, **kwargs: Any) -> None:
+        """Reload the current page.
+
+        Args:
+            **kwargs: Additional options like timeout, wait_until, etc.
+        """
+        await self._page.reload(**kwargs)
+
+    async def screenshot(self, **kwargs: Any) -> bytes:
+        """Take a screenshot of the page.
+
+        Args:
+            **kwargs: Additional options like path, full_page, clip, type, quality, etc.
+
+        Returns:
+            bytes: The screenshot as bytes (unless path is specified, then saves to file).
+        """
+        return await self._page.screenshot(**kwargs)
+
    async def _input_text(
        self,
        selector: str,
@@ -346,76 +624,3 @@ class SkyvernBrowserPage:
        locator = self._page.locator(selector)
        await handler_utils.input_sequentially(locator, value, timeout=timeout)
        return value
-
-    async def fill(
-        self,
-        selector: str,
-        value: str,
-        ai: str | None = "fallback",
-        intention: str | None = None,
-        data: str | dict[str, Any] | None = None,
-        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
-        totp_identifier: str | None = None,
-        totp_url: str | None = None,
-    ) -> str:
-        return await self._input_text(
-            selector=selector,
-            value=value,
-            ai=ai,
-            intention=intention,
-            data=data,
-            timeout=timeout,
-            totp_identifier=totp_identifier,
-            totp_url=totp_url,
-        )
-
-    async def goto(self, url: str, **kwargs: Any) -> None:
-        """Navigate to the given URL.
-
-        Args:
-            url: URL to navigate page to.
-            **kwargs: Additional options like timeout, wait_until, referer, etc.
-        """
-        await self._page.goto(url, **kwargs)
-
-    async def type(self, selector: str, text: str, **kwargs: Any) -> None:
-        """Type text into an element character by character.
-
-        Args:
-            selector: A selector to search for an element to type into.
-            text: Text to type into the element.
-            **kwargs: Additional options like delay, timeout, no_wait_after, etc.
-        """
-        await self._page.type(selector, text, **kwargs)
-
-    async def select_option(self, selector: str, value: Any = None, **kwargs: Any) -> list[str]:
-        """Select option(s) in a <select> element.
-
-        Args:
-            selector: A selector to search for a select element.
-            value: Option value(s) to select. Can be a string, list of strings, or dict with value/label/index.
-            **kwargs: Additional options like timeout, force, no_wait_after, etc.
-
-        Returns:
-            List of option values that have been successfully selected.
-        """
-        return await self._page.select_option(selector, value, **kwargs)
-
-    async def reload(self, **kwargs: Any) -> None:
-        """Reload the current page.
-
-        Args:
-            **kwargs: Additional options like timeout, wait_until, etc.
-        """
-        await self._page.reload(**kwargs)
-
-    async def screenshot(self, **kwargs: Any) -> bytes:
-        """Take a screenshot of the page.
-
-        Args:
-            **kwargs: Additional options like path, full_page, clip, type, quality, etc.
-
-        Returns:
-            bytes: The screenshot as bytes (unless path is specified, then saves to file).
-        """
-        return await self._page.screenshot(**kwargs)