SDK: text prompt (#4214)

2025-12-05 18:13:25 -07:00
parent 0f495f458e
commit b7d08fe906
9 changed files with 156 additions and 0 deletions
--- a/skyvern/client/init.py
+++ b/skyvern/client/init.py
@@ -289,6 +289,7 @@ if typing.TYPE_CHECKING:
        RunSdkActionRequestAction_AiUploadFile,
        RunSdkActionRequestAction_Extract,
        RunSdkActionRequestAction_LocateElement,
        RunSdkActionRequestAction_Prompt,
        RunSdkActionResponse,
        RunStatus,
        Script,
@@ -780,6 +781,7 @@ _dynamic_imports: typing.Dict[str, str] = {
    "RunSdkActionRequestAction_AiUploadFile": ".types",
    "RunSdkActionRequestAction_Extract": ".types",
    "RunSdkActionRequestAction_LocateElement": ".types",
    "RunSdkActionRequestAction_Prompt": ".types",
    "RunSdkActionResponse": ".types",
    "RunStatus": ".types",
    "Script": ".types",
@@ -1295,6 +1297,7 @@ __all__ = [
    "RunSdkActionRequestAction_AiUploadFile",
    "RunSdkActionRequestAction_Extract",
    "RunSdkActionRequestAction_LocateElement",
    "RunSdkActionRequestAction_Prompt",
    "RunSdkActionResponse",
    "RunStatus",
    "Script",
--- a/skyvern/client/types/init.py
+++ b/skyvern/client/types/init.py
@@ -314,6 +314,7 @@ if typing.TYPE_CHECKING:
        RunSdkActionRequestAction_AiUploadFile,
        RunSdkActionRequestAction_Extract,
        RunSdkActionRequestAction_LocateElement,
        RunSdkActionRequestAction_Prompt,
    )
    from .run_sdk_action_response import RunSdkActionResponse
    from .run_status import RunStatus
@@ -813,6 +814,7 @@ _dynamic_imports: typing.Dict[str, str] = {
    "RunSdkActionRequestAction_AiUploadFile": ".run_sdk_action_request_action",
    "RunSdkActionRequestAction_Extract": ".run_sdk_action_request_action",
    "RunSdkActionRequestAction_LocateElement": ".run_sdk_action_request_action",
    "RunSdkActionRequestAction_Prompt": ".run_sdk_action_request_action",
    "RunSdkActionResponse": ".run_sdk_action_response",
    "RunStatus": ".run_status",
    "Script": ".script",
@@ -1317,6 +1319,7 @@ __all__ = [
    "RunSdkActionRequestAction_AiUploadFile",
    "RunSdkActionRequestAction_Extract",
    "RunSdkActionRequestAction_LocateElement",
    "RunSdkActionRequestAction_Prompt",
    "RunSdkActionResponse",
    "RunStatus",
    "Script",
--- a/skyvern/client/types/run_sdk_action_request_action.py
+++ b/skyvern/client/types/run_sdk_action_request_action.py
@@ -163,6 +163,26 @@ class RunSdkActionRequestAction_LocateElement(UniversalBaseModel):
            extra = pydantic.Extra.allow
 class RunSdkActionRequestAction_Prompt(UniversalBaseModel):
    """
    The action to execute with its specific parameters
    """
    type: typing.Literal["prompt"] = "prompt"
    prompt: str
    schema: typing.Optional[typing.Dict[str, typing.Any]] = None
    model: typing.Optional[typing.Dict[str, typing.Any]] = None
    if IS_PYDANTIC_V2:
        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
    else:
        class Config:
            frozen = True
            smart_union = True
            extra = pydantic.Extra.allow
 RunSdkActionRequestAction = typing.Union[
    RunSdkActionRequestAction_AiAct,
    RunSdkActionRequestAction_AiClick,
@@ -171,4 +191,5 @@ RunSdkActionRequestAction = typing.Union[
    RunSdkActionRequestAction_AiUploadFile,
    RunSdkActionRequestAction_Extract,
    RunSdkActionRequestAction_LocateElement,
    RunSdkActionRequestAction_Prompt,
 ]
--- a/skyvern/core/script_generations/real_skyvern_page_ai.py
+++ b/skyvern/core/script_generations/real_skyvern_page_ai.py
@@ -17,6 +17,7 @@ from skyvern.forge.sdk.api.files import validate_download_url
 from skyvern.forge.sdk.api.llm.schema_validator import validate_and_fill_extraction_result
 from skyvern.forge.sdk.core import skyvern_context
 from skyvern.forge.sdk.schemas.totp_codes import OTPType
 from skyvern.services import script_service
 from skyvern.services.otp_service import poll_otp_value
 from skyvern.utils.prompt_engine import load_prompt_with_elements
 from skyvern.webeye.actions import handler_utils
@@ -644,6 +645,20 @@ class RealSkyvernPageAi(SkyvernPageAi):
        return xpath
    async def ai_prompt(
        self,
        prompt: str,
        schema: dict[str, Any] | None = None,
        model: dict[str, Any] | None = None,
    ) -> dict[str, Any] | list | str | None:
        """Send a prompt to the LLM and get a response based on the provided schema."""
        result = await script_service.prompt(
            prompt=prompt,
            schema=schema,
            model=model,
        )
        return result
    async def ai_act(
        self,
        prompt: str,
--- a/skyvern/core/script_generations/skyvern_page.py
+++ b/skyvern/core/script_generations/skyvern_page.py
@@ -684,6 +684,56 @@ class SkyvernPage(Page):
        data = kwargs.pop("data", None)
        return await self._ai.ai_extract(prompt, schema, error_code_mapping, intention, data)
    async def prompt(
        self,
        prompt: str,
        schema: dict[str, Any] | None = None,
        model: dict[str, Any] | str | None = None,
    ) -> dict[str, Any] | list | str | None:
        """Send a prompt to the LLM and get a response based on the provided schema.
        This method allows you to interact with the LLM directly without requiring page context.
        It's useful for making decisions, generating text, or processing information using AI.
        Args:
            prompt: The prompt to send to the LLM
            schema: Optional JSON schema to structure the response. If provided, the LLM response
                   will be validated against this schema.
            model: Optional model configuration. Can be either:
                   - A dict with model configuration (e.g., {"model_name": "gemini-2.5-flash-lite", "max_tokens": 2048})
                   - A string with just the model name (e.g., "gemini-2.5-flash-lite")
        Returns:
            LLM response structured according to the schema if provided, or unstructured response otherwise.
        Examples:
            ```python
            # Simple unstructured prompt
            response = await page.prompt("What is 2 + 2?")
            # Returns: {'llm_response': '2 + 2 equals 4.'}
            # Structured prompt with schema
            response = await page.prompt(
                "What is 2 + 2?",
                schema={
                    "type": "object",
                    "properties": {
                        "result_number": {"type": "int"},
                        "confidence": {"type": "number", "minimum": 0, "maximum": 1}
                    }
                }
            )
            # Returns: {'result_number': 4, 'confidence': 1}
            ```
        """
        normalized_model: dict[str, Any] | None = None
        if isinstance(model, str):
            normalized_model = {"model_name": model}
        elif model is not None:
            normalized_model = model
        return await self._ai.ai_prompt(prompt=prompt, schema=schema, model=normalized_model)
    @overload
    def locator(
        self,
--- a/skyvern/core/script_generations/skyvern_page_ai.py
+++ b/skyvern/core/script_generations/skyvern_page_ai.py
@@ -78,3 +78,12 @@ class SkyvernPageAi(Protocol):
    ) -> str | None:
        """Locate an element on the page using AI and return its XPath selector."""
        ...
    async def ai_prompt(
        self,
        prompt: str,
        schema: dict[str, Any] | None = None,
        model: dict[str, Any] | None = None,
    ) -> dict[str, Any] | list | str | None:
        """Send a prompt to the LLM and get a response based on the provided schema."""
        ...
--- a/skyvern/forge/sdk/routes/sdk.py
+++ b/skyvern/forge/sdk/routes/sdk.py
@@ -205,6 +205,13 @@ async def run_sdk_action(
                prompt=action.prompt,
            )
            result = xpath_result
        elif action.type == "prompt":
            prompt_result = await page_ai.ai_prompt(
                prompt=action.prompt,
                schema=action.schema,
                model=action.model,
            )
            result = prompt_result
        await app.DATABASE.update_task(
            task_id=task.task_id,
            organization_id=organization_id,
--- a/skyvern/forge/sdk/schemas/sdk_actions.py
+++ b/skyvern/forge/sdk/schemas/sdk_actions.py
@@ -16,6 +16,7 @@ class SdkActionType(str, Enum):
    AI_ACT = "ai_act"
    EXTRACT = "extract"
    LOCATE_ELEMENT = "locate_element"
    PROMPT = "prompt"
 # Base action class
@@ -151,6 +152,21 @@ class LocateElementAction(SdkActionBase):
        return None
 class PromptAction(SdkActionBase):
    """Prompt action parameters."""
    type: Literal["prompt"] = "prompt"
    prompt: str = Field(..., description="The prompt to send to the LLM")
    schema: dict[str, Any] | None = Field(None, description="Optional JSON schema to structure the response")
    model: dict[str, Any] | None = Field(None, description="Optional model configuration")
    def get_navigation_goal(self) -> str | None:
        return self.prompt
    def get_navigation_payload(self) -> dict[str, Any] | None:
        return None
 # Discriminated union of all action types
 SdkAction = Annotated[
    Union[
@@ -161,6 +177,7 @@ SdkAction = Annotated[
        ActAction,
        ExtractAction,
        LocateElementAction,
        PromptAction,
    ],
    Field(discriminator="type"),
 ]
--- a/skyvern/library/skyvern_browser_page_ai.py
+++ b/skyvern/library/skyvern_browser_page_ai.py
@@ -11,6 +11,7 @@ from skyvern.client import (
    RunSdkActionRequestAction_AiUploadFile,
    RunSdkActionRequestAction_Extract,
    RunSdkActionRequestAction_LocateElement,
    RunSdkActionRequestAction_Prompt,
 )
 from skyvern.config import settings
 from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
@@ -225,3 +226,33 @@ class SdkSkyvernPageAi(SkyvernPageAi):
            return response.result
        return None
    async def ai_prompt(
        self,
        prompt: str,
        schema: dict[str, Any] | None = None,
        model: dict[str, Any] | None = None,
    ) -> dict[str, Any] | list | str | None:
        """Send a prompt to the LLM and get a response based on the provided schema via API call."""
        LOG.info(
            "AI prompt",
            prompt=prompt,
            model=model,
            workflow_run_id=self._browser.workflow_run_id,
        )
        response = await self._browser.skyvern.run_sdk_action(
            url=self._page.url,
            action=RunSdkActionRequestAction_Prompt(
                prompt=prompt,
                schema=schema,
                model=model,
            ),
            browser_session_id=self._browser.browser_session_id,
            browser_address=self._browser.browser_address,
            workflow_run_id=self._browser.workflow_run_id,
        )
        self._browser.workflow_run_id = response.workflow_run_id
        return response.result if response.result is not None else None