SKY-7801/7802: MCP Foundation + Hybrid Browser Tools (selector + AI intent) (#4660)

This commit is contained in:
Marc Kelechava
2026-02-07 02:33:13 -08:00
committed by GitHub
parent cb7225c6e6
commit 4f1bf25768
12 changed files with 1826 additions and 6 deletions

View File

@@ -0,0 +1,43 @@
"""Shared core layer for Skyvern CLI and MCP tools.
This package provides reusable primitives that both MCP tools and CLI commands
import from, preventing logic duplication across interfaces.
"""
from .artifacts import get_artifact_dir, save_artifact
from .client import get_skyvern
from .result import Artifact, BrowserContext, ErrorCode, Timer, make_error, make_result
from .session_manager import (
BrowserNotAvailableError,
SessionState,
browser_session,
get_current_session,
get_page,
no_browser_error,
resolve_browser,
set_current_session,
)
__all__ = [
# client.py
"get_skyvern",
# result.py
"Artifact",
"BrowserContext",
"ErrorCode",
"Timer",
"make_error",
"make_result",
# artifacts.py
"get_artifact_dir",
"save_artifact",
# session_manager.py
"BrowserNotAvailableError",
"SessionState",
"browser_session",
"get_current_session",
"get_page",
"no_browser_error",
"resolve_browser",
"set_current_session",
]

View File

@@ -0,0 +1,29 @@
from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
from .result import Artifact
def get_artifact_dir(session_id: str | None = None, run_id: str | None = None) -> Path:
base = Path.home() / ".skyvern" / "artifacts" / datetime.now(timezone.utc).strftime("%Y-%m-%d")
if session_id:
return base / session_id
if run_id:
return base / run_id
return base / "anonymous"
def save_artifact(
content: bytes,
kind: str,
filename: str,
mime: str,
session_id: str | None = None,
) -> Artifact:
dir_path = get_artifact_dir(session_id)
dir_path.mkdir(parents=True, exist_ok=True)
file_path = dir_path / filename
file_path.write_bytes(content)
return Artifact(kind=kind, path=str(file_path), mime=mime, bytes=len(content))

View File

@@ -0,0 +1,32 @@
from __future__ import annotations
import os
from contextvars import ContextVar
from skyvern.client import SkyvernEnvironment
from skyvern.config import settings
from skyvern.library.skyvern import Skyvern
_skyvern_instance: ContextVar[Skyvern | None] = ContextVar("skyvern_instance", default=None)
def get_skyvern() -> Skyvern:
"""Get or create a Skyvern client instance."""
instance = _skyvern_instance.get()
if instance is not None:
return instance
api_key = settings.SKYVERN_API_KEY or os.environ.get("SKYVERN_API_KEY")
base_url = settings.SKYVERN_BASE_URL or os.environ.get("SKYVERN_BASE_URL")
if api_key:
instance = Skyvern(
api_key=api_key,
environment=SkyvernEnvironment.CLOUD,
base_url=base_url,
)
else:
instance = Skyvern.local()
_skyvern_instance.set(instance)
return instance

106
skyvern/cli/core/result.py Normal file
View File

@@ -0,0 +1,106 @@
from __future__ import annotations
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any
class ErrorCode:
NO_ACTIVE_BROWSER = "NO_ACTIVE_BROWSER"
BROWSER_NOT_FOUND = "BROWSER_NOT_FOUND"
SELECTOR_NOT_FOUND = "SELECTOR_NOT_FOUND"
ACTION_FAILED = "ACTION_FAILED"
AI_FALLBACK_FAILED = "AI_FALLBACK_FAILED"
SDK_ERROR = "SDK_ERROR"
TIMEOUT = "TIMEOUT"
INVALID_INPUT = "INVALID_INPUT"
@dataclass
class Artifact:
kind: str
path: str
mime: str
bytes: int
created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
def to_dict(self) -> dict[str, Any]:
return {
"kind": self.kind,
"path": self.path,
"mime": self.mime,
"bytes": self.bytes,
"created_at": self.created_at,
}
@dataclass
class BrowserContext:
mode: str
session_id: str | None = None
cdp_url: str | None = None
def to_dict(self) -> dict[str, Any]:
return {
"mode": self.mode,
"session_id": self.session_id,
"cdp_url": self.cdp_url,
}
def make_result(
action: str,
*,
ok: bool = True,
browser_context: BrowserContext | None = None,
data: dict[str, Any] | None = None,
artifacts: list[Artifact] | None = None,
timing_ms: dict[str, int] | None = None,
warnings: list[str] | None = None,
error: dict[str, Any] | None = None,
) -> dict[str, Any]:
return {
"ok": ok,
"action": action,
"browser_context": (browser_context or BrowserContext(mode="none")).to_dict(),
"data": data,
"artifacts": [a.to_dict() for a in (artifacts or [])],
"timing_ms": timing_ms or {},
"warnings": warnings or [],
"error": error,
}
def make_error(
code: str,
message: str,
hint: str,
details: dict[str, Any] | None = None,
) -> dict[str, Any]:
return {
"code": code,
"message": message,
"hint": hint,
"details": details or {},
}
class Timer:
def __init__(self) -> None:
self._start: float = 0
self._marks: dict[str, int] = {}
def __enter__(self) -> Timer:
self._start = time.perf_counter()
return self
def __exit__(self, *args: Any) -> None:
self._marks["total"] = int((time.perf_counter() - self._start) * 1000)
def mark(self, name: str) -> None:
self._marks[name] = int((time.perf_counter() - self._start) * 1000)
@property
def timing_ms(self) -> dict[str, int]:
return self._marks.copy()

View File

@@ -0,0 +1,153 @@
from __future__ import annotations
from contextlib import asynccontextmanager
from contextvars import ContextVar
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, AsyncIterator
from .client import get_skyvern
from .result import BrowserContext, ErrorCode, make_error
if TYPE_CHECKING:
from skyvern.library.skyvern_browser import SkyvernBrowser
from skyvern.library.skyvern_browser_page import SkyvernBrowserPage
@dataclass
class SessionState:
browser: SkyvernBrowser | None = None
context: BrowserContext | None = None
console_messages: list[dict[str, Any]] = field(default_factory=list)
tracing_active: bool = False
har_enabled: bool = False
_current_session: ContextVar[SessionState | None] = ContextVar("mcp_session", default=None)
def get_current_session() -> SessionState:
state = _current_session.get()
if state is None:
state = SessionState()
_current_session.set(state)
return state
def set_current_session(state: SessionState) -> None:
_current_session.set(state)
async def resolve_browser(
session_id: str | None = None,
cdp_url: str | None = None,
local: bool = False,
create_session: bool = False,
timeout: int | None = None,
headless: bool = False,
) -> tuple[SkyvernBrowser, BrowserContext]:
"""Resolve browser from parameters or current session.
Note: For MCP tools, sessions are stored in ContextVar and persist across tool calls.
Cleanup is done via explicit skyvern_session_close() call. For scripts that need
guaranteed cleanup, use the browser_session() context manager instead.
"""
skyvern = get_skyvern()
current = get_current_session()
browser: SkyvernBrowser | None = None
try:
if session_id:
browser = await skyvern.connect_to_cloud_browser_session(session_id)
ctx = BrowserContext(mode="cloud_session", session_id=session_id)
set_current_session(SessionState(browser=browser, context=ctx))
return browser, ctx
if cdp_url:
browser = await skyvern.connect_to_browser_over_cdp(cdp_url)
ctx = BrowserContext(mode="cdp", cdp_url=cdp_url)
set_current_session(SessionState(browser=browser, context=ctx))
return browser, ctx
if local:
browser = await skyvern.launch_local_browser(headless=headless)
ctx = BrowserContext(mode="local")
set_current_session(SessionState(browser=browser, context=ctx))
return browser, ctx
if create_session:
browser = await skyvern.launch_cloud_browser(timeout=timeout)
ctx = BrowserContext(mode="cloud_session", session_id=browser.browser_session_id)
set_current_session(SessionState(browser=browser, context=ctx))
return browser, ctx
except Exception:
if browser is not None:
try:
await browser.close()
except Exception:
pass
set_current_session(SessionState())
raise
if current.browser is not None and current.context is not None:
return current.browser, current.context
raise BrowserNotAvailableError()
async def get_page(
session_id: str | None = None,
cdp_url: str | None = None,
) -> tuple[SkyvernBrowserPage, BrowserContext]:
"""Get the working page from the current or specified browser session."""
browser, ctx = await resolve_browser(session_id=session_id, cdp_url=cdp_url)
page = await browser.get_working_page()
return page, ctx
@asynccontextmanager
async def browser_session(
session_id: str | None = None,
cdp_url: str | None = None,
local: bool = False,
timeout: int | None = None,
headless: bool = False,
) -> AsyncIterator[tuple[SkyvernBrowser, BrowserContext]]:
"""Context manager for browser sessions with guaranteed cleanup.
Use this in scripts that need guaranteed resource cleanup on error.
MCP tools use resolve_browser() directly since sessions persist across calls.
Example:
async with browser_session(local=True) as (browser, ctx):
page = await browser.get_working_page()
await page.goto("https://example.com")
# Browser is automatically closed on exit or exception
"""
browser, ctx = await resolve_browser(
session_id=session_id,
cdp_url=cdp_url,
local=local,
create_session=not (session_id or cdp_url or local),
timeout=timeout,
headless=headless,
)
try:
yield browser, ctx
finally:
try:
await browser.close()
except Exception:
pass # Best effort cleanup
set_current_session(SessionState())
class BrowserNotAvailableError(Exception):
"""Raised when no browser session is available."""
def no_browser_error() -> dict[str, Any]:
return make_error(
ErrorCode.NO_ACTIVE_BROWSER,
"No browser session available",
"Create a session with skyvern_session_create, provide session_id, or cdp_url",
)

View File

@@ -0,0 +1,201 @@
"""Skyvern MCP Tools.
This module provides MCP (Model Context Protocol) tools for browser automation.
Tools are registered with FastMCP and can be used by AI assistants like Claude.
"""
from fastmcp import FastMCP
from .browser import (
skyvern_act,
skyvern_click,
skyvern_evaluate,
skyvern_extract,
skyvern_navigate,
skyvern_press_key,
skyvern_run_task,
skyvern_screenshot,
skyvern_scroll,
skyvern_select_option,
skyvern_type,
skyvern_validate,
skyvern_wait,
)
from .session import (
skyvern_session_close,
skyvern_session_connect,
skyvern_session_create,
skyvern_session_get,
skyvern_session_list,
)
mcp = FastMCP(
"Skyvern",
instructions="""Use Skyvern tools whenever the task involves visiting, browsing, or interacting with ANY website or web application.
## When to Use These Tools
Reach for Skyvern tools when the user asks you to:
- Visit, browse, or interact with ANY website or web application
- Extract data from web pages (prices, listings, articles, tables, search results, etc.)
- Fill out forms, log in, sign up, or complete web-based workflows
- Check the current state of a web page or verify something on a site
- Do anything you would otherwise attempt with requests, beautifulsoup, selenium, or playwright
- Access website data where you are unsure whether an API endpoint exists
DO NOT try to scrape websites by guessing API endpoints or writing HTTP requests.
Instead, use skyvern_navigate + skyvern_extract to get real data from actual pages.
These tools give you a real browser — use them instead of writing scraping code.
## Examples
| User says | Use |
|-----------|-----|
| "Go to amazon.com" | skyvern_navigate |
| "What's on this page?" | skyvern_screenshot |
| "Get all product prices" | skyvern_extract |
| "Click the login button" | skyvern_act or skyvern_click |
| "Fill out this form" | skyvern_act |
| "Log in and buy the first item" | skyvern_run_task |
| "Is checkout complete?" | skyvern_validate |
| "Write a script to do this" | Skyvern SDK (see below) |
## Writing Scripts and Code
When asked to write an automation script, use the Skyvern Python SDK with the **hybrid xpath+prompt
pattern** for production-quality scripts. The hybrid form tries the xpath/selector first (fast,
deterministic) and falls back to AI if the selector breaks — this is the recommended pattern.
from skyvern import Skyvern
skyvern = Skyvern(api_key="YOUR_API_KEY")
browser = await skyvern.launch_cloud_browser()
page = await browser.get_working_page()
await page.goto("https://example.com")
# BEST: hybrid selector+prompt — fast deterministic selector with AI fallback
await page.click("xpath=//button[@id='submit']", prompt="the Submit button")
await page.fill("xpath=//input[@name='email']", "user@example.com", prompt="email input field")
# OK for exploration, but prefer hybrid for production scripts:
await page.click(prompt="the Submit button")
data = await page.extract("Get all product names and prices")
To get xpaths for hybrid calls, use precision tools (skyvern_click, skyvern_type) during exploration.
The `resolved_selector` field in responses gives you the xpath the AI resolved to. Use it in scripts:
explore: skyvern_click(intent="Submit button") → response includes resolved_selector="xpath=//button[@id='submit']"
script: await page.click("xpath=//button[@id='submit']", prompt="Submit button")
IMPORTANT: NEVER import from skyvern.cli.mcp_tools — those are internal server modules.
The public SDK is: from skyvern import Skyvern
## Recommended Workflow
1. **Connect** — Create or connect to a browser session
2. **Explore** — Navigate pages, take screenshots, extract data with AI
3. **Build** — Capture selectors and data schemas to construct deterministic workflows
4. **Test** — Validate workflows via skyvern_run_task
## Primary Tools (use these first)
These are the tools you should reach for by default:
- **skyvern_act** — Execute actions from natural language: "log in with test@example.com", "add the first item to cart". Best for exploration and testing flows.
- **skyvern_extract** — Pull structured data from any page with natural language + optional JSON Schema. THE differentiator over raw Playwright.
- **skyvern_validate** — Assert page conditions with AI: "is the user logged in?", "does the cart have 3 items?"
- **skyvern_run_task** — Delegate a full multi-step task to an autonomous AI agent with observability. Use for end-to-end task execution.
- **skyvern_navigate** — Go to a URL. Always the first step after connecting.
- **skyvern_screenshot** — See what's on the page. Essential for understanding page state.
- **skyvern_evaluate** — Run JavaScript to read DOM state, get URLs, or check values.
## Precision Tools (for debugging and exact control)
Use these when the primary tools aren't specific enough, or when you need deterministic
selector-based actions (e.g., replaying a known flow):
- **skyvern_click** — Click a specific element by selector or AI intent
- **skyvern_type** — Type into a specific input field by selector or AI intent
- **skyvern_scroll** — Scroll the page or an element into view
- **skyvern_select_option** — Select a dropdown option by selector or AI intent
- **skyvern_press_key** — Press a keyboard key (Enter, Tab, Escape, etc.)
- **skyvern_wait** — Wait for a condition, element, or time delay
## Tool Modes (precision tools)
Precision tools support three modes. When unsure, use `intent`.
1. **Intent mode** — AI-powered element finding:
`skyvern_click(intent="the blue Submit button")`
2. **Hybrid mode** — tries selector first, AI fallback:
`skyvern_click(selector="#submit-btn", intent="the Submit button")`
3. **Selector mode** — deterministic CSS/XPath targeting:
`skyvern_click(selector="#submit-btn")`
## Replay Story: From Exploration to Production Scripts
When you use precision tools (skyvern_click, skyvern_type, etc.) with intent mode, the response
includes `resolved_selector` — the xpath/CSS the AI found. Capture these to build hybrid scripts.
**The hybrid pattern** is the recommended default for SDK scripts:
await page.click("xpath=//button[@id='submit']", prompt="the Submit button")
It tries the selector first (fast, no AI cost), then falls back to AI if the selector breaks.
**Workflow for generating scripts:**
1. Explore: Use skyvern_click(intent="Submit button") during interactive exploration
2. Capture: Note the `resolved_selector` from the response (e.g., "//button[@id='submit']")
3. Script: Write `page.click("xpath=//button[@id='submit']", prompt="Submit button")`
The `sdk_equivalent` field in each tool response shows the correct hybrid call to use in scripts.
Always prefer hybrid xpath+prompt over prompt-only in generated scripts.
Note: Currently only skyvern_click returns resolved_selector. Support for skyvern_type and
skyvern_select_option is planned (SKY-7905). For those tools, use the selector you provided
as input, or fall back to prompt-only until SKY-7905 ships.
## Getting Started
Create a session with skyvern_session_create, then use browser tools to interact with pages.
""",
)
# -- Session management --
mcp.tool()(skyvern_session_create)
mcp.tool()(skyvern_session_close)
mcp.tool()(skyvern_session_list)
mcp.tool()(skyvern_session_get)
mcp.tool()(skyvern_session_connect)
# -- Primary tools (AI-powered exploration + observation) --
mcp.tool()(skyvern_act)
mcp.tool()(skyvern_extract)
mcp.tool()(skyvern_validate)
mcp.tool()(skyvern_run_task)
mcp.tool()(skyvern_navigate)
mcp.tool()(skyvern_screenshot)
mcp.tool()(skyvern_evaluate)
# -- Precision tools (selector/intent-based browser primitives) --
mcp.tool()(skyvern_click)
mcp.tool()(skyvern_type)
mcp.tool()(skyvern_scroll)
mcp.tool()(skyvern_select_option)
mcp.tool()(skyvern_press_key)
mcp.tool()(skyvern_wait)
__all__ = [
"mcp",
# Session
"skyvern_session_create",
"skyvern_session_close",
"skyvern_session_list",
"skyvern_session_get",
"skyvern_session_connect",
# Primary (AI-powered)
"skyvern_act",
"skyvern_extract",
"skyvern_validate",
"skyvern_run_task",
"skyvern_navigate",
"skyvern_screenshot",
"skyvern_evaluate",
# Precision (selector/intent browser primitives)
"skyvern_click",
"skyvern_type",
"skyvern_scroll",
"skyvern_select_option",
"skyvern_press_key",
"skyvern_wait",
]

View File

@@ -0,0 +1,11 @@
"""Skyvern HTTP API client accessor.
Workflow tools import from here to get the API client without pulling in
browser/Playwright dependencies.
"""
from __future__ import annotations
from skyvern.cli.core.client import get_skyvern
__all__ = ["get_skyvern"]

View File

@@ -0,0 +1,20 @@
"""Backward-compatible re-exports from skyvern.cli.core.
MCP tools import from here; the canonical implementations live in core/.
"""
from __future__ import annotations
from skyvern.cli.core.artifacts import get_artifact_dir, save_artifact
from skyvern.cli.core.result import Artifact, BrowserContext, ErrorCode, Timer, make_error, make_result
__all__ = [
"Artifact",
"BrowserContext",
"ErrorCode",
"Timer",
"get_artifact_dir",
"make_error",
"make_result",
"save_artifact",
]

View File

@@ -0,0 +1,30 @@
"""Backward-compatible re-exports from skyvern.cli.core.
MCP tools import from here; the canonical implementations live in core/.
"""
from __future__ import annotations
from skyvern.cli.core.client import get_skyvern
from skyvern.cli.core.session_manager import (
BrowserNotAvailableError,
SessionState,
browser_session,
get_current_session,
get_page,
no_browser_error,
resolve_browser,
set_current_session,
)
__all__ = [
"BrowserNotAvailableError",
"SessionState",
"browser_session",
"get_current_session",
"get_page",
"get_skyvern",
"no_browser_error",
"resolve_browser",
"set_current_session",
]

View File

@@ -0,0 +1,939 @@
from __future__ import annotations
import asyncio
import base64
import json
from datetime import datetime, timezone
from typing import Annotated, Any
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from pydantic import Field
from ._common import (
ErrorCode,
Timer,
make_error,
make_result,
save_artifact,
)
from ._session import BrowserNotAvailableError, get_page, no_browser_error
def _resolve_ai_mode(
selector: str | None,
intent: str | None,
) -> tuple[str | None, str | None]:
"""Determine AI mode from selector/intent combination.
Returns (ai_mode, error_code) — if error_code is set, the call should fail.
"""
if intent and not selector:
return "proactive", None
if intent and selector:
return "fallback", None
if selector and not intent:
return None, None
return None, "INVALID_INPUT"
async def skyvern_navigate(
url: Annotated[str, "The URL to navigate to"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=120000)] = 30000,
wait_until: Annotated[str | None, Field(description="Wait condition: load, domcontentloaded, networkidle")] = None,
) -> dict[str, Any]:
"""Open a website in the browser. Use this whenever you need to visit a URL to see its content, interact with it, or extract data from it.
Returns the final URL (after redirects) and page title.
"""
if wait_until is not None and wait_until not in ("load", "domcontentloaded", "networkidle", "commit"):
return make_result(
"skyvern_navigate",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
f"Invalid wait_until: {wait_until}",
"Use load, domcontentloaded, networkidle, or commit",
),
)
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_navigate", ok=False, error=no_browser_error())
with Timer() as timer:
try:
await page.goto(url, timeout=timeout, wait_until=wait_until)
timer.mark("sdk")
final_url = page.url
title = await page.title()
except Exception as e:
return make_result(
"skyvern_navigate",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check that the URL is valid and accessible"),
)
return make_result(
"skyvern_navigate",
browser_context=ctx,
data={"url": final_url, "title": title, "sdk_equivalent": f'await page.goto("{url}")'},
timing_ms=timer.timing_ms,
)
async def skyvern_click(
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
intent: Annotated[
str | None, Field(description="Natural language description of the element to click (uses AI)")
] = None,
selector: Annotated[str | None, Field(description="CSS selector or XPath for the element to click")] = None,
timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
button: Annotated[str | None, Field(description="Mouse button: left, right, middle")] = None,
click_count: Annotated[int | None, Field(description="Number of clicks (2 for double-click)")] = None,
) -> dict[str, Any]:
"""Click an element on the page. Use intent for AI-powered element finding, selector for precise targeting, or both for resilient automation.
Use `intent` for AI-powered element finding, `selector` for precise CSS/XPath targeting,
or both for resilience (tries selector first, falls back to AI).
"""
if button is not None and button not in ("left", "right", "middle"):
return make_result(
"skyvern_click",
ok=False,
error=make_error(ErrorCode.INVALID_INPUT, f"Invalid button: {button}", "Use left, right, or middle"),
)
ai_mode, err = _resolve_ai_mode(selector, intent)
if err:
return make_result(
"skyvern_click",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
"Must provide intent, selector, or both",
"Use intent='describe what to click' for AI-powered clicking, or selector='#css-selector' for precise targeting",
),
)
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_click", ok=False, error=no_browser_error())
with Timer() as timer:
try:
kwargs: dict[str, Any] = {"timeout": timeout}
if button:
kwargs["button"] = button
if click_count is not None:
kwargs["click_count"] = click_count
if ai_mode is not None:
resolved = await page.click(selector=selector, prompt=intent, ai=ai_mode, **kwargs) # type: ignore[arg-type]
else:
assert selector is not None
resolved = await page.click(selector=selector, **kwargs)
timer.mark("sdk")
except PlaywrightTimeoutError as e:
return make_result(
"skyvern_click",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(
ErrorCode.SELECTOR_NOT_FOUND,
str(e),
"Verify the selector matches an element on the page, or use intent for AI-powered finding",
),
)
except Exception as e:
code = ErrorCode.AI_FALLBACK_FAILED if ai_mode else ErrorCode.ACTION_FAILED
return make_result(
"skyvern_click",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(
code,
str(e),
"The element may be hidden, disabled, or intercepted by another element",
),
)
data: dict[str, Any] = {"selector": selector, "intent": intent, "ai_mode": ai_mode}
if resolved and resolved != selector:
data["resolved_selector"] = resolved
# Build sdk_equivalent: prefer hybrid selector+prompt for production scripts.
# resolved_selector already contains the "xpath=" prefix (e.g. "xpath=//button[@id='x']"),
# so pass it directly as the selector positional arg.
resolved_sel = resolved if resolved and resolved != selector else selector
if resolved_sel and intent:
data["sdk_equivalent"] = f'await page.click("{resolved_sel}", prompt="{intent}")'
elif ai_mode:
data["sdk_equivalent"] = f'await page.click(prompt="{intent}")'
elif selector:
data["sdk_equivalent"] = f'await page.click("{selector}")'
return make_result(
"skyvern_click",
browser_context=ctx,
data=data,
timing_ms=timer.timing_ms,
)
async def skyvern_type(
text: Annotated[str, "Text to type into the element"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
intent: Annotated[
str | None, Field(description="Natural language description of the input field (uses AI)")
] = None,
selector: Annotated[str | None, Field(description="CSS selector or XPath for the input element")] = None,
timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
clear: Annotated[bool, Field(description="Clear existing content before typing")] = True,
delay: Annotated[int | None, Field(description="Delay between keystrokes in ms")] = None,
) -> dict[str, Any]:
"""Type text into an input field. Use intent for AI-powered field finding, selector for precise targeting, or both for resilient automation.
Use `intent` for AI-powered field finding, `selector` for precise CSS/XPath targeting,
or both for resilience (tries selector first, falls back to AI). Clears existing content by default.
"""
ai_mode, err = _resolve_ai_mode(selector, intent)
if err:
return make_result(
"skyvern_type",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
"Must provide intent, selector, or both",
"Use intent='describe the input field' for AI-powered targeting, or selector='#css-selector' for precise targeting",
),
)
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_type", ok=False, error=no_browser_error())
with Timer() as timer:
try:
if clear:
if ai_mode is not None:
await page.fill(selector=selector, value=text, prompt=intent, ai=ai_mode, timeout=timeout) # type: ignore[arg-type]
else:
assert selector is not None
await page.fill(selector, text, timeout=timeout)
else:
kwargs: dict[str, Any] = {"timeout": timeout}
if delay is not None:
kwargs["delay"] = delay
if ai_mode is not None:
loc = page.locator(selector=selector, prompt=intent, ai=ai_mode) # type: ignore[arg-type]
await loc.type(text, **kwargs)
else:
assert selector is not None
await page.type(selector, text, **kwargs)
timer.mark("sdk")
except PlaywrightTimeoutError as e:
return make_result(
"skyvern_type",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(
ErrorCode.SELECTOR_NOT_FOUND,
str(e),
"Verify the selector matches an editable element, or use intent for AI-powered finding",
),
)
except Exception as e:
code = ErrorCode.AI_FALLBACK_FAILED if ai_mode else ErrorCode.ACTION_FAILED
return make_result(
"skyvern_type",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(
code,
str(e),
"The element may not be editable or may be hidden",
),
)
# NOTE: The SDK fill() returns the typed value, not a resolved selector.
# Unlike click(), we cannot return resolved_selector here. SKY-7905 will
# update the SDK to return element metadata from all action methods.
data: dict[str, Any] = {"selector": selector, "intent": intent, "ai_mode": ai_mode, "text_length": len(text)}
# Build sdk_equivalent: prefer hybrid selector+prompt for production scripts
if selector and intent:
data["sdk_equivalent"] = f'await page.fill("{selector}", "{text}", prompt="{intent}")'
elif ai_mode:
data["sdk_equivalent"] = f'await page.fill(prompt="{intent}", value="{text}")'
elif selector:
data["sdk_equivalent"] = f'await page.fill("{selector}", "{text}")'
return make_result(
"skyvern_type",
browser_context=ctx,
data=data,
timing_ms=timer.timing_ms,
)
async def skyvern_screenshot(
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
full_page: Annotated[bool, Field(description="Capture full scrollable page")] = False,
selector: Annotated[str | None, Field(description="CSS selector to screenshot specific element")] = None,
inline: Annotated[bool, Field(description="Return base64 data instead of file path")] = False,
) -> dict[str, Any]:
"""See what's currently on the page. Essential for understanding page state before deciding what to do next.
By default saves to ~/.skyvern/artifacts/ and returns the file path.
Set inline=true to get base64 data directly (increases token usage).
"""
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_screenshot", ok=False, error=no_browser_error())
with Timer() as timer:
try:
if selector:
element = page.locator(selector)
screenshot_bytes = await element.screenshot()
else:
screenshot_bytes = await page.screenshot(full_page=full_page)
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_screenshot",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check that the page or element is visible"),
)
if inline:
data_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
return make_result(
"skyvern_screenshot",
browser_context=ctx,
data={
"inline": True,
"data": data_b64,
"mime": "image/png",
"bytes": len(screenshot_bytes),
"sdk_equivalent": "await page.screenshot()",
},
timing_ms=timer.timing_ms,
warnings=["Inline mode increases token usage"],
)
ts = datetime.now(timezone.utc).strftime("%H%M%S_%f")
filename = f"screenshot_{ts}.png"
artifact = save_artifact(
screenshot_bytes,
kind="screenshot",
filename=filename,
mime="image/png",
session_id=ctx.session_id,
)
return make_result(
"skyvern_screenshot",
browser_context=ctx,
data={"path": artifact.path, "sdk_equivalent": "await page.screenshot(path='screenshot.png')"},
artifacts=[artifact],
timing_ms=timer.timing_ms,
)
async def skyvern_scroll(
direction: Annotated[str, Field(description="Direction: up, down, left, right")],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
amount: Annotated[int | None, Field(description="Pixels to scroll (default 500)")] = None,
intent: Annotated[
str | None, Field(description="Natural language description of element to scroll into view (uses AI)")
] = None,
selector: Annotated[str | None, Field(description="CSS selector of scrollable element")] = None,
) -> dict[str, Any]:
"""Scroll the page or use AI to scroll a specific element into view.
Use `intent` to scroll an AI-located element into view (with or without selector for hybrid fallback).
Without intent, scrolls the page or a selector-targeted container by pixel amount.
"""
valid_directions = ("up", "down", "left", "right")
if not intent and direction not in valid_directions:
return make_result(
"skyvern_scroll",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT, f"Invalid direction: {direction}", "Use up, down, left, or right"
),
)
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_scroll", ok=False, error=no_browser_error())
if intent:
ai_mode = "fallback" if selector else "proactive"
with Timer() as timer:
try:
loc = page.locator(selector=selector, prompt=intent, ai=ai_mode)
await loc.scroll_into_view_if_needed()
timer.mark("sdk")
except Exception as e:
code = ErrorCode.AI_FALLBACK_FAILED if ai_mode == "fallback" else ErrorCode.ACTION_FAILED
return make_result(
"skyvern_scroll",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(code, str(e), "Could not find element to scroll into view"),
)
return make_result(
"skyvern_scroll",
browser_context=ctx,
data={
"direction": "into_view",
"intent": intent,
"ai_mode": ai_mode,
"sdk_equivalent": (
f'await page.locator("{selector}", prompt="{intent}").scroll_into_view_if_needed()'
if selector
else f'await page.locator(prompt="{intent}").scroll_into_view_if_needed()'
),
},
timing_ms=timer.timing_ms,
)
pixels = amount or 500
direction_map = {
"up": (0, -pixels),
"down": (0, pixels),
"left": (-pixels, 0),
"right": (pixels, 0),
}
dx, dy = direction_map[direction]
with Timer() as timer:
try:
if selector:
await page.locator(selector).evaluate(f"el => el.scrollBy({dx}, {dy})")
else:
await page.evaluate(f"window.scrollBy({dx}, {dy})")
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_scroll",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.ACTION_FAILED, str(e), "Scroll action failed"),
)
return make_result(
"skyvern_scroll",
browser_context=ctx,
data={
"direction": direction,
"pixels": pixels,
"sdk_equivalent": f'await page.evaluate("window.scrollBy({dx}, {dy})")',
},
timing_ms=timer.timing_ms,
)
async def skyvern_select_option(
value: Annotated[str, "Value to select"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
intent: Annotated[str | None, Field(description="Natural language description of the dropdown (uses AI)")] = None,
selector: Annotated[str | None, Field(description="CSS selector for the select element")] = None,
timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
by_label: Annotated[bool, Field(description="Select by visible label instead of value")] = False,
) -> dict[str, Any]:
"""Select an option from a dropdown menu. Use intent for AI-powered finding, selector for precision.
Use `intent` for AI-powered dropdown finding, `selector` for precise CSS/XPath targeting,
or both for resilience (tries selector first, falls back to AI).
"""
ai_mode, err = _resolve_ai_mode(selector, intent)
if err:
return make_result(
"skyvern_select_option",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
"Must provide intent, selector, or both",
"Use intent='describe the dropdown' for AI-powered selection, or selector='#css-selector' for precise targeting",
),
)
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_select_option", ok=False, error=no_browser_error())
with Timer() as timer:
try:
if ai_mode is not None:
# AI paths: pass value= directly -- the AI interprets the text
# regardless of whether it represents a value or label.
await page.select_option(selector=selector, value=value, prompt=intent, ai=ai_mode, timeout=timeout) # type: ignore[arg-type]
else:
assert selector is not None
if by_label:
# Bypass SkyvernPage to avoid value="" coercion conflicting with label kwarg.
await page.page.locator(selector).select_option(label=value, timeout=timeout)
else:
await page.select_option(selector, value=value, timeout=timeout)
timer.mark("sdk")
except Exception as e:
code = ErrorCode.AI_FALLBACK_FAILED if ai_mode else ErrorCode.ACTION_FAILED
return make_result(
"skyvern_select_option",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(code, str(e), "Check selector and available options"),
)
# NOTE: The SDK select_option() returns the selected value, not a resolved
# selector. Unlike click(), we cannot return resolved_selector here.
# SKY-7905 will update the SDK to return element metadata from all action methods.
data: dict[str, Any] = {"selector": selector, "intent": intent, "ai_mode": ai_mode, "value": value}
# Build sdk_equivalent: prefer hybrid selector+prompt for production scripts
if selector and intent:
data["sdk_equivalent"] = f'await page.select_option("{selector}", value="{value}", prompt="{intent}")'
elif ai_mode:
data["sdk_equivalent"] = f'await page.select_option(prompt="{intent}", value="{value}")'
elif selector:
data["sdk_equivalent"] = f'await page.select_option("{selector}", value="{value}")'
return make_result(
"skyvern_select_option",
browser_context=ctx,
data=data,
timing_ms=timer.timing_ms,
)
async def skyvern_press_key(
key: Annotated[str, "Key to press (e.g., Enter, Tab, Escape, ArrowDown)"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
intent: Annotated[
str | None, Field(description="Natural language description of element to focus first (uses AI)")
] = None,
selector: Annotated[str | None, Field(description="CSS selector to focus first")] = None,
) -> dict[str, Any]:
"""Press a keyboard key -- Enter, Tab, Escape, arrow keys, shortcuts, etc.
Use `intent` or `selector` to focus a specific element before pressing.
Without either, presses the key on the currently focused element.
"""
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_press_key", ok=False, error=no_browser_error())
with Timer() as timer:
try:
if intent or selector:
ai_mode, _ = _resolve_ai_mode(selector, intent)
if ai_mode is not None:
loc = page.locator(selector=selector, prompt=intent, ai=ai_mode) # type: ignore[arg-type]
else:
assert selector is not None
loc = page.locator(selector)
await loc.press(key)
else:
await page.keyboard.press(key)
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_press_key",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check key name is valid"),
)
if selector and intent:
sdk_eq = f'await page.locator("{selector}", prompt="{intent}").press("{key}")'
elif intent:
sdk_eq = f'await page.locator(prompt="{intent}").press("{key}")'
elif selector:
sdk_eq = f'await page.locator("{selector}").press("{key}")'
else:
sdk_eq = f'await page.keyboard.press("{key}")'
return make_result(
"skyvern_press_key",
browser_context=ctx,
data={
"key": key,
"selector": selector,
"intent": intent,
"sdk_equivalent": sdk_eq,
},
timing_ms=timer.timing_ms,
)
async def skyvern_wait(
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
time_ms: Annotated[int | None, Field(description="Time to wait in milliseconds")] = None,
intent: Annotated[str | None, Field(description="Natural language condition to wait for (uses AI polling)")] = None,
selector: Annotated[str | None, Field(description="CSS selector to wait for")] = None,
state: Annotated[str | None, Field(description="Element state: visible, hidden, attached, detached")] = "visible",
timeout: Annotated[int, Field(description="Max wait time in milliseconds", ge=1000, le=120000)] = 30000,
poll_interval_ms: Annotated[
int, Field(description="Polling interval for intent-based waits in ms", ge=500, le=10000)
] = 5000,
) -> dict[str, Any]:
"""Wait for a condition, element, or time delay before proceeding. Use intent for AI-powered condition checking.
Use `intent` to poll with AI validation (e.g., "wait until the loading spinner disappears").
Use `selector` to wait for an element state. Use `time_ms` for a simple delay.
"""
valid_states = ("visible", "hidden", "attached", "detached")
if state is not None and state not in valid_states:
return make_result(
"skyvern_wait",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
f"Invalid state: {state}",
"Use visible, hidden, attached, or detached",
),
)
if time_ms is None and not selector and not intent:
return make_result(
"skyvern_wait",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
"Must provide intent, selector, or time_ms",
"Use intent='condition to wait for' for AI-powered waiting, selector='#element' for element visibility, or time_ms=5000 for a delay",
),
)
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_wait", ok=False, error=no_browser_error())
with Timer() as timer:
try:
if time_ms is not None:
await page.wait_for_timeout(time_ms)
waited_for = "time"
elif intent:
loop = asyncio.get_running_loop()
deadline = loop.time() + timeout / 1000
last_error: Exception | None = None
while True:
try:
result = await page.validate(intent)
last_error = None
except Exception as poll_err:
result = False
last_error = poll_err
if result:
break
if loop.time() >= deadline:
code = ErrorCode.SDK_ERROR if last_error else ErrorCode.TIMEOUT
msg = str(last_error) if last_error else f"Condition not met within {timeout}ms: {intent}"
return make_result(
"skyvern_wait",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(
code,
msg,
"Increase timeout or check that the condition can be satisfied",
),
)
await page.wait_for_timeout(poll_interval_ms)
waited_for = "intent"
elif selector:
await page.wait_for_selector(selector, state=state, timeout=timeout)
waited_for = "selector"
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_wait",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.TIMEOUT, str(e), "Condition was not met within timeout"),
)
sdk_eq = ""
if waited_for == "time":
sdk_eq = f"await page.wait_for_timeout({time_ms})"
elif waited_for == "intent":
sdk_eq = f'await page.validate("{intent}")'
elif waited_for == "selector":
sdk_eq = f'await page.wait_for_selector("{selector}")'
return make_result(
"skyvern_wait",
browser_context=ctx,
data={"waited_for": waited_for, "sdk_equivalent": sdk_eq},
timing_ms=timer.timing_ms,
)
async def skyvern_evaluate(
expression: Annotated[str, "JavaScript expression to evaluate"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
) -> dict[str, Any]:
"""Run JavaScript on the page to read DOM state, get URLs, check values, or discover CSS selectors for faster subsequent actions.
Security: This executes arbitrary JS in the page context. Only use with trusted expressions.
"""
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_evaluate", ok=False, error=no_browser_error())
with Timer() as timer:
try:
result = await page.evaluate(expression)
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_evaluate",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check JavaScript syntax"),
)
return make_result(
"skyvern_evaluate",
browser_context=ctx,
data={"result": result, "sdk_equivalent": f'await page.evaluate("{expression[:80]}")'},
timing_ms=timer.timing_ms,
)
# ---------------------------------------------------------------------------
# AI Differentiator Tools
# ---------------------------------------------------------------------------
async def skyvern_extract(
prompt: Annotated[str, "Natural language description of what data to extract from the page"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
schema: Annotated[
str | None, Field(description="JSON Schema string defining the expected output structure")
] = None,
) -> dict[str, Any]:
"""Get structured data from any website -- prices, listings, articles, tables, contact info, etc. Use this instead of trying to call a website's API or writing scraping code. Describe what you need in natural language.
Optionally provide a JSON `schema` to enforce the output structure (pass as a JSON string).
"""
parsed_schema: dict[str, Any] | None = None
if schema is not None:
try:
parsed_schema = json.loads(schema)
except (json.JSONDecodeError, TypeError) as e:
return make_result(
"skyvern_extract",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
f"Invalid JSON schema: {e}",
"Provide schema as a valid JSON string",
),
)
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_extract", ok=False, error=no_browser_error())
with Timer() as timer:
try:
extracted = await page.extract(prompt=prompt, schema=parsed_schema)
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_extract",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.SDK_ERROR, str(e), "Check that the page has loaded and the prompt is clear"),
)
return make_result(
"skyvern_extract",
browser_context=ctx,
data={"extracted": extracted, "sdk_equivalent": f'await page.extract(prompt="{prompt}")'},
timing_ms=timer.timing_ms,
)
async def skyvern_validate(
prompt: Annotated[str, "Validation condition to check (e.g., 'the login form is visible')"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
) -> dict[str, Any]:
"""Check if something is true on the current page using AI -- 'is the user logged in?', 'does the cart have 3 items?', 'is the form submitted?'
Returns whether the described condition is true or false.
"""
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_validate", ok=False, error=no_browser_error())
with Timer() as timer:
try:
valid = await page.validate(prompt)
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_validate",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.SDK_ERROR, str(e), "Check that the page has loaded and the prompt is clear"),
)
return make_result(
"skyvern_validate",
browser_context=ctx,
data={"prompt": prompt, "valid": valid, "sdk_equivalent": f'await page.validate("{prompt}")'},
timing_ms=timer.timing_ms,
)
async def skyvern_act(
prompt: Annotated[str, "Natural language instruction for the action to perform (e.g., 'close the cookie banner')"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
) -> dict[str, Any]:
"""Perform actions on a web page by describing what to do in plain English -- click buttons, close popups, fill forms, scroll to sections, interact with menus. Use for any website interaction task.
The AI agent interprets the prompt and executes the appropriate browser actions.
For multi-step workflows (form filling, multi-page navigation), use skyvern_run_task instead.
"""
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_act", ok=False, error=no_browser_error())
with Timer() as timer:
try:
await page.act(prompt)
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_act",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.SDK_ERROR, str(e), "Simplify the prompt or break the task into steps"),
)
return make_result(
"skyvern_act",
browser_context=ctx,
data={"prompt": prompt, "completed": True, "sdk_equivalent": f'await page.act("{prompt}")'},
timing_ms=timer.timing_ms,
)
async def skyvern_run_task(
prompt: Annotated[str, "Natural language description of the task to automate"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
url: Annotated[
str | None, Field(description="URL to navigate to before running (uses current page if omitted)")
] = None,
data_extraction_schema: Annotated[
str | None, Field(description="JSON Schema string defining what data to extract")
] = None,
max_steps: Annotated[int | None, Field(description="Maximum number of agent steps")] = None,
timeout_seconds: Annotated[
int, Field(description="Timeout in seconds (default 180s = 3 minutes)", ge=10, le=1800)
] = 180,
) -> dict[str, Any]:
"""Delegate a complete multi-step web task to an autonomous AI agent. Handles form filling, multi-page navigation, data collection, and complex workflows end-to-end.
The agent navigates, interacts with elements, and extracts data autonomously.
For simple single-step actions, use skyvern_act instead.
"""
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
except BrowserNotAvailableError:
return make_result("skyvern_run_task", ok=False, error=no_browser_error())
parsed_schema: dict[str, Any] | str | None = None
if data_extraction_schema is not None:
try:
parsed_schema = json.loads(data_extraction_schema)
except (json.JSONDecodeError, TypeError) as e:
return make_result(
"skyvern_run_task",
ok=False,
browser_context=ctx,
error=make_error(
ErrorCode.INVALID_INPUT,
f"Invalid data_extraction_schema JSON: {e}",
"Provide schema as a valid JSON string",
),
)
with Timer() as timer:
try:
response = await page.agent.run_task(
prompt=prompt,
url=url,
data_extraction_schema=parsed_schema,
max_steps=max_steps,
timeout=timeout_seconds,
)
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_run_task",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.SDK_ERROR, str(e), "Check the prompt, URL, and timeout settings"),
)
return make_result(
"skyvern_run_task",
browser_context=ctx,
data={
"run_id": response.run_id,
"status": response.status,
"output": response.output,
"failure_reason": response.failure_reason,
"recording_url": response.recording_url,
"app_url": response.app_url,
"sdk_equivalent": f'await page.agent.run_task(prompt="{prompt}")',
},
timing_ms=timer.timing_ms,
)

View File

@@ -0,0 +1,261 @@
from __future__ import annotations
from typing import Annotated, Any
from pydantic import Field
from skyvern.schemas.runs import ProxyLocation
from ._common import BrowserContext, ErrorCode, Timer, make_error, make_result
from ._session import (
SessionState,
get_current_session,
get_skyvern,
resolve_browser,
set_current_session,
)
async def skyvern_session_create(
timeout: Annotated[int | None, Field(description="Session timeout in minutes (5-1440)")] = 60,
proxy_location: Annotated[str | None, Field(description="Proxy location: RESIDENTIAL, US, etc.")] = None,
local: Annotated[bool, Field(description="Launch local browser instead of cloud")] = False,
headless: Annotated[bool, Field(description="Run local browser in headless mode")] = False,
) -> dict[str, Any]:
"""Create a new browser session to start interacting with websites. Creates a cloud browser by default.
Use local=true for a local Chromium instance.
The session persists across tool calls until explicitly closed.
"""
with Timer() as timer:
try:
skyvern = get_skyvern()
if local:
browser = await skyvern.launch_local_browser(headless=headless)
ctx = BrowserContext(mode="local")
set_current_session(SessionState(browser=browser, context=ctx))
timer.mark("sdk")
return make_result(
"skyvern_session_create",
browser_context=ctx,
data={"local": True, "headless": headless},
timing_ms=timer.timing_ms,
)
proxy = ProxyLocation(proxy_location) if proxy_location else None
browser = await skyvern.launch_cloud_browser(timeout=timeout, proxy_location=proxy)
ctx = BrowserContext(mode="cloud_session", session_id=browser.browser_session_id)
set_current_session(SessionState(browser=browser, context=ctx))
timer.mark("sdk")
except ValueError as e:
return make_result(
"skyvern_session_create",
ok=False,
timing_ms=timer.timing_ms,
error=make_error(
ErrorCode.SDK_ERROR,
str(e),
"Cloud sessions require SKYVERN_API_KEY. Check your environment.",
),
)
except Exception as e:
return make_result(
"skyvern_session_create",
ok=False,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.SDK_ERROR, str(e), "Failed to create browser session"),
)
return make_result(
"skyvern_session_create",
browser_context=ctx,
data={
"session_id": browser.browser_session_id,
"timeout_minutes": timeout,
},
timing_ms=timer.timing_ms,
)
async def skyvern_session_close(
session_id: Annotated[str | None, Field(description="Session ID to close (uses current if not specified)")] = None,
) -> dict[str, Any]:
"""Close a browser session when you're done. Frees cloud resources.
Closes the specified session or the current active session.
"""
current = get_current_session()
with Timer() as timer:
try:
if session_id:
skyvern = get_skyvern()
await skyvern.close_browser_session(session_id)
if current.context and current.context.session_id == session_id:
set_current_session(SessionState())
timer.mark("sdk")
return make_result(
"skyvern_session_close",
data={"session_id": session_id, "closed": True},
timing_ms=timer.timing_ms,
)
if current.browser is None:
return make_result(
"skyvern_session_close",
ok=False,
error=make_error(
ErrorCode.NO_ACTIVE_BROWSER,
"No active session to close",
"Provide a session_id or create a session first",
),
)
closed_id = current.context.session_id if current.context else None
await current.browser.close()
set_current_session(SessionState())
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_session_close",
ok=False,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.SDK_ERROR, str(e), "Failed to close session"),
)
return make_result(
"skyvern_session_close",
data={"session_id": closed_id, "closed": True},
timing_ms=timer.timing_ms,
)
async def skyvern_session_list() -> dict[str, Any]:
"""List all active browser sessions. Use to find available sessions to connect to."""
with Timer() as timer:
try:
skyvern = get_skyvern()
sessions = await skyvern.get_browser_sessions()
timer.mark("sdk")
session_data = [
{
"session_id": s.browser_session_id,
"status": s.status,
"started_at": s.started_at.isoformat() if s.started_at else None,
"timeout": s.timeout,
"runnable_id": s.runnable_id,
"available": s.runnable_id is None and s.browser_address is not None,
}
for s in sessions
]
except ValueError as e:
return make_result(
"skyvern_session_list",
ok=False,
timing_ms=timer.timing_ms,
error=make_error(
ErrorCode.SDK_ERROR,
str(e),
"Listing sessions requires SKYVERN_API_KEY",
),
)
except Exception as e:
return make_result(
"skyvern_session_list",
ok=False,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.SDK_ERROR, str(e), "Failed to list sessions"),
)
current = get_current_session()
current_id = current.context.session_id if current.context else None
return make_result(
"skyvern_session_list",
data={
"sessions": session_data,
"count": len(session_data),
"current_session_id": current_id,
},
timing_ms=timer.timing_ms,
)
async def skyvern_session_get(
session_id: Annotated[str, "Browser session ID to get details for"],
) -> dict[str, Any]:
"""Get details about a specific browser session -- status, timeout, availability."""
with Timer() as timer:
try:
skyvern = get_skyvern()
session = await skyvern.get_browser_session(session_id)
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_session_get",
ok=False,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.BROWSER_NOT_FOUND, str(e), "Check the session ID is correct"),
)
current = get_current_session()
is_current = current.context and current.context.session_id == session_id
return make_result(
"skyvern_session_get",
browser_context=BrowserContext(mode="cloud_session", session_id=session_id) if is_current else None,
data={
"session_id": session.browser_session_id,
"status": session.status,
"started_at": session.started_at.isoformat() if session.started_at else None,
"completed_at": session.completed_at.isoformat() if session.completed_at else None,
"timeout": session.timeout,
"runnable_id": session.runnable_id,
"is_current": is_current,
},
timing_ms=timer.timing_ms,
)
async def skyvern_session_connect(
session_id: Annotated[str | None, Field(description="Cloud session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
) -> dict[str, Any]:
"""Connect to an existing browser -- a cloud session by ID or any browser via CDP URL.
Use this to resume work in a previously created session or attach to an external browser.
"""
if not session_id and not cdp_url:
return make_result(
"skyvern_session_connect",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
"Must provide session_id or cdp_url",
"Specify which browser to connect to",
),
)
with Timer() as timer:
try:
browser, ctx = await resolve_browser(session_id=session_id, cdp_url=cdp_url)
timer.mark("sdk")
except Exception as e:
return make_result(
"skyvern_session_connect",
ok=False,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.BROWSER_NOT_FOUND, str(e), "Check the session ID or CDP URL is valid"),
)
return make_result(
"skyvern_session_connect",
browser_context=ctx,
data={"connected": True},
timing_ms=timer.timing_ms,
)

View File

@@ -10,11 +10,11 @@ import psutil
import typer
import uvicorn
from dotenv import load_dotenv, set_key
from mcp.server.fastmcp import FastMCP
from rich.panel import Panel
from rich.prompt import Confirm
from skyvern.cli.console import console
from skyvern.cli.mcp_tools import mcp # Uses standalone fastmcp (v2.x)
from skyvern.cli.utils import start_services
from skyvern.client import SkyvernEnvironment
from skyvern.config import settings
@@ -27,8 +27,6 @@ from skyvern.utils.env_paths import resolve_backend_env_path, resolve_frontend_e
run_app = typer.Typer(help="Commands to run Skyvern services such as the API server or UI.")
mcp = FastMCP("Skyvern")
@mcp.tool()
async def skyvern_run_task(prompt: str, url: str) -> dict[str, Any]:
@@ -53,12 +51,9 @@ async def skyvern_run_task(prompt: str, url: str) -> dict[str, Any]:
res = await skyvern_agent.run_task(prompt=prompt, url=url, user_agent="skyvern-mcp", wait_for_completion=True)
output = res.model_dump()["output"]
# Primary: use app_url from API response (handles both task and workflow run IDs correctly)
if res.app_url:
task_url = res.app_url
else:
# Fallback when app_url is not available (e.g., older API versions)
# Determine route based on run_id prefix: 'wr_' for workflows, otherwise tasks
if res.run_id and res.run_id.startswith("wr_"):
task_url = f"{settings.SKYVERN_APP_URL.rstrip('/')}/runs/{res.run_id}/overview"
else: