From 4f1bf2576831d03779c2c5af021b07a5079c31b1 Mon Sep 17 00:00:00 2001 From: Marc Kelechava Date: Sat, 7 Feb 2026 02:33:13 -0800 Subject: [PATCH] SKY-7801/7802: MCP Foundation + Hybrid Browser Tools (selector + AI intent) (#4660) --- skyvern/cli/core/__init__.py | 43 ++ skyvern/cli/core/artifacts.py | 29 + skyvern/cli/core/client.py | 32 + skyvern/cli/core/result.py | 106 ++++ skyvern/cli/core/session_manager.py | 153 +++++ skyvern/cli/mcp_tools/__init__.py | 201 ++++++ skyvern/cli/mcp_tools/_client.py | 11 + skyvern/cli/mcp_tools/_common.py | 20 + skyvern/cli/mcp_tools/_session.py | 30 + skyvern/cli/mcp_tools/browser.py | 939 ++++++++++++++++++++++++++++ skyvern/cli/mcp_tools/session.py | 261 ++++++++ skyvern/cli/run_commands.py | 7 +- 12 files changed, 1826 insertions(+), 6 deletions(-) create mode 100644 skyvern/cli/core/__init__.py create mode 100644 skyvern/cli/core/artifacts.py create mode 100644 skyvern/cli/core/client.py create mode 100644 skyvern/cli/core/result.py create mode 100644 skyvern/cli/core/session_manager.py create mode 100644 skyvern/cli/mcp_tools/__init__.py create mode 100644 skyvern/cli/mcp_tools/_client.py create mode 100644 skyvern/cli/mcp_tools/_common.py create mode 100644 skyvern/cli/mcp_tools/_session.py create mode 100644 skyvern/cli/mcp_tools/browser.py create mode 100644 skyvern/cli/mcp_tools/session.py diff --git a/skyvern/cli/core/__init__.py b/skyvern/cli/core/__init__.py new file mode 100644 index 00000000..5139da9a --- /dev/null +++ b/skyvern/cli/core/__init__.py @@ -0,0 +1,43 @@ +"""Shared core layer for Skyvern CLI and MCP tools. + +This package provides reusable primitives that both MCP tools and CLI commands +import from, preventing logic duplication across interfaces. +""" + +from .artifacts import get_artifact_dir, save_artifact +from .client import get_skyvern +from .result import Artifact, BrowserContext, ErrorCode, Timer, make_error, make_result +from .session_manager import ( + BrowserNotAvailableError, + SessionState, + browser_session, + get_current_session, + get_page, + no_browser_error, + resolve_browser, + set_current_session, +) + +__all__ = [ + # client.py + "get_skyvern", + # result.py + "Artifact", + "BrowserContext", + "ErrorCode", + "Timer", + "make_error", + "make_result", + # artifacts.py + "get_artifact_dir", + "save_artifact", + # session_manager.py + "BrowserNotAvailableError", + "SessionState", + "browser_session", + "get_current_session", + "get_page", + "no_browser_error", + "resolve_browser", + "set_current_session", +] diff --git a/skyvern/cli/core/artifacts.py b/skyvern/cli/core/artifacts.py new file mode 100644 index 00000000..69adabde --- /dev/null +++ b/skyvern/cli/core/artifacts.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from pathlib import Path + +from .result import Artifact + + +def get_artifact_dir(session_id: str | None = None, run_id: str | None = None) -> Path: + base = Path.home() / ".skyvern" / "artifacts" / datetime.now(timezone.utc).strftime("%Y-%m-%d") + if session_id: + return base / session_id + if run_id: + return base / run_id + return base / "anonymous" + + +def save_artifact( + content: bytes, + kind: str, + filename: str, + mime: str, + session_id: str | None = None, +) -> Artifact: + dir_path = get_artifact_dir(session_id) + dir_path.mkdir(parents=True, exist_ok=True) + file_path = dir_path / filename + file_path.write_bytes(content) + return Artifact(kind=kind, path=str(file_path), mime=mime, bytes=len(content)) diff --git a/skyvern/cli/core/client.py b/skyvern/cli/core/client.py new file mode 100644 index 00000000..fabae2ec --- /dev/null +++ b/skyvern/cli/core/client.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import os +from contextvars import ContextVar + +from skyvern.client import SkyvernEnvironment +from skyvern.config import settings +from skyvern.library.skyvern import Skyvern + +_skyvern_instance: ContextVar[Skyvern | None] = ContextVar("skyvern_instance", default=None) + + +def get_skyvern() -> Skyvern: + """Get or create a Skyvern client instance.""" + instance = _skyvern_instance.get() + if instance is not None: + return instance + + api_key = settings.SKYVERN_API_KEY or os.environ.get("SKYVERN_API_KEY") + base_url = settings.SKYVERN_BASE_URL or os.environ.get("SKYVERN_BASE_URL") + + if api_key: + instance = Skyvern( + api_key=api_key, + environment=SkyvernEnvironment.CLOUD, + base_url=base_url, + ) + else: + instance = Skyvern.local() + + _skyvern_instance.set(instance) + return instance diff --git a/skyvern/cli/core/result.py b/skyvern/cli/core/result.py new file mode 100644 index 00000000..67ceb864 --- /dev/null +++ b/skyvern/cli/core/result.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any + + +class ErrorCode: + NO_ACTIVE_BROWSER = "NO_ACTIVE_BROWSER" + BROWSER_NOT_FOUND = "BROWSER_NOT_FOUND" + SELECTOR_NOT_FOUND = "SELECTOR_NOT_FOUND" + ACTION_FAILED = "ACTION_FAILED" + AI_FALLBACK_FAILED = "AI_FALLBACK_FAILED" + SDK_ERROR = "SDK_ERROR" + TIMEOUT = "TIMEOUT" + INVALID_INPUT = "INVALID_INPUT" + + +@dataclass +class Artifact: + kind: str + path: str + mime: str + bytes: int + created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) + + def to_dict(self) -> dict[str, Any]: + return { + "kind": self.kind, + "path": self.path, + "mime": self.mime, + "bytes": self.bytes, + "created_at": self.created_at, + } + + +@dataclass +class BrowserContext: + mode: str + session_id: str | None = None + cdp_url: str | None = None + + def to_dict(self) -> dict[str, Any]: + return { + "mode": self.mode, + "session_id": self.session_id, + "cdp_url": self.cdp_url, + } + + +def make_result( + action: str, + *, + ok: bool = True, + browser_context: BrowserContext | None = None, + data: dict[str, Any] | None = None, + artifacts: list[Artifact] | None = None, + timing_ms: dict[str, int] | None = None, + warnings: list[str] | None = None, + error: dict[str, Any] | None = None, +) -> dict[str, Any]: + return { + "ok": ok, + "action": action, + "browser_context": (browser_context or BrowserContext(mode="none")).to_dict(), + "data": data, + "artifacts": [a.to_dict() for a in (artifacts or [])], + "timing_ms": timing_ms or {}, + "warnings": warnings or [], + "error": error, + } + + +def make_error( + code: str, + message: str, + hint: str, + details: dict[str, Any] | None = None, +) -> dict[str, Any]: + return { + "code": code, + "message": message, + "hint": hint, + "details": details or {}, + } + + +class Timer: + def __init__(self) -> None: + self._start: float = 0 + self._marks: dict[str, int] = {} + + def __enter__(self) -> Timer: + self._start = time.perf_counter() + return self + + def __exit__(self, *args: Any) -> None: + self._marks["total"] = int((time.perf_counter() - self._start) * 1000) + + def mark(self, name: str) -> None: + self._marks[name] = int((time.perf_counter() - self._start) * 1000) + + @property + def timing_ms(self) -> dict[str, int]: + return self._marks.copy() diff --git a/skyvern/cli/core/session_manager.py b/skyvern/cli/core/session_manager.py new file mode 100644 index 00000000..af83ba79 --- /dev/null +++ b/skyvern/cli/core/session_manager.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +from contextlib import asynccontextmanager +from contextvars import ContextVar +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, AsyncIterator + +from .client import get_skyvern +from .result import BrowserContext, ErrorCode, make_error + +if TYPE_CHECKING: + from skyvern.library.skyvern_browser import SkyvernBrowser + from skyvern.library.skyvern_browser_page import SkyvernBrowserPage + + +@dataclass +class SessionState: + browser: SkyvernBrowser | None = None + context: BrowserContext | None = None + console_messages: list[dict[str, Any]] = field(default_factory=list) + tracing_active: bool = False + har_enabled: bool = False + + +_current_session: ContextVar[SessionState | None] = ContextVar("mcp_session", default=None) + + +def get_current_session() -> SessionState: + state = _current_session.get() + if state is None: + state = SessionState() + _current_session.set(state) + return state + + +def set_current_session(state: SessionState) -> None: + _current_session.set(state) + + +async def resolve_browser( + session_id: str | None = None, + cdp_url: str | None = None, + local: bool = False, + create_session: bool = False, + timeout: int | None = None, + headless: bool = False, +) -> tuple[SkyvernBrowser, BrowserContext]: + """Resolve browser from parameters or current session. + + Note: For MCP tools, sessions are stored in ContextVar and persist across tool calls. + Cleanup is done via explicit skyvern_session_close() call. For scripts that need + guaranteed cleanup, use the browser_session() context manager instead. + """ + skyvern = get_skyvern() + current = get_current_session() + + browser: SkyvernBrowser | None = None + try: + if session_id: + browser = await skyvern.connect_to_cloud_browser_session(session_id) + ctx = BrowserContext(mode="cloud_session", session_id=session_id) + set_current_session(SessionState(browser=browser, context=ctx)) + return browser, ctx + + if cdp_url: + browser = await skyvern.connect_to_browser_over_cdp(cdp_url) + ctx = BrowserContext(mode="cdp", cdp_url=cdp_url) + set_current_session(SessionState(browser=browser, context=ctx)) + return browser, ctx + + if local: + browser = await skyvern.launch_local_browser(headless=headless) + ctx = BrowserContext(mode="local") + set_current_session(SessionState(browser=browser, context=ctx)) + return browser, ctx + + if create_session: + browser = await skyvern.launch_cloud_browser(timeout=timeout) + ctx = BrowserContext(mode="cloud_session", session_id=browser.browser_session_id) + set_current_session(SessionState(browser=browser, context=ctx)) + return browser, ctx + except Exception: + if browser is not None: + try: + await browser.close() + except Exception: + pass + set_current_session(SessionState()) + raise + + if current.browser is not None and current.context is not None: + return current.browser, current.context + + raise BrowserNotAvailableError() + + +async def get_page( + session_id: str | None = None, + cdp_url: str | None = None, +) -> tuple[SkyvernBrowserPage, BrowserContext]: + """Get the working page from the current or specified browser session.""" + browser, ctx = await resolve_browser(session_id=session_id, cdp_url=cdp_url) + page = await browser.get_working_page() + return page, ctx + + +@asynccontextmanager +async def browser_session( + session_id: str | None = None, + cdp_url: str | None = None, + local: bool = False, + timeout: int | None = None, + headless: bool = False, +) -> AsyncIterator[tuple[SkyvernBrowser, BrowserContext]]: + """Context manager for browser sessions with guaranteed cleanup. + + Use this in scripts that need guaranteed resource cleanup on error. + MCP tools use resolve_browser() directly since sessions persist across calls. + + Example: + async with browser_session(local=True) as (browser, ctx): + page = await browser.get_working_page() + await page.goto("https://example.com") + # Browser is automatically closed on exit or exception + """ + browser, ctx = await resolve_browser( + session_id=session_id, + cdp_url=cdp_url, + local=local, + create_session=not (session_id or cdp_url or local), + timeout=timeout, + headless=headless, + ) + try: + yield browser, ctx + finally: + try: + await browser.close() + except Exception: + pass # Best effort cleanup + set_current_session(SessionState()) + + +class BrowserNotAvailableError(Exception): + """Raised when no browser session is available.""" + + +def no_browser_error() -> dict[str, Any]: + return make_error( + ErrorCode.NO_ACTIVE_BROWSER, + "No browser session available", + "Create a session with skyvern_session_create, provide session_id, or cdp_url", + ) diff --git a/skyvern/cli/mcp_tools/__init__.py b/skyvern/cli/mcp_tools/__init__.py new file mode 100644 index 00000000..ad85afb5 --- /dev/null +++ b/skyvern/cli/mcp_tools/__init__.py @@ -0,0 +1,201 @@ +"""Skyvern MCP Tools. + +This module provides MCP (Model Context Protocol) tools for browser automation. +Tools are registered with FastMCP and can be used by AI assistants like Claude. +""" + +from fastmcp import FastMCP + +from .browser import ( + skyvern_act, + skyvern_click, + skyvern_evaluate, + skyvern_extract, + skyvern_navigate, + skyvern_press_key, + skyvern_run_task, + skyvern_screenshot, + skyvern_scroll, + skyvern_select_option, + skyvern_type, + skyvern_validate, + skyvern_wait, +) +from .session import ( + skyvern_session_close, + skyvern_session_connect, + skyvern_session_create, + skyvern_session_get, + skyvern_session_list, +) + +mcp = FastMCP( + "Skyvern", + instructions="""Use Skyvern tools whenever the task involves visiting, browsing, or interacting with ANY website or web application. + +## When to Use These Tools +Reach for Skyvern tools when the user asks you to: +- Visit, browse, or interact with ANY website or web application +- Extract data from web pages (prices, listings, articles, tables, search results, etc.) +- Fill out forms, log in, sign up, or complete web-based workflows +- Check the current state of a web page or verify something on a site +- Do anything you would otherwise attempt with requests, beautifulsoup, selenium, or playwright +- Access website data where you are unsure whether an API endpoint exists + +DO NOT try to scrape websites by guessing API endpoints or writing HTTP requests. +Instead, use skyvern_navigate + skyvern_extract to get real data from actual pages. +These tools give you a real browser — use them instead of writing scraping code. + +## Examples +| User says | Use | +|-----------|-----| +| "Go to amazon.com" | skyvern_navigate | +| "What's on this page?" | skyvern_screenshot | +| "Get all product prices" | skyvern_extract | +| "Click the login button" | skyvern_act or skyvern_click | +| "Fill out this form" | skyvern_act | +| "Log in and buy the first item" | skyvern_run_task | +| "Is checkout complete?" | skyvern_validate | +| "Write a script to do this" | Skyvern SDK (see below) | + +## Writing Scripts and Code +When asked to write an automation script, use the Skyvern Python SDK with the **hybrid xpath+prompt +pattern** for production-quality scripts. The hybrid form tries the xpath/selector first (fast, +deterministic) and falls back to AI if the selector breaks — this is the recommended pattern. + + from skyvern import Skyvern + skyvern = Skyvern(api_key="YOUR_API_KEY") + browser = await skyvern.launch_cloud_browser() + page = await browser.get_working_page() + await page.goto("https://example.com") + + # BEST: hybrid selector+prompt — fast deterministic selector with AI fallback + await page.click("xpath=//button[@id='submit']", prompt="the Submit button") + await page.fill("xpath=//input[@name='email']", "user@example.com", prompt="email input field") + + # OK for exploration, but prefer hybrid for production scripts: + await page.click(prompt="the Submit button") + + data = await page.extract("Get all product names and prices") + +To get xpaths for hybrid calls, use precision tools (skyvern_click, skyvern_type) during exploration. +The `resolved_selector` field in responses gives you the xpath the AI resolved to. Use it in scripts: + explore: skyvern_click(intent="Submit button") → response includes resolved_selector="xpath=//button[@id='submit']" + script: await page.click("xpath=//button[@id='submit']", prompt="Submit button") + +IMPORTANT: NEVER import from skyvern.cli.mcp_tools — those are internal server modules. +The public SDK is: from skyvern import Skyvern + +## Recommended Workflow +1. **Connect** — Create or connect to a browser session +2. **Explore** — Navigate pages, take screenshots, extract data with AI +3. **Build** — Capture selectors and data schemas to construct deterministic workflows +4. **Test** — Validate workflows via skyvern_run_task + +## Primary Tools (use these first) +These are the tools you should reach for by default: + +- **skyvern_act** — Execute actions from natural language: "log in with test@example.com", "add the first item to cart". Best for exploration and testing flows. +- **skyvern_extract** — Pull structured data from any page with natural language + optional JSON Schema. THE differentiator over raw Playwright. +- **skyvern_validate** — Assert page conditions with AI: "is the user logged in?", "does the cart have 3 items?" +- **skyvern_run_task** — Delegate a full multi-step task to an autonomous AI agent with observability. Use for end-to-end task execution. +- **skyvern_navigate** — Go to a URL. Always the first step after connecting. +- **skyvern_screenshot** — See what's on the page. Essential for understanding page state. +- **skyvern_evaluate** — Run JavaScript to read DOM state, get URLs, or check values. + +## Precision Tools (for debugging and exact control) +Use these when the primary tools aren't specific enough, or when you need deterministic +selector-based actions (e.g., replaying a known flow): + +- **skyvern_click** — Click a specific element by selector or AI intent +- **skyvern_type** — Type into a specific input field by selector or AI intent +- **skyvern_scroll** — Scroll the page or an element into view +- **skyvern_select_option** — Select a dropdown option by selector or AI intent +- **skyvern_press_key** — Press a keyboard key (Enter, Tab, Escape, etc.) +- **skyvern_wait** — Wait for a condition, element, or time delay + +## Tool Modes (precision tools) +Precision tools support three modes. When unsure, use `intent`. + +1. **Intent mode** — AI-powered element finding: + `skyvern_click(intent="the blue Submit button")` + +2. **Hybrid mode** — tries selector first, AI fallback: + `skyvern_click(selector="#submit-btn", intent="the Submit button")` + +3. **Selector mode** — deterministic CSS/XPath targeting: + `skyvern_click(selector="#submit-btn")` + +## Replay Story: From Exploration to Production Scripts +When you use precision tools (skyvern_click, skyvern_type, etc.) with intent mode, the response +includes `resolved_selector` — the xpath/CSS the AI found. Capture these to build hybrid scripts. + +**The hybrid pattern** is the recommended default for SDK scripts: + await page.click("xpath=//button[@id='submit']", prompt="the Submit button") +It tries the selector first (fast, no AI cost), then falls back to AI if the selector breaks. + +**Workflow for generating scripts:** +1. Explore: Use skyvern_click(intent="Submit button") during interactive exploration +2. Capture: Note the `resolved_selector` from the response (e.g., "//button[@id='submit']") +3. Script: Write `page.click("xpath=//button[@id='submit']", prompt="Submit button")` + +The `sdk_equivalent` field in each tool response shows the correct hybrid call to use in scripts. +Always prefer hybrid xpath+prompt over prompt-only in generated scripts. + +Note: Currently only skyvern_click returns resolved_selector. Support for skyvern_type and +skyvern_select_option is planned (SKY-7905). For those tools, use the selector you provided +as input, or fall back to prompt-only until SKY-7905 ships. + +## Getting Started +Create a session with skyvern_session_create, then use browser tools to interact with pages. +""", +) + +# -- Session management -- +mcp.tool()(skyvern_session_create) +mcp.tool()(skyvern_session_close) +mcp.tool()(skyvern_session_list) +mcp.tool()(skyvern_session_get) +mcp.tool()(skyvern_session_connect) + +# -- Primary tools (AI-powered exploration + observation) -- +mcp.tool()(skyvern_act) +mcp.tool()(skyvern_extract) +mcp.tool()(skyvern_validate) +mcp.tool()(skyvern_run_task) +mcp.tool()(skyvern_navigate) +mcp.tool()(skyvern_screenshot) +mcp.tool()(skyvern_evaluate) + +# -- Precision tools (selector/intent-based browser primitives) -- +mcp.tool()(skyvern_click) +mcp.tool()(skyvern_type) +mcp.tool()(skyvern_scroll) +mcp.tool()(skyvern_select_option) +mcp.tool()(skyvern_press_key) +mcp.tool()(skyvern_wait) + +__all__ = [ + "mcp", + # Session + "skyvern_session_create", + "skyvern_session_close", + "skyvern_session_list", + "skyvern_session_get", + "skyvern_session_connect", + # Primary (AI-powered) + "skyvern_act", + "skyvern_extract", + "skyvern_validate", + "skyvern_run_task", + "skyvern_navigate", + "skyvern_screenshot", + "skyvern_evaluate", + # Precision (selector/intent browser primitives) + "skyvern_click", + "skyvern_type", + "skyvern_scroll", + "skyvern_select_option", + "skyvern_press_key", + "skyvern_wait", +] diff --git a/skyvern/cli/mcp_tools/_client.py b/skyvern/cli/mcp_tools/_client.py new file mode 100644 index 00000000..f08f1e64 --- /dev/null +++ b/skyvern/cli/mcp_tools/_client.py @@ -0,0 +1,11 @@ +"""Skyvern HTTP API client accessor. + +Workflow tools import from here to get the API client without pulling in +browser/Playwright dependencies. +""" + +from __future__ import annotations + +from skyvern.cli.core.client import get_skyvern + +__all__ = ["get_skyvern"] diff --git a/skyvern/cli/mcp_tools/_common.py b/skyvern/cli/mcp_tools/_common.py new file mode 100644 index 00000000..0c7cd3ff --- /dev/null +++ b/skyvern/cli/mcp_tools/_common.py @@ -0,0 +1,20 @@ +"""Backward-compatible re-exports from skyvern.cli.core. + +MCP tools import from here; the canonical implementations live in core/. +""" + +from __future__ import annotations + +from skyvern.cli.core.artifacts import get_artifact_dir, save_artifact +from skyvern.cli.core.result import Artifact, BrowserContext, ErrorCode, Timer, make_error, make_result + +__all__ = [ + "Artifact", + "BrowserContext", + "ErrorCode", + "Timer", + "get_artifact_dir", + "make_error", + "make_result", + "save_artifact", +] diff --git a/skyvern/cli/mcp_tools/_session.py b/skyvern/cli/mcp_tools/_session.py new file mode 100644 index 00000000..0fd345fe --- /dev/null +++ b/skyvern/cli/mcp_tools/_session.py @@ -0,0 +1,30 @@ +"""Backward-compatible re-exports from skyvern.cli.core. + +MCP tools import from here; the canonical implementations live in core/. +""" + +from __future__ import annotations + +from skyvern.cli.core.client import get_skyvern +from skyvern.cli.core.session_manager import ( + BrowserNotAvailableError, + SessionState, + browser_session, + get_current_session, + get_page, + no_browser_error, + resolve_browser, + set_current_session, +) + +__all__ = [ + "BrowserNotAvailableError", + "SessionState", + "browser_session", + "get_current_session", + "get_page", + "get_skyvern", + "no_browser_error", + "resolve_browser", + "set_current_session", +] diff --git a/skyvern/cli/mcp_tools/browser.py b/skyvern/cli/mcp_tools/browser.py new file mode 100644 index 00000000..e12e521e --- /dev/null +++ b/skyvern/cli/mcp_tools/browser.py @@ -0,0 +1,939 @@ +from __future__ import annotations + +import asyncio +import base64 +import json +from datetime import datetime, timezone +from typing import Annotated, Any + +from playwright.async_api import TimeoutError as PlaywrightTimeoutError +from pydantic import Field + +from ._common import ( + ErrorCode, + Timer, + make_error, + make_result, + save_artifact, +) +from ._session import BrowserNotAvailableError, get_page, no_browser_error + + +def _resolve_ai_mode( + selector: str | None, + intent: str | None, +) -> tuple[str | None, str | None]: + """Determine AI mode from selector/intent combination. + + Returns (ai_mode, error_code) — if error_code is set, the call should fail. + """ + if intent and not selector: + return "proactive", None + if intent and selector: + return "fallback", None + if selector and not intent: + return None, None + return None, "INVALID_INPUT" + + +async def skyvern_navigate( + url: Annotated[str, "The URL to navigate to"], + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, + timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=120000)] = 30000, + wait_until: Annotated[str | None, Field(description="Wait condition: load, domcontentloaded, networkidle")] = None, +) -> dict[str, Any]: + """Open a website in the browser. Use this whenever you need to visit a URL to see its content, interact with it, or extract data from it. + + Returns the final URL (after redirects) and page title. + """ + if wait_until is not None and wait_until not in ("load", "domcontentloaded", "networkidle", "commit"): + return make_result( + "skyvern_navigate", + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + f"Invalid wait_until: {wait_until}", + "Use load, domcontentloaded, networkidle, or commit", + ), + ) + + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_navigate", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + await page.goto(url, timeout=timeout, wait_until=wait_until) + timer.mark("sdk") + final_url = page.url + title = await page.title() + except Exception as e: + return make_result( + "skyvern_navigate", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check that the URL is valid and accessible"), + ) + + return make_result( + "skyvern_navigate", + browser_context=ctx, + data={"url": final_url, "title": title, "sdk_equivalent": f'await page.goto("{url}")'}, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_click( + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, + intent: Annotated[ + str | None, Field(description="Natural language description of the element to click (uses AI)") + ] = None, + selector: Annotated[str | None, Field(description="CSS selector or XPath for the element to click")] = None, + timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000, + button: Annotated[str | None, Field(description="Mouse button: left, right, middle")] = None, + click_count: Annotated[int | None, Field(description="Number of clicks (2 for double-click)")] = None, +) -> dict[str, Any]: + """Click an element on the page. Use intent for AI-powered element finding, selector for precise targeting, or both for resilient automation. + + Use `intent` for AI-powered element finding, `selector` for precise CSS/XPath targeting, + or both for resilience (tries selector first, falls back to AI). + """ + if button is not None and button not in ("left", "right", "middle"): + return make_result( + "skyvern_click", + ok=False, + error=make_error(ErrorCode.INVALID_INPUT, f"Invalid button: {button}", "Use left, right, or middle"), + ) + + ai_mode, err = _resolve_ai_mode(selector, intent) + if err: + return make_result( + "skyvern_click", + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + "Must provide intent, selector, or both", + "Use intent='describe what to click' for AI-powered clicking, or selector='#css-selector' for precise targeting", + ), + ) + + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_click", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + kwargs: dict[str, Any] = {"timeout": timeout} + if button: + kwargs["button"] = button + if click_count is not None: + kwargs["click_count"] = click_count + + if ai_mode is not None: + resolved = await page.click(selector=selector, prompt=intent, ai=ai_mode, **kwargs) # type: ignore[arg-type] + else: + assert selector is not None + resolved = await page.click(selector=selector, **kwargs) + timer.mark("sdk") + except PlaywrightTimeoutError as e: + return make_result( + "skyvern_click", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error( + ErrorCode.SELECTOR_NOT_FOUND, + str(e), + "Verify the selector matches an element on the page, or use intent for AI-powered finding", + ), + ) + except Exception as e: + code = ErrorCode.AI_FALLBACK_FAILED if ai_mode else ErrorCode.ACTION_FAILED + return make_result( + "skyvern_click", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error( + code, + str(e), + "The element may be hidden, disabled, or intercepted by another element", + ), + ) + + data: dict[str, Any] = {"selector": selector, "intent": intent, "ai_mode": ai_mode} + if resolved and resolved != selector: + data["resolved_selector"] = resolved + # Build sdk_equivalent: prefer hybrid selector+prompt for production scripts. + # resolved_selector already contains the "xpath=" prefix (e.g. "xpath=//button[@id='x']"), + # so pass it directly as the selector positional arg. + resolved_sel = resolved if resolved and resolved != selector else selector + if resolved_sel and intent: + data["sdk_equivalent"] = f'await page.click("{resolved_sel}", prompt="{intent}")' + elif ai_mode: + data["sdk_equivalent"] = f'await page.click(prompt="{intent}")' + elif selector: + data["sdk_equivalent"] = f'await page.click("{selector}")' + + return make_result( + "skyvern_click", + browser_context=ctx, + data=data, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_type( + text: Annotated[str, "Text to type into the element"], + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, + intent: Annotated[ + str | None, Field(description="Natural language description of the input field (uses AI)") + ] = None, + selector: Annotated[str | None, Field(description="CSS selector or XPath for the input element")] = None, + timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000, + clear: Annotated[bool, Field(description="Clear existing content before typing")] = True, + delay: Annotated[int | None, Field(description="Delay between keystrokes in ms")] = None, +) -> dict[str, Any]: + """Type text into an input field. Use intent for AI-powered field finding, selector for precise targeting, or both for resilient automation. + + Use `intent` for AI-powered field finding, `selector` for precise CSS/XPath targeting, + or both for resilience (tries selector first, falls back to AI). Clears existing content by default. + """ + ai_mode, err = _resolve_ai_mode(selector, intent) + if err: + return make_result( + "skyvern_type", + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + "Must provide intent, selector, or both", + "Use intent='describe the input field' for AI-powered targeting, or selector='#css-selector' for precise targeting", + ), + ) + + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_type", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + if clear: + if ai_mode is not None: + await page.fill(selector=selector, value=text, prompt=intent, ai=ai_mode, timeout=timeout) # type: ignore[arg-type] + else: + assert selector is not None + await page.fill(selector, text, timeout=timeout) + else: + kwargs: dict[str, Any] = {"timeout": timeout} + if delay is not None: + kwargs["delay"] = delay + if ai_mode is not None: + loc = page.locator(selector=selector, prompt=intent, ai=ai_mode) # type: ignore[arg-type] + await loc.type(text, **kwargs) + else: + assert selector is not None + await page.type(selector, text, **kwargs) + timer.mark("sdk") + except PlaywrightTimeoutError as e: + return make_result( + "skyvern_type", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error( + ErrorCode.SELECTOR_NOT_FOUND, + str(e), + "Verify the selector matches an editable element, or use intent for AI-powered finding", + ), + ) + except Exception as e: + code = ErrorCode.AI_FALLBACK_FAILED if ai_mode else ErrorCode.ACTION_FAILED + return make_result( + "skyvern_type", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error( + code, + str(e), + "The element may not be editable or may be hidden", + ), + ) + + # NOTE: The SDK fill() returns the typed value, not a resolved selector. + # Unlike click(), we cannot return resolved_selector here. SKY-7905 will + # update the SDK to return element metadata from all action methods. + data: dict[str, Any] = {"selector": selector, "intent": intent, "ai_mode": ai_mode, "text_length": len(text)} + # Build sdk_equivalent: prefer hybrid selector+prompt for production scripts + if selector and intent: + data["sdk_equivalent"] = f'await page.fill("{selector}", "{text}", prompt="{intent}")' + elif ai_mode: + data["sdk_equivalent"] = f'await page.fill(prompt="{intent}", value="{text}")' + elif selector: + data["sdk_equivalent"] = f'await page.fill("{selector}", "{text}")' + return make_result( + "skyvern_type", + browser_context=ctx, + data=data, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_screenshot( + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, + full_page: Annotated[bool, Field(description="Capture full scrollable page")] = False, + selector: Annotated[str | None, Field(description="CSS selector to screenshot specific element")] = None, + inline: Annotated[bool, Field(description="Return base64 data instead of file path")] = False, +) -> dict[str, Any]: + """See what's currently on the page. Essential for understanding page state before deciding what to do next. + + By default saves to ~/.skyvern/artifacts/ and returns the file path. + Set inline=true to get base64 data directly (increases token usage). + """ + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_screenshot", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + if selector: + element = page.locator(selector) + screenshot_bytes = await element.screenshot() + else: + screenshot_bytes = await page.screenshot(full_page=full_page) + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_screenshot", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check that the page or element is visible"), + ) + + if inline: + data_b64 = base64.b64encode(screenshot_bytes).decode("utf-8") + return make_result( + "skyvern_screenshot", + browser_context=ctx, + data={ + "inline": True, + "data": data_b64, + "mime": "image/png", + "bytes": len(screenshot_bytes), + "sdk_equivalent": "await page.screenshot()", + }, + timing_ms=timer.timing_ms, + warnings=["Inline mode increases token usage"], + ) + + ts = datetime.now(timezone.utc).strftime("%H%M%S_%f") + filename = f"screenshot_{ts}.png" + artifact = save_artifact( + screenshot_bytes, + kind="screenshot", + filename=filename, + mime="image/png", + session_id=ctx.session_id, + ) + + return make_result( + "skyvern_screenshot", + browser_context=ctx, + data={"path": artifact.path, "sdk_equivalent": "await page.screenshot(path='screenshot.png')"}, + artifacts=[artifact], + timing_ms=timer.timing_ms, + ) + + +async def skyvern_scroll( + direction: Annotated[str, Field(description="Direction: up, down, left, right")], + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, + amount: Annotated[int | None, Field(description="Pixels to scroll (default 500)")] = None, + intent: Annotated[ + str | None, Field(description="Natural language description of element to scroll into view (uses AI)") + ] = None, + selector: Annotated[str | None, Field(description="CSS selector of scrollable element")] = None, +) -> dict[str, Any]: + """Scroll the page or use AI to scroll a specific element into view. + + Use `intent` to scroll an AI-located element into view (with or without selector for hybrid fallback). + Without intent, scrolls the page or a selector-targeted container by pixel amount. + """ + valid_directions = ("up", "down", "left", "right") + if not intent and direction not in valid_directions: + return make_result( + "skyvern_scroll", + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, f"Invalid direction: {direction}", "Use up, down, left, or right" + ), + ) + + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_scroll", ok=False, error=no_browser_error()) + + if intent: + ai_mode = "fallback" if selector else "proactive" + with Timer() as timer: + try: + loc = page.locator(selector=selector, prompt=intent, ai=ai_mode) + await loc.scroll_into_view_if_needed() + timer.mark("sdk") + except Exception as e: + code = ErrorCode.AI_FALLBACK_FAILED if ai_mode == "fallback" else ErrorCode.ACTION_FAILED + return make_result( + "skyvern_scroll", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(code, str(e), "Could not find element to scroll into view"), + ) + + return make_result( + "skyvern_scroll", + browser_context=ctx, + data={ + "direction": "into_view", + "intent": intent, + "ai_mode": ai_mode, + "sdk_equivalent": ( + f'await page.locator("{selector}", prompt="{intent}").scroll_into_view_if_needed()' + if selector + else f'await page.locator(prompt="{intent}").scroll_into_view_if_needed()' + ), + }, + timing_ms=timer.timing_ms, + ) + + pixels = amount or 500 + direction_map = { + "up": (0, -pixels), + "down": (0, pixels), + "left": (-pixels, 0), + "right": (pixels, 0), + } + dx, dy = direction_map[direction] + + with Timer() as timer: + try: + if selector: + await page.locator(selector).evaluate(f"el => el.scrollBy({dx}, {dy})") + else: + await page.evaluate(f"window.scrollBy({dx}, {dy})") + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_scroll", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.ACTION_FAILED, str(e), "Scroll action failed"), + ) + + return make_result( + "skyvern_scroll", + browser_context=ctx, + data={ + "direction": direction, + "pixels": pixels, + "sdk_equivalent": f'await page.evaluate("window.scrollBy({dx}, {dy})")', + }, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_select_option( + value: Annotated[str, "Value to select"], + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, + intent: Annotated[str | None, Field(description="Natural language description of the dropdown (uses AI)")] = None, + selector: Annotated[str | None, Field(description="CSS selector for the select element")] = None, + timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000, + by_label: Annotated[bool, Field(description="Select by visible label instead of value")] = False, +) -> dict[str, Any]: + """Select an option from a dropdown menu. Use intent for AI-powered finding, selector for precision. + + Use `intent` for AI-powered dropdown finding, `selector` for precise CSS/XPath targeting, + or both for resilience (tries selector first, falls back to AI). + """ + ai_mode, err = _resolve_ai_mode(selector, intent) + if err: + return make_result( + "skyvern_select_option", + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + "Must provide intent, selector, or both", + "Use intent='describe the dropdown' for AI-powered selection, or selector='#css-selector' for precise targeting", + ), + ) + + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_select_option", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + if ai_mode is not None: + # AI paths: pass value= directly -- the AI interprets the text + # regardless of whether it represents a value or label. + await page.select_option(selector=selector, value=value, prompt=intent, ai=ai_mode, timeout=timeout) # type: ignore[arg-type] + else: + assert selector is not None + if by_label: + # Bypass SkyvernPage to avoid value="" coercion conflicting with label kwarg. + await page.page.locator(selector).select_option(label=value, timeout=timeout) + else: + await page.select_option(selector, value=value, timeout=timeout) + timer.mark("sdk") + except Exception as e: + code = ErrorCode.AI_FALLBACK_FAILED if ai_mode else ErrorCode.ACTION_FAILED + return make_result( + "skyvern_select_option", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(code, str(e), "Check selector and available options"), + ) + + # NOTE: The SDK select_option() returns the selected value, not a resolved + # selector. Unlike click(), we cannot return resolved_selector here. + # SKY-7905 will update the SDK to return element metadata from all action methods. + data: dict[str, Any] = {"selector": selector, "intent": intent, "ai_mode": ai_mode, "value": value} + # Build sdk_equivalent: prefer hybrid selector+prompt for production scripts + if selector and intent: + data["sdk_equivalent"] = f'await page.select_option("{selector}", value="{value}", prompt="{intent}")' + elif ai_mode: + data["sdk_equivalent"] = f'await page.select_option(prompt="{intent}", value="{value}")' + elif selector: + data["sdk_equivalent"] = f'await page.select_option("{selector}", value="{value}")' + return make_result( + "skyvern_select_option", + browser_context=ctx, + data=data, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_press_key( + key: Annotated[str, "Key to press (e.g., Enter, Tab, Escape, ArrowDown)"], + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, + intent: Annotated[ + str | None, Field(description="Natural language description of element to focus first (uses AI)") + ] = None, + selector: Annotated[str | None, Field(description="CSS selector to focus first")] = None, +) -> dict[str, Any]: + """Press a keyboard key -- Enter, Tab, Escape, arrow keys, shortcuts, etc. + + Use `intent` or `selector` to focus a specific element before pressing. + Without either, presses the key on the currently focused element. + """ + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_press_key", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + if intent or selector: + ai_mode, _ = _resolve_ai_mode(selector, intent) + if ai_mode is not None: + loc = page.locator(selector=selector, prompt=intent, ai=ai_mode) # type: ignore[arg-type] + else: + assert selector is not None + loc = page.locator(selector) + await loc.press(key) + else: + await page.keyboard.press(key) + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_press_key", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check key name is valid"), + ) + + if selector and intent: + sdk_eq = f'await page.locator("{selector}", prompt="{intent}").press("{key}")' + elif intent: + sdk_eq = f'await page.locator(prompt="{intent}").press("{key}")' + elif selector: + sdk_eq = f'await page.locator("{selector}").press("{key}")' + else: + sdk_eq = f'await page.keyboard.press("{key}")' + + return make_result( + "skyvern_press_key", + browser_context=ctx, + data={ + "key": key, + "selector": selector, + "intent": intent, + "sdk_equivalent": sdk_eq, + }, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_wait( + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, + time_ms: Annotated[int | None, Field(description="Time to wait in milliseconds")] = None, + intent: Annotated[str | None, Field(description="Natural language condition to wait for (uses AI polling)")] = None, + selector: Annotated[str | None, Field(description="CSS selector to wait for")] = None, + state: Annotated[str | None, Field(description="Element state: visible, hidden, attached, detached")] = "visible", + timeout: Annotated[int, Field(description="Max wait time in milliseconds", ge=1000, le=120000)] = 30000, + poll_interval_ms: Annotated[ + int, Field(description="Polling interval for intent-based waits in ms", ge=500, le=10000) + ] = 5000, +) -> dict[str, Any]: + """Wait for a condition, element, or time delay before proceeding. Use intent for AI-powered condition checking. + + Use `intent` to poll with AI validation (e.g., "wait until the loading spinner disappears"). + Use `selector` to wait for an element state. Use `time_ms` for a simple delay. + """ + valid_states = ("visible", "hidden", "attached", "detached") + if state is not None and state not in valid_states: + return make_result( + "skyvern_wait", + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + f"Invalid state: {state}", + "Use visible, hidden, attached, or detached", + ), + ) + + if time_ms is None and not selector and not intent: + return make_result( + "skyvern_wait", + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + "Must provide intent, selector, or time_ms", + "Use intent='condition to wait for' for AI-powered waiting, selector='#element' for element visibility, or time_ms=5000 for a delay", + ), + ) + + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_wait", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + if time_ms is not None: + await page.wait_for_timeout(time_ms) + waited_for = "time" + elif intent: + loop = asyncio.get_running_loop() + deadline = loop.time() + timeout / 1000 + last_error: Exception | None = None + while True: + try: + result = await page.validate(intent) + last_error = None + except Exception as poll_err: + result = False + last_error = poll_err + if result: + break + if loop.time() >= deadline: + code = ErrorCode.SDK_ERROR if last_error else ErrorCode.TIMEOUT + msg = str(last_error) if last_error else f"Condition not met within {timeout}ms: {intent}" + return make_result( + "skyvern_wait", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error( + code, + msg, + "Increase timeout or check that the condition can be satisfied", + ), + ) + await page.wait_for_timeout(poll_interval_ms) + waited_for = "intent" + elif selector: + await page.wait_for_selector(selector, state=state, timeout=timeout) + waited_for = "selector" + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_wait", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.TIMEOUT, str(e), "Condition was not met within timeout"), + ) + + sdk_eq = "" + if waited_for == "time": + sdk_eq = f"await page.wait_for_timeout({time_ms})" + elif waited_for == "intent": + sdk_eq = f'await page.validate("{intent}")' + elif waited_for == "selector": + sdk_eq = f'await page.wait_for_selector("{selector}")' + return make_result( + "skyvern_wait", + browser_context=ctx, + data={"waited_for": waited_for, "sdk_equivalent": sdk_eq}, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_evaluate( + expression: Annotated[str, "JavaScript expression to evaluate"], + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, +) -> dict[str, Any]: + """Run JavaScript on the page to read DOM state, get URLs, check values, or discover CSS selectors for faster subsequent actions. + + Security: This executes arbitrary JS in the page context. Only use with trusted expressions. + """ + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_evaluate", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + result = await page.evaluate(expression) + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_evaluate", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check JavaScript syntax"), + ) + + return make_result( + "skyvern_evaluate", + browser_context=ctx, + data={"result": result, "sdk_equivalent": f'await page.evaluate("{expression[:80]}")'}, + timing_ms=timer.timing_ms, + ) + + +# --------------------------------------------------------------------------- +# AI Differentiator Tools +# --------------------------------------------------------------------------- + + +async def skyvern_extract( + prompt: Annotated[str, "Natural language description of what data to extract from the page"], + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, + schema: Annotated[ + str | None, Field(description="JSON Schema string defining the expected output structure") + ] = None, +) -> dict[str, Any]: + """Get structured data from any website -- prices, listings, articles, tables, contact info, etc. Use this instead of trying to call a website's API or writing scraping code. Describe what you need in natural language. + + Optionally provide a JSON `schema` to enforce the output structure (pass as a JSON string). + """ + parsed_schema: dict[str, Any] | None = None + if schema is not None: + try: + parsed_schema = json.loads(schema) + except (json.JSONDecodeError, TypeError) as e: + return make_result( + "skyvern_extract", + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + f"Invalid JSON schema: {e}", + "Provide schema as a valid JSON string", + ), + ) + + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_extract", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + extracted = await page.extract(prompt=prompt, schema=parsed_schema) + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_extract", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.SDK_ERROR, str(e), "Check that the page has loaded and the prompt is clear"), + ) + + return make_result( + "skyvern_extract", + browser_context=ctx, + data={"extracted": extracted, "sdk_equivalent": f'await page.extract(prompt="{prompt}")'}, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_validate( + prompt: Annotated[str, "Validation condition to check (e.g., 'the login form is visible')"], + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, +) -> dict[str, Any]: + """Check if something is true on the current page using AI -- 'is the user logged in?', 'does the cart have 3 items?', 'is the form submitted?' + + Returns whether the described condition is true or false. + """ + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_validate", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + valid = await page.validate(prompt) + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_validate", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.SDK_ERROR, str(e), "Check that the page has loaded and the prompt is clear"), + ) + + return make_result( + "skyvern_validate", + browser_context=ctx, + data={"prompt": prompt, "valid": valid, "sdk_equivalent": f'await page.validate("{prompt}")'}, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_act( + prompt: Annotated[str, "Natural language instruction for the action to perform (e.g., 'close the cookie banner')"], + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, +) -> dict[str, Any]: + """Perform actions on a web page by describing what to do in plain English -- click buttons, close popups, fill forms, scroll to sections, interact with menus. Use for any website interaction task. + + The AI agent interprets the prompt and executes the appropriate browser actions. + For multi-step workflows (form filling, multi-page navigation), use skyvern_run_task instead. + """ + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_act", ok=False, error=no_browser_error()) + + with Timer() as timer: + try: + await page.act(prompt) + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_act", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.SDK_ERROR, str(e), "Simplify the prompt or break the task into steps"), + ) + + return make_result( + "skyvern_act", + browser_context=ctx, + data={"prompt": prompt, "completed": True, "sdk_equivalent": f'await page.act("{prompt}")'}, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_run_task( + prompt: Annotated[str, "Natural language description of the task to automate"], + session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, + url: Annotated[ + str | None, Field(description="URL to navigate to before running (uses current page if omitted)") + ] = None, + data_extraction_schema: Annotated[ + str | None, Field(description="JSON Schema string defining what data to extract") + ] = None, + max_steps: Annotated[int | None, Field(description="Maximum number of agent steps")] = None, + timeout_seconds: Annotated[ + int, Field(description="Timeout in seconds (default 180s = 3 minutes)", ge=10, le=1800) + ] = 180, +) -> dict[str, Any]: + """Delegate a complete multi-step web task to an autonomous AI agent. Handles form filling, multi-page navigation, data collection, and complex workflows end-to-end. + + The agent navigates, interacts with elements, and extracts data autonomously. + For simple single-step actions, use skyvern_act instead. + """ + try: + page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) + except BrowserNotAvailableError: + return make_result("skyvern_run_task", ok=False, error=no_browser_error()) + + parsed_schema: dict[str, Any] | str | None = None + if data_extraction_schema is not None: + try: + parsed_schema = json.loads(data_extraction_schema) + except (json.JSONDecodeError, TypeError) as e: + return make_result( + "skyvern_run_task", + ok=False, + browser_context=ctx, + error=make_error( + ErrorCode.INVALID_INPUT, + f"Invalid data_extraction_schema JSON: {e}", + "Provide schema as a valid JSON string", + ), + ) + + with Timer() as timer: + try: + response = await page.agent.run_task( + prompt=prompt, + url=url, + data_extraction_schema=parsed_schema, + max_steps=max_steps, + timeout=timeout_seconds, + ) + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_run_task", + ok=False, + browser_context=ctx, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.SDK_ERROR, str(e), "Check the prompt, URL, and timeout settings"), + ) + + return make_result( + "skyvern_run_task", + browser_context=ctx, + data={ + "run_id": response.run_id, + "status": response.status, + "output": response.output, + "failure_reason": response.failure_reason, + "recording_url": response.recording_url, + "app_url": response.app_url, + "sdk_equivalent": f'await page.agent.run_task(prompt="{prompt}")', + }, + timing_ms=timer.timing_ms, + ) diff --git a/skyvern/cli/mcp_tools/session.py b/skyvern/cli/mcp_tools/session.py new file mode 100644 index 00000000..7057b33d --- /dev/null +++ b/skyvern/cli/mcp_tools/session.py @@ -0,0 +1,261 @@ +from __future__ import annotations + +from typing import Annotated, Any + +from pydantic import Field + +from skyvern.schemas.runs import ProxyLocation + +from ._common import BrowserContext, ErrorCode, Timer, make_error, make_result +from ._session import ( + SessionState, + get_current_session, + get_skyvern, + resolve_browser, + set_current_session, +) + + +async def skyvern_session_create( + timeout: Annotated[int | None, Field(description="Session timeout in minutes (5-1440)")] = 60, + proxy_location: Annotated[str | None, Field(description="Proxy location: RESIDENTIAL, US, etc.")] = None, + local: Annotated[bool, Field(description="Launch local browser instead of cloud")] = False, + headless: Annotated[bool, Field(description="Run local browser in headless mode")] = False, +) -> dict[str, Any]: + """Create a new browser session to start interacting with websites. Creates a cloud browser by default. + + Use local=true for a local Chromium instance. + The session persists across tool calls until explicitly closed. + """ + with Timer() as timer: + try: + skyvern = get_skyvern() + + if local: + browser = await skyvern.launch_local_browser(headless=headless) + ctx = BrowserContext(mode="local") + set_current_session(SessionState(browser=browser, context=ctx)) + timer.mark("sdk") + return make_result( + "skyvern_session_create", + browser_context=ctx, + data={"local": True, "headless": headless}, + timing_ms=timer.timing_ms, + ) + + proxy = ProxyLocation(proxy_location) if proxy_location else None + browser = await skyvern.launch_cloud_browser(timeout=timeout, proxy_location=proxy) + ctx = BrowserContext(mode="cloud_session", session_id=browser.browser_session_id) + set_current_session(SessionState(browser=browser, context=ctx)) + timer.mark("sdk") + + except ValueError as e: + return make_result( + "skyvern_session_create", + ok=False, + timing_ms=timer.timing_ms, + error=make_error( + ErrorCode.SDK_ERROR, + str(e), + "Cloud sessions require SKYVERN_API_KEY. Check your environment.", + ), + ) + except Exception as e: + return make_result( + "skyvern_session_create", + ok=False, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.SDK_ERROR, str(e), "Failed to create browser session"), + ) + + return make_result( + "skyvern_session_create", + browser_context=ctx, + data={ + "session_id": browser.browser_session_id, + "timeout_minutes": timeout, + }, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_session_close( + session_id: Annotated[str | None, Field(description="Session ID to close (uses current if not specified)")] = None, +) -> dict[str, Any]: + """Close a browser session when you're done. Frees cloud resources. + + Closes the specified session or the current active session. + """ + current = get_current_session() + + with Timer() as timer: + try: + if session_id: + skyvern = get_skyvern() + await skyvern.close_browser_session(session_id) + if current.context and current.context.session_id == session_id: + set_current_session(SessionState()) + timer.mark("sdk") + return make_result( + "skyvern_session_close", + data={"session_id": session_id, "closed": True}, + timing_ms=timer.timing_ms, + ) + + if current.browser is None: + return make_result( + "skyvern_session_close", + ok=False, + error=make_error( + ErrorCode.NO_ACTIVE_BROWSER, + "No active session to close", + "Provide a session_id or create a session first", + ), + ) + + closed_id = current.context.session_id if current.context else None + await current.browser.close() + set_current_session(SessionState()) + timer.mark("sdk") + + except Exception as e: + return make_result( + "skyvern_session_close", + ok=False, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.SDK_ERROR, str(e), "Failed to close session"), + ) + + return make_result( + "skyvern_session_close", + data={"session_id": closed_id, "closed": True}, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_session_list() -> dict[str, Any]: + """List all active browser sessions. Use to find available sessions to connect to.""" + with Timer() as timer: + try: + skyvern = get_skyvern() + sessions = await skyvern.get_browser_sessions() + timer.mark("sdk") + + session_data = [ + { + "session_id": s.browser_session_id, + "status": s.status, + "started_at": s.started_at.isoformat() if s.started_at else None, + "timeout": s.timeout, + "runnable_id": s.runnable_id, + "available": s.runnable_id is None and s.browser_address is not None, + } + for s in sessions + ] + + except ValueError as e: + return make_result( + "skyvern_session_list", + ok=False, + timing_ms=timer.timing_ms, + error=make_error( + ErrorCode.SDK_ERROR, + str(e), + "Listing sessions requires SKYVERN_API_KEY", + ), + ) + except Exception as e: + return make_result( + "skyvern_session_list", + ok=False, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.SDK_ERROR, str(e), "Failed to list sessions"), + ) + + current = get_current_session() + current_id = current.context.session_id if current.context else None + + return make_result( + "skyvern_session_list", + data={ + "sessions": session_data, + "count": len(session_data), + "current_session_id": current_id, + }, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_session_get( + session_id: Annotated[str, "Browser session ID to get details for"], +) -> dict[str, Any]: + """Get details about a specific browser session -- status, timeout, availability.""" + with Timer() as timer: + try: + skyvern = get_skyvern() + session = await skyvern.get_browser_session(session_id) + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_session_get", + ok=False, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.BROWSER_NOT_FOUND, str(e), "Check the session ID is correct"), + ) + + current = get_current_session() + is_current = current.context and current.context.session_id == session_id + + return make_result( + "skyvern_session_get", + browser_context=BrowserContext(mode="cloud_session", session_id=session_id) if is_current else None, + data={ + "session_id": session.browser_session_id, + "status": session.status, + "started_at": session.started_at.isoformat() if session.started_at else None, + "completed_at": session.completed_at.isoformat() if session.completed_at else None, + "timeout": session.timeout, + "runnable_id": session.runnable_id, + "is_current": is_current, + }, + timing_ms=timer.timing_ms, + ) + + +async def skyvern_session_connect( + session_id: Annotated[str | None, Field(description="Cloud session ID (pbs_...)")] = None, + cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, +) -> dict[str, Any]: + """Connect to an existing browser -- a cloud session by ID or any browser via CDP URL. + + Use this to resume work in a previously created session or attach to an external browser. + """ + if not session_id and not cdp_url: + return make_result( + "skyvern_session_connect", + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + "Must provide session_id or cdp_url", + "Specify which browser to connect to", + ), + ) + + with Timer() as timer: + try: + browser, ctx = await resolve_browser(session_id=session_id, cdp_url=cdp_url) + timer.mark("sdk") + except Exception as e: + return make_result( + "skyvern_session_connect", + ok=False, + timing_ms=timer.timing_ms, + error=make_error(ErrorCode.BROWSER_NOT_FOUND, str(e), "Check the session ID or CDP URL is valid"), + ) + + return make_result( + "skyvern_session_connect", + browser_context=ctx, + data={"connected": True}, + timing_ms=timer.timing_ms, + ) diff --git a/skyvern/cli/run_commands.py b/skyvern/cli/run_commands.py index 8b599eca..b44688c7 100644 --- a/skyvern/cli/run_commands.py +++ b/skyvern/cli/run_commands.py @@ -10,11 +10,11 @@ import psutil import typer import uvicorn from dotenv import load_dotenv, set_key -from mcp.server.fastmcp import FastMCP from rich.panel import Panel from rich.prompt import Confirm from skyvern.cli.console import console +from skyvern.cli.mcp_tools import mcp # Uses standalone fastmcp (v2.x) from skyvern.cli.utils import start_services from skyvern.client import SkyvernEnvironment from skyvern.config import settings @@ -27,8 +27,6 @@ from skyvern.utils.env_paths import resolve_backend_env_path, resolve_frontend_e run_app = typer.Typer(help="Commands to run Skyvern services such as the API server or UI.") -mcp = FastMCP("Skyvern") - @mcp.tool() async def skyvern_run_task(prompt: str, url: str) -> dict[str, Any]: @@ -53,12 +51,9 @@ async def skyvern_run_task(prompt: str, url: str) -> dict[str, Any]: res = await skyvern_agent.run_task(prompt=prompt, url=url, user_agent="skyvern-mcp", wait_for_completion=True) output = res.model_dump()["output"] - # Primary: use app_url from API response (handles both task and workflow run IDs correctly) if res.app_url: task_url = res.app_url else: - # Fallback when app_url is not available (e.g., older API versions) - # Determine route based on run_id prefix: 'wr_' for workflows, otherwise tasks if res.run_id and res.run_id.startswith("wr_"): task_url = f"{settings.SKYVERN_APP_URL.rstrip('/')}/runs/{res.run_id}/overview" else: