SKY-7801/7802: MCP Foundation + Hybrid Browser Tools (selector + AI intent) (#4660)

2026-02-07 02:33:13 -08:00
parent cb7225c6e6
commit 4f1bf25768
12 changed files with 1826 additions and 6 deletions
--- a/skyvern/cli/core/init.py
+++ b/skyvern/cli/core/init.py
@@ -0,0 +1,43 @@
+"""Shared core layer for Skyvern CLI and MCP tools.
+
+This package provides reusable primitives that both MCP tools and CLI commands
+import from, preventing logic duplication across interfaces.
+"""
+
+from .artifacts import get_artifact_dir, save_artifact
+from .client import get_skyvern
+from .result import Artifact, BrowserContext, ErrorCode, Timer, make_error, make_result
+from .session_manager import (
+    BrowserNotAvailableError,
+    SessionState,
+    browser_session,
+    get_current_session,
+    get_page,
+    no_browser_error,
+    resolve_browser,
+    set_current_session,
+)
+
+__all__ = [
+    # client.py
+    "get_skyvern",
+    # result.py
+    "Artifact",
+    "BrowserContext",
+    "ErrorCode",
+    "Timer",
+    "make_error",
+    "make_result",
+    # artifacts.py
+    "get_artifact_dir",
+    "save_artifact",
+    # session_manager.py
+    "BrowserNotAvailableError",
+    "SessionState",
+    "browser_session",
+    "get_current_session",
+    "get_page",
+    "no_browser_error",
+    "resolve_browser",
+    "set_current_session",
+]
--- a/skyvern/cli/core/artifacts.py
+++ b/skyvern/cli/core/artifacts.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from pathlib import Path
+
+from .result import Artifact
+
+
+def get_artifact_dir(session_id: str | None = None, run_id: str | None = None) -> Path:
+    base = Path.home() / ".skyvern" / "artifacts" / datetime.now(timezone.utc).strftime("%Y-%m-%d")
+    if session_id:
+        return base / session_id
+    if run_id:
+        return base / run_id
+    return base / "anonymous"
+
+
+def save_artifact(
+    content: bytes,
+    kind: str,
+    filename: str,
+    mime: str,
+    session_id: str | None = None,
+) -> Artifact:
+    dir_path = get_artifact_dir(session_id)
+    dir_path.mkdir(parents=True, exist_ok=True)
+    file_path = dir_path / filename
+    file_path.write_bytes(content)
+    return Artifact(kind=kind, path=str(file_path), mime=mime, bytes=len(content))
--- a/skyvern/cli/core/client.py
+++ b/skyvern/cli/core/client.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import os
+from contextvars import ContextVar
+
+from skyvern.client import SkyvernEnvironment
+from skyvern.config import settings
+from skyvern.library.skyvern import Skyvern
+
+_skyvern_instance: ContextVar[Skyvern | None] = ContextVar("skyvern_instance", default=None)
+
+
+def get_skyvern() -> Skyvern:
+    """Get or create a Skyvern client instance."""
+    instance = _skyvern_instance.get()
+    if instance is not None:
+        return instance
+
+    api_key = settings.SKYVERN_API_KEY or os.environ.get("SKYVERN_API_KEY")
+    base_url = settings.SKYVERN_BASE_URL or os.environ.get("SKYVERN_BASE_URL")
+
+    if api_key:
+        instance = Skyvern(
+            api_key=api_key,
+            environment=SkyvernEnvironment.CLOUD,
+            base_url=base_url,
+        )
+    else:
+        instance = Skyvern.local()
+
+    _skyvern_instance.set(instance)
+    return instance
--- a/skyvern/cli/core/result.py
+++ b/skyvern/cli/core/result.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+
+
+class ErrorCode:
+    NO_ACTIVE_BROWSER = "NO_ACTIVE_BROWSER"
+    BROWSER_NOT_FOUND = "BROWSER_NOT_FOUND"
+    SELECTOR_NOT_FOUND = "SELECTOR_NOT_FOUND"
+    ACTION_FAILED = "ACTION_FAILED"
+    AI_FALLBACK_FAILED = "AI_FALLBACK_FAILED"
+    SDK_ERROR = "SDK_ERROR"
+    TIMEOUT = "TIMEOUT"
+    INVALID_INPUT = "INVALID_INPUT"
+
+
+@dataclass
+class Artifact:
+    kind: str
+    path: str
+    mime: str
+    bytes: int
+    created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "kind": self.kind,
+            "path": self.path,
+            "mime": self.mime,
+            "bytes": self.bytes,
+            "created_at": self.created_at,
+        }
+
+
+@dataclass
+class BrowserContext:
+    mode: str
+    session_id: str | None = None
+    cdp_url: str | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "mode": self.mode,
+            "session_id": self.session_id,
+            "cdp_url": self.cdp_url,
+        }
+
+
+def make_result(
+    action: str,
+    *,
+    ok: bool = True,
+    browser_context: BrowserContext | None = None,
+    data: dict[str, Any] | None = None,
+    artifacts: list[Artifact] | None = None,
+    timing_ms: dict[str, int] | None = None,
+    warnings: list[str] | None = None,
+    error: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    return {
+        "ok": ok,
+        "action": action,
+        "browser_context": (browser_context or BrowserContext(mode="none")).to_dict(),
+        "data": data,
+        "artifacts": [a.to_dict() for a in (artifacts or [])],
+        "timing_ms": timing_ms or {},
+        "warnings": warnings or [],
+        "error": error,
+    }
+
+
+def make_error(
+    code: str,
+    message: str,
+    hint: str,
+    details: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    return {
+        "code": code,
+        "message": message,
+        "hint": hint,
+        "details": details or {},
+    }
+
+
+class Timer:
+    def __init__(self) -> None:
+        self._start: float = 0
+        self._marks: dict[str, int] = {}
+
+    def __enter__(self) -> Timer:
+        self._start = time.perf_counter()
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        self._marks["total"] = int((time.perf_counter() - self._start) * 1000)
+
+    def mark(self, name: str) -> None:
+        self._marks[name] = int((time.perf_counter() - self._start) * 1000)
+
+    @property
+    def timing_ms(self) -> dict[str, int]:
+        return self._marks.copy()
--- a/skyvern/cli/core/session_manager.py
+++ b/skyvern/cli/core/session_manager.py
@@ -0,0 +1,153 @@
+from __future__ import annotations
+
+from contextlib import asynccontextmanager
+from contextvars import ContextVar
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, AsyncIterator
+
+from .client import get_skyvern
+from .result import BrowserContext, ErrorCode, make_error
+
+if TYPE_CHECKING:
+    from skyvern.library.skyvern_browser import SkyvernBrowser
+    from skyvern.library.skyvern_browser_page import SkyvernBrowserPage
+
+
+@dataclass
+class SessionState:
+    browser: SkyvernBrowser | None = None
+    context: BrowserContext | None = None
+    console_messages: list[dict[str, Any]] = field(default_factory=list)
+    tracing_active: bool = False
+    har_enabled: bool = False
+
+
+_current_session: ContextVar[SessionState | None] = ContextVar("mcp_session", default=None)
+
+
+def get_current_session() -> SessionState:
+    state = _current_session.get()
+    if state is None:
+        state = SessionState()
+        _current_session.set(state)
+    return state
+
+
+def set_current_session(state: SessionState) -> None:
+    _current_session.set(state)
+
+
+async def resolve_browser(
+    session_id: str | None = None,
+    cdp_url: str | None = None,
+    local: bool = False,
+    create_session: bool = False,
+    timeout: int | None = None,
+    headless: bool = False,
+) -> tuple[SkyvernBrowser, BrowserContext]:
+    """Resolve browser from parameters or current session.
+
+    Note: For MCP tools, sessions are stored in ContextVar and persist across tool calls.
+    Cleanup is done via explicit skyvern_session_close() call. For scripts that need
+    guaranteed cleanup, use the browser_session() context manager instead.
+    """
+    skyvern = get_skyvern()
+    current = get_current_session()
+
+    browser: SkyvernBrowser | None = None
+    try:
+        if session_id:
+            browser = await skyvern.connect_to_cloud_browser_session(session_id)
+            ctx = BrowserContext(mode="cloud_session", session_id=session_id)
+            set_current_session(SessionState(browser=browser, context=ctx))
+            return browser, ctx
+
+        if cdp_url:
+            browser = await skyvern.connect_to_browser_over_cdp(cdp_url)
+            ctx = BrowserContext(mode="cdp", cdp_url=cdp_url)
+            set_current_session(SessionState(browser=browser, context=ctx))
+            return browser, ctx
+
+        if local:
+            browser = await skyvern.launch_local_browser(headless=headless)
+            ctx = BrowserContext(mode="local")
+            set_current_session(SessionState(browser=browser, context=ctx))
+            return browser, ctx
+
+        if create_session:
+            browser = await skyvern.launch_cloud_browser(timeout=timeout)
+            ctx = BrowserContext(mode="cloud_session", session_id=browser.browser_session_id)
+            set_current_session(SessionState(browser=browser, context=ctx))
+            return browser, ctx
+    except Exception:
+        if browser is not None:
+            try:
+                await browser.close()
+            except Exception:
+                pass
+        set_current_session(SessionState())
+        raise
+
+    if current.browser is not None and current.context is not None:
+        return current.browser, current.context
+
+    raise BrowserNotAvailableError()
+
+
+async def get_page(
+    session_id: str | None = None,
+    cdp_url: str | None = None,
+) -> tuple[SkyvernBrowserPage, BrowserContext]:
+    """Get the working page from the current or specified browser session."""
+    browser, ctx = await resolve_browser(session_id=session_id, cdp_url=cdp_url)
+    page = await browser.get_working_page()
+    return page, ctx
+
+
+@asynccontextmanager
+async def browser_session(
+    session_id: str | None = None,
+    cdp_url: str | None = None,
+    local: bool = False,
+    timeout: int | None = None,
+    headless: bool = False,
+) -> AsyncIterator[tuple[SkyvernBrowser, BrowserContext]]:
+    """Context manager for browser sessions with guaranteed cleanup.
+
+    Use this in scripts that need guaranteed resource cleanup on error.
+    MCP tools use resolve_browser() directly since sessions persist across calls.
+
+    Example:
+        async with browser_session(local=True) as (browser, ctx):
+            page = await browser.get_working_page()
+            await page.goto("https://example.com")
+        # Browser is automatically closed on exit or exception
+    """
+    browser, ctx = await resolve_browser(
+        session_id=session_id,
+        cdp_url=cdp_url,
+        local=local,
+        create_session=not (session_id or cdp_url or local),
+        timeout=timeout,
+        headless=headless,
+    )
+    try:
+        yield browser, ctx
+    finally:
+        try:
+            await browser.close()
+        except Exception:
+            pass  # Best effort cleanup
+        set_current_session(SessionState())
+
+
+class BrowserNotAvailableError(Exception):
+    """Raised when no browser session is available."""
+
+
+def no_browser_error() -> dict[str, Any]:
+    return make_error(
+        ErrorCode.NO_ACTIVE_BROWSER,
+        "No browser session available",
+        "Create a session with skyvern_session_create, provide session_id, or cdp_url",
+    )
--- a/skyvern/cli/mcp_tools/init.py
+++ b/skyvern/cli/mcp_tools/init.py
@@ -0,0 +1,201 @@
+"""Skyvern MCP Tools.
+
+This module provides MCP (Model Context Protocol) tools for browser automation.
+Tools are registered with FastMCP and can be used by AI assistants like Claude.
+"""
+
+from fastmcp import FastMCP
+
+from .browser import (
+    skyvern_act,
+    skyvern_click,
+    skyvern_evaluate,
+    skyvern_extract,
+    skyvern_navigate,
+    skyvern_press_key,
+    skyvern_run_task,
+    skyvern_screenshot,
+    skyvern_scroll,
+    skyvern_select_option,
+    skyvern_type,
+    skyvern_validate,
+    skyvern_wait,
+)
+from .session import (
+    skyvern_session_close,
+    skyvern_session_connect,
+    skyvern_session_create,
+    skyvern_session_get,
+    skyvern_session_list,
+)
+
+mcp = FastMCP(
+    "Skyvern",
+    instructions="""Use Skyvern tools whenever the task involves visiting, browsing, or interacting with ANY website or web application.
+
+## When to Use These Tools
+Reach for Skyvern tools when the user asks you to:
+- Visit, browse, or interact with ANY website or web application
+- Extract data from web pages (prices, listings, articles, tables, search results, etc.)
+- Fill out forms, log in, sign up, or complete web-based workflows
+- Check the current state of a web page or verify something on a site
+- Do anything you would otherwise attempt with requests, beautifulsoup, selenium, or playwright
+- Access website data where you are unsure whether an API endpoint exists
+
+DO NOT try to scrape websites by guessing API endpoints or writing HTTP requests.
+Instead, use skyvern_navigate + skyvern_extract to get real data from actual pages.
+These tools give you a real browser — use them instead of writing scraping code.
+
+## Examples
+| User says | Use |
+|-----------|-----|
+| "Go to amazon.com" | skyvern_navigate |
+| "What's on this page?" | skyvern_screenshot |
+| "Get all product prices" | skyvern_extract |
+| "Click the login button" | skyvern_act or skyvern_click |
+| "Fill out this form" | skyvern_act |
+| "Log in and buy the first item" | skyvern_run_task |
+| "Is checkout complete?" | skyvern_validate |
+| "Write a script to do this" | Skyvern SDK (see below) |
+
+## Writing Scripts and Code
+When asked to write an automation script, use the Skyvern Python SDK with the **hybrid xpath+prompt
+pattern** for production-quality scripts. The hybrid form tries the xpath/selector first (fast,
+deterministic) and falls back to AI if the selector breaks — this is the recommended pattern.
+
+    from skyvern import Skyvern
+    skyvern = Skyvern(api_key="YOUR_API_KEY")
+    browser = await skyvern.launch_cloud_browser()
+    page = await browser.get_working_page()
+    await page.goto("https://example.com")
+
+    # BEST: hybrid selector+prompt — fast deterministic selector with AI fallback
+    await page.click("xpath=//button[@id='submit']", prompt="the Submit button")
+    await page.fill("xpath=//input[@name='email']", "user@example.com", prompt="email input field")
+
+    # OK for exploration, but prefer hybrid for production scripts:
+    await page.click(prompt="the Submit button")
+
+    data = await page.extract("Get all product names and prices")
+
+To get xpaths for hybrid calls, use precision tools (skyvern_click, skyvern_type) during exploration.
+The `resolved_selector` field in responses gives you the xpath the AI resolved to. Use it in scripts:
+  explore: skyvern_click(intent="Submit button") → response includes resolved_selector="xpath=//button[@id='submit']"
+  script:  await page.click("xpath=//button[@id='submit']", prompt="Submit button")
+
+IMPORTANT: NEVER import from skyvern.cli.mcp_tools — those are internal server modules.
+The public SDK is: from skyvern import Skyvern
+
+## Recommended Workflow
+1. **Connect** — Create or connect to a browser session
+2. **Explore** — Navigate pages, take screenshots, extract data with AI
+3. **Build** — Capture selectors and data schemas to construct deterministic workflows
+4. **Test** — Validate workflows via skyvern_run_task
+
+## Primary Tools (use these first)
+These are the tools you should reach for by default:
+
+- **skyvern_act** — Execute actions from natural language: "log in with test@example.com", "add the first item to cart". Best for exploration and testing flows.
+- **skyvern_extract** — Pull structured data from any page with natural language + optional JSON Schema. THE differentiator over raw Playwright.
+- **skyvern_validate** — Assert page conditions with AI: "is the user logged in?", "does the cart have 3 items?"
+- **skyvern_run_task** — Delegate a full multi-step task to an autonomous AI agent with observability. Use for end-to-end task execution.
+- **skyvern_navigate** — Go to a URL. Always the first step after connecting.
+- **skyvern_screenshot** — See what's on the page. Essential for understanding page state.
+- **skyvern_evaluate** — Run JavaScript to read DOM state, get URLs, or check values.
+
+## Precision Tools (for debugging and exact control)
+Use these when the primary tools aren't specific enough, or when you need deterministic
+selector-based actions (e.g., replaying a known flow):
+
+- **skyvern_click** — Click a specific element by selector or AI intent
+- **skyvern_type** — Type into a specific input field by selector or AI intent
+- **skyvern_scroll** — Scroll the page or an element into view
+- **skyvern_select_option** — Select a dropdown option by selector or AI intent
+- **skyvern_press_key** — Press a keyboard key (Enter, Tab, Escape, etc.)
+- **skyvern_wait** — Wait for a condition, element, or time delay
+
+## Tool Modes (precision tools)
+Precision tools support three modes. When unsure, use `intent`.
+
+1. **Intent mode** — AI-powered element finding:
+   `skyvern_click(intent="the blue Submit button")`
+
+2. **Hybrid mode** — tries selector first, AI fallback:
+   `skyvern_click(selector="#submit-btn", intent="the Submit button")`
+
+3. **Selector mode** — deterministic CSS/XPath targeting:
+   `skyvern_click(selector="#submit-btn")`
+
+## Replay Story: From Exploration to Production Scripts
+When you use precision tools (skyvern_click, skyvern_type, etc.) with intent mode, the response
+includes `resolved_selector` — the xpath/CSS the AI found. Capture these to build hybrid scripts.
+
+**The hybrid pattern** is the recommended default for SDK scripts:
+    await page.click("xpath=//button[@id='submit']", prompt="the Submit button")
+It tries the selector first (fast, no AI cost), then falls back to AI if the selector breaks.
+
+**Workflow for generating scripts:**
+1. Explore: Use skyvern_click(intent="Submit button") during interactive exploration
+2. Capture: Note the `resolved_selector` from the response (e.g., "//button[@id='submit']")
+3. Script: Write `page.click("xpath=//button[@id='submit']", prompt="Submit button")`
+
+The `sdk_equivalent` field in each tool response shows the correct hybrid call to use in scripts.
+Always prefer hybrid xpath+prompt over prompt-only in generated scripts.
+
+Note: Currently only skyvern_click returns resolved_selector. Support for skyvern_type and
+skyvern_select_option is planned (SKY-7905). For those tools, use the selector you provided
+as input, or fall back to prompt-only until SKY-7905 ships.
+
+## Getting Started
+Create a session with skyvern_session_create, then use browser tools to interact with pages.
+""",
+)
+
+# -- Session management --
+mcp.tool()(skyvern_session_create)
+mcp.tool()(skyvern_session_close)
+mcp.tool()(skyvern_session_list)
+mcp.tool()(skyvern_session_get)
+mcp.tool()(skyvern_session_connect)
+
+# -- Primary tools (AI-powered exploration + observation) --
+mcp.tool()(skyvern_act)
+mcp.tool()(skyvern_extract)
+mcp.tool()(skyvern_validate)
+mcp.tool()(skyvern_run_task)
+mcp.tool()(skyvern_navigate)
+mcp.tool()(skyvern_screenshot)
+mcp.tool()(skyvern_evaluate)
+
+# -- Precision tools (selector/intent-based browser primitives) --
+mcp.tool()(skyvern_click)
+mcp.tool()(skyvern_type)
+mcp.tool()(skyvern_scroll)
+mcp.tool()(skyvern_select_option)
+mcp.tool()(skyvern_press_key)
+mcp.tool()(skyvern_wait)
+
+__all__ = [
+    "mcp",
+    # Session
+    "skyvern_session_create",
+    "skyvern_session_close",
+    "skyvern_session_list",
+    "skyvern_session_get",
+    "skyvern_session_connect",
+    # Primary (AI-powered)
+    "skyvern_act",
+    "skyvern_extract",
+    "skyvern_validate",
+    "skyvern_run_task",
+    "skyvern_navigate",
+    "skyvern_screenshot",
+    "skyvern_evaluate",
+    # Precision (selector/intent browser primitives)
+    "skyvern_click",
+    "skyvern_type",
+    "skyvern_scroll",
+    "skyvern_select_option",
+    "skyvern_press_key",
+    "skyvern_wait",
+]
--- a/skyvern/cli/mcp_tools/_client.py
+++ b/skyvern/cli/mcp_tools/_client.py
@@ -0,0 +1,11 @@
+"""Skyvern HTTP API client accessor.
+
+Workflow tools import from here to get the API client without pulling in
+browser/Playwright dependencies.
+"""
+
+from __future__ import annotations
+
+from skyvern.cli.core.client import get_skyvern
+
+__all__ = ["get_skyvern"]
--- a/skyvern/cli/mcp_tools/_common.py
+++ b/skyvern/cli/mcp_tools/_common.py
@@ -0,0 +1,20 @@
+"""Backward-compatible re-exports from skyvern.cli.core.
+
+MCP tools import from here; the canonical implementations live in core/.
+"""
+
+from __future__ import annotations
+
+from skyvern.cli.core.artifacts import get_artifact_dir, save_artifact
+from skyvern.cli.core.result import Artifact, BrowserContext, ErrorCode, Timer, make_error, make_result
+
+__all__ = [
+    "Artifact",
+    "BrowserContext",
+    "ErrorCode",
+    "Timer",
+    "get_artifact_dir",
+    "make_error",
+    "make_result",
+    "save_artifact",
+]
--- a/skyvern/cli/mcp_tools/_session.py
+++ b/skyvern/cli/mcp_tools/_session.py
@@ -0,0 +1,30 @@
+"""Backward-compatible re-exports from skyvern.cli.core.
+
+MCP tools import from here; the canonical implementations live in core/.
+"""
+
+from __future__ import annotations
+
+from skyvern.cli.core.client import get_skyvern
+from skyvern.cli.core.session_manager import (
+    BrowserNotAvailableError,
+    SessionState,
+    browser_session,
+    get_current_session,
+    get_page,
+    no_browser_error,
+    resolve_browser,
+    set_current_session,
+)
+
+__all__ = [
+    "BrowserNotAvailableError",
+    "SessionState",
+    "browser_session",
+    "get_current_session",
+    "get_page",
+    "get_skyvern",
+    "no_browser_error",
+    "resolve_browser",
+    "set_current_session",
+]
--- a/skyvern/cli/mcp_tools/browser.py
+++ b/skyvern/cli/mcp_tools/browser.py
@@ -0,0 +1,939 @@
+from __future__ import annotations
+
+import asyncio
+import base64
+import json
+from datetime import datetime, timezone
+from typing import Annotated, Any
+
+from playwright.async_api import TimeoutError as PlaywrightTimeoutError
+from pydantic import Field
+
+from ._common import (
+    ErrorCode,
+    Timer,
+    make_error,
+    make_result,
+    save_artifact,
+)
+from ._session import BrowserNotAvailableError, get_page, no_browser_error
+
+
+def _resolve_ai_mode(
+    selector: str | None,
+    intent: str | None,
+) -> tuple[str | None, str | None]:
+    """Determine AI mode from selector/intent combination.
+
+    Returns (ai_mode, error_code) — if error_code is set, the call should fail.
+    """
+    if intent and not selector:
+        return "proactive", None
+    if intent and selector:
+        return "fallback", None
+    if selector and not intent:
+        return None, None
+    return None, "INVALID_INPUT"
+
+
+async def skyvern_navigate(
+    url: Annotated[str, "The URL to navigate to"],
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+    timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=120000)] = 30000,
+    wait_until: Annotated[str | None, Field(description="Wait condition: load, domcontentloaded, networkidle")] = None,
+) -> dict[str, Any]:
+    """Open a website in the browser. Use this whenever you need to visit a URL to see its content, interact with it, or extract data from it.
+
+    Returns the final URL (after redirects) and page title.
+    """
+    if wait_until is not None and wait_until not in ("load", "domcontentloaded", "networkidle", "commit"):
+        return make_result(
+            "skyvern_navigate",
+            ok=False,
+            error=make_error(
+                ErrorCode.INVALID_INPUT,
+                f"Invalid wait_until: {wait_until}",
+                "Use load, domcontentloaded, networkidle, or commit",
+            ),
+        )
+
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_navigate", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            await page.goto(url, timeout=timeout, wait_until=wait_until)
+            timer.mark("sdk")
+            final_url = page.url
+            title = await page.title()
+        except Exception as e:
+            return make_result(
+                "skyvern_navigate",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check that the URL is valid and accessible"),
+            )
+
+    return make_result(
+        "skyvern_navigate",
+        browser_context=ctx,
+        data={"url": final_url, "title": title, "sdk_equivalent": f'await page.goto("{url}")'},
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_click(
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+    intent: Annotated[
+        str | None, Field(description="Natural language description of the element to click (uses AI)")
+    ] = None,
+    selector: Annotated[str | None, Field(description="CSS selector or XPath for the element to click")] = None,
+    timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
+    button: Annotated[str | None, Field(description="Mouse button: left, right, middle")] = None,
+    click_count: Annotated[int | None, Field(description="Number of clicks (2 for double-click)")] = None,
+) -> dict[str, Any]:
+    """Click an element on the page. Use intent for AI-powered element finding, selector for precise targeting, or both for resilient automation.
+
+    Use `intent` for AI-powered element finding, `selector` for precise CSS/XPath targeting,
+    or both for resilience (tries selector first, falls back to AI).
+    """
+    if button is not None and button not in ("left", "right", "middle"):
+        return make_result(
+            "skyvern_click",
+            ok=False,
+            error=make_error(ErrorCode.INVALID_INPUT, f"Invalid button: {button}", "Use left, right, or middle"),
+        )
+
+    ai_mode, err = _resolve_ai_mode(selector, intent)
+    if err:
+        return make_result(
+            "skyvern_click",
+            ok=False,
+            error=make_error(
+                ErrorCode.INVALID_INPUT,
+                "Must provide intent, selector, or both",
+                "Use intent='describe what to click' for AI-powered clicking, or selector='#css-selector' for precise targeting",
+            ),
+        )
+
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_click", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            kwargs: dict[str, Any] = {"timeout": timeout}
+            if button:
+                kwargs["button"] = button
+            if click_count is not None:
+                kwargs["click_count"] = click_count
+
+            if ai_mode is not None:
+                resolved = await page.click(selector=selector, prompt=intent, ai=ai_mode, **kwargs)  # type: ignore[arg-type]
+            else:
+                assert selector is not None
+                resolved = await page.click(selector=selector, **kwargs)
+            timer.mark("sdk")
+        except PlaywrightTimeoutError as e:
+            return make_result(
+                "skyvern_click",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(
+                    ErrorCode.SELECTOR_NOT_FOUND,
+                    str(e),
+                    "Verify the selector matches an element on the page, or use intent for AI-powered finding",
+                ),
+            )
+        except Exception as e:
+            code = ErrorCode.AI_FALLBACK_FAILED if ai_mode else ErrorCode.ACTION_FAILED
+            return make_result(
+                "skyvern_click",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(
+                    code,
+                    str(e),
+                    "The element may be hidden, disabled, or intercepted by another element",
+                ),
+            )
+
+    data: dict[str, Any] = {"selector": selector, "intent": intent, "ai_mode": ai_mode}
+    if resolved and resolved != selector:
+        data["resolved_selector"] = resolved
+    # Build sdk_equivalent: prefer hybrid selector+prompt for production scripts.
+    # resolved_selector already contains the "xpath=" prefix (e.g. "xpath=//button[@id='x']"),
+    # so pass it directly as the selector positional arg.
+    resolved_sel = resolved if resolved and resolved != selector else selector
+    if resolved_sel and intent:
+        data["sdk_equivalent"] = f'await page.click("{resolved_sel}", prompt="{intent}")'
+    elif ai_mode:
+        data["sdk_equivalent"] = f'await page.click(prompt="{intent}")'
+    elif selector:
+        data["sdk_equivalent"] = f'await page.click("{selector}")'
+
+    return make_result(
+        "skyvern_click",
+        browser_context=ctx,
+        data=data,
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_type(
+    text: Annotated[str, "Text to type into the element"],
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+    intent: Annotated[
+        str | None, Field(description="Natural language description of the input field (uses AI)")
+    ] = None,
+    selector: Annotated[str | None, Field(description="CSS selector or XPath for the input element")] = None,
+    timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
+    clear: Annotated[bool, Field(description="Clear existing content before typing")] = True,
+    delay: Annotated[int | None, Field(description="Delay between keystrokes in ms")] = None,
+) -> dict[str, Any]:
+    """Type text into an input field. Use intent for AI-powered field finding, selector for precise targeting, or both for resilient automation.
+
+    Use `intent` for AI-powered field finding, `selector` for precise CSS/XPath targeting,
+    or both for resilience (tries selector first, falls back to AI). Clears existing content by default.
+    """
+    ai_mode, err = _resolve_ai_mode(selector, intent)
+    if err:
+        return make_result(
+            "skyvern_type",
+            ok=False,
+            error=make_error(
+                ErrorCode.INVALID_INPUT,
+                "Must provide intent, selector, or both",
+                "Use intent='describe the input field' for AI-powered targeting, or selector='#css-selector' for precise targeting",
+            ),
+        )
+
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_type", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            if clear:
+                if ai_mode is not None:
+                    await page.fill(selector=selector, value=text, prompt=intent, ai=ai_mode, timeout=timeout)  # type: ignore[arg-type]
+                else:
+                    assert selector is not None
+                    await page.fill(selector, text, timeout=timeout)
+            else:
+                kwargs: dict[str, Any] = {"timeout": timeout}
+                if delay is not None:
+                    kwargs["delay"] = delay
+                if ai_mode is not None:
+                    loc = page.locator(selector=selector, prompt=intent, ai=ai_mode)  # type: ignore[arg-type]
+                    await loc.type(text, **kwargs)
+                else:
+                    assert selector is not None
+                    await page.type(selector, text, **kwargs)
+            timer.mark("sdk")
+        except PlaywrightTimeoutError as e:
+            return make_result(
+                "skyvern_type",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(
+                    ErrorCode.SELECTOR_NOT_FOUND,
+                    str(e),
+                    "Verify the selector matches an editable element, or use intent for AI-powered finding",
+                ),
+            )
+        except Exception as e:
+            code = ErrorCode.AI_FALLBACK_FAILED if ai_mode else ErrorCode.ACTION_FAILED
+            return make_result(
+                "skyvern_type",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(
+                    code,
+                    str(e),
+                    "The element may not be editable or may be hidden",
+                ),
+            )
+
+    # NOTE: The SDK fill() returns the typed value, not a resolved selector.
+    # Unlike click(), we cannot return resolved_selector here. SKY-7905 will
+    # update the SDK to return element metadata from all action methods.
+    data: dict[str, Any] = {"selector": selector, "intent": intent, "ai_mode": ai_mode, "text_length": len(text)}
+    # Build sdk_equivalent: prefer hybrid selector+prompt for production scripts
+    if selector and intent:
+        data["sdk_equivalent"] = f'await page.fill("{selector}", "{text}", prompt="{intent}")'
+    elif ai_mode:
+        data["sdk_equivalent"] = f'await page.fill(prompt="{intent}", value="{text}")'
+    elif selector:
+        data["sdk_equivalent"] = f'await page.fill("{selector}", "{text}")'
+    return make_result(
+        "skyvern_type",
+        browser_context=ctx,
+        data=data,
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_screenshot(
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+    full_page: Annotated[bool, Field(description="Capture full scrollable page")] = False,
+    selector: Annotated[str | None, Field(description="CSS selector to screenshot specific element")] = None,
+    inline: Annotated[bool, Field(description="Return base64 data instead of file path")] = False,
+) -> dict[str, Any]:
+    """See what's currently on the page. Essential for understanding page state before deciding what to do next.
+
+    By default saves to ~/.skyvern/artifacts/ and returns the file path.
+    Set inline=true to get base64 data directly (increases token usage).
+    """
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_screenshot", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            if selector:
+                element = page.locator(selector)
+                screenshot_bytes = await element.screenshot()
+            else:
+                screenshot_bytes = await page.screenshot(full_page=full_page)
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_screenshot",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check that the page or element is visible"),
+            )
+
+    if inline:
+        data_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
+        return make_result(
+            "skyvern_screenshot",
+            browser_context=ctx,
+            data={
+                "inline": True,
+                "data": data_b64,
+                "mime": "image/png",
+                "bytes": len(screenshot_bytes),
+                "sdk_equivalent": "await page.screenshot()",
+            },
+            timing_ms=timer.timing_ms,
+            warnings=["Inline mode increases token usage"],
+        )
+
+    ts = datetime.now(timezone.utc).strftime("%H%M%S_%f")
+    filename = f"screenshot_{ts}.png"
+    artifact = save_artifact(
+        screenshot_bytes,
+        kind="screenshot",
+        filename=filename,
+        mime="image/png",
+        session_id=ctx.session_id,
+    )
+
+    return make_result(
+        "skyvern_screenshot",
+        browser_context=ctx,
+        data={"path": artifact.path, "sdk_equivalent": "await page.screenshot(path='screenshot.png')"},
+        artifacts=[artifact],
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_scroll(
+    direction: Annotated[str, Field(description="Direction: up, down, left, right")],
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+    amount: Annotated[int | None, Field(description="Pixels to scroll (default 500)")] = None,
+    intent: Annotated[
+        str | None, Field(description="Natural language description of element to scroll into view (uses AI)")
+    ] = None,
+    selector: Annotated[str | None, Field(description="CSS selector of scrollable element")] = None,
+) -> dict[str, Any]:
+    """Scroll the page or use AI to scroll a specific element into view.
+
+    Use `intent` to scroll an AI-located element into view (with or without selector for hybrid fallback).
+    Without intent, scrolls the page or a selector-targeted container by pixel amount.
+    """
+    valid_directions = ("up", "down", "left", "right")
+    if not intent and direction not in valid_directions:
+        return make_result(
+            "skyvern_scroll",
+            ok=False,
+            error=make_error(
+                ErrorCode.INVALID_INPUT, f"Invalid direction: {direction}", "Use up, down, left, or right"
+            ),
+        )
+
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_scroll", ok=False, error=no_browser_error())
+
+    if intent:
+        ai_mode = "fallback" if selector else "proactive"
+        with Timer() as timer:
+            try:
+                loc = page.locator(selector=selector, prompt=intent, ai=ai_mode)
+                await loc.scroll_into_view_if_needed()
+                timer.mark("sdk")
+            except Exception as e:
+                code = ErrorCode.AI_FALLBACK_FAILED if ai_mode == "fallback" else ErrorCode.ACTION_FAILED
+                return make_result(
+                    "skyvern_scroll",
+                    ok=False,
+                    browser_context=ctx,
+                    timing_ms=timer.timing_ms,
+                    error=make_error(code, str(e), "Could not find element to scroll into view"),
+                )
+
+        return make_result(
+            "skyvern_scroll",
+            browser_context=ctx,
+            data={
+                "direction": "into_view",
+                "intent": intent,
+                "ai_mode": ai_mode,
+                "sdk_equivalent": (
+                    f'await page.locator("{selector}", prompt="{intent}").scroll_into_view_if_needed()'
+                    if selector
+                    else f'await page.locator(prompt="{intent}").scroll_into_view_if_needed()'
+                ),
+            },
+            timing_ms=timer.timing_ms,
+        )
+
+    pixels = amount or 500
+    direction_map = {
+        "up": (0, -pixels),
+        "down": (0, pixels),
+        "left": (-pixels, 0),
+        "right": (pixels, 0),
+    }
+    dx, dy = direction_map[direction]
+
+    with Timer() as timer:
+        try:
+            if selector:
+                await page.locator(selector).evaluate(f"el => el.scrollBy({dx}, {dy})")
+            else:
+                await page.evaluate(f"window.scrollBy({dx}, {dy})")
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_scroll",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.ACTION_FAILED, str(e), "Scroll action failed"),
+            )
+
+    return make_result(
+        "skyvern_scroll",
+        browser_context=ctx,
+        data={
+            "direction": direction,
+            "pixels": pixels,
+            "sdk_equivalent": f'await page.evaluate("window.scrollBy({dx}, {dy})")',
+        },
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_select_option(
+    value: Annotated[str, "Value to select"],
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+    intent: Annotated[str | None, Field(description="Natural language description of the dropdown (uses AI)")] = None,
+    selector: Annotated[str | None, Field(description="CSS selector for the select element")] = None,
+    timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
+    by_label: Annotated[bool, Field(description="Select by visible label instead of value")] = False,
+) -> dict[str, Any]:
+    """Select an option from a dropdown menu. Use intent for AI-powered finding, selector for precision.
+
+    Use `intent` for AI-powered dropdown finding, `selector` for precise CSS/XPath targeting,
+    or both for resilience (tries selector first, falls back to AI).
+    """
+    ai_mode, err = _resolve_ai_mode(selector, intent)
+    if err:
+        return make_result(
+            "skyvern_select_option",
+            ok=False,
+            error=make_error(
+                ErrorCode.INVALID_INPUT,
+                "Must provide intent, selector, or both",
+                "Use intent='describe the dropdown' for AI-powered selection, or selector='#css-selector' for precise targeting",
+            ),
+        )
+
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_select_option", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            if ai_mode is not None:
+                # AI paths: pass value= directly -- the AI interprets the text
+                # regardless of whether it represents a value or label.
+                await page.select_option(selector=selector, value=value, prompt=intent, ai=ai_mode, timeout=timeout)  # type: ignore[arg-type]
+            else:
+                assert selector is not None
+                if by_label:
+                    # Bypass SkyvernPage to avoid value="" coercion conflicting with label kwarg.
+                    await page.page.locator(selector).select_option(label=value, timeout=timeout)
+                else:
+                    await page.select_option(selector, value=value, timeout=timeout)
+            timer.mark("sdk")
+        except Exception as e:
+            code = ErrorCode.AI_FALLBACK_FAILED if ai_mode else ErrorCode.ACTION_FAILED
+            return make_result(
+                "skyvern_select_option",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(code, str(e), "Check selector and available options"),
+            )
+
+    # NOTE: The SDK select_option() returns the selected value, not a resolved
+    # selector. Unlike click(), we cannot return resolved_selector here.
+    # SKY-7905 will update the SDK to return element metadata from all action methods.
+    data: dict[str, Any] = {"selector": selector, "intent": intent, "ai_mode": ai_mode, "value": value}
+    # Build sdk_equivalent: prefer hybrid selector+prompt for production scripts
+    if selector and intent:
+        data["sdk_equivalent"] = f'await page.select_option("{selector}", value="{value}", prompt="{intent}")'
+    elif ai_mode:
+        data["sdk_equivalent"] = f'await page.select_option(prompt="{intent}", value="{value}")'
+    elif selector:
+        data["sdk_equivalent"] = f'await page.select_option("{selector}", value="{value}")'
+    return make_result(
+        "skyvern_select_option",
+        browser_context=ctx,
+        data=data,
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_press_key(
+    key: Annotated[str, "Key to press (e.g., Enter, Tab, Escape, ArrowDown)"],
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+    intent: Annotated[
+        str | None, Field(description="Natural language description of element to focus first (uses AI)")
+    ] = None,
+    selector: Annotated[str | None, Field(description="CSS selector to focus first")] = None,
+) -> dict[str, Any]:
+    """Press a keyboard key -- Enter, Tab, Escape, arrow keys, shortcuts, etc.
+
+    Use `intent` or `selector` to focus a specific element before pressing.
+    Without either, presses the key on the currently focused element.
+    """
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_press_key", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            if intent or selector:
+                ai_mode, _ = _resolve_ai_mode(selector, intent)
+                if ai_mode is not None:
+                    loc = page.locator(selector=selector, prompt=intent, ai=ai_mode)  # type: ignore[arg-type]
+                else:
+                    assert selector is not None
+                    loc = page.locator(selector)
+                await loc.press(key)
+            else:
+                await page.keyboard.press(key)
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_press_key",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check key name is valid"),
+            )
+
+    if selector and intent:
+        sdk_eq = f'await page.locator("{selector}", prompt="{intent}").press("{key}")'
+    elif intent:
+        sdk_eq = f'await page.locator(prompt="{intent}").press("{key}")'
+    elif selector:
+        sdk_eq = f'await page.locator("{selector}").press("{key}")'
+    else:
+        sdk_eq = f'await page.keyboard.press("{key}")'
+
+    return make_result(
+        "skyvern_press_key",
+        browser_context=ctx,
+        data={
+            "key": key,
+            "selector": selector,
+            "intent": intent,
+            "sdk_equivalent": sdk_eq,
+        },
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_wait(
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+    time_ms: Annotated[int | None, Field(description="Time to wait in milliseconds")] = None,
+    intent: Annotated[str | None, Field(description="Natural language condition to wait for (uses AI polling)")] = None,
+    selector: Annotated[str | None, Field(description="CSS selector to wait for")] = None,
+    state: Annotated[str | None, Field(description="Element state: visible, hidden, attached, detached")] = "visible",
+    timeout: Annotated[int, Field(description="Max wait time in milliseconds", ge=1000, le=120000)] = 30000,
+    poll_interval_ms: Annotated[
+        int, Field(description="Polling interval for intent-based waits in ms", ge=500, le=10000)
+    ] = 5000,
+) -> dict[str, Any]:
+    """Wait for a condition, element, or time delay before proceeding. Use intent for AI-powered condition checking.
+
+    Use `intent` to poll with AI validation (e.g., "wait until the loading spinner disappears").
+    Use `selector` to wait for an element state. Use `time_ms` for a simple delay.
+    """
+    valid_states = ("visible", "hidden", "attached", "detached")
+    if state is not None and state not in valid_states:
+        return make_result(
+            "skyvern_wait",
+            ok=False,
+            error=make_error(
+                ErrorCode.INVALID_INPUT,
+                f"Invalid state: {state}",
+                "Use visible, hidden, attached, or detached",
+            ),
+        )
+
+    if time_ms is None and not selector and not intent:
+        return make_result(
+            "skyvern_wait",
+            ok=False,
+            error=make_error(
+                ErrorCode.INVALID_INPUT,
+                "Must provide intent, selector, or time_ms",
+                "Use intent='condition to wait for' for AI-powered waiting, selector='#element' for element visibility, or time_ms=5000 for a delay",
+            ),
+        )
+
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_wait", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            if time_ms is not None:
+                await page.wait_for_timeout(time_ms)
+                waited_for = "time"
+            elif intent:
+                loop = asyncio.get_running_loop()
+                deadline = loop.time() + timeout / 1000
+                last_error: Exception | None = None
+                while True:
+                    try:
+                        result = await page.validate(intent)
+                        last_error = None
+                    except Exception as poll_err:
+                        result = False
+                        last_error = poll_err
+                    if result:
+                        break
+                    if loop.time() >= deadline:
+                        code = ErrorCode.SDK_ERROR if last_error else ErrorCode.TIMEOUT
+                        msg = str(last_error) if last_error else f"Condition not met within {timeout}ms: {intent}"
+                        return make_result(
+                            "skyvern_wait",
+                            ok=False,
+                            browser_context=ctx,
+                            timing_ms=timer.timing_ms,
+                            error=make_error(
+                                code,
+                                msg,
+                                "Increase timeout or check that the condition can be satisfied",
+                            ),
+                        )
+                    await page.wait_for_timeout(poll_interval_ms)
+                waited_for = "intent"
+            elif selector:
+                await page.wait_for_selector(selector, state=state, timeout=timeout)
+                waited_for = "selector"
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_wait",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.TIMEOUT, str(e), "Condition was not met within timeout"),
+            )
+
+    sdk_eq = ""
+    if waited_for == "time":
+        sdk_eq = f"await page.wait_for_timeout({time_ms})"
+    elif waited_for == "intent":
+        sdk_eq = f'await page.validate("{intent}")'
+    elif waited_for == "selector":
+        sdk_eq = f'await page.wait_for_selector("{selector}")'
+    return make_result(
+        "skyvern_wait",
+        browser_context=ctx,
+        data={"waited_for": waited_for, "sdk_equivalent": sdk_eq},
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_evaluate(
+    expression: Annotated[str, "JavaScript expression to evaluate"],
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+) -> dict[str, Any]:
+    """Run JavaScript on the page to read DOM state, get URLs, check values, or discover CSS selectors for faster subsequent actions.
+
+    Security: This executes arbitrary JS in the page context. Only use with trusted expressions.
+    """
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_evaluate", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            result = await page.evaluate(expression)
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_evaluate",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.ACTION_FAILED, str(e), "Check JavaScript syntax"),
+            )
+
+    return make_result(
+        "skyvern_evaluate",
+        browser_context=ctx,
+        data={"result": result, "sdk_equivalent": f'await page.evaluate("{expression[:80]}")'},
+        timing_ms=timer.timing_ms,
+    )
+
+
+# ---------------------------------------------------------------------------
+# AI Differentiator Tools
+# ---------------------------------------------------------------------------
+
+
+async def skyvern_extract(
+    prompt: Annotated[str, "Natural language description of what data to extract from the page"],
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+    schema: Annotated[
+        str | None, Field(description="JSON Schema string defining the expected output structure")
+    ] = None,
+) -> dict[str, Any]:
+    """Get structured data from any website -- prices, listings, articles, tables, contact info, etc. Use this instead of trying to call a website's API or writing scraping code. Describe what you need in natural language.
+
+    Optionally provide a JSON `schema` to enforce the output structure (pass as a JSON string).
+    """
+    parsed_schema: dict[str, Any] | None = None
+    if schema is not None:
+        try:
+            parsed_schema = json.loads(schema)
+        except (json.JSONDecodeError, TypeError) as e:
+            return make_result(
+                "skyvern_extract",
+                ok=False,
+                error=make_error(
+                    ErrorCode.INVALID_INPUT,
+                    f"Invalid JSON schema: {e}",
+                    "Provide schema as a valid JSON string",
+                ),
+            )
+
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_extract", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            extracted = await page.extract(prompt=prompt, schema=parsed_schema)
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_extract",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.SDK_ERROR, str(e), "Check that the page has loaded and the prompt is clear"),
+            )
+
+    return make_result(
+        "skyvern_extract",
+        browser_context=ctx,
+        data={"extracted": extracted, "sdk_equivalent": f'await page.extract(prompt="{prompt}")'},
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_validate(
+    prompt: Annotated[str, "Validation condition to check (e.g., 'the login form is visible')"],
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+) -> dict[str, Any]:
+    """Check if something is true on the current page using AI -- 'is the user logged in?', 'does the cart have 3 items?', 'is the form submitted?'
+
+    Returns whether the described condition is true or false.
+    """
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_validate", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            valid = await page.validate(prompt)
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_validate",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.SDK_ERROR, str(e), "Check that the page has loaded and the prompt is clear"),
+            )
+
+    return make_result(
+        "skyvern_validate",
+        browser_context=ctx,
+        data={"prompt": prompt, "valid": valid, "sdk_equivalent": f'await page.validate("{prompt}")'},
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_act(
+    prompt: Annotated[str, "Natural language instruction for the action to perform (e.g., 'close the cookie banner')"],
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+) -> dict[str, Any]:
+    """Perform actions on a web page by describing what to do in plain English -- click buttons, close popups, fill forms, scroll to sections, interact with menus. Use for any website interaction task.
+
+    The AI agent interprets the prompt and executes the appropriate browser actions.
+    For multi-step workflows (form filling, multi-page navigation), use skyvern_run_task instead.
+    """
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_act", ok=False, error=no_browser_error())
+
+    with Timer() as timer:
+        try:
+            await page.act(prompt)
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_act",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.SDK_ERROR, str(e), "Simplify the prompt or break the task into steps"),
+            )
+
+    return make_result(
+        "skyvern_act",
+        browser_context=ctx,
+        data={"prompt": prompt, "completed": True, "sdk_equivalent": f'await page.act("{prompt}")'},
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_run_task(
+    prompt: Annotated[str, "Natural language description of the task to automate"],
+    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+    url: Annotated[
+        str | None, Field(description="URL to navigate to before running (uses current page if omitted)")
+    ] = None,
+    data_extraction_schema: Annotated[
+        str | None, Field(description="JSON Schema string defining what data to extract")
+    ] = None,
+    max_steps: Annotated[int | None, Field(description="Maximum number of agent steps")] = None,
+    timeout_seconds: Annotated[
+        int, Field(description="Timeout in seconds (default 180s = 3 minutes)", ge=10, le=1800)
+    ] = 180,
+) -> dict[str, Any]:
+    """Delegate a complete multi-step web task to an autonomous AI agent. Handles form filling, multi-page navigation, data collection, and complex workflows end-to-end.
+
+    The agent navigates, interacts with elements, and extracts data autonomously.
+    For simple single-step actions, use skyvern_act instead.
+    """
+    try:
+        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
+    except BrowserNotAvailableError:
+        return make_result("skyvern_run_task", ok=False, error=no_browser_error())
+
+    parsed_schema: dict[str, Any] | str | None = None
+    if data_extraction_schema is not None:
+        try:
+            parsed_schema = json.loads(data_extraction_schema)
+        except (json.JSONDecodeError, TypeError) as e:
+            return make_result(
+                "skyvern_run_task",
+                ok=False,
+                browser_context=ctx,
+                error=make_error(
+                    ErrorCode.INVALID_INPUT,
+                    f"Invalid data_extraction_schema JSON: {e}",
+                    "Provide schema as a valid JSON string",
+                ),
+            )
+
+    with Timer() as timer:
+        try:
+            response = await page.agent.run_task(
+                prompt=prompt,
+                url=url,
+                data_extraction_schema=parsed_schema,
+                max_steps=max_steps,
+                timeout=timeout_seconds,
+            )
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_run_task",
+                ok=False,
+                browser_context=ctx,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.SDK_ERROR, str(e), "Check the prompt, URL, and timeout settings"),
+            )
+
+    return make_result(
+        "skyvern_run_task",
+        browser_context=ctx,
+        data={
+            "run_id": response.run_id,
+            "status": response.status,
+            "output": response.output,
+            "failure_reason": response.failure_reason,
+            "recording_url": response.recording_url,
+            "app_url": response.app_url,
+            "sdk_equivalent": f'await page.agent.run_task(prompt="{prompt}")',
+        },
+        timing_ms=timer.timing_ms,
+    )
--- a/skyvern/cli/mcp_tools/session.py
+++ b/skyvern/cli/mcp_tools/session.py
@@ -0,0 +1,261 @@
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+from pydantic import Field
+
+from skyvern.schemas.runs import ProxyLocation
+
+from ._common import BrowserContext, ErrorCode, Timer, make_error, make_result
+from ._session import (
+    SessionState,
+    get_current_session,
+    get_skyvern,
+    resolve_browser,
+    set_current_session,
+)
+
+
+async def skyvern_session_create(
+    timeout: Annotated[int | None, Field(description="Session timeout in minutes (5-1440)")] = 60,
+    proxy_location: Annotated[str | None, Field(description="Proxy location: RESIDENTIAL, US, etc.")] = None,
+    local: Annotated[bool, Field(description="Launch local browser instead of cloud")] = False,
+    headless: Annotated[bool, Field(description="Run local browser in headless mode")] = False,
+) -> dict[str, Any]:
+    """Create a new browser session to start interacting with websites. Creates a cloud browser by default.
+
+    Use local=true for a local Chromium instance.
+    The session persists across tool calls until explicitly closed.
+    """
+    with Timer() as timer:
+        try:
+            skyvern = get_skyvern()
+
+            if local:
+                browser = await skyvern.launch_local_browser(headless=headless)
+                ctx = BrowserContext(mode="local")
+                set_current_session(SessionState(browser=browser, context=ctx))
+                timer.mark("sdk")
+                return make_result(
+                    "skyvern_session_create",
+                    browser_context=ctx,
+                    data={"local": True, "headless": headless},
+                    timing_ms=timer.timing_ms,
+                )
+
+            proxy = ProxyLocation(proxy_location) if proxy_location else None
+            browser = await skyvern.launch_cloud_browser(timeout=timeout, proxy_location=proxy)
+            ctx = BrowserContext(mode="cloud_session", session_id=browser.browser_session_id)
+            set_current_session(SessionState(browser=browser, context=ctx))
+            timer.mark("sdk")
+
+        except ValueError as e:
+            return make_result(
+                "skyvern_session_create",
+                ok=False,
+                timing_ms=timer.timing_ms,
+                error=make_error(
+                    ErrorCode.SDK_ERROR,
+                    str(e),
+                    "Cloud sessions require SKYVERN_API_KEY. Check your environment.",
+                ),
+            )
+        except Exception as e:
+            return make_result(
+                "skyvern_session_create",
+                ok=False,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.SDK_ERROR, str(e), "Failed to create browser session"),
+            )
+
+    return make_result(
+        "skyvern_session_create",
+        browser_context=ctx,
+        data={
+            "session_id": browser.browser_session_id,
+            "timeout_minutes": timeout,
+        },
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_session_close(
+    session_id: Annotated[str | None, Field(description="Session ID to close (uses current if not specified)")] = None,
+) -> dict[str, Any]:
+    """Close a browser session when you're done. Frees cloud resources.
+
+    Closes the specified session or the current active session.
+    """
+    current = get_current_session()
+
+    with Timer() as timer:
+        try:
+            if session_id:
+                skyvern = get_skyvern()
+                await skyvern.close_browser_session(session_id)
+                if current.context and current.context.session_id == session_id:
+                    set_current_session(SessionState())
+                timer.mark("sdk")
+                return make_result(
+                    "skyvern_session_close",
+                    data={"session_id": session_id, "closed": True},
+                    timing_ms=timer.timing_ms,
+                )
+
+            if current.browser is None:
+                return make_result(
+                    "skyvern_session_close",
+                    ok=False,
+                    error=make_error(
+                        ErrorCode.NO_ACTIVE_BROWSER,
+                        "No active session to close",
+                        "Provide a session_id or create a session first",
+                    ),
+                )
+
+            closed_id = current.context.session_id if current.context else None
+            await current.browser.close()
+            set_current_session(SessionState())
+            timer.mark("sdk")
+
+        except Exception as e:
+            return make_result(
+                "skyvern_session_close",
+                ok=False,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.SDK_ERROR, str(e), "Failed to close session"),
+            )
+
+    return make_result(
+        "skyvern_session_close",
+        data={"session_id": closed_id, "closed": True},
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_session_list() -> dict[str, Any]:
+    """List all active browser sessions. Use to find available sessions to connect to."""
+    with Timer() as timer:
+        try:
+            skyvern = get_skyvern()
+            sessions = await skyvern.get_browser_sessions()
+            timer.mark("sdk")
+
+            session_data = [
+                {
+                    "session_id": s.browser_session_id,
+                    "status": s.status,
+                    "started_at": s.started_at.isoformat() if s.started_at else None,
+                    "timeout": s.timeout,
+                    "runnable_id": s.runnable_id,
+                    "available": s.runnable_id is None and s.browser_address is not None,
+                }
+                for s in sessions
+            ]
+
+        except ValueError as e:
+            return make_result(
+                "skyvern_session_list",
+                ok=False,
+                timing_ms=timer.timing_ms,
+                error=make_error(
+                    ErrorCode.SDK_ERROR,
+                    str(e),
+                    "Listing sessions requires SKYVERN_API_KEY",
+                ),
+            )
+        except Exception as e:
+            return make_result(
+                "skyvern_session_list",
+                ok=False,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.SDK_ERROR, str(e), "Failed to list sessions"),
+            )
+
+    current = get_current_session()
+    current_id = current.context.session_id if current.context else None
+
+    return make_result(
+        "skyvern_session_list",
+        data={
+            "sessions": session_data,
+            "count": len(session_data),
+            "current_session_id": current_id,
+        },
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_session_get(
+    session_id: Annotated[str, "Browser session ID to get details for"],
+) -> dict[str, Any]:
+    """Get details about a specific browser session -- status, timeout, availability."""
+    with Timer() as timer:
+        try:
+            skyvern = get_skyvern()
+            session = await skyvern.get_browser_session(session_id)
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_session_get",
+                ok=False,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.BROWSER_NOT_FOUND, str(e), "Check the session ID is correct"),
+            )
+
+    current = get_current_session()
+    is_current = current.context and current.context.session_id == session_id
+
+    return make_result(
+        "skyvern_session_get",
+        browser_context=BrowserContext(mode="cloud_session", session_id=session_id) if is_current else None,
+        data={
+            "session_id": session.browser_session_id,
+            "status": session.status,
+            "started_at": session.started_at.isoformat() if session.started_at else None,
+            "completed_at": session.completed_at.isoformat() if session.completed_at else None,
+            "timeout": session.timeout,
+            "runnable_id": session.runnable_id,
+            "is_current": is_current,
+        },
+        timing_ms=timer.timing_ms,
+    )
+
+
+async def skyvern_session_connect(
+    session_id: Annotated[str | None, Field(description="Cloud session ID (pbs_...)")] = None,
+    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
+) -> dict[str, Any]:
+    """Connect to an existing browser -- a cloud session by ID or any browser via CDP URL.
+
+    Use this to resume work in a previously created session or attach to an external browser.
+    """
+    if not session_id and not cdp_url:
+        return make_result(
+            "skyvern_session_connect",
+            ok=False,
+            error=make_error(
+                ErrorCode.INVALID_INPUT,
+                "Must provide session_id or cdp_url",
+                "Specify which browser to connect to",
+            ),
+        )
+
+    with Timer() as timer:
+        try:
+            browser, ctx = await resolve_browser(session_id=session_id, cdp_url=cdp_url)
+            timer.mark("sdk")
+        except Exception as e:
+            return make_result(
+                "skyvern_session_connect",
+                ok=False,
+                timing_ms=timer.timing_ms,
+                error=make_error(ErrorCode.BROWSER_NOT_FOUND, str(e), "Check the session ID or CDP URL is valid"),
+            )
+
+    return make_result(
+        "skyvern_session_connect",
+        browser_context=ctx,
+        data={"connected": True},
+        timing_ms=timer.timing_ms,
+    )
--- a/skyvern/cli/run_commands.py
+++ b/skyvern/cli/run_commands.py
@@ -10,11 +10,11 @@ import psutil
 import typer
 import uvicorn
 from dotenv import load_dotenv, set_key
-from mcp.server.fastmcp import FastMCP
 from rich.panel import Panel
 from rich.prompt import Confirm

 from skyvern.cli.console import console
+from skyvern.cli.mcp_tools import mcp  # Uses standalone fastmcp (v2.x)
 from skyvern.cli.utils import start_services
 from skyvern.client import SkyvernEnvironment
 from skyvern.config import settings
@@ -27,8 +27,6 @@ from skyvern.utils.env_paths import resolve_backend_env_path, resolve_frontend_e

 run_app = typer.Typer(help="Commands to run Skyvern services such as the API server or UI.")

-mcp = FastMCP("Skyvern")
-

@mcp.tool()
 async def skyvern_run_task(prompt: str, url: str) -> dict[str, Any]:
@@ -53,12 +51,9 @@ async def skyvern_run_task(prompt: str, url: str) -> dict[str, Any]:
    res = await skyvern_agent.run_task(prompt=prompt, url=url, user_agent="skyvern-mcp", wait_for_completion=True)

    output = res.model_dump()["output"]
-    # Primary: use app_url from API response (handles both task and workflow run IDs correctly)
    if res.app_url:
        task_url = res.app_url
    else:
-        # Fallback when app_url is not available (e.g., older API versions)
-        # Determine route based on run_id prefix: 'wr_' for workflows, otherwise tasks
        if res.run_id and res.run_id.startswith("wr_"):
            task_url = f"{settings.SKYVERN_APP_URL.rstrip('/')}/runs/{res.run_id}/overview"
        else: