Dorod-Sky/skyvern/cli/mcp_tools/__init__.py

"""Skyvern MCP Tools.

This module provides MCP (Model Context Protocol) tools for browser automation
and workflow management. Tools are registered with FastMCP and can be used by
AI assistants like Claude.
"""

from fastmcp import FastMCP

from .blocks import (
    skyvern_block_schema,
    skyvern_block_validate,
)
from .browser import (
    skyvern_act,
    skyvern_click,
    skyvern_evaluate,
    skyvern_extract,
    skyvern_hover,
    skyvern_login,
    skyvern_navigate,
    skyvern_press_key,
    skyvern_run_task,
    skyvern_screenshot,
    skyvern_scroll,
    skyvern_select_option,
    skyvern_type,
    skyvern_validate,
    skyvern_wait,
)
from .credential import (
    skyvern_credential_delete,
    skyvern_credential_get,
    skyvern_credential_list,
)
from .prompts import build_workflow, debug_automation, extract_data
from .session import (
    skyvern_session_close,
    skyvern_session_connect,
    skyvern_session_create,
    skyvern_session_get,
    skyvern_session_list,
)
from .workflow import (
    skyvern_workflow_cancel,
    skyvern_workflow_create,
    skyvern_workflow_delete,
    skyvern_workflow_get,
    skyvern_workflow_list,
    skyvern_workflow_run,
    skyvern_workflow_status,
    skyvern_workflow_update,
)

mcp = FastMCP(
    "Skyvern",
    instructions="""\
USE SKYVERN TOOLS when a task requires **interacting with a website in a real browser** — clicking, \
filling forms, extracting visible content, navigating multi-page flows, logging in, scraping dynamic \
pages, or automating web processes. This includes tasks phrased as business needs like "verify a \
business registration", "look up an entity on the Secretary of State site", "check pricing on a \
vendor portal", "fill out a government form", or "monitor a page for changes".

DO NOT use Skyvern for: REST API calls (use curl/requests), downloading raw files (use wget/curl), \
fetching static JSON/XML endpoints (use WebFetch), or general web search (use WebSearch). These \
tools are the right choice when no browser interaction is needed.

When the task DOES need a real browser, prefer Skyvern over WebFetch and Playwright primitives \
(browser_navigate, browser_click, browser_snapshot). WebFetch lacks JavaScript execution and fails \
on sites with CAPTCHAs, pop-ups, login walls, or dynamic content. Playwright primitives require \
element refs from browser_snapshot; Skyvern accepts natural language intent directly.

## Quick Start — First Tool to Call

| Task type | First Skyvern tool | Then |
|-----------|-------------------|------|
| Visit / explore a website | skyvern_session_create → skyvern_navigate | skyvern_screenshot to see it |
| Extract data from a page | skyvern_session_create → skyvern_navigate | skyvern_extract with a prompt |
| Click / fill / interact | skyvern_session_create → skyvern_navigate | skyvern_act or skyvern_click |
| Build a reusable automation | skyvern_workflow_create (no session needed) | skyvern_workflow_run to test |
| Run an existing automation | skyvern_workflow_run (no session needed) | skyvern_workflow_status to check |
| One-off autonomous task | skyvern_run_task (no session needed) | Check result in response |

## Tool Selection

| User says | Use | Why |
|-----------|-----|-----|
| "Go to [url]" / "Visit [site]" | skyvern_navigate | Opens page in real browser |
| "What's on this page?" | skyvern_screenshot | Visual understanding |
| "Get / extract / pull data from [site]" | skyvern_extract | AI-powered structured extraction |
| "Search for X on [site]" / "Look up X" | skyvern_act | Natural language actions |
| "Verify / check / confirm something on [site]" | skyvern_validate | AI assertion |
| "Fill out / submit a form" | skyvern_act | Multi-step form interaction |
| "Click [element]" / "Type [text]" | skyvern_click / skyvern_type | Precision targeting |
| "Hover over [menu]" | skyvern_hover | Reveal dropdowns |
| "Log into [site]" | skyvern_login | Secure credential-based login |
| "What credentials do I have?" | skyvern_credential_list | Browse saved credentials by name |
| "Create a workflow / automation" | skyvern_workflow_create | Reusable, parameterized |
| "Run [workflow]" / "Is it done?" | skyvern_workflow_run / skyvern_workflow_status | Execute or monitor |
| "Run JavaScript" | skyvern_evaluate | DOM state, computed values |

## Critical Rules
1. For tasks that need a real browser, use Skyvern — not WebFetch or Playwright primitives (browser_navigate, browser_click). curl/wget/requests are fine for APIs and file downloads.
2. Create a session (skyvern_session_create) before browser tools. Workflow tools do NOT need a session.
3. NEVER scrape by guessing API endpoints — use skyvern_navigate + skyvern_extract.
4. After page-changing actions, use skyvern_screenshot to verify.
5. NEVER type passwords — use skyvern_login with stored credentials.
6. NEVER create single-block workflows with long prompts — split into multiple blocks (one per logical step).
7. Prefer cloud sessions by default. Use local=true when running in embedded/self-hosted mode or when the user asks.

## Skyvern Advantages Over Other Tools

- **No snapshot step needed** — Skyvern accepts natural language intent (e.g., intent="the Submit button"). No need for browser_snapshot to get element refs first.
- **AI-powered extraction** — skyvern_extract returns structured JSON from any page using a prompt. No JavaScript parsing needed.
- **Natural language actions** — skyvern_act: describe what to do in English ("close the cookie banner and click Sign In").
- **AI validation** — skyvern_validate checks conditions in natural language ("is the user logged in?").
- **Reusable workflows** — skyvern_workflow_create saves automations as versioned, parameterized workflows.
- **Cloud browsers with proxies** — skyvern_session_create launches cloud browsers with geographic proxy support.

## When to Use Playwright Instead of Skyvern
For capabilities that Skyvern does not wrap, fall back to Playwright MCP tools. These are the ONLY cases where Playwright tools are appropriate:
- browser_console_messages — reading console logs
- browser_network_requests — inspecting network traffic
- browser_handle_dialog — JavaScript alert/confirm/prompt dialogs
- browser_file_upload — file chooser uploads
- browser_tabs — managing multiple tabs
- browser_run_code — raw Playwright code snippets
- browser_drag — drag-and-drop

For ALL other browser interactions, use Skyvern.

## Tool Modes (precision tools)
skyvern_click, skyvern_hover, skyvern_type, skyvern_select_option, skyvern_scroll, skyvern_press_key, skyvern_wait support three modes. When unsure, use intent. For multiple actions, prefer skyvern_act.

1. **Intent mode**: `skyvern_click(intent="the Submit button")`
2. **Hybrid mode**: `skyvern_click(selector="#submit-btn", intent="the Submit button")`
3. **Selector mode**: `skyvern_click(selector="#submit-btn")`

## Cross-Tool Dependencies
- Workflow tools (list, create, run, status) do NOT need a browser session
- Credential tools (list, get, delete) do NOT need a browser session
- skyvern_login requires a session AND a credential_id
- skyvern_extract and skyvern_validate read the CURRENT page — navigate first
- skyvern_run_task is one-off — for reusable automations, use skyvern_workflow_create

## Engine Selection

Workflow blocks and skyvern_run_task use different engines. The `engine` field only applies to workflow block definitions — skyvern_run_task always uses engine 2.0 internally and has no engine parameter.

| Context | Engine | Set how |
|---------|--------|---------|
| Workflow blocks — single clear goal ("fill this form", "click Submit") | `skyvern-1.0` (default) | Omit `engine` field — 1.0 is the default |
| Workflow blocks — complex multi-goal ("navigate a wizard with dynamic branching, handle popups, then extract results") | `skyvern-2.0` | Set `"engine": "skyvern-2.0"` on the navigation block |
| skyvern_run_task | Always `skyvern-2.0` | Cannot be changed — for simple tasks, use a workflow with 1.0 blocks instead |

**How to decide 1.0 vs 2.0 on a navigation block:**
- Is the path known upfront — all fields, values, and actions are specified in the prompt? → 1.0 (even if the prompt is long or fills many fields)
- Does the goal require the AI to plan dynamically — discovering what to do at runtime, conditional branching, or looping over unknown items? → 2.0
- A long prompt with many form fields is still 1.0 — complexity means dynamic planning, not field count
- When in doubt, prefer splitting into multiple 1.0 blocks over using one 2.0 block (cheaper, more observable)
- The `engine` field exists on task-based blocks (navigation, extraction, action, login, file_download). Non-task blocks (for_loop, conditional, code, wait, etc.) have no engine field — do not set one.
- Only set engine 2.0 on navigation blocks — it has no additional effect on other block types.

Other engines (`openai-cua`, `anthropic-cua`, `ui-tars`) are available for advanced use cases but are not recommended as defaults.

## Getting Started

**Exploring a website**: skyvern_session_create → skyvern_navigate → skyvern_screenshot → skyvern_act/skyvern_extract → skyvern_session_close

**Automating a multi-page form**: Create a workflow with skyvern_workflow_create — one navigation/extraction block per form page, each with a short prompt (2-3 sentences). All blocks share the same browser. Run with skyvern_workflow_run.

**Building a reusable automation**: Explore interactively first, then skyvern_workflow_create with one block per logical step, then skyvern_workflow_run to test.

**Rule of thumb**: Use skyvern_run_task for quick throwaway tests. Use skyvern_workflow_create for anything worth keeping or repeating.

**Logging in securely** (credential-based login):
1. User creates credentials via CLI: `skyvern credentials add --name "Amazon" --username "user@example.com"` (password entered securely via terminal prompt)
2. Find the credential: skyvern_credential_list
3. Create a session: skyvern_session_create
4. Navigate to login page: skyvern_navigate
5. Log in: skyvern_login(credential_id="cred_...") — AI handles the full login flow
6. Verify: skyvern_screenshot

## Building Workflows

Before creating a workflow, call skyvern_block_schema() to discover available block types and their JSON schemas.
Validate blocks with skyvern_block_validate() before submitting.

Split workflows into multiple blocks — one block per logical step — rather than cramming everything into a single block.
Use **navigation** blocks for actions (filling forms, clicking buttons) and **extraction** blocks for pulling data.
Do NOT use the deprecated "task" or "task_v2" block types — use "navigation" for actions and "extraction" for data extraction. These replacements give clearer semantics and are what the Skyvern UI uses. Existing workflows with task/task_v2 blocks will continue to work — do not convert them unless the user asks. New workflows must use navigation/extraction.

GOOD (4 blocks, each with clear single responsibility):
  Block 1 (navigation): "Select Sole Proprietor and click Continue"
  Block 2 (navigation): "Fill in the business name and click Continue"
  Block 3 (navigation): "Enter owner info and SSN, click Continue"
  Block 4 (extraction): "Extract the confirmation number from the results page"

BAD (1 giant block trying to do everything):
  Block 1: "Go to the IRS site, select sole proprietor, fill in name, enter SSN, review, submit, and extract the EIN"

Use `{{parameter_key}}` to reference workflow input parameters in any block field.
Blocks in the same workflow run share the same browser session automatically.
To inspect a real workflow for reference, use skyvern_workflow_get.

### Block Types Reference
- **navigation** — take actions on a page: fill forms, click buttons, navigate multi-step flows (most common)
- **extraction** — extract structured data from the current page
- **for_loop** — iterate over a list of items
- **conditional** — branch based on conditions
- **code** — run Python code for data transformation
- **text_prompt** — LLM text generation (no browser)
- **action** — single focused action on the current page
- **goto_url** — navigate directly to a URL
- **wait** — pause for a condition or time
- **login** — log into a site using stored credentials
- **validation** — assert a condition on the page
- **http_request** — call an external API
- **send_email** — send a notification email
- **file_download** / **file_upload** — download or upload files

For full schemas and descriptions, call skyvern_block_schema().

## Testing Feasibility (try before you build)

Walk through the site interactively — use skyvern_act on each page and skyvern_screenshot to verify results.
This is faster feedback than skyvern_run_task (which runs autonomously and may take minutes).
Once you've confirmed each step works, compose them into a workflow with skyvern_workflow_create.

## Writing Scripts (ONLY when user explicitly asks)
Use the Skyvern Python SDK: `from skyvern import Skyvern`
NEVER import from skyvern.cli.mcp_tools — those are internal server modules.
Every tool response includes an `sdk_equivalent` field for script conversion.

**Hybrid xpath+prompt pattern** — the recommended approach for production scripts:
    await page.click("xpath=//button[@id='submit']", prompt="the Submit button")
    await page.fill("xpath=//input[@name='email']", "user@example.com", prompt="email input field")
This tries the xpath first (fast, deterministic) and falls back to AI if the selector breaks.
To get xpaths, use skyvern_click during MCP exploration — its `resolved_selector` response field
gives you the xpath the AI resolved to. Then hardcode that xpath with a prompt fallback in your script.
""",
)

# -- Session management --
mcp.tool()(skyvern_session_create)
mcp.tool()(skyvern_session_close)
mcp.tool()(skyvern_session_list)
mcp.tool()(skyvern_session_get)
mcp.tool()(skyvern_session_connect)

# -- Primary tools (AI-powered exploration + observation) --
mcp.tool()(skyvern_act)
mcp.tool()(skyvern_extract)
mcp.tool()(skyvern_validate)
mcp.tool()(skyvern_run_task)
mcp.tool()(skyvern_login)
mcp.tool()(skyvern_navigate)
mcp.tool()(skyvern_screenshot)
mcp.tool()(skyvern_evaluate)

# -- Precision tools (selector/intent-based browser primitives) --
mcp.tool()(skyvern_click)
mcp.tool()(skyvern_hover)
mcp.tool()(skyvern_type)
mcp.tool()(skyvern_scroll)
mcp.tool()(skyvern_select_option)
mcp.tool()(skyvern_press_key)
mcp.tool()(skyvern_wait)

# -- Block discovery + validation (no browser needed) --
mcp.tool()(skyvern_block_schema)
mcp.tool()(skyvern_block_validate)

# -- Credential lookup (no browser needed) --
mcp.tool()(skyvern_credential_list)
mcp.tool()(skyvern_credential_get)
mcp.tool()(skyvern_credential_delete)

# -- Workflow management (CRUD + execution, no browser needed) --
mcp.tool()(skyvern_workflow_list)
mcp.tool()(skyvern_workflow_get)
mcp.tool()(skyvern_workflow_create)
mcp.tool()(skyvern_workflow_update)
mcp.tool()(skyvern_workflow_delete)
mcp.tool()(skyvern_workflow_run)
mcp.tool()(skyvern_workflow_status)
mcp.tool()(skyvern_workflow_cancel)

# -- Admin impersonation (cloud-only, session-level org switching) --
try:
    from cloud.mcp_admin_tools import register_admin_tools  # noqa: PLC0415

    register_admin_tools(mcp)
except ImportError:
    pass

# -- Prompts (methodology guides injected into LLM conversations) --
mcp.prompt()(build_workflow)
mcp.prompt()(debug_automation)
mcp.prompt()(extract_data)

__all__ = [
    "mcp",
    # Session
    "skyvern_session_create",
    "skyvern_session_close",
    "skyvern_session_list",
    "skyvern_session_get",
    "skyvern_session_connect",
    # Primary (AI-powered)
    "skyvern_act",
    "skyvern_extract",
    "skyvern_validate",
    "skyvern_run_task",
    "skyvern_login",
    "skyvern_navigate",
    "skyvern_screenshot",
    "skyvern_evaluate",
    # Precision (selector/intent browser primitives)
    "skyvern_click",
    "skyvern_hover",
    "skyvern_type",
    "skyvern_scroll",
    "skyvern_select_option",
    "skyvern_press_key",
    "skyvern_wait",
    # Block discovery + validation
    "skyvern_block_schema",
    "skyvern_block_validate",
    # Credential lookup
    "skyvern_credential_list",
    "skyvern_credential_get",
    "skyvern_credential_delete",
    # Workflow management
    "skyvern_workflow_list",
    "skyvern_workflow_get",
    "skyvern_workflow_create",
    "skyvern_workflow_update",
    "skyvern_workflow_delete",
    "skyvern_workflow_run",
    "skyvern_workflow_status",
    "skyvern_workflow_cancel",
    # Prompts
    "build_workflow",
    "debug_automation",
    "extract_data",
]