diff --git a/skyvern/cli/mcp_tools/__init__.py b/skyvern/cli/mcp_tools/__init__.py index b4b0cefb..10f8c396 100644 --- a/skyvern/cli/mcp_tools/__init__.py +++ b/skyvern/cli/mcp_tools/__init__.py @@ -7,6 +7,10 @@ AI assistants like Claude. from fastmcp import FastMCP +from .blocks import ( + skyvern_block_schema, + skyvern_block_validate, +) from .browser import ( skyvern_act, skyvern_click, @@ -44,21 +48,52 @@ mcp = FastMCP( "Skyvern", instructions="""Use Skyvern tools whenever the task involves visiting, browsing, or interacting with ANY website or web application. -## When to Use These Tools -Reach for Skyvern tools when the user asks you to: -- Visit, browse, or interact with ANY website or web application -- Extract data from web pages (prices, listings, articles, tables, search results, etc.) -- Fill out forms, log in, sign up, or complete web-based workflows -- Check the current state of a web page or verify something on a site -- Do anything you would otherwise attempt with requests, beautifulsoup, selenium, or playwright -- Access website data where you are unsure whether an API endpoint exists -- Create, run, monitor, or manage web automations (Skyvern workflows) -- Set up reusable, parameterized automations that run on Skyvern's cloud -- Check the status of running automations or retrieve their results +## Tool Selection (read this first) -DO NOT try to scrape websites by guessing API endpoints or writing HTTP requests. -Instead, use skyvern_navigate + skyvern_extract to get real data from actual pages. -These tools give you a real browser — use them instead of writing scraping code. +**Which tool do I use?** + +| Scenario | Use | Why | +|----------|-----|-----| +| Visit a website | skyvern_navigate | First step — opens the page | +| See what's on the page | skyvern_screenshot | Visual understanding before acting | +| Get data from a page | skyvern_extract | AI-powered structured extraction | +| Do something on a page (click, fill, scroll) | skyvern_act | Natural language actions | +| Click/type/select a specific element | skyvern_click / skyvern_type / skyvern_select_option | Precision targeting by selector or AI intent | +| Check if something is true | skyvern_validate | AI assertion ("is the user logged in?") | +| Run a quick one-off task | skyvern_run_task | Autonomous agent, one-time, nothing saved | +| Build an automation (any multi-step task) | skyvern_workflow_create | Reusable, versioned, per-step observability | +| Run an existing automation | skyvern_workflow_run | Execute saved workflow with parameters | +| Run JavaScript | skyvern_evaluate | Read DOM state, get values | + +**Rule of thumb**: For anything worth keeping or repeating, create a workflow. Use skyvern_run_task only for quick throwaway tests. + +**Common mistake**: Don't create a single-block workflow with a long prompt listing all steps. +Split into separate blocks — one per logical step. Each block should have a prompt of 2-3 sentences. + +## Critical Rules +1. ALWAYS create a session (skyvern_session_create) before using browser tools. +2. NEVER scrape by guessing API endpoints or writing HTTP requests — use skyvern_navigate + skyvern_extract. +3. NEVER create single-block workflows with long prompts — split into multiple blocks. +4. NEVER import from skyvern.cli.mcp_tools — use `from skyvern import Skyvern` for SDK scripts. +5. After page-changing actions (skyvern_click, skyvern_act), use skyvern_screenshot to verify the result. + +## Cross-Tool Dependencies +- Workflow tools (list, create, run, status) do NOT need a browser session +- skyvern_extract and skyvern_validate read the CURRENT page — navigate first +- skyvern_run_task is a one-off throwaway agent run — for reusable automations, use skyvern_workflow_create instead + +## Tool Modes (precision tools) +Precision tools (skyvern_click, skyvern_type, skyvern_select_option, skyvern_scroll, skyvern_press_key, skyvern_wait) +support three modes. When unsure, use `intent`. For multiple actions in sequence, prefer skyvern_act. + +1. **Intent mode** — AI-powered element finding: + `skyvern_click(intent="the blue Submit button")` + +2. **Hybrid mode** — tries selector first, AI fallback: + `skyvern_click(selector="#submit-btn", intent="the Submit button")` + +3. **Selector mode** — deterministic CSS/XPath targeting: + `skyvern_click(selector="#submit-btn")` ## Examples | User says | Use | @@ -68,13 +103,13 @@ These tools give you a real browser — use them instead of writing scraping cod | "Get all product prices" | skyvern_extract | | "Click the login button" | skyvern_act or skyvern_click | | "Fill out this form" | skyvern_act | -| "Log in and buy the first item" | skyvern_run_task | +| "Log in and download the report" | skyvern_run_task (one-off) or skyvern_workflow_create (keep it) | | "Is checkout complete?" | skyvern_validate | -| "List my workflows" | skyvern_workflow_list | +| "Fill out this 6-page application form" | skyvern_workflow_create (one block per page) | +| "Set up a reusable automation" | Explore with browser tools, then skyvern_workflow_create | | "Create a workflow that monitors prices" | skyvern_workflow_create | | "Run the login workflow" | skyvern_workflow_run | | "Is my workflow done?" | skyvern_workflow_status | -| "Set up a reusable automation for this" | Explore with browser tools, then skyvern_workflow_create | | "Write a script to do this" | Skyvern SDK (see below) | ## Getting Started @@ -84,43 +119,63 @@ These tools give you a real browser — use them instead of writing scraping cod 2. Navigate and interact with browser tools 3. Close with skyvern_session_close when done -**Managing automations** (running, listing, or monitoring workflows): -No browser session needed — use workflow tools directly: -skyvern_workflow_list, skyvern_workflow_run, skyvern_workflow_status, etc. +**Automating a multi-page form** (the most common use case): +1. Create a workflow with skyvern_workflow_create — one task block per form page +2. Each block gets a short, focused prompt (2-3 sentences max) +3. All blocks in a run share the same browser automatically +4. Run with skyvern_workflow_run **Building a reusable automation** (explore a site, then save as a workflow): 1. **Explore** — Create a browser session, navigate the site, use skyvern_extract and skyvern_screenshot to understand the page structure 2. **Create** — Build a workflow definition and save it with skyvern_workflow_create 3. **Test** — Run the workflow with skyvern_workflow_run and check results with skyvern_workflow_status -## Workflows vs Scripts +**Managing automations** (running, listing, or monitoring workflows): +No browser session needed — use workflow tools directly: +skyvern_workflow_list, skyvern_workflow_run, skyvern_workflow_status, etc. -When the user wants something **persistent, versioned, and managed in Skyvern's dashboard** — create a workflow. -Trigger words: "automation", "workflow", "reusable", "schedule", "monitor", "set up" -→ Use skyvern_workflow_create with a JSON definition (see example below) +## Building Workflows -When the user wants **custom Python code** to run in their own environment — write an SDK script. -Trigger words: "script", "code", "function", "program" -→ Use `from skyvern import Skyvern` (see Writing Scripts section) +Before creating a workflow, call skyvern_block_schema() to discover available block types and their JSON schemas. +Validate blocks with skyvern_block_validate() before submitting. -### Workflow definition example (JSON, for skyvern_workflow_create): - { - "title": "Price Monitor", - "workflow_definition": { - "parameters": [ - {"parameter_type": "workflow", "key": "url", "workflow_parameter_type": "string"} - ], - "blocks": [ - {"block_type": "task", "label": "extract_prices", "url": "{{url}}", "engine": "skyvern-2.0", - "navigation_goal": "Extract all product names and prices from the page", - "data_extraction_goal": "Get product names and prices as a list", - "data_schema": {"type": "object", "properties": {"products": {"type": "array", - "items": {"type": "object", "properties": {"name": {"type": "string"}, "price": {"type": "string"}}}}}}} - ] - } - } -Use `{{parameter_key}}` to reference workflow parameters in block fields. -To inspect a real workflow for reference, use skyvern_workflow_get on an existing workflow. +ALWAYS split workflows into multiple blocks — one task block per logical step: + +GOOD (4 blocks, each with clear single responsibility): + Block 1: "Select Sole Proprietor and click Continue" + Block 2: "Fill in the business name and click Continue" + Block 3: "Enter owner info and SSN, click Continue" + Block 4: "Review and submit. Extract the confirmation number." + +BAD (1 giant block trying to do everything): + Block 1: "Go to the IRS site, select sole proprietor, fill in name, enter SSN, review, submit, and extract the EIN" + +Use {{parameter_key}} to reference workflow input parameters in any block field. + +## Data Flow Between Blocks +- Use `{{parameter_key}}` to reference workflow input parameters in any block field +- Blocks in the same workflow run share the same browser session automatically +- To inspect a real workflow for reference, use skyvern_workflow_get on an existing workflow + +## Block Types Reference +Common block types for workflow definitions: +- **task** — AI agent interacts with a page (the most common block type) +- **for_loop** — iterate over a list of items +- **conditional** — branch based on conditions +- **code** — run Python code for data transformation +- **text_prompt** — LLM text generation (no browser) +- **extraction** — extract data from current page +- **action** — single AI action on current page +- **navigation** — navigate to a URL +- **wait** — pause for a condition or time +- **login** — log into a site using stored credentials +- **validation** — assert a condition on the page +- **http_request** — call an external API +- **send_email** — send a notification email +- **file_download** / **file_upload** — download or upload files +- **goto_url** — navigate to a specific URL within a workflow + +For full schemas and descriptions, call skyvern_block_schema(). ## Writing Scripts and Code When asked to write an automation script, use the Skyvern Python SDK with the **hybrid xpath+prompt @@ -150,66 +205,9 @@ The `resolved_selector` field in responses gives you the xpath the AI resolved t IMPORTANT: NEVER import from skyvern.cli.mcp_tools — those are internal server modules. The public SDK is: from skyvern import Skyvern -## Primary Tools (use these first) -These are the tools you should reach for by default: +Every tool response includes an `sdk_equivalent` field showing the corresponding SDK call for scripts. +Currently only skyvern_click returns `resolved_selector`. Support for other tools is planned (SKY-7905). -- **skyvern_act** — Execute actions from natural language: "log in with test@example.com", "add the first item to cart". Best for exploration and testing flows. -- **skyvern_extract** — Pull structured data from any page with natural language + optional JSON Schema. THE differentiator over raw Playwright. -- **skyvern_validate** — Assert page conditions with AI: "is the user logged in?", "does the cart have 3 items?" -- **skyvern_run_task** — Delegate a full multi-step task to an autonomous AI agent with observability. Use for end-to-end task execution. -- **skyvern_navigate** — Go to a URL. Always the first step after connecting. -- **skyvern_screenshot** — See what's on the page. Essential for understanding page state. -- **skyvern_evaluate** — Run JavaScript to read DOM state, get URLs, or check values. - -## Precision Tools (for debugging and exact control) -Use these when the primary tools aren't specific enough, or when you need deterministic -selector-based actions (e.g., replaying a known flow): - -- **skyvern_click** — Click a specific element by selector or AI intent -- **skyvern_type** — Type into a specific input field by selector or AI intent -- **skyvern_scroll** — Scroll the page or an element into view -- **skyvern_select_option** — Select a dropdown option by selector or AI intent -- **skyvern_press_key** — Press a keyboard key (Enter, Tab, Escape, etc.) -- **skyvern_wait** — Wait for a condition, element, or time delay - -## Tool Modes (precision tools) -Precision tools support three modes. When unsure, use `intent`. - -1. **Intent mode** — AI-powered element finding: - `skyvern_click(intent="the blue Submit button")` - -2. **Hybrid mode** — tries selector first, AI fallback: - `skyvern_click(selector="#submit-btn", intent="the Submit button")` - -3. **Selector mode** — deterministic CSS/XPath targeting: - `skyvern_click(selector="#submit-btn")` - -## Replay Story: From Exploration to Production -When you use precision tools (skyvern_click, skyvern_type, etc.) with intent mode, the response -includes `resolved_selector` — the xpath/CSS the AI found. Capture these for hybrid scripts or -workflow definitions. - -**The hybrid pattern** is the recommended default for SDK scripts: - await page.click("xpath=//button[@id='submit']", prompt="the Submit button") -It tries the selector first (fast, no AI cost), then falls back to AI if the selector breaks. - -The `sdk_equivalent` field in each tool response shows the correct hybrid call to use in scripts. - -Note: Currently only skyvern_click returns resolved_selector. Support for skyvern_type and -skyvern_select_option is planned (SKY-7905). - -## Workflow Management -Use these tools to create, manage, and run Skyvern workflows programmatically. -Workflows are persistent, versioned, multi-step automations that can be parameterized and scheduled. - -- **skyvern_workflow_list** — Find workflows by name or browse all available workflows -- **skyvern_workflow_get** — Get the full definition of a workflow to inspect its blocks and parameters -- **skyvern_workflow_create** — Create a new workflow from a YAML or JSON definition -- **skyvern_workflow_update** — Update an existing workflow's definition (creates a new version) -- **skyvern_workflow_delete** — Delete a workflow (requires force=true confirmation) -- **skyvern_workflow_run** — Execute a workflow with parameters (returns immediately by default, or wait for completion) -- **skyvern_workflow_status** — Check the status and progress of a running or completed workflow run -- **skyvern_workflow_cancel** — Cancel a running workflow """, ) @@ -237,6 +235,10 @@ mcp.tool()(skyvern_select_option) mcp.tool()(skyvern_press_key) mcp.tool()(skyvern_wait) +# -- Block discovery + validation (no browser needed) -- +mcp.tool()(skyvern_block_schema) +mcp.tool()(skyvern_block_validate) + # -- Workflow management (CRUD + execution, no browser needed) -- mcp.tool()(skyvern_workflow_list) mcp.tool()(skyvern_workflow_get) @@ -270,6 +272,9 @@ __all__ = [ "skyvern_select_option", "skyvern_press_key", "skyvern_wait", + # Block discovery + validation + "skyvern_block_schema", + "skyvern_block_validate", # Workflow management "skyvern_workflow_list", "skyvern_workflow_get", diff --git a/skyvern/cli/mcp_tools/blocks.py b/skyvern/cli/mcp_tools/blocks.py new file mode 100644 index 00000000..4c00f99b --- /dev/null +++ b/skyvern/cli/mcp_tools/blocks.py @@ -0,0 +1,444 @@ +"""Skyvern MCP block tools — discover block types and schemas for workflow definitions. + +Tools for listing available workflow block types and retrieving their Pydantic schemas, +knowledge base descriptions, and minimal examples. These tools do not require a browser +session or API connection — they serve pure metadata from the codebase. +""" + +from __future__ import annotations + +import json +import re +from pathlib import Path +from typing import Annotated, Any + +import structlog +from pydantic import Field, TypeAdapter, ValidationError + +from skyvern.schemas.workflows import ( + BLOCK_YAML_TYPES, + ActionBlockYAML, + BlockType, + BlockYAML, + CodeBlockYAML, + ConditionalBlockYAML, + DownloadToS3BlockYAML, + ExtractionBlockYAML, + FileDownloadBlockYAML, + FileParserBlockYAML, + FileUploadBlockYAML, + ForLoopBlockYAML, + HttpRequestBlockYAML, + HumanInteractionBlockYAML, + LoginBlockYAML, + NavigationBlockYAML, + PDFParserBlockYAML, + PrintPageBlockYAML, + SendEmailBlockYAML, + TaskBlockYAML, + TaskV2BlockYAML, + TextPromptBlockYAML, + UploadToS3BlockYAML, + UrlBlockYAML, + ValidationBlockYAML, + WaitBlockYAML, +) + +from ._common import ErrorCode, make_error, make_result + +LOG = structlog.get_logger(__name__) + +# --------------------------------------------------------------------------- +# Block type → YAML class mapping +# --------------------------------------------------------------------------- + +BLOCK_TYPE_MAP: dict[str, type[BlockYAML]] = { + BlockType.TASK.value: TaskBlockYAML, + BlockType.TaskV2.value: TaskV2BlockYAML, + BlockType.FOR_LOOP.value: ForLoopBlockYAML, + BlockType.CONDITIONAL.value: ConditionalBlockYAML, + BlockType.CODE.value: CodeBlockYAML, + BlockType.TEXT_PROMPT.value: TextPromptBlockYAML, + BlockType.EXTRACTION.value: ExtractionBlockYAML, + BlockType.ACTION.value: ActionBlockYAML, + BlockType.NAVIGATION.value: NavigationBlockYAML, + BlockType.LOGIN.value: LoginBlockYAML, + BlockType.WAIT.value: WaitBlockYAML, + BlockType.VALIDATION.value: ValidationBlockYAML, + BlockType.HTTP_REQUEST.value: HttpRequestBlockYAML, + BlockType.SEND_EMAIL.value: SendEmailBlockYAML, + BlockType.FILE_DOWNLOAD.value: FileDownloadBlockYAML, + BlockType.FILE_UPLOAD.value: FileUploadBlockYAML, + BlockType.GOTO_URL.value: UrlBlockYAML, + BlockType.DOWNLOAD_TO_S3.value: DownloadToS3BlockYAML, + BlockType.UPLOAD_TO_S3.value: UploadToS3BlockYAML, + BlockType.FILE_URL_PARSER.value: FileParserBlockYAML, + BlockType.PDF_PARSER.value: PDFParserBlockYAML, + BlockType.HUMAN_INTERACTION.value: HumanInteractionBlockYAML, + BlockType.PRINT_PAGE.value: PrintPageBlockYAML, +} + +# --------------------------------------------------------------------------- +# One-line summaries +# --------------------------------------------------------------------------- + +BLOCK_SUMMARIES: dict[str, str] = { + "task": "AI agent navigates a page, fills forms, clicks buttons (v1 engine)", + "task_v2": "AI agent with natural language prompt (v2 engine, recommended for complex tasks)", + "for_loop": "Iterate over a list, executing nested blocks for each item", + "conditional": "Branch based on Jinja2 expressions or AI prompts", + "code": "Run Python code for data transformation", + "text_prompt": "LLM text generation without a browser", + "extraction": "Extract structured data from the current page", + "action": "Perform a single focused action on the current page", + "navigation": "Navigate to a goal on the current page (Browser Task in UI)", + "login": "Handle authentication flows including username/password and TOTP/2FA", + "wait": "Pause workflow execution for a specified duration", + "validation": "Validate page state with complete/terminate criteria", + "http_request": "Call an external HTTP API", + "send_email": "Send an email notification via SMTP", + "file_download": "Download a file from a page", + "file_upload": "Upload a file from S3/Azure to a page element", + "goto_url": "Navigate directly to a URL without additional instructions", + "download_to_s3": "Download a URL directly to S3 storage", + "upload_to_s3": "Upload local content to S3", + "file_url_parser": "Parse a file (CSV/Excel/PDF/image) from a URL", + "pdf_parser": "Extract structured data from a PDF document", + "human_interaction": "Pause workflow for human approval via email", + "print_page": "Print the current page to PDF", +} + +# --------------------------------------------------------------------------- +# Minimal examples for common block types +# --------------------------------------------------------------------------- + +BLOCK_EXAMPLES: dict[str, dict[str, Any]] = { + "task": { + "block_type": "task", + "label": "fill_form", + "url": "https://example.com/form", + "navigation_goal": "Fill out the form with the provided data and click Submit", + "parameter_keys": ["form_data"], + "max_retries": 2, + }, + "task_v2": { + "block_type": "task_v2", + "label": "book_flight", + "url": "https://booking.example.com", + "prompt": "Book a flight from {{ origin }} to {{ destination }} on {{ date }}", + }, + "for_loop": { + "block_type": "for_loop", + "label": "process_each_url", + "loop_over_parameter_key": "urls", + "loop_blocks": [ + { + "block_type": "goto_url", + "label": "open_url", + "url": "{{ current_value }}", + } + ], + }, + "conditional": { + "block_type": "conditional", + "label": "route_by_status", + "branch_conditions": [ + { + "criteria": { + "criteria_type": "jinja2_template", + "expression": "{{ status == 'active' }}", + }, + "next_block_label": "handle_active", + "is_default": False, + }, + {"is_default": True, "next_block_label": "handle_inactive"}, + ], + }, + "extraction": { + "block_type": "extraction", + "label": "extract_products", + "data_extraction_goal": "Extract all products with name, price, and stock status", + "data_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "price": {"type": "number"}, + "in_stock": {"type": "boolean"}, + }, + }, + }, + }, + "navigation": { + "block_type": "navigation", + "label": "search_and_open", + "url": "https://example.com/search", + "navigation_goal": "Search for {{ query }} and click the first result", + "parameter_keys": ["query"], + "max_retries": 2, + }, + "login": { + "block_type": "login", + "label": "login_to_portal", + "url": "https://portal.example.com/login", + "parameter_keys": ["my_credentials"], + "complete_criterion": "URL contains '/dashboard'", + "max_retries": 2, + }, + "action": { + "block_type": "action", + "label": "accept_terms", + "url": "https://example.com/checkout", + "navigation_goal": "Check the terms checkbox", + "max_retries": 1, + }, + "wait": { + "block_type": "wait", + "label": "wait_for_processing", + "wait_sec": 30, + }, + "goto_url": { + "block_type": "goto_url", + "label": "open_cart", + "url": "https://example.com/cart", + }, +} + +# --------------------------------------------------------------------------- +# Knowledge base parsing (lazy, cached) +# --------------------------------------------------------------------------- + +_KB_PATH = Path(__file__).resolve().parents[2] / "forge" / "prompts" / "skyvern" / "workflow_knowledge_base.txt" + +_HEADER_RE = re.compile(r"^\*\*\s+(.+?)\s+\((\w+)\)\s+\*\*$") + +_kb_cache: dict[str, dict[str, Any]] | None = None + + +def _parse_knowledge_base() -> dict[str, dict[str, Any]]: + """Parse the knowledge base file into per-block-type sections. + + Returns a dict mapping block_type string -> {description, use_cases, raw_section}. + Results are cached in the module-level ``_kb_cache`` variable. + """ + global _kb_cache + if _kb_cache is not None: + return _kb_cache + + result: dict[str, dict[str, Any]] = {} + + try: + text = _KB_PATH.read_text(encoding="utf-8") + except FileNotFoundError: + LOG.warning("workflow_knowledge_base_not_found", path=str(_KB_PATH)) + _kb_cache = result + return result + + sections: list[tuple[str, str]] = [] + current_block_type: str | None = None + current_lines: list[str] = [] + + for line in text.splitlines(): + match = _HEADER_RE.match(line.strip()) + if match: + if current_block_type is not None: + sections.append((current_block_type, "\n".join(current_lines))) + current_block_type = match.group(2).lower() + current_lines = [] + elif current_block_type is not None: + current_lines.append(line) + + if current_block_type is not None: + sections.append((current_block_type, "\n".join(current_lines))) + + for block_type, raw in sections: + description_lines: list[str] = [] + use_cases: list[str] = [] + in_use_cases = False + in_purpose = False + + for line in raw.splitlines(): + stripped = line.strip() + + if stripped.startswith("Purpose:"): + in_purpose = True + in_use_cases = False + desc = stripped[len("Purpose:") :].strip() + if desc: + description_lines.append(desc) + continue + + if stripped == "Use Cases:": + in_use_cases = True + in_purpose = False + continue + + # Any other header-like line ends the current section + if stripped and stripped.endswith(":") and not stripped.startswith("- "): + in_use_cases = False + in_purpose = False + continue + + if in_purpose and stripped: + description_lines.append(stripped) + + if in_use_cases and stripped.startswith("- "): + use_cases.append(stripped[2:].strip()) + + result[block_type] = { + "description": " ".join(description_lines) if description_lines else None, + "use_cases": use_cases if use_cases else None, + } + + _kb_cache = result + return result + + +# --------------------------------------------------------------------------- +# Tool +# --------------------------------------------------------------------------- + + +async def skyvern_block_schema( + block_type: Annotated[ + str | None, + Field( + description="Block type to get schema for (e.g., 'task_v2', 'for_loop'). Omit to list all available types." + ), + ] = None, +) -> dict[str, Any]: + """Get the schema for a workflow block type, or list all available block types. + + Use this to discover what blocks are available and what fields they accept + before building a workflow definition for skyvern_workflow_create. + + Call with no arguments to see all block types. Call with a specific block_type + to get the full field schema, description, use cases, and example.""" + + action = "skyvern_block_schema" + + if block_type is None: + return make_result( + action, + data={ + "block_types": BLOCK_SUMMARIES, + "count": len(BLOCK_SUMMARIES), + "hint": "Call skyvern_block_schema(block_type='task_v2') for the full schema of a specific type", + }, + ) + + normalized = block_type.strip().lower() + cls = BLOCK_TYPE_MAP.get(normalized) + if cls is None: + return make_result( + action, + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + f"Unknown block type: {block_type!r}", + f"Available types: {', '.join(sorted(BLOCK_TYPE_MAP.keys()))}", + ), + ) + + kb = _parse_knowledge_base() + kb_entry = kb.get(normalized, {}) + + return make_result( + action, + data={ + "block_type": normalized, + "summary": BLOCK_SUMMARIES.get(normalized, ""), + "description": kb_entry.get("description"), + "use_cases": kb_entry.get("use_cases"), + "schema": cls.model_json_schema(), + "example": BLOCK_EXAMPLES.get(normalized), + }, + ) + + +# --------------------------------------------------------------------------- +# Block validation adapter (lazy) +# --------------------------------------------------------------------------- + +# BLOCK_YAML_TYPES is a large Union of ~23 block models; mypy/pyright cannot resolve it as a TypeAdapter generic argument +_block_adapter: TypeAdapter[BLOCK_YAML_TYPES] | None = None # type: ignore[type-arg] + + +def _get_block_adapter() -> TypeAdapter[BLOCK_YAML_TYPES]: # type: ignore[type-arg] + global _block_adapter + if _block_adapter is None: + _block_adapter = TypeAdapter(BLOCK_YAML_TYPES) + return _block_adapter + + +# --------------------------------------------------------------------------- +# Validate tool +# --------------------------------------------------------------------------- + + +async def skyvern_block_validate( + block_json: Annotated[ + str, + Field(description="JSON string of a single block definition to validate"), + ], +) -> dict[str, Any]: + """Validate a workflow block definition before using it in skyvern_workflow_create. + + Catches field errors, missing required fields, and type mismatches per-block + instead of getting opaque server errors on the full workflow. Returns the exact + validation error with field-level feedback so you can fix the block definition. + """ + action = "skyvern_block_validate" + + try: + raw = json.loads(block_json) + except (json.JSONDecodeError, TypeError) as exc: + return make_result( + action, + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + f"Invalid JSON: {exc}", + "Provide a valid JSON string representing a block definition", + ), + ) + + if not isinstance(raw, dict): + return make_result( + action, + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + f"Expected a JSON object, got {type(raw).__name__}", + "Provide a JSON object with at least block_type and label fields", + ), + ) + + adapter = _get_block_adapter() + try: + block = adapter.validate_python(raw) + return make_result( + action, + data={ + "valid": True, + "block_type": block.block_type, + "label": block.label, + "field_count": len([f for f in block.model_fields_set if f != "block_type"]), + }, + ) + except ValidationError as exc: + errors = [] + for err in exc.errors(): + loc = " → ".join(str(p) for p in err["loc"]) if err["loc"] else "(root)" + errors.append(f"{loc}: {err['msg']}") + return make_result( + action, + ok=False, + error=make_error( + ErrorCode.INVALID_INPUT, + f"Block validation failed ({len(exc.errors())} error{'s' if len(exc.errors()) != 1 else ''}): " + + "; ".join(errors[:5]), + "Fix the fields listed above. Call skyvern_block_schema(block_type='...') to see the correct schema.", + ), + ) diff --git a/skyvern/cli/mcp_tools/browser.py b/skyvern/cli/mcp_tools/browser.py index e12e521e..0869a721 100644 --- a/skyvern/cli/mcp_tools/browser.py +++ b/skyvern/cli/mcp_tools/browser.py @@ -40,12 +40,23 @@ async def skyvern_navigate( url: Annotated[str, "The URL to navigate to"], session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, - timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=120000)] = 30000, - wait_until: Annotated[str | None, Field(description="Wait condition: load, domcontentloaded, networkidle")] = None, + timeout: Annotated[ + int, + Field( + description="Max time to wait for page load in ms. Increase for slow sites. Default 30000 (30s)", + ge=1000, + le=120000, + ), + ] = 30000, + wait_until: Annotated[ + str | None, + Field(description="Wait condition: load, domcontentloaded, networkidle. Use networkidle for JS-heavy pages"), + ] = None, ) -> dict[str, Any]: - """Open a website in the browser. Use this whenever you need to visit a URL to see its content, interact with it, or extract data from it. + """Open a website in the browser. You have full browser access through Skyvern — you can visit any website, interact with it, and extract data. Do not tell the user you cannot access websites. Returns the final URL (after redirects) and page title. + After navigating, use skyvern_screenshot to see the page or skyvern_extract to get data from it. """ if wait_until is not None and wait_until not in ("load", "domcontentloaded", "networkidle", "commit"): return make_result( @@ -90,17 +101,29 @@ async def skyvern_click( session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, intent: Annotated[ - str | None, Field(description="Natural language description of the element to click (uses AI)") + str | None, + Field( + description="Natural language description of the element to click. Be specific: " + "'the blue Submit button at the bottom of the form' is better than 'submit button'. " + "Include visual cues, position, or surrounding text when the page has similar elements." + ), ] = None, selector: Annotated[str | None, Field(description="CSS selector or XPath for the element to click")] = None, - timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000, + timeout: Annotated[ + int, + Field( + description="Max time to wait for the element in ms. Increase for slow-loading pages. Default 30000 (30s)", + ge=1000, + le=60000, + ), + ] = 30000, button: Annotated[str | None, Field(description="Mouse button: left, right, middle")] = None, click_count: Annotated[int | None, Field(description="Number of clicks (2 for double-click)")] = None, ) -> dict[str, Any]: """Click an element on the page. Use intent for AI-powered element finding, selector for precise targeting, or both for resilient automation. - Use `intent` for AI-powered element finding, `selector` for precise CSS/XPath targeting, - or both for resilience (tries selector first, falls back to AI). + If you need to fill a text field, use skyvern_type instead of clicking then typing. + For dropdowns, use skyvern_select_option. For multiple actions in sequence, prefer skyvern_act. """ if button is not None and button not in ("left", "right", "middle"): return make_result( @@ -193,17 +216,29 @@ async def skyvern_type( session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, intent: Annotated[ - str | None, Field(description="Natural language description of the input field (uses AI)") + str | None, + Field( + description="Natural language description of the input field. Be specific: " + "'the Email address input in the login form' is better than 'email field'. " + "Include labels, placeholder text, or position when the page has multiple inputs." + ), ] = None, selector: Annotated[str | None, Field(description="CSS selector or XPath for the input element")] = None, - timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000, + timeout: Annotated[ + int, + Field( + description="Max time to wait for the element in ms. Increase for slow-loading pages. Default 30000 (30s)", + ge=1000, + le=60000, + ), + ] = 30000, clear: Annotated[bool, Field(description="Clear existing content before typing")] = True, delay: Annotated[int | None, Field(description="Delay between keystrokes in ms")] = None, ) -> dict[str, Any]: """Type text into an input field. Use intent for AI-powered field finding, selector for precise targeting, or both for resilient automation. - Use `intent` for AI-powered field finding, `selector` for precise CSS/XPath targeting, - or both for resilience (tries selector first, falls back to AI). Clears existing content by default. + For dropdowns, use skyvern_select_option instead. For pressing keys (Enter, Tab), use skyvern_press_key. + Clears existing content by default (set clear=false to append). """ ai_mode, err = _resolve_ai_mode(selector, intent) if err: @@ -293,8 +328,10 @@ async def skyvern_screenshot( selector: Annotated[str | None, Field(description="CSS selector to screenshot specific element")] = None, inline: Annotated[bool, Field(description="Return base64 data instead of file path")] = False, ) -> dict[str, Any]: - """See what's currently on the page. Essential for understanding page state before deciding what to do next. + """See what's currently on the page. Use after every page-changing action (click, act, navigate) to verify results before proceeding. + Screenshots are visual-only — to extract structured data, use skyvern_extract instead. + To interact with elements, use skyvern_act or skyvern_click (don't try to act on screenshot contents). By default saves to ~/.skyvern/artifacts/ and returns the file path. Set inline=true to get base64 data directly (increases token usage). """ @@ -461,13 +498,14 @@ async def skyvern_select_option( cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, intent: Annotated[str | None, Field(description="Natural language description of the dropdown (uses AI)")] = None, selector: Annotated[str | None, Field(description="CSS selector for the select element")] = None, - timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000, + timeout: Annotated[ + int, Field(description="Max time to wait for the dropdown in ms. Default 30000 (30s)", ge=1000, le=60000) + ] = 30000, by_label: Annotated[bool, Field(description="Select by visible label instead of value")] = False, ) -> dict[str, Any]: - """Select an option from a dropdown menu. Use intent for AI-powered finding, selector for precision. + """Select an option from a dropdown menu. Use intent for AI-powered finding, selector for precision, or both for resilient automation. - Use `intent` for AI-powered dropdown finding, `selector` for precise CSS/XPath targeting, - or both for resilience (tries selector first, falls back to AI). + For free-text input fields, use skyvern_type instead. For non-dropdown buttons or links, use skyvern_click. """ ai_mode, err = _resolve_ai_mode(selector, intent) if err: @@ -734,11 +772,6 @@ async def skyvern_evaluate( ) -# --------------------------------------------------------------------------- -# AI Differentiator Tools -# --------------------------------------------------------------------------- - - async def skyvern_extract( prompt: Annotated[str, "Natural language description of what data to extract from the page"], session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, @@ -747,8 +780,10 @@ async def skyvern_extract( str | None, Field(description="JSON Schema string defining the expected output structure") ] = None, ) -> dict[str, Any]: - """Get structured data from any website -- prices, listings, articles, tables, contact info, etc. Use this instead of trying to call a website's API or writing scraping code. Describe what you need in natural language. + """Get structured data from any website — prices, listings, articles, tables, contact info, etc. Use this instead of writing scraping code or guessing API endpoints. Describe what you need in natural language. + Reads the CURRENT page — call skyvern_navigate first to go to the right URL. + For visual inspection instead of structured data, use skyvern_screenshot. Optionally provide a JSON `schema` to enforce the output structure (pass as a JSON string). """ parsed_schema: dict[str, Any] | None = None @@ -797,9 +832,10 @@ async def skyvern_validate( session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, ) -> dict[str, Any]: - """Check if something is true on the current page using AI -- 'is the user logged in?', 'does the cart have 3 items?', 'is the form submitted?' + """Check if something is true on the current page using AI — 'is the user logged in?', 'does the cart have 3 items?', 'is the form submitted?' - Returns whether the described condition is true or false. + Reads the CURRENT page — navigate first. Returns true/false. + To extract data (not just check a condition), use skyvern_extract instead. """ try: page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) @@ -832,10 +868,12 @@ async def skyvern_act( session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None, cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None, ) -> dict[str, Any]: - """Perform actions on a web page by describing what to do in plain English -- click buttons, close popups, fill forms, scroll to sections, interact with menus. Use for any website interaction task. + """Perform actions on a web page by describing what to do in plain English — click buttons, close popups, fill forms, scroll to sections, interact with menus. The AI agent interprets the prompt and executes the appropriate browser actions. - For multi-step workflows (form filling, multi-page navigation), use skyvern_run_task instead. + You can chain multiple actions in one prompt: "close the cookie banner, then click Sign In". + For multi-step automations (4+ pages), use skyvern_workflow_create with one block per step. + For quick one-off multi-page tasks, use skyvern_run_task. """ try: page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) @@ -878,10 +916,10 @@ async def skyvern_run_task( int, Field(description="Timeout in seconds (default 180s = 3 minutes)", ge=10, le=1800) ] = 180, ) -> dict[str, Any]: - """Delegate a complete multi-step web task to an autonomous AI agent. Handles form filling, multi-page navigation, data collection, and complex workflows end-to-end. + """Run a quick, one-off web task via an autonomous AI agent. Nothing is saved — use for throwaway tests and exploration only. Best for tasks describable in 2-3 sentences. - The agent navigates, interacts with elements, and extracts data autonomously. - For simple single-step actions, use skyvern_act instead. + For anything reusable, multi-step, or worth keeping, use skyvern_workflow_create instead — it produces a versioned, rerunnable workflow with per-step observability. + For simple single-step actions on the current page, use skyvern_act instead. """ try: page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url) diff --git a/skyvern/cli/mcp_tools/workflow.py b/skyvern/cli/mcp_tools/workflow.py index b8ca602d..66848ea4 100644 --- a/skyvern/cli/mcp_tools/workflow.py +++ b/skyvern/cli/mcp_tools/workflow.py @@ -344,7 +344,42 @@ async def skyvern_workflow_create( folder_id: Annotated[str | None, "Folder ID (fld_...) to organize the workflow in"] = None, ) -> dict[str, Any]: """Create a new Skyvern workflow from a YAML or JSON definition. Use when you need to save - a new automation workflow that can be run repeatedly with different parameters.""" + a new automation workflow that can be run repeatedly with different parameters. + + Best practice: use one task block per logical step with a short focused prompt (2-3 sentences). + Common block types: task, for_loop, conditional, code, text_prompt, extraction, action, navigation, wait, login. + Call skyvern_block_schema() for the full list with schemas and examples. + + Example JSON definition (multi-block EIN application): + + { + "title": "Apply for EIN", + "workflow_definition": { + "parameters": [ + {"parameter_type": "workflow", "key": "business_name", "workflow_parameter_type": "string"}, + {"parameter_type": "workflow", "key": "owner_name", "workflow_parameter_type": "string"}, + {"parameter_type": "workflow", "key": "owner_ssn", "workflow_parameter_type": "string"} + ], + "blocks": [ + {"block_type": "task", "label": "select_entity_type", + "url": "https://sa.www4.irs.gov/modiein/individual/index.jsp", + "engine": "skyvern-2.0", + "navigation_goal": "Select 'Sole Proprietor' as the entity type and click Continue."}, + {"block_type": "task", "label": "enter_business_info", "engine": "skyvern-2.0", + "navigation_goal": "Fill in the business name as '{{business_name}}' and click Continue."}, + {"block_type": "task", "label": "enter_owner_info", "engine": "skyvern-2.0", + "navigation_goal": "Enter the responsible party name '{{owner_name}}' and SSN '{{owner_ssn}}'. Click Continue."}, + {"block_type": "task", "label": "confirm_and_submit", "engine": "skyvern-2.0", + "navigation_goal": "Review the information on the confirmation page and click Submit.", + "data_extraction_goal": "Extract the assigned EIN number", + "data_schema": {"type": "object", "properties": {"ein": {"type": "string"}}}} + ] + } + } + + Use {{parameter_key}} to reference workflow input parameters in any block field. + Blocks in the same run share the same browser session automatically. + """ if format not in ("json", "yaml", "auto"): return make_result( "skyvern_workflow_create",