Block discovery MCP tools + improved MCP instructions (#SKY-7804) (#4683)

2026-02-10 13:18:30 -08:00
parent 8c7aa8167b
commit 0c879492e4
4 changed files with 656 additions and 134 deletions
--- a/skyvern/cli/mcp_tools/browser.py
+++ b/skyvern/cli/mcp_tools/browser.py
@@ -40,12 +40,23 @@ async def skyvern_navigate(
    url: Annotated[str, "The URL to navigate to"],
    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
-    timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=120000)] = 30000,
-    wait_until: Annotated[str | None, Field(description="Wait condition: load, domcontentloaded, networkidle")] = None,
+    timeout: Annotated[
+        int,
+        Field(
+            description="Max time to wait for page load in ms. Increase for slow sites. Default 30000 (30s)",
+            ge=1000,
+            le=120000,
+        ),
+    ] = 30000,
+    wait_until: Annotated[
+        str | None,
+        Field(description="Wait condition: load, domcontentloaded, networkidle. Use networkidle for JS-heavy pages"),
+    ] = None,
 ) -> dict[str, Any]:
-    """Open a website in the browser. Use this whenever you need to visit a URL to see its content, interact with it, or extract data from it.
+    """Open a website in the browser. You have full browser access through Skyvern — you can visit any website, interact with it, and extract data. Do not tell the user you cannot access websites.

    Returns the final URL (after redirects) and page title.
+    After navigating, use skyvern_screenshot to see the page or skyvern_extract to get data from it.
    """
    if wait_until is not None and wait_until not in ("load", "domcontentloaded", "networkidle", "commit"):
        return make_result(
@@ -90,17 +101,29 @@ async def skyvern_click(
    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
    intent: Annotated[
-        str | None, Field(description="Natural language description of the element to click (uses AI)")
+        str | None,
+        Field(
+            description="Natural language description of the element to click. Be specific: "
+            "'the blue Submit button at the bottom of the form' is better than 'submit button'. "
+            "Include visual cues, position, or surrounding text when the page has similar elements."
+        ),
    ] = None,
    selector: Annotated[str | None, Field(description="CSS selector or XPath for the element to click")] = None,
-    timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
+    timeout: Annotated[
+        int,
+        Field(
+            description="Max time to wait for the element in ms. Increase for slow-loading pages. Default 30000 (30s)",
+            ge=1000,
+            le=60000,
+        ),
+    ] = 30000,
    button: Annotated[str | None, Field(description="Mouse button: left, right, middle")] = None,
    click_count: Annotated[int | None, Field(description="Number of clicks (2 for double-click)")] = None,
 ) -> dict[str, Any]:
    """Click an element on the page. Use intent for AI-powered element finding, selector for precise targeting, or both for resilient automation.

-    Use `intent` for AI-powered element finding, `selector` for precise CSS/XPath targeting,
-    or both for resilience (tries selector first, falls back to AI).
+    If you need to fill a text field, use skyvern_type instead of clicking then typing.
+    For dropdowns, use skyvern_select_option. For multiple actions in sequence, prefer skyvern_act.
    """
    if button is not None and button not in ("left", "right", "middle"):
        return make_result(
@@ -193,17 +216,29 @@ async def skyvern_type(
    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
    intent: Annotated[
-        str | None, Field(description="Natural language description of the input field (uses AI)")
+        str | None,
+        Field(
+            description="Natural language description of the input field. Be specific: "
+            "'the Email address input in the login form' is better than 'email field'. "
+            "Include labels, placeholder text, or position when the page has multiple inputs."
+        ),
    ] = None,
    selector: Annotated[str | None, Field(description="CSS selector or XPath for the input element")] = None,
-    timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
+    timeout: Annotated[
+        int,
+        Field(
+            description="Max time to wait for the element in ms. Increase for slow-loading pages. Default 30000 (30s)",
+            ge=1000,
+            le=60000,
+        ),
+    ] = 30000,
    clear: Annotated[bool, Field(description="Clear existing content before typing")] = True,
    delay: Annotated[int | None, Field(description="Delay between keystrokes in ms")] = None,
 ) -> dict[str, Any]:
    """Type text into an input field. Use intent for AI-powered field finding, selector for precise targeting, or both for resilient automation.

-    Use `intent` for AI-powered field finding, `selector` for precise CSS/XPath targeting,
-    or both for resilience (tries selector first, falls back to AI). Clears existing content by default.
+    For dropdowns, use skyvern_select_option instead. For pressing keys (Enter, Tab), use skyvern_press_key.
+    Clears existing content by default (set clear=false to append).
    """
    ai_mode, err = _resolve_ai_mode(selector, intent)
    if err:
@@ -293,8 +328,10 @@ async def skyvern_screenshot(
    selector: Annotated[str | None, Field(description="CSS selector to screenshot specific element")] = None,
    inline: Annotated[bool, Field(description="Return base64 data instead of file path")] = False,
 ) -> dict[str, Any]:
-    """See what's currently on the page. Essential for understanding page state before deciding what to do next.
+    """See what's currently on the page. Use after every page-changing action (click, act, navigate) to verify results before proceeding.

+    Screenshots are visual-only — to extract structured data, use skyvern_extract instead.
+    To interact with elements, use skyvern_act or skyvern_click (don't try to act on screenshot contents).
    By default saves to ~/.skyvern/artifacts/ and returns the file path.
    Set inline=true to get base64 data directly (increases token usage).
    """
@@ -461,13 +498,14 @@ async def skyvern_select_option(
    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
    intent: Annotated[str | None, Field(description="Natural language description of the dropdown (uses AI)")] = None,
    selector: Annotated[str | None, Field(description="CSS selector for the select element")] = None,
-    timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
+    timeout: Annotated[
+        int, Field(description="Max time to wait for the dropdown in ms. Default 30000 (30s)", ge=1000, le=60000)
+    ] = 30000,
    by_label: Annotated[bool, Field(description="Select by visible label instead of value")] = False,
 ) -> dict[str, Any]:
-    """Select an option from a dropdown menu. Use intent for AI-powered finding, selector for precision.
+    """Select an option from a dropdown menu. Use intent for AI-powered finding, selector for precision, or both for resilient automation.

-    Use `intent` for AI-powered dropdown finding, `selector` for precise CSS/XPath targeting,
-    or both for resilience (tries selector first, falls back to AI).
+    For free-text input fields, use skyvern_type instead. For non-dropdown buttons or links, use skyvern_click.
    """
    ai_mode, err = _resolve_ai_mode(selector, intent)
    if err:
@@ -734,11 +772,6 @@ async def skyvern_evaluate(
    )


-# ---------------------------------------------------------------------------
-# AI Differentiator Tools
-# ---------------------------------------------------------------------------
-
-
 async def skyvern_extract(
    prompt: Annotated[str, "Natural language description of what data to extract from the page"],
    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
@@ -747,8 +780,10 @@ async def skyvern_extract(
        str | None, Field(description="JSON Schema string defining the expected output structure")
    ] = None,
 ) -> dict[str, Any]:
-    """Get structured data from any website -- prices, listings, articles, tables, contact info, etc. Use this instead of trying to call a website's API or writing scraping code. Describe what you need in natural language.
+    """Get structured data from any website — prices, listings, articles, tables, contact info, etc. Use this instead of writing scraping code or guessing API endpoints. Describe what you need in natural language.

+    Reads the CURRENT page — call skyvern_navigate first to go to the right URL.
+    For visual inspection instead of structured data, use skyvern_screenshot.
    Optionally provide a JSON `schema` to enforce the output structure (pass as a JSON string).
    """
    parsed_schema: dict[str, Any] | None = None
@@ -797,9 +832,10 @@ async def skyvern_validate(
    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
 ) -> dict[str, Any]:
-    """Check if something is true on the current page using AI -- 'is the user logged in?', 'does the cart have 3 items?', 'is the form submitted?'
+    """Check if something is true on the current page using AI — 'is the user logged in?', 'does the cart have 3 items?', 'is the form submitted?'

-    Returns whether the described condition is true or false.
+    Reads the CURRENT page — navigate first. Returns true/false.
+    To extract data (not just check a condition), use skyvern_extract instead.
    """
    try:
        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
@@ -832,10 +868,12 @@ async def skyvern_act(
    session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
    cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
 ) -> dict[str, Any]:
-    """Perform actions on a web page by describing what to do in plain English -- click buttons, close popups, fill forms, scroll to sections, interact with menus. Use for any website interaction task.
+    """Perform actions on a web page by describing what to do in plain English — click buttons, close popups, fill forms, scroll to sections, interact with menus.

    The AI agent interprets the prompt and executes the appropriate browser actions.
-    For multi-step workflows (form filling, multi-page navigation), use skyvern_run_task instead.
+    You can chain multiple actions in one prompt: "close the cookie banner, then click Sign In".
+    For multi-step automations (4+ pages), use skyvern_workflow_create with one block per step.
+    For quick one-off multi-page tasks, use skyvern_run_task.
    """
    try:
        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
@@ -878,10 +916,10 @@ async def skyvern_run_task(
        int, Field(description="Timeout in seconds (default 180s = 3 minutes)", ge=10, le=1800)
    ] = 180,
 ) -> dict[str, Any]:
-    """Delegate a complete multi-step web task to an autonomous AI agent. Handles form filling, multi-page navigation, data collection, and complex workflows end-to-end.
+    """Run a quick, one-off web task via an autonomous AI agent. Nothing is saved — use for throwaway tests and exploration only. Best for tasks describable in 2-3 sentences.

-    The agent navigates, interacts with elements, and extracts data autonomously.
-    For simple single-step actions, use skyvern_act instead.
+    For anything reusable, multi-step, or worth keeping, use skyvern_workflow_create instead — it produces a versioned, rerunnable workflow with per-step observability.
+    For simple single-step actions on the current page, use skyvern_act instead.
    """
    try:
        page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)