Block discovery MCP tools + improved MCP instructions (#SKY-7804) (#4683)

This commit is contained in:
Marc Kelechava
2026-02-10 13:18:30 -08:00
committed by GitHub
parent 8c7aa8167b
commit 0c879492e4
4 changed files with 656 additions and 134 deletions

View File

@@ -40,12 +40,23 @@ async def skyvern_navigate(
url: Annotated[str, "The URL to navigate to"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=120000)] = 30000,
wait_until: Annotated[str | None, Field(description="Wait condition: load, domcontentloaded, networkidle")] = None,
timeout: Annotated[
int,
Field(
description="Max time to wait for page load in ms. Increase for slow sites. Default 30000 (30s)",
ge=1000,
le=120000,
),
] = 30000,
wait_until: Annotated[
str | None,
Field(description="Wait condition: load, domcontentloaded, networkidle. Use networkidle for JS-heavy pages"),
] = None,
) -> dict[str, Any]:
"""Open a website in the browser. Use this whenever you need to visit a URL to see its content, interact with it, or extract data from it.
"""Open a website in the browser. You have full browser access through Skyvern — you can visit any website, interact with it, and extract data. Do not tell the user you cannot access websites.
Returns the final URL (after redirects) and page title.
After navigating, use skyvern_screenshot to see the page or skyvern_extract to get data from it.
"""
if wait_until is not None and wait_until not in ("load", "domcontentloaded", "networkidle", "commit"):
return make_result(
@@ -90,17 +101,29 @@ async def skyvern_click(
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
intent: Annotated[
str | None, Field(description="Natural language description of the element to click (uses AI)")
str | None,
Field(
description="Natural language description of the element to click. Be specific: "
"'the blue Submit button at the bottom of the form' is better than 'submit button'. "
"Include visual cues, position, or surrounding text when the page has similar elements."
),
] = None,
selector: Annotated[str | None, Field(description="CSS selector or XPath for the element to click")] = None,
timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
timeout: Annotated[
int,
Field(
description="Max time to wait for the element in ms. Increase for slow-loading pages. Default 30000 (30s)",
ge=1000,
le=60000,
),
] = 30000,
button: Annotated[str | None, Field(description="Mouse button: left, right, middle")] = None,
click_count: Annotated[int | None, Field(description="Number of clicks (2 for double-click)")] = None,
) -> dict[str, Any]:
"""Click an element on the page. Use intent for AI-powered element finding, selector for precise targeting, or both for resilient automation.
Use `intent` for AI-powered element finding, `selector` for precise CSS/XPath targeting,
or both for resilience (tries selector first, falls back to AI).
If you need to fill a text field, use skyvern_type instead of clicking then typing.
For dropdowns, use skyvern_select_option. For multiple actions in sequence, prefer skyvern_act.
"""
if button is not None and button not in ("left", "right", "middle"):
return make_result(
@@ -193,17 +216,29 @@ async def skyvern_type(
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
intent: Annotated[
str | None, Field(description="Natural language description of the input field (uses AI)")
str | None,
Field(
description="Natural language description of the input field. Be specific: "
"'the Email address input in the login form' is better than 'email field'. "
"Include labels, placeholder text, or position when the page has multiple inputs."
),
] = None,
selector: Annotated[str | None, Field(description="CSS selector or XPath for the input element")] = None,
timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
timeout: Annotated[
int,
Field(
description="Max time to wait for the element in ms. Increase for slow-loading pages. Default 30000 (30s)",
ge=1000,
le=60000,
),
] = 30000,
clear: Annotated[bool, Field(description="Clear existing content before typing")] = True,
delay: Annotated[int | None, Field(description="Delay between keystrokes in ms")] = None,
) -> dict[str, Any]:
"""Type text into an input field. Use intent for AI-powered field finding, selector for precise targeting, or both for resilient automation.
Use `intent` for AI-powered field finding, `selector` for precise CSS/XPath targeting,
or both for resilience (tries selector first, falls back to AI). Clears existing content by default.
For dropdowns, use skyvern_select_option instead. For pressing keys (Enter, Tab), use skyvern_press_key.
Clears existing content by default (set clear=false to append).
"""
ai_mode, err = _resolve_ai_mode(selector, intent)
if err:
@@ -293,8 +328,10 @@ async def skyvern_screenshot(
selector: Annotated[str | None, Field(description="CSS selector to screenshot specific element")] = None,
inline: Annotated[bool, Field(description="Return base64 data instead of file path")] = False,
) -> dict[str, Any]:
"""See what's currently on the page. Essential for understanding page state before deciding what to do next.
"""See what's currently on the page. Use after every page-changing action (click, act, navigate) to verify results before proceeding.
Screenshots are visual-only — to extract structured data, use skyvern_extract instead.
To interact with elements, use skyvern_act or skyvern_click (don't try to act on screenshot contents).
By default saves to ~/.skyvern/artifacts/ and returns the file path.
Set inline=true to get base64 data directly (increases token usage).
"""
@@ -461,13 +498,14 @@ async def skyvern_select_option(
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
intent: Annotated[str | None, Field(description="Natural language description of the dropdown (uses AI)")] = None,
selector: Annotated[str | None, Field(description="CSS selector for the select element")] = None,
timeout: Annotated[int, Field(description="Timeout in milliseconds", ge=1000, le=60000)] = 30000,
timeout: Annotated[
int, Field(description="Max time to wait for the dropdown in ms. Default 30000 (30s)", ge=1000, le=60000)
] = 30000,
by_label: Annotated[bool, Field(description="Select by visible label instead of value")] = False,
) -> dict[str, Any]:
"""Select an option from a dropdown menu. Use intent for AI-powered finding, selector for precision.
"""Select an option from a dropdown menu. Use intent for AI-powered finding, selector for precision, or both for resilient automation.
Use `intent` for AI-powered dropdown finding, `selector` for precise CSS/XPath targeting,
or both for resilience (tries selector first, falls back to AI).
For free-text input fields, use skyvern_type instead. For non-dropdown buttons or links, use skyvern_click.
"""
ai_mode, err = _resolve_ai_mode(selector, intent)
if err:
@@ -734,11 +772,6 @@ async def skyvern_evaluate(
)
# ---------------------------------------------------------------------------
# AI Differentiator Tools
# ---------------------------------------------------------------------------
async def skyvern_extract(
prompt: Annotated[str, "Natural language description of what data to extract from the page"],
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
@@ -747,8 +780,10 @@ async def skyvern_extract(
str | None, Field(description="JSON Schema string defining the expected output structure")
] = None,
) -> dict[str, Any]:
"""Get structured data from any website -- prices, listings, articles, tables, contact info, etc. Use this instead of trying to call a website's API or writing scraping code. Describe what you need in natural language.
"""Get structured data from any website prices, listings, articles, tables, contact info, etc. Use this instead of writing scraping code or guessing API endpoints. Describe what you need in natural language.
Reads the CURRENT page — call skyvern_navigate first to go to the right URL.
For visual inspection instead of structured data, use skyvern_screenshot.
Optionally provide a JSON `schema` to enforce the output structure (pass as a JSON string).
"""
parsed_schema: dict[str, Any] | None = None
@@ -797,9 +832,10 @@ async def skyvern_validate(
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
) -> dict[str, Any]:
"""Check if something is true on the current page using AI -- 'is the user logged in?', 'does the cart have 3 items?', 'is the form submitted?'
"""Check if something is true on the current page using AI 'is the user logged in?', 'does the cart have 3 items?', 'is the form submitted?'
Returns whether the described condition is true or false.
Reads the CURRENT page — navigate first. Returns true/false.
To extract data (not just check a condition), use skyvern_extract instead.
"""
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
@@ -832,10 +868,12 @@ async def skyvern_act(
session_id: Annotated[str | None, Field(description="Browser session ID (pbs_...)")] = None,
cdp_url: Annotated[str | None, Field(description="CDP WebSocket URL")] = None,
) -> dict[str, Any]:
"""Perform actions on a web page by describing what to do in plain English -- click buttons, close popups, fill forms, scroll to sections, interact with menus. Use for any website interaction task.
"""Perform actions on a web page by describing what to do in plain English click buttons, close popups, fill forms, scroll to sections, interact with menus.
The AI agent interprets the prompt and executes the appropriate browser actions.
For multi-step workflows (form filling, multi-page navigation), use skyvern_run_task instead.
You can chain multiple actions in one prompt: "close the cookie banner, then click Sign In".
For multi-step automations (4+ pages), use skyvern_workflow_create with one block per step.
For quick one-off multi-page tasks, use skyvern_run_task.
"""
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
@@ -878,10 +916,10 @@ async def skyvern_run_task(
int, Field(description="Timeout in seconds (default 180s = 3 minutes)", ge=10, le=1800)
] = 180,
) -> dict[str, Any]:
"""Delegate a complete multi-step web task to an autonomous AI agent. Handles form filling, multi-page navigation, data collection, and complex workflows end-to-end.
"""Run a quick, one-off web task via an autonomous AI agent. Nothing is saved — use for throwaway tests and exploration only. Best for tasks describable in 2-3 sentences.
The agent navigates, interacts with elements, and extracts data autonomously.
For simple single-step actions, use skyvern_act instead.
For anything reusable, multi-step, or worth keeping, use skyvern_workflow_create instead — it produces a versioned, rerunnable workflow with per-step observability.
For simple single-step actions on the current page, use skyvern_act instead.
"""
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)