Extract shared core from MCP tools, add CLI browser commands (#4768)

This commit is contained in:
Marc Kelechava
2026-02-17 11:24:56 -08:00
committed by GitHub
parent aacc612365
commit 7c5be8fefe
14 changed files with 1304 additions and 113 deletions

View File

@@ -0,0 +1,87 @@
"""Shared browser operations for MCP tools and CLI commands.
Each function: validate inputs -> call SDK -> return typed result.
Session resolution and output formatting are caller responsibilities.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from typing import Any
from .guards import GuardError
@dataclass
class NavigateResult:
url: str
title: str
@dataclass
class ScreenshotResult:
data: bytes
full_page: bool = False
@dataclass
class ActResult:
prompt: str
completed: bool = True
@dataclass
class ExtractResult:
extracted: Any = None
def parse_extract_schema(schema: str | dict[str, Any] | None) -> dict[str, Any] | None:
"""Parse and validate an extraction schema payload."""
if schema is None:
return None
if isinstance(schema, dict):
return schema
try:
return json.loads(schema)
except (json.JSONDecodeError, TypeError) as e:
raise GuardError(f"Invalid JSON schema: {e}", "Provide schema as a valid JSON string")
async def do_navigate(
page: Any,
url: str,
timeout: int = 30000,
wait_until: str | None = None,
) -> NavigateResult:
await page.goto(url, timeout=timeout, wait_until=wait_until)
return NavigateResult(url=page.url, title=await page.title())
async def do_screenshot(
page: Any,
full_page: bool = False,
selector: str | None = None,
) -> ScreenshotResult:
if selector:
element = page.locator(selector)
data = await element.screenshot()
else:
data = await page.screenshot(full_page=full_page)
return ScreenshotResult(data=data, full_page=full_page)
async def do_act(page: Any, prompt: str) -> ActResult:
await page.act(prompt)
return ActResult(prompt=prompt, completed=True)
async def do_extract(
page: Any,
prompt: str,
schema: str | dict[str, Any] | None = None,
) -> ExtractResult:
parsed_schema = parse_extract_schema(schema)
extracted = await page.extract(prompt=prompt, schema=parsed_schema)
return ExtractResult(extracted=extracted)

View File

@@ -0,0 +1,81 @@
"""Shared input validation guards for both MCP and CLI surfaces."""
from __future__ import annotations
import re
PASSWORD_PATTERN = re.compile(
r"\bpass(?:word|phrase|code)s?\b|\bsecret\b|\bcredential\b|\bpin\s*(?:code)?\b|\bpwd\b|\bpasswd\b",
re.IGNORECASE,
)
JS_PASSWORD_PATTERN = re.compile(
r"""(?:type\s*=\s*['"]?password|\.type\s*===?\s*['"]password|input\[type=password\]).*?\.value\s*=""",
re.IGNORECASE,
)
CREDENTIAL_HINT = (
"Use skyvern_login with a stored credential to authenticate. "
"Create credentials via CLI: skyvern credentials add. "
"Never pass passwords through tool calls."
)
VALID_WAIT_UNTIL = ("load", "domcontentloaded", "networkidle", "commit")
VALID_BUTTONS = ("left", "right", "middle")
VALID_ELEMENT_STATES = ("visible", "hidden", "attached", "detached")
class GuardError(Exception):
"""Raised when an input guard blocks an operation."""
def __init__(self, message: str, hint: str = "") -> None:
super().__init__(message)
self.hint = hint
def check_password_prompt(text: str) -> None:
"""Block prompts containing password/credential terms."""
if PASSWORD_PATTERN.search(text):
raise GuardError(
"Cannot perform password/credential actions — credentials must not be passed through tool calls",
CREDENTIAL_HINT,
)
def check_js_password(expression: str) -> None:
"""Block JS expressions that set password field values."""
if JS_PASSWORD_PATTERN.search(expression):
raise GuardError(
"Cannot set password field values via JavaScript — credentials must not be passed through tool calls",
CREDENTIAL_HINT,
)
def validate_wait_until(value: str | None) -> None:
if value is not None and value not in VALID_WAIT_UNTIL:
raise GuardError(
f"Invalid wait_until: {value}",
"Use load, domcontentloaded, networkidle, or commit",
)
def validate_button(value: str | None) -> None:
if value is not None and value not in VALID_BUTTONS:
raise GuardError(f"Invalid button: {value}", "Use left, right, or middle")
def resolve_ai_mode(
selector: str | None,
intent: str | None,
) -> tuple[str | None, str | None]:
"""Determine AI mode from selector/intent combination.
Returns (ai_mode, error_code) -- if error_code is set, the call should fail.
"""
if intent and not selector:
return "proactive", None
if intent and selector:
return "fallback", None
if selector and not intent:
return None, None
return None, "INVALID_INPUT"

View File

@@ -0,0 +1,74 @@
"""Shared session operations for MCP tools and CLI commands."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
from skyvern.schemas.runs import ProxyLocation
@dataclass
class SessionCreateResult:
session_id: str | None
local: bool = False
headless: bool = False
timeout_minutes: int | None = None
@dataclass
class SessionCloseResult:
session_id: str | None
closed: bool = True
@dataclass
class SessionInfo:
session_id: str
status: str | None
started_at: str | None
timeout: int | None
runnable_id: str | None = None
available: bool = False
async def do_session_create(
skyvern: Any,
timeout: int = 60,
proxy_location: str | None = None,
local: bool = False,
headless: bool = False,
) -> tuple[Any, SessionCreateResult]:
"""Create browser session. Returns (browser, result)."""
if local:
browser = await skyvern.launch_local_browser(headless=headless)
return browser, SessionCreateResult(session_id=None, local=True, headless=headless)
proxy = ProxyLocation(proxy_location) if proxy_location else None
browser = await skyvern.launch_cloud_browser(timeout=timeout, proxy_location=proxy)
return browser, SessionCreateResult(
session_id=browser.browser_session_id,
timeout_minutes=timeout,
)
async def do_session_close(skyvern: Any, session_id: str) -> SessionCloseResult:
"""Close a browser session by ID."""
await skyvern.close_browser_session(session_id)
return SessionCloseResult(session_id=session_id)
async def do_session_list(skyvern: Any) -> list[SessionInfo]:
"""List all browser sessions."""
sessions = await skyvern.get_browser_sessions()
return [
SessionInfo(
session_id=s.browser_session_id,
status=s.status,
started_at=s.started_at.isoformat() if s.started_at else None,
timeout=s.timeout,
runnable_id=s.runnable_id,
available=s.runnable_id is None and s.browser_address is not None,
)
for s in sessions
]