Extract shared core from MCP tools, add CLI browser commands (#4768)

This commit is contained in:
Marc Kelechava
2026-02-17 11:24:56 -08:00
committed by GitHub
parent aacc612365
commit 7c5be8fefe
14 changed files with 1304 additions and 113 deletions

View File

@@ -4,13 +4,24 @@ import asyncio
import base64
import json
import logging
import re
from datetime import datetime, timezone
from typing import Annotated, Any
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from pydantic import Field
from skyvern.cli.core.browser_ops import do_act, do_extract, do_navigate, do_screenshot, parse_extract_schema
from skyvern.cli.core.guards import (
CREDENTIAL_HINT,
JS_PASSWORD_PATTERN,
PASSWORD_PATTERN,
GuardError,
check_password_prompt,
)
from skyvern.cli.core.guards import resolve_ai_mode as _resolve_ai_mode
from skyvern.cli.core.guards import (
validate_wait_until,
)
from skyvern.schemas.run_blocks import CredentialType
from ._common import (
@@ -24,39 +35,6 @@ from ._session import BrowserNotAvailableError, get_page, no_browser_error
LOG = logging.getLogger(__name__)
_PASSWORD_PATTERN = re.compile(
r"\bpass(?:word|phrase|code)s?\b|\bsecret\b|\bcredential\b|\bpin\s*(?:code)?\b|\bpwd\b|\bpasswd\b",
re.IGNORECASE,
)
_CREDENTIAL_ERROR_HINT = (
"Use skyvern_login with a stored credential to authenticate. "
"Create credentials via CLI: skyvern credentials add. "
"Never pass passwords through tool calls."
)
_JS_PASSWORD_PATTERN = re.compile(
r"""(?:type\s*=\s*['"]?password|\.type\s*===?\s*['"]password|input\[type=password\]).*?\.value\s*=""",
re.IGNORECASE,
)
def _resolve_ai_mode(
selector: str | None,
intent: str | None,
) -> tuple[str | None, str | None]:
"""Determine AI mode from selector/intent combination.
Returns (ai_mode, error_code) — if error_code is set, the call should fail.
"""
if intent and not selector:
return "proactive", None
if intent and selector:
return "fallback", None
if selector and not intent:
return None, None
return None, "INVALID_INPUT"
async def skyvern_navigate(
url: Annotated[str, "The URL to navigate to"],
@@ -80,15 +58,13 @@ async def skyvern_navigate(
Returns the final URL (after redirects) and page title.
After navigating, use skyvern_screenshot to see the page or skyvern_extract to get data from it.
"""
if wait_until is not None and wait_until not in ("load", "domcontentloaded", "networkidle", "commit"):
try:
validate_wait_until(wait_until)
except GuardError as e:
return make_result(
"skyvern_navigate",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
f"Invalid wait_until: {wait_until}",
"Use load, domcontentloaded, networkidle, or commit",
),
error=make_error(ErrorCode.INVALID_INPUT, str(e), e.hint),
)
try:
@@ -98,10 +74,16 @@ async def skyvern_navigate(
with Timer() as timer:
try:
await page.goto(url, timeout=timeout, wait_until=wait_until)
result = await do_navigate(page, url, timeout=timeout, wait_until=wait_until)
timer.mark("sdk")
final_url = page.url
title = await page.title()
except GuardError as e:
return make_result(
"skyvern_navigate",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.INVALID_INPUT, str(e), e.hint),
)
except Exception as e:
return make_result(
"skyvern_navigate",
@@ -114,7 +96,7 @@ async def skyvern_navigate(
return make_result(
"skyvern_navigate",
browser_context=ctx,
data={"url": final_url, "title": title, "sdk_equivalent": f'await page.goto("{url}")'},
data={"url": result.url, "title": result.title, "sdk_equivalent": f'await page.goto("{url}")'},
timing_ms=timer.timing_ms,
)
@@ -355,14 +337,14 @@ async def skyvern_type(
"""
# Block password entry — redirect to skyvern_login
target_text = f"{intent or ''} {selector or ''}"
if _PASSWORD_PATTERN.search(target_text):
if PASSWORD_PATTERN.search(target_text):
return make_result(
"skyvern_type",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
"Cannot type into password fields — credentials must not be passed through tool calls",
_CREDENTIAL_ERROR_HINT,
CREDENTIAL_HINT,
),
)
@@ -402,7 +384,7 @@ async def skyvern_type(
error=make_error(
ErrorCode.INVALID_INPUT,
"Cannot type into password fields — credentials must not be passed through tool calls",
_CREDENTIAL_ERROR_HINT,
CREDENTIAL_HINT,
),
)
@@ -491,11 +473,7 @@ async def skyvern_screenshot(
with Timer() as timer:
try:
if selector:
element = page.locator(selector)
screenshot_bytes = await element.screenshot()
else:
screenshot_bytes = await page.screenshot(full_page=full_page)
result = await do_screenshot(page, full_page=full_page, selector=selector)
timer.mark("sdk")
except Exception as e:
return make_result(
@@ -507,7 +485,7 @@ async def skyvern_screenshot(
)
if inline:
data_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
data_b64 = base64.b64encode(result.data).decode("utf-8")
return make_result(
"skyvern_screenshot",
browser_context=ctx,
@@ -515,7 +493,7 @@ async def skyvern_screenshot(
"inline": True,
"data": data_b64,
"mime": "image/png",
"bytes": len(screenshot_bytes),
"bytes": len(result.data),
"sdk_equivalent": "await page.screenshot()",
},
timing_ms=timer.timing_ms,
@@ -525,7 +503,7 @@ async def skyvern_screenshot(
ts = datetime.now(timezone.utc).strftime("%H%M%S_%f")
filename = f"screenshot_{ts}.png"
artifact = save_artifact(
screenshot_bytes,
result.data,
kind="screenshot",
filename=filename,
mime="image/png",
@@ -896,14 +874,14 @@ async def skyvern_evaluate(
Security: This executes arbitrary JS in the page context. Only use with trusted expressions.
"""
# Block JS that sets password field values
if _JS_PASSWORD_PATTERN.search(expression):
if JS_PASSWORD_PATTERN.search(expression):
return make_result(
"skyvern_evaluate",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
"Cannot set password field values via JavaScript — credentials must not be passed through tool calls",
_CREDENTIAL_ERROR_HINT,
CREDENTIAL_HINT,
),
)
@@ -947,20 +925,17 @@ async def skyvern_extract(
For visual inspection instead of structured data, use skyvern_screenshot.
Optionally provide a JSON `schema` to enforce the output structure (pass as a JSON string).
"""
parsed_schema: dict[str, Any] | None = None
if schema is not None:
try:
parsed_schema = json.loads(schema)
except (json.JSONDecodeError, TypeError) as e:
parsed_schema = parse_extract_schema(schema)
except GuardError as e:
return make_result(
"skyvern_extract",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
f"Invalid JSON schema: {e}",
"Provide schema as a valid JSON string",
),
error=make_error(ErrorCode.INVALID_INPUT, str(e), e.hint),
)
else:
parsed_schema = None
try:
page, ctx = await get_page(session_id=session_id, cdp_url=cdp_url)
@@ -969,8 +944,16 @@ async def skyvern_extract(
with Timer() as timer:
try:
extracted = await page.extract(prompt=prompt, schema=parsed_schema)
result = await do_extract(page, prompt, schema=parsed_schema)
timer.mark("sdk")
except GuardError as e:
return make_result(
"skyvern_extract",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.INVALID_INPUT, str(e), e.hint),
)
except Exception as e:
return make_result(
"skyvern_extract",
@@ -983,7 +966,10 @@ async def skyvern_extract(
return make_result(
"skyvern_extract",
browser_context=ctx,
data={"extracted": extracted, "sdk_equivalent": f'await page.extract(prompt="{prompt}")'},
data={
"extracted": result.extracted,
"sdk_equivalent": f'await page.extract(prompt="{prompt}")',
},
timing_ms=timer.timing_ms,
)
@@ -1037,16 +1023,13 @@ async def skyvern_act(
For multi-step automations (4+ pages), use skyvern_workflow_create with one block per step.
For quick one-off multi-page tasks, use skyvern_run_task.
"""
# Block login/password actions — redirect to skyvern_login
if _PASSWORD_PATTERN.search(prompt):
try:
check_password_prompt(prompt)
except GuardError as e:
return make_result(
"skyvern_act",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
"Cannot perform password/credential actions — credentials must not be passed through tool calls",
_CREDENTIAL_ERROR_HINT,
),
error=make_error(ErrorCode.INVALID_INPUT, str(e), e.hint),
)
try:
@@ -1056,8 +1039,16 @@ async def skyvern_act(
with Timer() as timer:
try:
await page.act(prompt)
result = await do_act(page, prompt)
timer.mark("sdk")
except GuardError as e:
return make_result(
"skyvern_act",
ok=False,
browser_context=ctx,
timing_ms=timer.timing_ms,
error=make_error(ErrorCode.INVALID_INPUT, str(e), e.hint),
)
except Exception as e:
return make_result(
"skyvern_act",
@@ -1070,7 +1061,11 @@ async def skyvern_act(
return make_result(
"skyvern_act",
browser_context=ctx,
data={"prompt": prompt, "completed": True, "sdk_equivalent": f'await page.act("{prompt}")'},
data={
"prompt": result.prompt,
"completed": result.completed,
"sdk_equivalent": f'await page.act("{prompt}")',
},
timing_ms=timer.timing_ms,
)
@@ -1099,14 +1094,14 @@ async def skyvern_run_task(
For simple single-step actions on the current page, use skyvern_act instead.
"""
# Block password/credential actions — redirect to skyvern_login
if _PASSWORD_PATTERN.search(prompt):
if PASSWORD_PATTERN.search(prompt):
return make_result(
"skyvern_run_task",
ok=False,
error=make_error(
ErrorCode.INVALID_INPUT,
"Cannot perform password/credential actions — credentials must not be passed through tool calls",
_CREDENTIAL_ERROR_HINT,
CREDENTIAL_HINT,
),
)