Files
Dorod-Sky/skyvern/cli/commands/browser.py

921 lines
39 KiB
Python

from __future__ import annotations
import asyncio
import json
import sys
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Literal
import typer
from skyvern.cli.commands._output import output, output_error
from skyvern.cli.commands._state import CLIState, clear_state, load_state, save_state
from skyvern.cli.core.artifacts import save_artifact
from skyvern.cli.core.browser_ops import do_act, do_extract, do_navigate, do_screenshot
from skyvern.cli.core.client import get_skyvern
from skyvern.cli.core.guards import (
CREDENTIAL_HINT,
PASSWORD_PATTERN,
VALID_ELEMENT_STATES,
GuardError,
check_js_password,
check_password_prompt,
resolve_ai_mode,
validate_button,
validate_wait_until,
)
from skyvern.cli.core.session_ops import do_session_close, do_session_create, do_session_list
from skyvern.cli.mcp_tools.browser import skyvern_login as tool_login
from skyvern.cli.mcp_tools.browser import skyvern_run_task as tool_run_task
browser_app = typer.Typer(help="Browser automation commands.", no_args_is_help=True)
session_app = typer.Typer(help="Manage browser sessions.", no_args_is_help=True)
browser_app.add_typer(session_app, name="session")
@dataclass(frozen=True)
class ConnectionTarget:
mode: Literal["cloud", "cdp"]
session_id: str | None = None
cdp_url: str | None = None
def _resolve_connection(session: str | None, cdp: str | None) -> ConnectionTarget:
if session and cdp:
raise typer.BadParameter("Pass only one of --session or --cdp.")
if session:
return ConnectionTarget(mode="cloud", session_id=session)
if cdp:
return ConnectionTarget(mode="cdp", cdp_url=cdp)
state = load_state()
if state:
if state.mode == "cdp" and state.cdp_url:
return ConnectionTarget(mode="cdp", cdp_url=state.cdp_url)
if state.session_id:
return ConnectionTarget(mode="cloud", session_id=state.session_id)
if state.cdp_url:
return ConnectionTarget(mode="cdp", cdp_url=state.cdp_url)
raise typer.BadParameter(
"No active browser connection. Create one with: skyvern browser session create\n"
"Or connect with: skyvern browser session connect --cdp ws://...\n"
"Or specify: --session pbs_... / --cdp ws://..."
)
async def _connect_browser(connection: ConnectionTarget) -> Any:
skyvern = get_skyvern()
if connection.mode == "cloud":
if not connection.session_id:
raise typer.BadParameter("Cloud mode requires --session or an active cloud session in state.")
return await skyvern.connect_to_cloud_browser_session(connection.session_id)
if not connection.cdp_url:
raise typer.BadParameter("CDP mode requires --cdp or an active CDP URL in state.")
return await skyvern.connect_to_browser_over_cdp(connection.cdp_url)
def _resolve_ai_target(selector: str | None, intent: str | None, *, operation: str) -> str | None:
ai_mode, err = resolve_ai_mode(selector, intent)
if err:
raise GuardError(
"Must provide intent, selector, or both",
(
f"Use intent='describe what to {operation}' for AI-powered targeting, "
"or selector='#css-selector' for precise targeting"
),
)
return ai_mode
def _validate_wait_state(state: str) -> None:
if state not in VALID_ELEMENT_STATES:
raise GuardError(f"Invalid state: {state}", "Use visible, hidden, attached, or detached")
def _emit_tool_result(result: dict[str, Any], *, json_output: bool, action: str) -> None:
if json_output:
json.dump(result, sys.stdout, indent=2, default=str)
sys.stdout.write("\n")
if not result.get("ok", False):
raise SystemExit(1)
return
if result.get("ok", False):
output(result.get("data"), action=action, json_mode=False)
return
err = result.get("error") or {}
output_error(str(err.get("message") or "Unknown error"), hint=str(err.get("hint") or ""), json_mode=False)
# ---------------------------------------------------------------------------
# Session commands
# ---------------------------------------------------------------------------
@session_app.command("create")
def session_create(
timeout: int = typer.Option(60, help="Session timeout in minutes."),
proxy: str | None = typer.Option(None, help="Proxy location (e.g. RESIDENTIAL)."),
local: bool = typer.Option(False, "--local", help="Launch a local browser instead of cloud."),
headless: bool = typer.Option(False, "--headless", help="Run local browser headless."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Create a new browser session."""
if local:
output_error(
"Local browser sessions are not yet supported in CLI mode.",
hint="Use MCP (skyvern run mcp) for local browser sessions, or omit --local for cloud sessions.",
json_mode=json_output,
)
async def _run() -> dict:
skyvern = get_skyvern()
_browser, result = await do_session_create(
skyvern,
timeout=timeout,
proxy_location=proxy,
)
save_state(CLIState(session_id=result.session_id, cdp_url=None, mode="cloud"))
return {
"session_id": result.session_id,
"mode": "cloud",
"timeout_minutes": result.timeout_minutes,
}
try:
data = asyncio.run(_run())
output(data, action="session_create", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Check your API key and network connection.", json_mode=json_output)
@session_app.command("close")
def session_close(
session: str | None = typer.Option(None, help="Browser session ID to close."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL to detach from."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Close a browser session."""
async def _run() -> dict:
connection = _resolve_connection(session, cdp)
if connection.mode == "cdp":
clear_state()
return {"cdp_url": connection.cdp_url, "closed": False, "detached": True}
if not connection.session_id:
raise typer.BadParameter("Cloud mode requires a browser session ID.")
skyvern = get_skyvern()
result = await do_session_close(skyvern, connection.session_id)
clear_state()
return {"session_id": result.session_id, "closed": result.closed}
try:
data = asyncio.run(_run())
output(data, action="session_close", json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Verify the session ID or CDP URL is correct.", json_mode=json_output)
@session_app.command("connect")
def session_connect(
session: str | None = typer.Option(None, help="Cloud browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Connect to an existing browser session (cloud or CDP) and persist it as active state."""
if not session and not cdp:
raise typer.BadParameter("Specify one of --session or --cdp.")
async def _run() -> dict:
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
await browser.get_working_page()
if connection.mode == "cdp":
save_state(CLIState(session_id=None, cdp_url=connection.cdp_url, mode="cdp"))
return {"connected": True, "mode": "cdp", "cdp_url": connection.cdp_url}
save_state(CLIState(session_id=connection.session_id, cdp_url=None, mode="cloud"))
return {"connected": True, "mode": "cloud", "session_id": connection.session_id}
try:
data = asyncio.run(_run())
output(data, action="session_connect", json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Verify the session ID or CDP URL is reachable.", json_mode=json_output)
@session_app.command("list")
def session_list(
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""List all browser sessions."""
async def _run() -> list[dict]:
skyvern = get_skyvern()
sessions = await do_session_list(skyvern)
return [asdict(s) for s in sessions]
try:
data = asyncio.run(_run())
output(data, action="session_list", json_mode=json_output)
except Exception as e:
output_error(str(e), hint="Check your API key and network connection.", json_mode=json_output)
@session_app.command("get")
def session_get(
session: str = typer.Option(..., "--session", "--id", help="Browser session ID."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Get details for a browser session."""
async def _run() -> dict:
skyvern = get_skyvern()
resolved = await skyvern.get_browser_session(session)
state = load_state()
is_current = bool(state and state.mode == "cloud" and state.session_id == session)
return {
"session_id": resolved.browser_session_id,
"status": resolved.status,
"started_at": resolved.started_at.isoformat() if resolved.started_at else None,
"completed_at": resolved.completed_at.isoformat() if resolved.completed_at else None,
"timeout": resolved.timeout,
"runnable_id": resolved.runnable_id,
"is_current": is_current,
}
try:
data = asyncio.run(_run())
output(data, action="session_get", json_mode=json_output)
except Exception as e:
output_error(str(e), hint="Verify the session ID exists and is accessible.", json_mode=json_output)
# ---------------------------------------------------------------------------
# Browser commands
# ---------------------------------------------------------------------------
@browser_app.command("navigate")
def navigate(
url: str = typer.Option(..., help="URL to navigate to."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
timeout: int = typer.Option(30000, help="Navigation timeout in milliseconds."),
wait_until: str | None = typer.Option(None, help="Wait condition: load, domcontentloaded, networkidle, commit."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Navigate to a URL in the browser session."""
async def _run() -> dict:
validate_wait_until(wait_until)
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
result = await do_navigate(page, url, timeout=timeout, wait_until=wait_until)
return {"url": result.url, "title": result.title}
try:
data = asyncio.run(_run())
output(data, action="navigate", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Check the URL is valid and the session is active.", json_mode=json_output)
@browser_app.command("screenshot")
def screenshot(
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
full_page: bool = typer.Option(False, "--full-page", help="Capture the full scrollable page."),
selector: str | None = typer.Option(None, help="CSS selector to screenshot."),
output_path: str | None = typer.Option(None, "--output", help="Custom output file path."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Take a screenshot of the current page."""
async def _run() -> dict:
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
result = await do_screenshot(page, full_page=full_page, selector=selector)
if output_path:
path = Path(output_path)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(result.data)
return {"path": str(path), "bytes": len(result.data), "full_page": result.full_page}
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
artifact = save_artifact(
content=result.data,
kind="screenshot",
filename=f"screenshot_{timestamp}.png",
mime="image/png",
session_id=connection.session_id,
)
return {"path": artifact.path, "bytes": artifact.bytes, "full_page": result.full_page}
try:
data = asyncio.run(_run())
output(data, action="screenshot", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Ensure the session is active and the page has loaded.", json_mode=json_output)
@browser_app.command("evaluate")
def evaluate(
expression: str = typer.Option(..., help="JavaScript expression to evaluate."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Run JavaScript on the current page."""
async def _run() -> dict:
check_js_password(expression)
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
result = await page.evaluate(expression)
return {"result": result}
try:
data = asyncio.run(_run())
output(data, action="evaluate", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Check JavaScript syntax and page state.", json_mode=json_output)
@browser_app.command("click")
def click(
intent: str | None = typer.Option(None, help="Natural language description of the element to click."),
selector: str | None = typer.Option(None, help="CSS selector or XPath for the element to click."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
timeout: int = typer.Option(30000, help="Max wait time in milliseconds."),
button: str | None = typer.Option(None, help="Mouse button: left, right, or middle."),
click_count: int | None = typer.Option(None, "--click-count", help="Number of clicks (2 for double-click)."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Click an element using selector, intent, or both."""
async def _run() -> dict:
validate_button(button)
ai_mode = _resolve_ai_target(selector, intent, operation="click")
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
kwargs: dict[str, Any] = {"timeout": timeout}
if button:
kwargs["button"] = button
if click_count is not None:
kwargs["click_count"] = click_count
if ai_mode is not None:
resolved = await page.click(selector=selector, prompt=intent, ai=ai_mode, **kwargs) # type: ignore[arg-type]
else:
assert selector is not None
resolved = await page.click(selector=selector, **kwargs)
data: dict[str, Any] = {"selector": selector, "intent": intent, "ai_mode": ai_mode}
if resolved and resolved != selector:
data["resolved_selector"] = resolved
return data
try:
data = asyncio.run(_run())
output(data, action="click", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Element may be hidden, disabled, or not yet available.", json_mode=json_output)
@browser_app.command("hover")
def hover(
intent: str | None = typer.Option(None, help="Natural language description of the element to hover."),
selector: str | None = typer.Option(None, help="CSS selector or XPath for the element to hover."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
timeout: int = typer.Option(30000, help="Max wait time in milliseconds."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Hover over an element using selector, intent, or both."""
async def _run() -> dict:
ai_mode = _resolve_ai_target(selector, intent, operation="hover")
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
if ai_mode is not None:
locator = page.locator(selector=selector, prompt=intent, ai=ai_mode) # type: ignore[arg-type]
else:
assert selector is not None
locator = page.locator(selector)
await locator.hover(timeout=timeout)
return {"selector": selector, "intent": intent, "ai_mode": ai_mode}
try:
data = asyncio.run(_run())
output(data, action="hover", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Element may be hidden or not interactable.", json_mode=json_output)
@browser_app.command("type")
def type_text(
text: str = typer.Option(..., help="Text to type into the input."),
intent: str | None = typer.Option(None, help="Natural language description of the input field."),
selector: str | None = typer.Option(None, help="CSS selector or XPath for the input field."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
timeout: int = typer.Option(30000, help="Max wait time in milliseconds."),
clear: bool = typer.Option(True, "--clear/--no-clear", help="Clear existing content before typing."),
delay: int | None = typer.Option(None, help="Delay between keystrokes in milliseconds."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Type into an input field using selector, intent, or both."""
async def _run() -> dict:
target_text = f"{intent or ''} {selector or ''}"
if PASSWORD_PATTERN.search(target_text):
raise GuardError(
"Cannot type into password fields — credentials must not be passed through tool calls",
CREDENTIAL_HINT,
)
ai_mode = _resolve_ai_target(selector, intent, operation="type")
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
if selector:
try:
is_password = await page.evaluate(
"(s) => { const el = document.querySelector(s); return !!(el && el.type === 'password'); }",
selector,
)
except Exception:
is_password = False
if is_password:
raise GuardError(
"Cannot type into password fields — credentials must not be passed through tool calls",
CREDENTIAL_HINT,
)
if clear:
if ai_mode is not None:
await page.fill(selector=selector, value=text, prompt=intent, ai=ai_mode, timeout=timeout) # type: ignore[arg-type]
else:
assert selector is not None
await page.fill(selector, text, timeout=timeout)
else:
kwargs: dict[str, Any] = {"timeout": timeout}
if delay is not None:
kwargs["delay"] = delay
if ai_mode is not None:
locator = page.locator(selector=selector, prompt=intent, ai=ai_mode) # type: ignore[arg-type]
await locator.type(text, **kwargs)
else:
assert selector is not None
await page.type(selector, text, **kwargs)
return {"selector": selector, "intent": intent, "ai_mode": ai_mode, "text_length": len(text)}
try:
data = asyncio.run(_run())
output(data, action="type", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Element may not be editable or may be obscured.", json_mode=json_output)
@browser_app.command("scroll")
def scroll(
direction: str = typer.Option(..., help="Direction: up, down, left, right."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
amount: int | None = typer.Option(None, help="Pixels to scroll (default 500)."),
intent: str | None = typer.Option(None, help="Natural language element to scroll into view."),
selector: str | None = typer.Option(None, help="CSS selector of scrollable element."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Scroll the page or scroll a targeted element into view."""
async def _run() -> dict:
valid_directions = ("up", "down", "left", "right")
if not intent and direction not in valid_directions:
raise GuardError(f"Invalid direction: {direction}", "Use up, down, left, or right")
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
if intent:
ai_mode = "fallback" if selector else "proactive"
locator = page.locator(selector=selector, prompt=intent, ai=ai_mode)
await locator.scroll_into_view_if_needed()
return {"direction": "into_view", "intent": intent, "selector": selector, "ai_mode": ai_mode}
pixels = amount or 500
direction_map = {"up": (0, -pixels), "down": (0, pixels), "left": (-pixels, 0), "right": (pixels, 0)}
dx, dy = direction_map[direction]
if selector:
await page.locator(selector).evaluate(f"el => el.scrollBy({dx}, {dy})")
else:
await page.evaluate(f"window.scrollBy({dx}, {dy})")
return {"direction": direction, "pixels": pixels, "selector": selector}
try:
data = asyncio.run(_run())
output(data, action="scroll", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Scroll failed; check selector and page readiness.", json_mode=json_output)
@browser_app.command("select")
def select(
value: str = typer.Option(..., help="Option value to select."),
intent: str | None = typer.Option(None, help="Natural language description of the dropdown."),
selector: str | None = typer.Option(None, help="CSS selector for the dropdown."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
timeout: int = typer.Option(30000, help="Max wait time in milliseconds."),
by_label: bool = typer.Option(False, "--by-label", help="Select by visible label instead of value."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Select an option from a dropdown."""
async def _run() -> dict:
ai_mode = _resolve_ai_target(selector, intent, operation="select")
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
if ai_mode is not None:
await page.select_option(selector=selector, value=value, prompt=intent, ai=ai_mode, timeout=timeout) # type: ignore[arg-type]
else:
assert selector is not None
if by_label:
await page.page.locator(selector).select_option(label=value, timeout=timeout)
else:
await page.select_option(selector, value=value, timeout=timeout)
return {"selector": selector, "intent": intent, "ai_mode": ai_mode, "value": value, "by_label": by_label}
try:
data = asyncio.run(_run())
output(data, action="select", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Check dropdown selector and available options.", json_mode=json_output)
@browser_app.command("press-key")
def press_key(
key: str = typer.Option(..., help="Key to press (e.g., Enter, Tab, Escape)."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
intent: str | None = typer.Option(None, help="Natural language description of element to focus first."),
selector: str | None = typer.Option(None, help="CSS selector to focus first."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Press a keyboard key."""
async def _run() -> dict:
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
if intent or selector:
ai_mode, err = resolve_ai_mode(selector, intent)
if err:
raise GuardError(
"Must provide intent, selector, or both",
"Use intent='describe where to press' or selector='#css-selector'",
)
if ai_mode is not None:
locator = page.locator(selector=selector, prompt=intent, ai=ai_mode) # type: ignore[arg-type]
else:
assert selector is not None
locator = page.locator(selector)
await locator.press(key)
else:
await page.keyboard.press(key)
return {"key": key, "selector": selector, "intent": intent}
try:
data = asyncio.run(_run())
output(data, action="press_key", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Check key name and focused target.", json_mode=json_output)
@browser_app.command("wait")
def wait(
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
time_ms: int | None = typer.Option(None, "--time", help="Milliseconds to wait."),
intent: str | None = typer.Option(None, help="Natural language condition to wait for."),
selector: str | None = typer.Option(None, help="CSS selector to wait for."),
state: str = typer.Option("visible", help="Element state: visible, hidden, attached, detached."),
timeout: int = typer.Option(30000, help="Max wait time in milliseconds."),
poll_interval: int = typer.Option(5000, "--poll-interval", help="Polling interval for intent waits in ms."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Wait for time, selector state, or AI condition."""
async def _run() -> dict:
_validate_wait_state(state)
if time_ms is None and not selector and not intent:
raise GuardError(
"Must provide intent, selector, or time_ms",
"Use --time, --selector, or --intent to specify what to wait for",
)
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
waited_for = ""
if time_ms is not None:
await page.wait_for_timeout(time_ms)
waited_for = "time"
elif intent:
loop = asyncio.get_running_loop()
deadline = loop.time() + timeout / 1000
last_error: Exception | None = None
while True:
try:
ready = await page.validate(intent)
last_error = None
except Exception as poll_error:
ready = False
last_error = poll_error
if ready:
waited_for = "intent"
break
if loop.time() >= deadline:
if last_error:
raise RuntimeError(str(last_error))
raise TimeoutError(f"Condition not met within {timeout}ms: {intent}")
await page.wait_for_timeout(poll_interval)
else:
assert selector is not None
await page.wait_for_selector(selector, state=state, timeout=timeout)
waited_for = "selector"
return {"waited_for": waited_for, "state": state, "selector": selector, "intent": intent}
try:
data = asyncio.run(_run())
output(data, action="wait", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Condition was not met within timeout.", json_mode=json_output)
@browser_app.command("act")
def act(
prompt: str = typer.Option(..., help="Natural language action to perform."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Perform a natural language action on the current page."""
async def _run() -> dict:
check_password_prompt(prompt)
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
result = await do_act(page, prompt)
return {"prompt": result.prompt, "completed": result.completed}
try:
data = asyncio.run(_run())
output(data, action="act", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Simplify the prompt or break into steps.", json_mode=json_output)
@browser_app.command("extract")
def extract(
prompt: str = typer.Option(..., help="What data to extract from the page."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
schema: str | None = typer.Option(None, help="JSON schema for structured extraction."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Extract data from the current page using natural language."""
async def _run() -> dict:
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
result = await do_extract(page, prompt, schema=schema)
return {"prompt": prompt, "extracted": result.extracted}
try:
data = asyncio.run(_run())
output(data, action="extract", json_mode=json_output)
except GuardError as e:
output_error(str(e), hint=e.hint, json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Simplify the prompt or provide a JSON schema.", json_mode=json_output)
@browser_app.command("validate")
def validate(
prompt: str = typer.Option(..., help="Validation condition to check."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Check whether a natural language condition is true on the current page."""
async def _run() -> dict:
connection = _resolve_connection(session, cdp)
browser = await _connect_browser(connection)
page = await browser.get_working_page()
valid = await page.validate(prompt)
return {"prompt": prompt, "valid": valid}
try:
data = asyncio.run(_run())
output(data, action="validate", json_mode=json_output)
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Check the page state and validation prompt.", json_mode=json_output)
@browser_app.command("run-task")
def run_task(
prompt: str = typer.Option(..., help="Natural language description of the task to automate."),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
url: str | None = typer.Option(None, help="URL to navigate to before running."),
data_extraction_schema: str | None = typer.Option(
None,
"--schema",
"--data-extraction-schema",
help="JSON Schema string defining what data to extract.",
),
max_steps: int | None = typer.Option(None, "--max-steps", min=1, help="Maximum number of agent steps."),
timeout_seconds: int = typer.Option(180, "--timeout", min=10, max=1800, help="Timeout in seconds."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Run a quick one-off browser automation task."""
async def _run() -> dict[str, Any]:
connection = _resolve_connection(session, cdp)
return await tool_run_task(
prompt=prompt,
session_id=connection.session_id if connection.mode == "cloud" else None,
cdp_url=connection.cdp_url if connection.mode == "cdp" else None,
url=url,
data_extraction_schema=data_extraction_schema,
max_steps=max_steps,
timeout_seconds=timeout_seconds,
)
try:
result = asyncio.run(_run())
_emit_tool_result(result, json_output=json_output, action="run_task")
except typer.BadParameter:
raise
except Exception as e:
output_error(str(e), hint="Check the prompt, active connection, and timeout settings.", json_mode=json_output)
@browser_app.command("login")
def login(
credential_type: str = typer.Option(
"skyvern",
"--credential-type",
help="Credential provider: skyvern, bitwarden, 1password, or azure_vault.",
),
session: str | None = typer.Option(None, help="Browser session ID."),
cdp: str | None = typer.Option(None, "--cdp", help="CDP WebSocket URL."),
url: str | None = typer.Option(None, help="Login page URL."),
credential_id: str | None = typer.Option(None, "--credential-id", help="Skyvern credential ID for type=skyvern."),
bitwarden_item_id: str | None = typer.Option(None, "--bitwarden-item-id", help="Bitwarden item ID."),
bitwarden_collection_id: str | None = typer.Option(
None, "--bitwarden-collection-id", help="Bitwarden collection ID."
),
onepassword_vault_id: str | None = typer.Option(None, "--onepassword-vault-id", help="1Password vault ID."),
onepassword_item_id: str | None = typer.Option(None, "--onepassword-item-id", help="1Password item ID."),
azure_vault_name: str | None = typer.Option(None, "--azure-vault-name", help="Azure Vault name."),
azure_vault_username_key: str | None = typer.Option(
None, "--azure-vault-username-key", help="Azure Vault username key."
),
azure_vault_password_key: str | None = typer.Option(
None, "--azure-vault-password-key", help="Azure Vault password key."
),
azure_vault_totp_secret_key: str | None = typer.Option(
None, "--azure-vault-totp-secret-key", help="Azure Vault TOTP secret key."
),
prompt: str | None = typer.Option(None, help="Additional login instructions."),
totp_identifier: str | None = typer.Option(None, "--totp-identifier", help="TOTP identifier for 2FA."),
totp_url: str | None = typer.Option(None, "--totp-url", help="URL to fetch TOTP codes."),
timeout_seconds: int = typer.Option(180, "--timeout", min=10, max=600, help="Timeout in seconds."),
json_output: bool = typer.Option(False, "--json", help="Output as JSON."),
) -> None:
"""Log into a site using stored credentials from a supported provider."""
async def _run() -> dict[str, Any]:
connection = _resolve_connection(session, cdp)
return await tool_login(
credential_type=credential_type,
session_id=connection.session_id if connection.mode == "cloud" else None,
cdp_url=connection.cdp_url if connection.mode == "cdp" else None,
url=url,
credential_id=credential_id,
bitwarden_item_id=bitwarden_item_id,
bitwarden_collection_id=bitwarden_collection_id,
onepassword_vault_id=onepassword_vault_id,
onepassword_item_id=onepassword_item_id,
azure_vault_name=azure_vault_name,
azure_vault_username_key=azure_vault_username_key,
azure_vault_password_key=azure_vault_password_key,
azure_vault_totp_secret_key=azure_vault_totp_secret_key,
prompt=prompt,
totp_identifier=totp_identifier,
totp_url=totp_url,
timeout_seconds=timeout_seconds,
)
try:
result = asyncio.run(_run())
_emit_tool_result(result, json_output=json_output, action="login")
except typer.BadParameter:
raise
except Exception as e:
output_error(
str(e), hint="Check credential inputs, active connection, and timeout settings.", json_mode=json_output
)