Added Ollama & Openrouter & Groq & improved cdp browser (#2283)

This commit is contained in:
Prakash Maheshwaran
2025-05-05 03:03:23 -04:00
committed by GitHub
parent 0540e65d06
commit c3072d7572
5 changed files with 207 additions and 7 deletions

View File

@@ -24,6 +24,7 @@ services:
# comment out if you want to externally call skyvern API
ports:
- 8000:8000
- 9222:9222 # for cdp browser forwarding
volumes:
- ./artifacts:/data/artifacts
- ./videos:/data/videos
@@ -36,9 +37,26 @@ services:
environment:
- DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern
- BROWSER_TYPE=chromium-headful
- ENABLE_OPENAI=true
- LLM_KEY=OPENAI_GPT4O
- OPENAI_API_KEY=<your_openai_key>
# - BROWSER_TYPE=cdp-connect
# Use this command to start Chrome with remote debugging:
# "C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\chrome-cdp-profile" --no-first-run --no-default-browser-check
# /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 --user-data-dir="/Users/yourusername/chrome-cdp-profile" --no-first-run --no-default-browser-check
# - BROWSER_REMOTE_DEBUGGING_URL=http://host.docker.internal:9222/
# =========================
# LLM Settings
# =========================
# OpenAI Support:
# If you want to use OpenAI as your LLM provider, uncomment the following lines and fill in your OpenAI API key.
# - ENABLE_OPENAI=true
# - LLM_KEY=OPENAI_GPT4O
# - OPENAI_API_KEY=<your_openai_key>
# Gemini Support:
# Gemini is a new LLM provider that is currently in beta. You can use it by uncommenting the following lines and filling in your Gemini API key.
- LLM_KEY=GEMINI
- ENABLE_GEMINI=true
- GEMINI_API_KEY=YOUR_GEMINI_KEY
- LLM_KEY=GEMINI_2.5_PRO_PREVIEW_03_25
# If you want to use other LLM provider, like azure and anthropic:
# - ENABLE_ANTHROPIC=true
# - LLM_KEY=ANTHROPIC_CLAUDE3.5_SONNET
@@ -72,7 +90,26 @@ services:
# - AWS_REGION=us-west-2 # Replace this with a different AWS region, if you desire
# - AWS_ACCESS_KEY_ID=FILL_ME_IN_PLEASE
# - AWS_SECRET_ACCESS_KEY=FILL_ME_IN_PLEASE
#
# Ollama Support:
# Ollama is a local LLM provider that can be used to run models locally on your machine.
# - LLM_KEY=OLLAMA
# - ENABLE_OLLAMA=true
# - OLLAMA_MODEL=qwen2.5:7b-instruct
# - OLLAMA_SERVER_URL=http://host.docker.internal:11434
# Open Router Support:
# - ENABLE_OPENROUTER=true
# - LLM_KEY=OPENROUTER
# - OPENROUTER_API_KEY=<your_openrouter_api_key>
# - OPENROUTER_MODEL=mistralai/mistral-small-3.1-24b-instruct
# Groq Support:
# - ENABLE_GROQ=true
# - LLM_KEY=GROQ
# - GROQ_API_KEY=<your_groq_api_key>
# - GROQ_MODEL=llama-3.1-8b-instant
# Maximum tokens to use: (only set for OpenRouter aand Ollama)
# - LLM_CONFIG_MAX_TOKENS=128000
# Bitwarden Settings
# If you are looking to integrate Skyvern with a password manager (eg Bitwarden), you can use the following environment variables.
# - BITWARDEN_SERVER=http://localhost # OPTIONAL IF YOU ARE SELF HOSTING BITWARDEN
@@ -80,7 +117,7 @@ services:
# - BITWARDEN_CLIENT_ID=FILL_ME_IN_PLEASE
# - BITWARDEN_CLIENT_SECRET=FILL_ME_IN_PLEASE
# - BITWARDEN_MASTER_PASSWORD=FILL_ME_IN_PLEASE
depends_on:
postgres:
condition: service_healthy

View File

@@ -7,6 +7,8 @@ import time
import uuid
from pathlib import Path
from typing import Optional
import requests
from urllib.parse import urlparse
import typer
import uvicorn
@@ -472,6 +474,82 @@ def setup_browser_config() -> tuple[str, Optional[str], Optional[str]]:
print("\nTo use CDP connection, Chrome must be running with remote debugging enabled.")
print("Example: chrome --remote-debugging-port=9222")
print("Default debugging URL: http://localhost:9222")
default_port = "9222"
if remote_debugging_url is None:
remote_debugging_url = "http://localhost:9222"
elif ":" in remote_debugging_url.split("/")[-1]:
default_port = remote_debugging_url.split(":")[-1].split("/")[0]
parsed_url = urlparse(remote_debugging_url)
version_url = f"{parsed_url.scheme}://{parsed_url.netloc}/json/version"
print(f"\nChecking if Chrome is already running with remote debugging on port {default_port}...")
try:
response = requests.get(version_url, timeout=2)
if response.status_code == 200:
try:
browser_info = response.json()
print(f"Chrome is already running with remote debugging!")
if "Browser" in browser_info:
print(f"Browser: {browser_info['Browser']}")
if "webSocketDebuggerUrl" in browser_info:
print(f"WebSocket URL: {browser_info['webSocketDebuggerUrl']}")
print(f"Connected to {remote_debugging_url}")
return selected_browser, browser_location, remote_debugging_url
except json.JSONDecodeError:
print("Port is in use, but doesn't appear to be Chrome with remote debugging.")
except requests.RequestException:
print(f"No Chrome instance detected on {remote_debugging_url}")
print("\nExecuting Chrome with remote debugging enabled:")
if host_system == "darwin" or host_system == "linux":
chrome_cmd = f'{browser_location} --remote-debugging-port={default_port} --user-data-dir="$HOME/chrome-cdp-profile" --no-first-run --no-default-browser-check'
print(f" {chrome_cmd}")
elif host_system == "windows" or host_system == "wsl":
chrome_cmd = f'"{browser_location}" --remote-debugging-port={default_port} --user-data-dir="C:\\chrome-cdp-profile" --no-first-run --no-default-browser-check'
print(f" {chrome_cmd}")
else:
print("Unsupported OS for Chrome configuration. Please set it up manually.")
# Ask user if they want to execute the command
execute_browser = input("\nWould you like to start Chrome with remote debugging now? (y/n) [y]: ").strip().lower()
if not execute_browser or execute_browser == "y":
print(f"Starting Chrome with remote debugging on port {default_port}...")
try:
# Execute in background - different approach per OS
if host_system in ["darwin", "linux"]:
subprocess.Popen(f"nohup {chrome_cmd} > /dev/null 2>&1 &", shell=True)
elif host_system == "windows":
subprocess.Popen(f"start {chrome_cmd}", shell=True)
elif host_system == "wsl":
subprocess.Popen(f"cmd.exe /c start {chrome_cmd}", shell=True)
print(f"Chrome started successfully. Connecting to {remote_debugging_url}")
print("Waiting for Chrome to initialize...")
time.sleep(2)
try:
verification_response = requests.get(version_url, timeout=5)
if verification_response.status_code == 200:
try:
browser_info = verification_response.json()
print("Connection verified! Chrome is running with remote debugging.")
if "Browser" in browser_info:
print(f"Browser: {browser_info['Browser']}")
except json.JSONDecodeError:
print("Warning: Response from Chrome debugging port is not valid JSON.")
else:
print(f"Warning: Chrome responded with status code {verification_response.status_code}")
except requests.RequestException as e:
print(f"Warning: Could not verify Chrome is running properly: {e}")
print("You may need to check Chrome manually or try a different port.")
except Exception as e:
print(f"Error starting Chrome: {e}")
print("Please start Chrome manually using the command above.")
remote_debugging_url = input("Enter remote debugging URL (press Enter for default): ").strip()
if not remote_debugging_url:
remote_debugging_url = "http://localhost:9222"

View File

@@ -214,6 +214,23 @@ class Settings(BaseSettings):
NOVITA_API_KEY: str | None = None
NOVITA_API_VERSION: str = "v3"
# OLLAMA
ENABLE_OLLAMA: bool = False
OLLAMA_SERVER_URL: str | None = None
OLLAMA_MODEL: str | None = None
# OPENROUTER
ENABLE_OPENROUTER: bool = False
OPENROUTER_API_KEY: str | None = None
OPENROUTER_MODEL: str | None = None
OPENROUTER_API_BASE: str = "https://api.openrouter.ai/v1"
# GROQ
ENABLE_GROQ: bool = False
GROQ_API_KEY: str | None = None
GROQ_MODEL: str | None = None
GROQ_API_BASE: str = "https://api.groq.com/openai/v1"
# TOTP Settings
TOTP_LIFESPAN_MINUTES: int = 10
VERIFICATION_CODE_INITIAL_WAIT_TIME_SECS: int = 40

View File

@@ -804,7 +804,67 @@ if settings.ENABLE_VERTEX_AI:
),
)
if settings.ENABLE_OLLAMA:
# Register Ollama model configured in settings
if settings.OLLAMA_MODEL:
model_name = settings.OLLAMA_MODEL
LLMConfigRegistry.register_config(
"OLLAMA",
LLMConfig(
f"ollama/{model_name}",
["OLLAMA_SERVER_URL", "OLLAMA_MODEL"],
supports_vision=False, # Ollama does not support vision yet
add_assistant_prefix=False,
max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS,
litellm_params=LiteLLMParams(
api_base=settings.OLLAMA_SERVER_URL,
api_key=None,
api_version=None,
model_info={"model_name": f"ollama/{model_name}"},
),
),
)
if settings.ENABLE_OPENROUTER:
# Register OpenRouter model configured in settings
if settings.OPENROUTER_MODEL:
model_name = settings.OPENROUTER_MODEL
LLMConfigRegistry.register_config(
"OPENROUTER",
LLMConfig(
f"openrouter/{model_name}",
["OPENROUTER_API_KEY", "OPENROUTER_MODEL"],
supports_vision=settings.LLM_CONFIG_SUPPORT_VISION,
add_assistant_prefix=False,
max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS,
litellm_params=LiteLLMParams(
api_key=settings.OPENROUTER_API_KEY,
api_base=settings.OPENROUTER_API_BASE,
api_version=None,
model_info={"model_name": f"openrouter/{model_name}"},
),
),
)
if settings.ENABLE_GROQ:
# Register Groq model configured in settings
if settings.GROQ_MODEL:
model_name = settings.GROQ_MODEL
LLMConfigRegistry.register_config(
"GROQ",
LLMConfig(
f"groq/{model_name}",
["GROQ_API_KEY", "GROQ_MODEL"],
supports_vision=settings.LLM_CONFIG_SUPPORT_VISION,
add_assistant_prefix=False,
max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS,
litellm_params=LiteLLMParams(
api_key=settings.GROQ_API_KEY,
api_version=None,
api_base=settings.GROQ_API_BASE,
model_info={"model_name": f"groq/{model_name}"},
),
),
)
# Add support for dynamically configuring OpenAI-compatible LLM models
# Based on liteLLM's support for OpenAI-compatible APIs
# See documentation: https://docs.litellm.ai/docs/providers/openai_compatible

View File

@@ -465,10 +465,18 @@ async def _create_cdp_connection_browser(
raise Exception("Port 9222 is already in use. Another process may be using this port.")
browser_process = subprocess.Popen(
[browser_path, "--remote-debugging-port=9222"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
[
browser_path,
"--remote-debugging-port=9222",
"--no-first-run",
"--no-default-browser-check",
"--remote-debugging-address=0.0.0.0",
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
# Add small delay to allow browser to start
time.sleep(1)
time.sleep(2)
if browser_process.poll() is not None:
raise Exception(f"Failed to open browser. browser_path: {browser_path}")