Added Ollama & Openrouter & Groq & improved cdp browser (#2283)

2025-05-05 03:03:23 -04:00
parent 0540e65d06
commit c3072d7572
5 changed files with 207 additions and 7 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -24,6 +24,7 @@ services:
    # comment out if you want to externally call skyvern API
    ports:
      - 8000:8000
      - 9222:9222 # for cdp browser forwarding
    volumes:
      - ./artifacts:/data/artifacts
      - ./videos:/data/videos
@@ -36,9 +37,26 @@ services:
    environment:
      - DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern
      - BROWSER_TYPE=chromium-headful
-      - ENABLE_OPENAI=true
+      # - BROWSER_TYPE=cdp-connect
-      - LLM_KEY=OPENAI_GPT4O
+      # Use this command to start Chrome with remote debugging:
-      - OPENAI_API_KEY=<your_openai_key>
+      # "C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\chrome-cdp-profile" --no-first-run --no-default-browser-check
      # /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 --user-data-dir="/Users/yourusername/chrome-cdp-profile" --no-first-run --no-default-browser-check
      # - BROWSER_REMOTE_DEBUGGING_URL=http://host.docker.internal:9222/
      # =========================
      #       LLM Settings
      # =========================
      # OpenAI Support:
      # If you want to use OpenAI as your LLM provider, uncomment the following lines and fill in your OpenAI API key.
      # - ENABLE_OPENAI=true
      # - LLM_KEY=OPENAI_GPT4O
      # - OPENAI_API_KEY=<your_openai_key>
      #  Gemini Support:
      # Gemini is a new LLM provider that is currently in beta. You can use it by uncommenting the following lines and filling in your Gemini API key.
      - LLM_KEY=GEMINI
      - ENABLE_GEMINI=true
      - GEMINI_API_KEY=YOUR_GEMINI_KEY
      - LLM_KEY=GEMINI_2.5_PRO_PREVIEW_03_25
      # If you want to use other LLM provider, like azure and anthropic:
      # - ENABLE_ANTHROPIC=true
      # - LLM_KEY=ANTHROPIC_CLAUDE3.5_SONNET
@@ -72,7 +90,26 @@ services:
      # - AWS_REGION=us-west-2                         # Replace this with a different AWS region, if you desire
      # - AWS_ACCESS_KEY_ID=FILL_ME_IN_PLEASE
      # - AWS_SECRET_ACCESS_KEY=FILL_ME_IN_PLEASE
-      # 
+      # Ollama Support:
      # Ollama is a local LLM provider that can be used to run models locally on your machine.
      # - LLM_KEY=OLLAMA
      # - ENABLE_OLLAMA=true
      # - OLLAMA_MODEL=qwen2.5:7b-instruct
      # - OLLAMA_SERVER_URL=http://host.docker.internal:11434
      # Open Router Support:
      # - ENABLE_OPENROUTER=true
      # - LLM_KEY=OPENROUTER
      # - OPENROUTER_API_KEY=<your_openrouter_api_key>
      # - OPENROUTER_MODEL=mistralai/mistral-small-3.1-24b-instruct
      # Groq Support:
      # - ENABLE_GROQ=true
      # - LLM_KEY=GROQ
      # - GROQ_API_KEY=<your_groq_api_key>
      # - GROQ_MODEL=llama-3.1-8b-instant
      # Maximum tokens to use: (only set for OpenRouter aand Ollama)
      # - LLM_CONFIG_MAX_TOKENS=128000
      # Bitwarden Settings
      # If you are looking to integrate Skyvern with a password manager (eg Bitwarden), you can use the following environment variables.
      # - BITWARDEN_SERVER=http://localhost  # OPTIONAL IF YOU ARE SELF HOSTING BITWARDEN
@@ -80,7 +117,7 @@ services:
      # - BITWARDEN_CLIENT_ID=FILL_ME_IN_PLEASE
      # - BITWARDEN_CLIENT_SECRET=FILL_ME_IN_PLEASE
      # - BITWARDEN_MASTER_PASSWORD=FILL_ME_IN_PLEASE
-
+      
    depends_on:
      postgres:
        condition: service_healthy
--- a/skyvern/cli/commands.py
+++ b/skyvern/cli/commands.py
@@ -7,6 +7,8 @@ import time
 import uuid
 from pathlib import Path
 from typing import Optional
 import requests
 from urllib.parse import urlparse
 import typer
 import uvicorn
@@ -472,6 +474,82 @@ def setup_browser_config() -> tuple[str, Optional[str], Optional[str]]:
        print("\nTo use CDP connection, Chrome must be running with remote debugging enabled.")
        print("Example: chrome --remote-debugging-port=9222")
        print("Default debugging URL: http://localhost:9222")
        default_port = "9222"  
        if remote_debugging_url is None:
            remote_debugging_url = "http://localhost:9222"
        elif ":" in remote_debugging_url.split("/")[-1]:
            default_port = remote_debugging_url.split(":")[-1].split("/")[0]
        parsed_url = urlparse(remote_debugging_url)
        version_url = f"{parsed_url.scheme}://{parsed_url.netloc}/json/version"
        print(f"\nChecking if Chrome is already running with remote debugging on port {default_port}...")
        try:
            response = requests.get(version_url, timeout=2)
            if response.status_code == 200:
                try:
                    browser_info = response.json()
                    print(f"Chrome is already running with remote debugging!")
                    if "Browser" in browser_info:
                        print(f"Browser: {browser_info['Browser']}")
                    if "webSocketDebuggerUrl" in browser_info:
                        print(f"WebSocket URL: {browser_info['webSocketDebuggerUrl']}")
                    print(f"Connected to {remote_debugging_url}")
                    return selected_browser, browser_location, remote_debugging_url
                except json.JSONDecodeError:
                    print("Port is in use, but doesn't appear to be Chrome with remote debugging.")
        except requests.RequestException:
            print(f"No Chrome instance detected on {remote_debugging_url}")
        print("\nExecuting Chrome with remote debugging enabled:")
        if host_system == "darwin" or host_system == "linux":
            chrome_cmd = f'{browser_location} --remote-debugging-port={default_port} --user-data-dir="$HOME/chrome-cdp-profile" --no-first-run --no-default-browser-check'
            print(f"    {chrome_cmd}")
        elif host_system == "windows" or host_system == "wsl":
            chrome_cmd = f'"{browser_location}" --remote-debugging-port={default_port} --user-data-dir="C:\\chrome-cdp-profile" --no-first-run --no-default-browser-check'
            print(f"    {chrome_cmd}")
        else:
            print("Unsupported OS for Chrome configuration. Please set it up manually.")
        # Ask user if they want to execute the command
        execute_browser = input("\nWould you like to start Chrome with remote debugging now? (y/n) [y]: ").strip().lower()
        if not execute_browser or execute_browser == "y":
            print(f"Starting Chrome with remote debugging on port {default_port}...")
            try:
                # Execute in background - different approach per OS
                if host_system in ["darwin", "linux"]:
                    subprocess.Popen(f"nohup {chrome_cmd} > /dev/null 2>&1 &", shell=True)
                elif host_system == "windows":
                    subprocess.Popen(f"start {chrome_cmd}", shell=True)
                elif host_system == "wsl":
                    subprocess.Popen(f"cmd.exe /c start {chrome_cmd}", shell=True)
                print(f"Chrome started successfully. Connecting to {remote_debugging_url}")
                print("Waiting for Chrome to initialize...")
                time.sleep(2)
                try:
                    verification_response = requests.get(version_url, timeout=5)
                    if verification_response.status_code == 200:
                        try:
                            browser_info = verification_response.json()
                            print("Connection verified! Chrome is running with remote debugging.")
                            if "Browser" in browser_info:
                                print(f"Browser: {browser_info['Browser']}")
                        except json.JSONDecodeError:
                            print("Warning: Response from Chrome debugging port is not valid JSON.")
                    else:
                        print(f"Warning: Chrome responded with status code {verification_response.status_code}")
                except requests.RequestException as e:
                    print(f"Warning: Could not verify Chrome is running properly: {e}")
                    print("You may need to check Chrome manually or try a different port.")
            except Exception as e:
                print(f"Error starting Chrome: {e}")
                print("Please start Chrome manually using the command above.")
        remote_debugging_url = input("Enter remote debugging URL (press Enter for default): ").strip()
        if not remote_debugging_url:
            remote_debugging_url = "http://localhost:9222"
--- a/skyvern/config.py
+++ b/skyvern/config.py
@@ -214,6 +214,23 @@ class Settings(BaseSettings):
    NOVITA_API_KEY: str | None = None
    NOVITA_API_VERSION: str = "v3"
    # OLLAMA
    ENABLE_OLLAMA: bool = False
    OLLAMA_SERVER_URL: str | None = None
    OLLAMA_MODEL: str | None = None
    # OPENROUTER
    ENABLE_OPENROUTER: bool = False
    OPENROUTER_API_KEY: str | None = None
    OPENROUTER_MODEL: str | None = None
    OPENROUTER_API_BASE: str = "https://api.openrouter.ai/v1"
    # GROQ
    ENABLE_GROQ: bool = False
    GROQ_API_KEY: str | None = None
    GROQ_MODEL: str | None = None
    GROQ_API_BASE: str = "https://api.groq.com/openai/v1"
    # TOTP Settings
    TOTP_LIFESPAN_MINUTES: int = 10
    VERIFICATION_CODE_INITIAL_WAIT_TIME_SECS: int = 40
--- a/skyvern/forge/sdk/api/llm/config_registry.py
+++ b/skyvern/forge/sdk/api/llm/config_registry.py
@@ -804,7 +804,67 @@ if settings.ENABLE_VERTEX_AI:
        ),
    )
 if settings.ENABLE_OLLAMA:
    # Register Ollama model configured in settings
    if settings.OLLAMA_MODEL:
        model_name = settings.OLLAMA_MODEL
        LLMConfigRegistry.register_config(
            "OLLAMA",
            LLMConfig(
                f"ollama/{model_name}",
                ["OLLAMA_SERVER_URL", "OLLAMA_MODEL"],
                supports_vision=False,              # Ollama does not support vision yet
                add_assistant_prefix=False,
                max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS,
                litellm_params=LiteLLMParams(
                    api_base=settings.OLLAMA_SERVER_URL,
                    api_key=None,
                    api_version=None,
                    model_info={"model_name": f"ollama/{model_name}"},
                ),
            ),
        )
 if settings.ENABLE_OPENROUTER:
    # Register OpenRouter model configured in settings
    if settings.OPENROUTER_MODEL:
        model_name = settings.OPENROUTER_MODEL
        LLMConfigRegistry.register_config(
            "OPENROUTER",
            LLMConfig(
                f"openrouter/{model_name}",
                ["OPENROUTER_API_KEY", "OPENROUTER_MODEL"],
                supports_vision=settings.LLM_CONFIG_SUPPORT_VISION, 
                add_assistant_prefix=False,
                max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS,
                litellm_params=LiteLLMParams(
                    api_key=settings.OPENROUTER_API_KEY,
                    api_base=settings.OPENROUTER_API_BASE,
                    api_version=None,
                    model_info={"model_name": f"openrouter/{model_name}"},
                ),
            ),
        )
 if settings.ENABLE_GROQ:
    # Register Groq model configured in settings
    if settings.GROQ_MODEL:
        model_name = settings.GROQ_MODEL
        LLMConfigRegistry.register_config(
            "GROQ",
            LLMConfig(
                f"groq/{model_name}",
                ["GROQ_API_KEY", "GROQ_MODEL"],
                supports_vision=settings.LLM_CONFIG_SUPPORT_VISION,
                add_assistant_prefix=False,
                max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS,
                litellm_params=LiteLLMParams(
                    api_key=settings.GROQ_API_KEY,
                    api_version=None,
                    api_base=settings.GROQ_API_BASE,
                    model_info={"model_name": f"groq/{model_name}"},
                ),
            ),
        )
 # Add support for dynamically configuring OpenAI-compatible LLM models
 # Based on liteLLM's support for OpenAI-compatible APIs
 # See documentation: https://docs.litellm.ai/docs/providers/openai_compatible
--- a/skyvern/webeye/browser_factory.py
+++ b/skyvern/webeye/browser_factory.py
@@ -465,10 +465,18 @@ async def _create_cdp_connection_browser(
            raise Exception("Port 9222 is already in use. Another process may be using this port.")
        browser_process = subprocess.Popen(
-            [browser_path, "--remote-debugging-port=9222"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            [
                browser_path,
                "--remote-debugging-port=9222",
                "--no-first-run",
                "--no-default-browser-check",
                "--remote-debugging-address=0.0.0.0",
            ], 
            stdout=subprocess.PIPE, 
            stderr=subprocess.PIPE
        )
        # Add small delay to allow browser to start
-        time.sleep(1)
+        time.sleep(2)
        if browser_process.poll() is not None:
            raise Exception(f"Failed to open browser. browser_path: {browser_path}")