Added Ollama & Openrouter & Groq & improved cdp browser (#2283)

2025-05-05 03:03:23 -04:00
parent 0540e65d06
commit c3072d7572
5 changed files with 207 additions and 7 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -24,6 +24,7 @@ services:
    # comment out if you want to externally call skyvern API
    ports:
      - 8000:8000
+      - 9222:9222 # for cdp browser forwarding
    volumes:
      - ./artifacts:/data/artifacts
      - ./videos:/data/videos
@@ -36,9 +37,26 @@ services:
    environment:
      - DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern
      - BROWSER_TYPE=chromium-headful
-      - ENABLE_OPENAI=true
-      - LLM_KEY=OPENAI_GPT4O
-      - OPENAI_API_KEY=<your_openai_key>
+      # - BROWSER_TYPE=cdp-connect
+      # Use this command to start Chrome with remote debugging:
+      # "C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\chrome-cdp-profile" --no-first-run --no-default-browser-check
+      # /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 --user-data-dir="/Users/yourusername/chrome-cdp-profile" --no-first-run --no-default-browser-check
+      # - BROWSER_REMOTE_DEBUGGING_URL=http://host.docker.internal:9222/
+
+      # =========================
+      #       LLM Settings
+      # =========================
+      # OpenAI Support:
+      # If you want to use OpenAI as your LLM provider, uncomment the following lines and fill in your OpenAI API key.
+      # - ENABLE_OPENAI=true
+      # - LLM_KEY=OPENAI_GPT4O
+      # - OPENAI_API_KEY=<your_openai_key>
+      #  Gemini Support:
+      # Gemini is a new LLM provider that is currently in beta. You can use it by uncommenting the following lines and filling in your Gemini API key.
+      - LLM_KEY=GEMINI
+      - ENABLE_GEMINI=true
+      - GEMINI_API_KEY=YOUR_GEMINI_KEY
+      - LLM_KEY=GEMINI_2.5_PRO_PREVIEW_03_25
      # If you want to use other LLM provider, like azure and anthropic:
      # - ENABLE_ANTHROPIC=true
      # - LLM_KEY=ANTHROPIC_CLAUDE3.5_SONNET
@@ -72,7 +90,26 @@ services:
      # - AWS_REGION=us-west-2                         # Replace this with a different AWS region, if you desire
      # - AWS_ACCESS_KEY_ID=FILL_ME_IN_PLEASE
      # - AWS_SECRET_ACCESS_KEY=FILL_ME_IN_PLEASE
-      # 
+      # Ollama Support:
+      # Ollama is a local LLM provider that can be used to run models locally on your machine.
+      # - LLM_KEY=OLLAMA
+      # - ENABLE_OLLAMA=true
+      # - OLLAMA_MODEL=qwen2.5:7b-instruct
+      # - OLLAMA_SERVER_URL=http://host.docker.internal:11434
+      # Open Router Support:
+      # - ENABLE_OPENROUTER=true
+      # - LLM_KEY=OPENROUTER
+      # - OPENROUTER_API_KEY=<your_openrouter_api_key>
+      # - OPENROUTER_MODEL=mistralai/mistral-small-3.1-24b-instruct
+      # Groq Support:
+      # - ENABLE_GROQ=true
+      # - LLM_KEY=GROQ
+      # - GROQ_API_KEY=<your_groq_api_key>
+      # - GROQ_MODEL=llama-3.1-8b-instant
+
+      # Maximum tokens to use: (only set for OpenRouter aand Ollama)
+      # - LLM_CONFIG_MAX_TOKENS=128000
+
      # Bitwarden Settings
      # If you are looking to integrate Skyvern with a password manager (eg Bitwarden), you can use the following environment variables.
      # - BITWARDEN_SERVER=http://localhost  # OPTIONAL IF YOU ARE SELF HOSTING BITWARDEN
@@ -80,7 +117,7 @@ services:
      # - BITWARDEN_CLIENT_ID=FILL_ME_IN_PLEASE
      # - BITWARDEN_CLIENT_SECRET=FILL_ME_IN_PLEASE
      # - BITWARDEN_MASTER_PASSWORD=FILL_ME_IN_PLEASE
-
+      
    depends_on:
      postgres:
        condition: service_healthy
--- a/skyvern/cli/commands.py
+++ b/skyvern/cli/commands.py
@@ -7,6 +7,8 @@ import time
 import uuid
 from pathlib import Path
 from typing import Optional
+import requests
+from urllib.parse import urlparse

 import typer
 import uvicorn
@@ -472,6 +474,82 @@ def setup_browser_config() -> tuple[str, Optional[str], Optional[str]]:
        print("\nTo use CDP connection, Chrome must be running with remote debugging enabled.")
        print("Example: chrome --remote-debugging-port=9222")
        print("Default debugging URL: http://localhost:9222")
+        
+        default_port = "9222"  
+        if remote_debugging_url is None:
+            remote_debugging_url = "http://localhost:9222"
+        elif ":" in remote_debugging_url.split("/")[-1]:
+            default_port = remote_debugging_url.split(":")[-1].split("/")[0]
+        
+        parsed_url = urlparse(remote_debugging_url)
+        version_url = f"{parsed_url.scheme}://{parsed_url.netloc}/json/version"
+        
+        print(f"\nChecking if Chrome is already running with remote debugging on port {default_port}...")
+        try:
+            response = requests.get(version_url, timeout=2)
+            if response.status_code == 200:
+                try:
+                    browser_info = response.json()
+                    print(f"Chrome is already running with remote debugging!")
+                    if "Browser" in browser_info:
+                        print(f"Browser: {browser_info['Browser']}")
+                    if "webSocketDebuggerUrl" in browser_info:
+                        print(f"WebSocket URL: {browser_info['webSocketDebuggerUrl']}")
+                    print(f"Connected to {remote_debugging_url}")
+                    return selected_browser, browser_location, remote_debugging_url
+                except json.JSONDecodeError:
+                    print("Port is in use, but doesn't appear to be Chrome with remote debugging.")
+        except requests.RequestException:
+            print(f"No Chrome instance detected on {remote_debugging_url}")
+        
+        print("\nExecuting Chrome with remote debugging enabled:")
+        
+        if host_system == "darwin" or host_system == "linux":
+            chrome_cmd = f'{browser_location} --remote-debugging-port={default_port} --user-data-dir="$HOME/chrome-cdp-profile" --no-first-run --no-default-browser-check'
+            print(f"    {chrome_cmd}")
+        elif host_system == "windows" or host_system == "wsl":
+            chrome_cmd = f'"{browser_location}" --remote-debugging-port={default_port} --user-data-dir="C:\\chrome-cdp-profile" --no-first-run --no-default-browser-check'
+            print(f"    {chrome_cmd}")
+        else:
+            print("Unsupported OS for Chrome configuration. Please set it up manually.")
+        
+        # Ask user if they want to execute the command
+        execute_browser = input("\nWould you like to start Chrome with remote debugging now? (y/n) [y]: ").strip().lower()
+        if not execute_browser or execute_browser == "y":
+            print(f"Starting Chrome with remote debugging on port {default_port}...")
+            try:
+                # Execute in background - different approach per OS
+                if host_system in ["darwin", "linux"]:
+                    subprocess.Popen(f"nohup {chrome_cmd} > /dev/null 2>&1 &", shell=True)
+                elif host_system == "windows":
+                    subprocess.Popen(f"start {chrome_cmd}", shell=True)
+                elif host_system == "wsl":
+                    subprocess.Popen(f"cmd.exe /c start {chrome_cmd}", shell=True)
+                
+                print(f"Chrome started successfully. Connecting to {remote_debugging_url}")
+                
+                print("Waiting for Chrome to initialize...")
+                time.sleep(2)
+                
+                try:
+                    verification_response = requests.get(version_url, timeout=5)
+                    if verification_response.status_code == 200:
+                        try:
+                            browser_info = verification_response.json()
+                            print("Connection verified! Chrome is running with remote debugging.")
+                            if "Browser" in browser_info:
+                                print(f"Browser: {browser_info['Browser']}")
+                        except json.JSONDecodeError:
+                            print("Warning: Response from Chrome debugging port is not valid JSON.")
+                    else:
+                        print(f"Warning: Chrome responded with status code {verification_response.status_code}")
+                except requests.RequestException as e:
+                    print(f"Warning: Could not verify Chrome is running properly: {e}")
+                    print("You may need to check Chrome manually or try a different port.")
+            except Exception as e:
+                print(f"Error starting Chrome: {e}")
+                print("Please start Chrome manually using the command above.")
+        
        remote_debugging_url = input("Enter remote debugging URL (press Enter for default): ").strip()
        if not remote_debugging_url:
            remote_debugging_url = "http://localhost:9222"
--- a/skyvern/config.py
+++ b/skyvern/config.py
@@ -214,6 +214,23 @@ class Settings(BaseSettings):
    NOVITA_API_KEY: str | None = None
    NOVITA_API_VERSION: str = "v3"

+    # OLLAMA
+    ENABLE_OLLAMA: bool = False
+    OLLAMA_SERVER_URL: str | None = None
+    OLLAMA_MODEL: str | None = None
+    
+    # OPENROUTER
+    ENABLE_OPENROUTER: bool = False
+    OPENROUTER_API_KEY: str | None = None
+    OPENROUTER_MODEL: str | None = None
+    OPENROUTER_API_BASE: str = "https://api.openrouter.ai/v1"
+
+    # GROQ
+    ENABLE_GROQ: bool = False
+    GROQ_API_KEY: str | None = None
+    GROQ_MODEL: str | None = None
+    GROQ_API_BASE: str = "https://api.groq.com/openai/v1"
+
    # TOTP Settings
    TOTP_LIFESPAN_MINUTES: int = 10
    VERIFICATION_CODE_INITIAL_WAIT_TIME_SECS: int = 40
--- a/skyvern/forge/sdk/api/llm/config_registry.py
+++ b/skyvern/forge/sdk/api/llm/config_registry.py
@@ -804,7 +804,67 @@ if settings.ENABLE_VERTEX_AI:
        ),
    )

+if settings.ENABLE_OLLAMA:
+    # Register Ollama model configured in settings
+    if settings.OLLAMA_MODEL:
+        model_name = settings.OLLAMA_MODEL
+        LLMConfigRegistry.register_config(
+            "OLLAMA",
+            LLMConfig(
+                f"ollama/{model_name}",
+                ["OLLAMA_SERVER_URL", "OLLAMA_MODEL"],
+                supports_vision=False,              # Ollama does not support vision yet
+                add_assistant_prefix=False,
+                max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS,
+                litellm_params=LiteLLMParams(
+                    api_base=settings.OLLAMA_SERVER_URL,
+                    api_key=None,
+                    api_version=None,
+                    model_info={"model_name": f"ollama/{model_name}"},
+                ),
+            ),
+        )

+if settings.ENABLE_OPENROUTER:
+    # Register OpenRouter model configured in settings
+    if settings.OPENROUTER_MODEL:
+        model_name = settings.OPENROUTER_MODEL
+        LLMConfigRegistry.register_config(
+            "OPENROUTER",
+            LLMConfig(
+                f"openrouter/{model_name}",
+                ["OPENROUTER_API_KEY", "OPENROUTER_MODEL"],
+                supports_vision=settings.LLM_CONFIG_SUPPORT_VISION, 
+                add_assistant_prefix=False,
+                max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS,
+                litellm_params=LiteLLMParams(
+                    api_key=settings.OPENROUTER_API_KEY,
+                    api_base=settings.OPENROUTER_API_BASE,
+                    api_version=None,
+                    model_info={"model_name": f"openrouter/{model_name}"},
+                ),
+            ),
+        )
+if settings.ENABLE_GROQ:
+    # Register Groq model configured in settings
+    if settings.GROQ_MODEL:
+        model_name = settings.GROQ_MODEL
+        LLMConfigRegistry.register_config(
+            "GROQ",
+            LLMConfig(
+                f"groq/{model_name}",
+                ["GROQ_API_KEY", "GROQ_MODEL"],
+                supports_vision=settings.LLM_CONFIG_SUPPORT_VISION,
+                add_assistant_prefix=False,
+                max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS,
+                litellm_params=LiteLLMParams(
+                    api_key=settings.GROQ_API_KEY,
+                    api_version=None,
+                    api_base=settings.GROQ_API_BASE,
+                    model_info={"model_name": f"groq/{model_name}"},
+                ),
+            ),
+        )
 # Add support for dynamically configuring OpenAI-compatible LLM models
 # Based on liteLLM's support for OpenAI-compatible APIs
 # See documentation: https://docs.litellm.ai/docs/providers/openai_compatible
--- a/skyvern/webeye/browser_factory.py
+++ b/skyvern/webeye/browser_factory.py
@@ -465,10 +465,18 @@ async def _create_cdp_connection_browser(
            raise Exception("Port 9222 is already in use. Another process may be using this port.")

        browser_process = subprocess.Popen(
-            [browser_path, "--remote-debugging-port=9222"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            [
+                browser_path,
+                "--remote-debugging-port=9222",
+                "--no-first-run",
+                "--no-default-browser-check",
+                "--remote-debugging-address=0.0.0.0",
+            ], 
+            stdout=subprocess.PIPE, 
+            stderr=subprocess.PIPE
        )
        # Add small delay to allow browser to start
-        time.sleep(1)
+        time.sleep(2)
        if browser_process.poll() is not None:
            raise Exception(f"Failed to open browser. browser_path: {browser_path}")