diff --git a/docker-compose.yml b/docker-compose.yml index 4dcebf59..ee42669b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -24,6 +24,7 @@ services: # comment out if you want to externally call skyvern API ports: - 8000:8000 + - 9222:9222 # for cdp browser forwarding volumes: - ./artifacts:/data/artifacts - ./videos:/data/videos @@ -36,9 +37,26 @@ services: environment: - DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern - BROWSER_TYPE=chromium-headful - - ENABLE_OPENAI=true - - LLM_KEY=OPENAI_GPT4O - - OPENAI_API_KEY= + # - BROWSER_TYPE=cdp-connect + # Use this command to start Chrome with remote debugging: + # "C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\chrome-cdp-profile" --no-first-run --no-default-browser-check + # /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 --user-data-dir="/Users/yourusername/chrome-cdp-profile" --no-first-run --no-default-browser-check + # - BROWSER_REMOTE_DEBUGGING_URL=http://host.docker.internal:9222/ + + # ========================= + # LLM Settings + # ========================= + # OpenAI Support: + # If you want to use OpenAI as your LLM provider, uncomment the following lines and fill in your OpenAI API key. + # - ENABLE_OPENAI=true + # - LLM_KEY=OPENAI_GPT4O + # - OPENAI_API_KEY= + # Gemini Support: + # Gemini is a new LLM provider that is currently in beta. You can use it by uncommenting the following lines and filling in your Gemini API key. + - LLM_KEY=GEMINI + - ENABLE_GEMINI=true + - GEMINI_API_KEY=YOUR_GEMINI_KEY + - LLM_KEY=GEMINI_2.5_PRO_PREVIEW_03_25 # If you want to use other LLM provider, like azure and anthropic: # - ENABLE_ANTHROPIC=true # - LLM_KEY=ANTHROPIC_CLAUDE3.5_SONNET @@ -72,7 +90,26 @@ services: # - AWS_REGION=us-west-2 # Replace this with a different AWS region, if you desire # - AWS_ACCESS_KEY_ID=FILL_ME_IN_PLEASE # - AWS_SECRET_ACCESS_KEY=FILL_ME_IN_PLEASE - # + # Ollama Support: + # Ollama is a local LLM provider that can be used to run models locally on your machine. + # - LLM_KEY=OLLAMA + # - ENABLE_OLLAMA=true + # - OLLAMA_MODEL=qwen2.5:7b-instruct + # - OLLAMA_SERVER_URL=http://host.docker.internal:11434 + # Open Router Support: + # - ENABLE_OPENROUTER=true + # - LLM_KEY=OPENROUTER + # - OPENROUTER_API_KEY= + # - OPENROUTER_MODEL=mistralai/mistral-small-3.1-24b-instruct + # Groq Support: + # - ENABLE_GROQ=true + # - LLM_KEY=GROQ + # - GROQ_API_KEY= + # - GROQ_MODEL=llama-3.1-8b-instant + + # Maximum tokens to use: (only set for OpenRouter aand Ollama) + # - LLM_CONFIG_MAX_TOKENS=128000 + # Bitwarden Settings # If you are looking to integrate Skyvern with a password manager (eg Bitwarden), you can use the following environment variables. # - BITWARDEN_SERVER=http://localhost # OPTIONAL IF YOU ARE SELF HOSTING BITWARDEN @@ -80,7 +117,7 @@ services: # - BITWARDEN_CLIENT_ID=FILL_ME_IN_PLEASE # - BITWARDEN_CLIENT_SECRET=FILL_ME_IN_PLEASE # - BITWARDEN_MASTER_PASSWORD=FILL_ME_IN_PLEASE - + depends_on: postgres: condition: service_healthy diff --git a/skyvern/cli/commands.py b/skyvern/cli/commands.py index 2978e284..ea38ebce 100644 --- a/skyvern/cli/commands.py +++ b/skyvern/cli/commands.py @@ -7,6 +7,8 @@ import time import uuid from pathlib import Path from typing import Optional +import requests +from urllib.parse import urlparse import typer import uvicorn @@ -472,6 +474,82 @@ def setup_browser_config() -> tuple[str, Optional[str], Optional[str]]: print("\nTo use CDP connection, Chrome must be running with remote debugging enabled.") print("Example: chrome --remote-debugging-port=9222") print("Default debugging URL: http://localhost:9222") + + default_port = "9222" + if remote_debugging_url is None: + remote_debugging_url = "http://localhost:9222" + elif ":" in remote_debugging_url.split("/")[-1]: + default_port = remote_debugging_url.split(":")[-1].split("/")[0] + + parsed_url = urlparse(remote_debugging_url) + version_url = f"{parsed_url.scheme}://{parsed_url.netloc}/json/version" + + print(f"\nChecking if Chrome is already running with remote debugging on port {default_port}...") + try: + response = requests.get(version_url, timeout=2) + if response.status_code == 200: + try: + browser_info = response.json() + print(f"Chrome is already running with remote debugging!") + if "Browser" in browser_info: + print(f"Browser: {browser_info['Browser']}") + if "webSocketDebuggerUrl" in browser_info: + print(f"WebSocket URL: {browser_info['webSocketDebuggerUrl']}") + print(f"Connected to {remote_debugging_url}") + return selected_browser, browser_location, remote_debugging_url + except json.JSONDecodeError: + print("Port is in use, but doesn't appear to be Chrome with remote debugging.") + except requests.RequestException: + print(f"No Chrome instance detected on {remote_debugging_url}") + + print("\nExecuting Chrome with remote debugging enabled:") + + if host_system == "darwin" or host_system == "linux": + chrome_cmd = f'{browser_location} --remote-debugging-port={default_port} --user-data-dir="$HOME/chrome-cdp-profile" --no-first-run --no-default-browser-check' + print(f" {chrome_cmd}") + elif host_system == "windows" or host_system == "wsl": + chrome_cmd = f'"{browser_location}" --remote-debugging-port={default_port} --user-data-dir="C:\\chrome-cdp-profile" --no-first-run --no-default-browser-check' + print(f" {chrome_cmd}") + else: + print("Unsupported OS for Chrome configuration. Please set it up manually.") + + # Ask user if they want to execute the command + execute_browser = input("\nWould you like to start Chrome with remote debugging now? (y/n) [y]: ").strip().lower() + if not execute_browser or execute_browser == "y": + print(f"Starting Chrome with remote debugging on port {default_port}...") + try: + # Execute in background - different approach per OS + if host_system in ["darwin", "linux"]: + subprocess.Popen(f"nohup {chrome_cmd} > /dev/null 2>&1 &", shell=True) + elif host_system == "windows": + subprocess.Popen(f"start {chrome_cmd}", shell=True) + elif host_system == "wsl": + subprocess.Popen(f"cmd.exe /c start {chrome_cmd}", shell=True) + + print(f"Chrome started successfully. Connecting to {remote_debugging_url}") + + print("Waiting for Chrome to initialize...") + time.sleep(2) + + try: + verification_response = requests.get(version_url, timeout=5) + if verification_response.status_code == 200: + try: + browser_info = verification_response.json() + print("Connection verified! Chrome is running with remote debugging.") + if "Browser" in browser_info: + print(f"Browser: {browser_info['Browser']}") + except json.JSONDecodeError: + print("Warning: Response from Chrome debugging port is not valid JSON.") + else: + print(f"Warning: Chrome responded with status code {verification_response.status_code}") + except requests.RequestException as e: + print(f"Warning: Could not verify Chrome is running properly: {e}") + print("You may need to check Chrome manually or try a different port.") + except Exception as e: + print(f"Error starting Chrome: {e}") + print("Please start Chrome manually using the command above.") + remote_debugging_url = input("Enter remote debugging URL (press Enter for default): ").strip() if not remote_debugging_url: remote_debugging_url = "http://localhost:9222" diff --git a/skyvern/config.py b/skyvern/config.py index 9b0db312..c0571456 100644 --- a/skyvern/config.py +++ b/skyvern/config.py @@ -214,6 +214,23 @@ class Settings(BaseSettings): NOVITA_API_KEY: str | None = None NOVITA_API_VERSION: str = "v3" + # OLLAMA + ENABLE_OLLAMA: bool = False + OLLAMA_SERVER_URL: str | None = None + OLLAMA_MODEL: str | None = None + + # OPENROUTER + ENABLE_OPENROUTER: bool = False + OPENROUTER_API_KEY: str | None = None + OPENROUTER_MODEL: str | None = None + OPENROUTER_API_BASE: str = "https://api.openrouter.ai/v1" + + # GROQ + ENABLE_GROQ: bool = False + GROQ_API_KEY: str | None = None + GROQ_MODEL: str | None = None + GROQ_API_BASE: str = "https://api.groq.com/openai/v1" + # TOTP Settings TOTP_LIFESPAN_MINUTES: int = 10 VERIFICATION_CODE_INITIAL_WAIT_TIME_SECS: int = 40 diff --git a/skyvern/forge/sdk/api/llm/config_registry.py b/skyvern/forge/sdk/api/llm/config_registry.py index a8f3cfd9..d95ad67b 100644 --- a/skyvern/forge/sdk/api/llm/config_registry.py +++ b/skyvern/forge/sdk/api/llm/config_registry.py @@ -804,7 +804,67 @@ if settings.ENABLE_VERTEX_AI: ), ) +if settings.ENABLE_OLLAMA: + # Register Ollama model configured in settings + if settings.OLLAMA_MODEL: + model_name = settings.OLLAMA_MODEL + LLMConfigRegistry.register_config( + "OLLAMA", + LLMConfig( + f"ollama/{model_name}", + ["OLLAMA_SERVER_URL", "OLLAMA_MODEL"], + supports_vision=False, # Ollama does not support vision yet + add_assistant_prefix=False, + max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS, + litellm_params=LiteLLMParams( + api_base=settings.OLLAMA_SERVER_URL, + api_key=None, + api_version=None, + model_info={"model_name": f"ollama/{model_name}"}, + ), + ), + ) +if settings.ENABLE_OPENROUTER: + # Register OpenRouter model configured in settings + if settings.OPENROUTER_MODEL: + model_name = settings.OPENROUTER_MODEL + LLMConfigRegistry.register_config( + "OPENROUTER", + LLMConfig( + f"openrouter/{model_name}", + ["OPENROUTER_API_KEY", "OPENROUTER_MODEL"], + supports_vision=settings.LLM_CONFIG_SUPPORT_VISION, + add_assistant_prefix=False, + max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS, + litellm_params=LiteLLMParams( + api_key=settings.OPENROUTER_API_KEY, + api_base=settings.OPENROUTER_API_BASE, + api_version=None, + model_info={"model_name": f"openrouter/{model_name}"}, + ), + ), + ) +if settings.ENABLE_GROQ: + # Register Groq model configured in settings + if settings.GROQ_MODEL: + model_name = settings.GROQ_MODEL + LLMConfigRegistry.register_config( + "GROQ", + LLMConfig( + f"groq/{model_name}", + ["GROQ_API_KEY", "GROQ_MODEL"], + supports_vision=settings.LLM_CONFIG_SUPPORT_VISION, + add_assistant_prefix=False, + max_completion_tokens=settings.LLM_CONFIG_MAX_TOKENS, + litellm_params=LiteLLMParams( + api_key=settings.GROQ_API_KEY, + api_version=None, + api_base=settings.GROQ_API_BASE, + model_info={"model_name": f"groq/{model_name}"}, + ), + ), + ) # Add support for dynamically configuring OpenAI-compatible LLM models # Based on liteLLM's support for OpenAI-compatible APIs # See documentation: https://docs.litellm.ai/docs/providers/openai_compatible diff --git a/skyvern/webeye/browser_factory.py b/skyvern/webeye/browser_factory.py index be53b433..c1d79fcd 100644 --- a/skyvern/webeye/browser_factory.py +++ b/skyvern/webeye/browser_factory.py @@ -465,10 +465,18 @@ async def _create_cdp_connection_browser( raise Exception("Port 9222 is already in use. Another process may be using this port.") browser_process = subprocess.Popen( - [browser_path, "--remote-debugging-port=9222"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + [ + browser_path, + "--remote-debugging-port=9222", + "--no-first-run", + "--no-default-browser-check", + "--remote-debugging-address=0.0.0.0", + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE ) # Add small delay to allow browser to start - time.sleep(1) + time.sleep(2) if browser_process.poll() is not None: raise Exception(f"Failed to open browser. browser_path: {browser_path}")