From 296d2f903b16f967c266409f0c51a48c50adf028 Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Fri, 13 Jun 2025 21:17:32 +0800 Subject: [PATCH] support volcengine + migrate ui tars to volcengine (#2705) --- .env.example | 14 +++--- skyvern/cli/llm_setup.py | 36 +++++++-------- skyvern/config.py | 11 +++-- skyvern/forge/agent.py | 2 +- skyvern/forge/app.py | 6 +-- .../forge/sdk/api/llm/api_handler_factory.py | 45 ++++++------------- skyvern/forge/sdk/api/llm/config_registry.py | 42 ++++++++++++++--- .../forge/sdk/api/llm/ui_tars_llm_caller.py | 2 +- skyvern/forge/sdk/api/llm/ui_tars_response.py | 24 +++++----- 9 files changed, 95 insertions(+), 87 deletions(-) diff --git a/.env.example b/.env.example index 46bad68f..183f1a9e 100644 --- a/.env.example +++ b/.env.example @@ -43,14 +43,12 @@ ENABLE_NOVITA=false # NOVITA_API_KEY: Your Novita AI API key. NOVITA_API_KEY="" -# ENABLE_UI_TARS: Set to true to enable UI-TARS (Seed1.5-VL) as a language model provider. -ENABLE_UI_TARS=false -# UI_TARS_API_KEY: Your ByteDance Doubao API key for accessing UI-TARS models. -UI_TARS_API_KEY="" -# UI_TARS_API_BASE: The base URL for ByteDance Doubao API. -UI_TARS_API_BASE="https://ark.cn-beijing.volces.com/api/v3" -# UI_TARS_MODEL: Your UI-TARS model endpoint ID from ByteDance Doubao. -UI_TARS_MODEL="doubao-1-5-thinking-vision-pro-250428" +# ENABLE_VOLCENGINE: Set to true to enable Volcengine(ByteDance Doubao) as a language model provider. +ENABLE_VOLCENGINE=false +# VOLCENGINE_API_KEY: Your Volcengine(ByteDance Doubao) API key. +VOLCENGINE_API_KEY="" +# VOLCENGINE_API_BASE: The base URL for Volcengine(ByteDance Doubao) API. +VOLCENGINE_API_BASE="https://ark.cn-beijing.volces.com/api/v3" # LLM_KEY: The chosen language model to use. This should be one of the models # provided by the enabled LLM providers (e.g., OPENAI_GPT4_TURBO, OPENAI_GPT4V, ANTHROPIC_CLAUDE3, AZURE_OPENAI_GPT4V). diff --git a/skyvern/cli/llm_setup.py b/skyvern/cli/llm_setup.py index f34b8b48..f88d5fca 100644 --- a/skyvern/cli/llm_setup.py +++ b/skyvern/cli/llm_setup.py @@ -160,30 +160,26 @@ def setup_llm_providers() -> None: else: update_or_add_env_var("ENABLE_NOVITA", "false") - console.print("\n[bold blue]--- UI-TARS Configuration ---[/bold blue]") - console.print("To enable UI-TARS (Seed1.5-VL), you must have a ByteDance Doubao API key.") - console.print("UI-TARS now uses direct VolcEngine API calls for improved compatibility.") - enable_ui_tars = Confirm.ask("Do you want to enable UI-TARS?") - if enable_ui_tars: - ui_tars_api_key = Prompt.ask("Enter your ByteDance Doubao API key", password=True) - if not ui_tars_api_key: - console.print("[red]Error: UI-TARS API key is required. UI-TARS will not be enabled.[/red]") + console.print("\n[bold blue]--- VolcEngine Configuration ---[/bold blue]") + console.print("To enable VolcEngine, you must have a ByteDance Doubao API key.") + enable_volcengine = Confirm.ask("Do you want to enable VolcEngine?") + if enable_volcengine: + volcengine_api_key = Prompt.ask("Enter your VolcEngine(ByteDance Doubao) API key", password=True) + if not volcengine_api_key: + console.print("[red]Error: VolcEngine key is required. VolcEngine will not be enabled.[/red]") else: - update_or_add_env_var("UI_TARS_API_KEY", ui_tars_api_key) - update_or_add_env_var("ENABLE_UI_TARS", "true") + update_or_add_env_var("VOLCENGINE_API_KEY", volcengine_api_key) + update_or_add_env_var("ENABLE_VOLCENGINE", "true") - # Optional: Allow customizing model endpoint - custom_model = Confirm.ask( - "Do you want to use a custom model endpoint? (default: doubao-1-5-thinking-vision-pro-250428)" + model_options.extend( + [ + "VOLCENGINE_DOUBAO_SEED_1_6", + "VOLCENGINE_DOUBAO_SEED_1_6_FLASH", + "VOLCENGINE_DOUBAO_1_5_THINKING_VISION_PRO", + ] ) - if custom_model: - ui_tars_model = Prompt.ask("Enter your UI-TARS model endpoint ID") - if ui_tars_model: - update_or_add_env_var("UI_TARS_MODEL", ui_tars_model) - - model_options.append("UI_TARS_SEED1_5_VL") else: - update_or_add_env_var("ENABLE_UI_TARS", "false") + update_or_add_env_var("ENABLE_VOLCENGINE", "false") console.print("\n[bold blue]--- OpenAI-Compatible Provider Configuration ---[/bold blue]") console.print("To enable an OpenAI-compatible provider, you must have a model name, API key, and API base URL.") diff --git a/skyvern/config.py b/skyvern/config.py index 6dde64f3..6e1eb922 100644 --- a/skyvern/config.py +++ b/skyvern/config.py @@ -134,12 +134,11 @@ class Settings(BaseSettings): ANTHROPIC_API_KEY: str | None = None ANTHROPIC_CUA_LLM_KEY: str = "ANTHROPIC_CLAUDE3.7_SONNET" - # UI-TARS (Seed1.5-VL via Doubao) - UI_TARS_API_KEY: str | None = None - UI_TARS_API_BASE: str = "https://ark.cn-beijing.volces.com/api/v3" - UI_TARS_MODEL: str = "doubao-1-5-thinking-vision-pro-250428" - UI_TARS_LLM_KEY: str = "UI_TARS_SEED1_5_VL" - ENABLE_UI_TARS: bool = False + # VOLCENGINE (Doubao) + ENABLE_VOLCENGINE: bool = False + VOLCENGINE_API_KEY: str | None = None + VOLCENGINE_API_BASE: str = "https://ark.cn-beijing.volces.com/api/v3" + VOLCENGINE_CUA_LLM_KEY: str = "VOLCENGINE_DOUBAO_1_5_THINKING_VISION_PRO" # OPENAI COMPATIBLE OPENAI_COMPATIBLE_MODEL_NAME: str | None = None diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index fc9dafa5..8097c9f1 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -404,7 +404,7 @@ class ForgeAgent: llm_caller = LLMCallerManager.get_llm_caller(task.task_id) if not llm_caller: # create a new UI-TARS llm_caller - llm_key = task.llm_key or settings.UI_TARS_LLM_KEY + llm_key = task.llm_key or settings.VOLCENGINE_CUA_LLM_KEY llm_caller = UITarsLLMCaller(llm_key=llm_key, screenshot_scaling_enabled=True) llm_caller.initialize_conversation(task) diff --git a/skyvern/forge/app.py b/skyvern/forge/app.py index 324b01cc..108d3f7f 100644 --- a/skyvern/forge/app.py +++ b/skyvern/forge/app.py @@ -48,10 +48,10 @@ if SettingsManager.get_settings().ENABLE_BEDROCK_ANTHROPIC: # Add UI-TARS client setup UI_TARS_CLIENT = None -if SettingsManager.get_settings().ENABLE_UI_TARS: +if SettingsManager.get_settings().ENABLE_VOLCENGINE: UI_TARS_CLIENT = AsyncOpenAI( - api_key=SettingsManager.get_settings().UI_TARS_API_KEY, - base_url=SettingsManager.get_settings().UI_TARS_API_BASE, + api_key=SettingsManager.get_settings().VOLCENGINE_API_KEY, + base_url=SettingsManager.get_settings().VOLCENGINE_API_BASE, ) SECONDARY_LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler( diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py index d4555b56..a86bbd09 100644 --- a/skyvern/forge/sdk/api/llm/api_handler_factory.py +++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py @@ -198,6 +198,7 @@ class LLMAPIHandlerFactory: ) if step or thought: try: + # FIXME: volcengine doesn't support litellm cost calculation. llm_cost = litellm.completion_cost(completion_response=response) except Exception as e: LOG.debug("Failed to calculate LLM cost", error=str(e), exc_info=True) @@ -401,6 +402,7 @@ class LLMAPIHandlerFactory: if step or thought: try: + # FIXME: volcengine doesn't support litellm cost calculation. llm_cost = litellm.completion_cost(completion_response=response) except Exception as e: LOG.debug("Failed to calculate LLM cost", error=str(e), exc_info=True) @@ -746,7 +748,7 @@ class LLMCaller: tools: list | None = None, timeout: float = settings.LLM_CONFIG_TIMEOUT, **active_parameters: dict[str, Any], - ) -> ModelResponse | CustomStreamWrapper | AnthropicMessage | Any: + ) -> ModelResponse | CustomStreamWrapper | AnthropicMessage | UITarsResponse: if self.llm_key and "ANTHROPIC" in self.llm_key: return await self._call_anthropic(messages, tools, timeout, **active_parameters) @@ -802,14 +804,14 @@ class LLMCaller: tools: list | None = None, timeout: float = settings.LLM_CONFIG_TIMEOUT, **active_parameters: dict[str, Any], - ) -> Any: + ) -> UITarsResponse: """Custom UI-TARS API call using OpenAI client with VolcEngine endpoint.""" max_tokens = active_parameters.get("max_completion_tokens") or active_parameters.get("max_tokens") or 400 - model_name = self.llm_config.model_name + model_name = self.llm_config.model_name.replace("volcengine/", "") if not app.UI_TARS_CLIENT: raise ValueError( - "UI_TARS_CLIENT not initialized. Please ensure ENABLE_UI_TARS=true and UI_TARS_API_KEY is set." + "UI_TARS_CLIENT not initialized. Please ensure ENABLE_VOLCENGINE=true and VOLCENGINE_API_KEY is set." ) LOG.info( @@ -851,39 +853,18 @@ class LLMCaller: return response async def get_call_stats( - self, response: ModelResponse | CustomStreamWrapper | AnthropicMessage | dict[str, Any] | Any + self, response: ModelResponse | CustomStreamWrapper | AnthropicMessage | UITarsResponse ) -> LLMCallStats: empty_call_stats = LLMCallStats() # Handle UI-TARS response (UITarsResponse object from _call_ui_tars) - if hasattr(response, "usage") and hasattr(response, "choices") and hasattr(response, "model"): - usage = response.usage - # Use Doubao pricing: ¥0.8/1M input, ¥2/1M output (convert to USD: ~$0.11/$0.28) - input_token_cost = (0.11 / 1000000) * usage.get("prompt_tokens", 0) - output_token_cost = (0.28 / 1000000) * usage.get("completion_tokens", 0) - llm_cost = input_token_cost + output_token_cost - + if isinstance(response, UITarsResponse): + ui_tars_usage = response.usage return LLMCallStats( - llm_cost=llm_cost, - input_tokens=usage.get("prompt_tokens", 0), - output_tokens=usage.get("completion_tokens", 0), - cached_tokens=0, # UI-TARS doesn't have cached tokens - reasoning_tokens=0, - ) - - # Handle UI-TARS response (dict format - fallback) - if isinstance(response, dict) and "choices" in response and "usage" in response: - usage = response["usage"] - # Use Doubao pricing: ¥0.8/1M input, ¥2/1M output (convert to USD: ~$0.11/$0.28) - input_token_cost = (0.11 / 1000000) * usage.get("prompt_tokens", 0) - output_token_cost = (0.28 / 1000000) * usage.get("completion_tokens", 0) - llm_cost = input_token_cost + output_token_cost - - return LLMCallStats( - llm_cost=llm_cost, - input_tokens=usage.get("prompt_tokens", 0), - output_tokens=usage.get("completion_tokens", 0), - cached_tokens=0, # UI-TARS doesn't have cached tokens + llm_cost=0, # TODO: calculate the cost according to the price: https://www.volcengine.com/docs/82379/1544106 + input_tokens=ui_tars_usage.get("prompt_tokens", 0), + output_tokens=ui_tars_usage.get("completion_tokens", 0), + cached_tokens=0, # only part of model support cached tokens reasoning_tokens=0, ) diff --git a/skyvern/forge/sdk/api/llm/config_registry.py b/skyvern/forge/sdk/api/llm/config_registry.py index 51824643..46ac9483 100644 --- a/skyvern/forge/sdk/api/llm/config_registry.py +++ b/skyvern/forge/sdk/api/llm/config_registry.py @@ -568,16 +568,46 @@ if settings.ENABLE_AZURE_O3: max_completion_tokens=100000, ), ) -if settings.ENABLE_UI_TARS: +if settings.ENABLE_VOLCENGINE: LLMConfigRegistry.register_config( - "UI_TARS_SEED1_5_VL", + "VOLCENGINE_DOUBAO_SEED_1_6", LLMConfig( - settings.UI_TARS_MODEL, - ["UI_TARS_API_KEY"], + "volcengine/doubao-seed-1.6-250615", + ["VOLCENGINE_API_KEY"], + litellm_params=LiteLLMParams( + api_base=settings.VOLCENGINE_API_BASE, + api_key=settings.VOLCENGINE_API_KEY, + ), + supports_vision=True, + add_assistant_prefix=False, + ), + ) + + LLMConfigRegistry.register_config( + "VOLCENGINE_DOUBAO_SEED_1_6_FLASH", + LLMConfig( + "volcengine/doubao-seed-1.6-flash-250615", + ["VOLCENGINE_API_KEY"], + litellm_params=LiteLLMParams( + api_base=settings.VOLCENGINE_API_BASE, + api_key=settings.VOLCENGINE_API_KEY, + ), + supports_vision=True, + add_assistant_prefix=False, + ), + ) + + LLMConfigRegistry.register_config( + "VOLCENGINE_DOUBAO_1_5_THINKING_VISION_PRO", + LLMConfig( + "volcengine/doubao-1-5-thinking-vision-pro-250428", + ["VOLCENGINE_API_KEY"], + litellm_params=LiteLLMParams( + api_base=settings.VOLCENGINE_API_BASE, + api_key=settings.VOLCENGINE_API_KEY, + ), supports_vision=True, add_assistant_prefix=False, - max_tokens=400, - temperature=0.0, ), ) diff --git a/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py b/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py index 62fe4765..e3987495 100644 --- a/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py +++ b/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py @@ -62,7 +62,7 @@ class UITarsLLMCaller(LLMCaller): # Handle None case for navigation_goal instruction = task.navigation_goal or "Default navigation task" system_prompt = _build_system_prompt(instruction) - self.message_history = [{"role": "user", "content": system_prompt}] + self.message_history: list = [{"role": "user", "content": system_prompt}] self._conversation_initialized = True LOG.debug("Initialized UI-TARS conversation", task_id=task.task_id) diff --git a/skyvern/forge/sdk/api/llm/ui_tars_response.py b/skyvern/forge/sdk/api/llm/ui_tars_response.py index 8f06cdd4..a97ea4b5 100644 --- a/skyvern/forge/sdk/api/llm/ui_tars_response.py +++ b/skyvern/forge/sdk/api/llm/ui_tars_response.py @@ -3,21 +3,25 @@ import json from typing import Any +from anthropic import BaseModel -class UITarsResponse: + +class Message: + def __init__(self, content: str): + self.content = content + self.role = "assistant" + + +class Choice: + def __init__(self, content: str): + self.message = Message(content) + + +class UITarsResponse(BaseModel): """A response object that mimics the ModelResponse interface for UI-TARS API responses.""" def __init__(self, content: str, model: str): # Create choice objects with proper nested structure for parse_api_response - class Message: - def __init__(self, content: str): - self.content = content - self.role = "assistant" - - class Choice: - def __init__(self, content: str): - self.message = Message(content) - self.choices = [Choice(content)] self.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} self.model = model