support volcengine + migrate ui tars to volcengine (#2705)

2025-06-13 21:17:32 +08:00
parent 40e608f9cd
commit 296d2f903b
9 changed files with 95 additions and 87 deletions
--- a/skyvern/forge/sdk/api/llm/api_handler_factory.py
+++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py
@@ -198,6 +198,7 @@ class LLMAPIHandlerFactory:
            )
            if step or thought:
                try:
+                    # FIXME: volcengine doesn't support litellm cost calculation.
                    llm_cost = litellm.completion_cost(completion_response=response)
                except Exception as e:
                    LOG.debug("Failed to calculate LLM cost", error=str(e), exc_info=True)
@@ -401,6 +402,7 @@ class LLMAPIHandlerFactory:

            if step or thought:
                try:
+                    # FIXME: volcengine doesn't support litellm cost calculation.
                    llm_cost = litellm.completion_cost(completion_response=response)
                except Exception as e:
                    LOG.debug("Failed to calculate LLM cost", error=str(e), exc_info=True)
@@ -746,7 +748,7 @@ class LLMCaller:
        tools: list | None = None,
        timeout: float = settings.LLM_CONFIG_TIMEOUT,
        **active_parameters: dict[str, Any],
-    ) -> ModelResponse | CustomStreamWrapper | AnthropicMessage | Any:
+    ) -> ModelResponse | CustomStreamWrapper | AnthropicMessage | UITarsResponse:
        if self.llm_key and "ANTHROPIC" in self.llm_key:
            return await self._call_anthropic(messages, tools, timeout, **active_parameters)

@@ -802,14 +804,14 @@ class LLMCaller:
        tools: list | None = None,
        timeout: float = settings.LLM_CONFIG_TIMEOUT,
        **active_parameters: dict[str, Any],
-    ) -> Any:
+    ) -> UITarsResponse:
        """Custom UI-TARS API call using OpenAI client with VolcEngine endpoint."""
        max_tokens = active_parameters.get("max_completion_tokens") or active_parameters.get("max_tokens") or 400
-        model_name = self.llm_config.model_name
+        model_name = self.llm_config.model_name.replace("volcengine/", "")

        if not app.UI_TARS_CLIENT:
            raise ValueError(
-                "UI_TARS_CLIENT not initialized. Please ensure ENABLE_UI_TARS=true and UI_TARS_API_KEY is set."
+                "UI_TARS_CLIENT not initialized. Please ensure ENABLE_VOLCENGINE=true and VOLCENGINE_API_KEY is set."
            )

        LOG.info(
@@ -851,39 +853,18 @@ class LLMCaller:
        return response

    async def get_call_stats(
-        self, response: ModelResponse | CustomStreamWrapper | AnthropicMessage | dict[str, Any] | Any
+        self, response: ModelResponse | CustomStreamWrapper | AnthropicMessage | UITarsResponse
    ) -> LLMCallStats:
        empty_call_stats = LLMCallStats()

        # Handle UI-TARS response (UITarsResponse object from _call_ui_tars)
-        if hasattr(response, "usage") and hasattr(response, "choices") and hasattr(response, "model"):
-            usage = response.usage
-            # Use Doubao pricing: ¥0.8/1M input, ¥2/1M output (convert to USD: ~$0.11/$0.28)
-            input_token_cost = (0.11 / 1000000) * usage.get("prompt_tokens", 0)
-            output_token_cost = (0.28 / 1000000) * usage.get("completion_tokens", 0)
-            llm_cost = input_token_cost + output_token_cost
-
+        if isinstance(response, UITarsResponse):
+            ui_tars_usage = response.usage
            return LLMCallStats(
-                llm_cost=llm_cost,
-                input_tokens=usage.get("prompt_tokens", 0),
-                output_tokens=usage.get("completion_tokens", 0),
-                cached_tokens=0,  # UI-TARS doesn't have cached tokens
-                reasoning_tokens=0,
-            )
-
-        # Handle UI-TARS response (dict format - fallback)
-        if isinstance(response, dict) and "choices" in response and "usage" in response:
-            usage = response["usage"]
-            # Use Doubao pricing: ¥0.8/1M input, ¥2/1M output (convert to USD: ~$0.11/$0.28)
-            input_token_cost = (0.11 / 1000000) * usage.get("prompt_tokens", 0)
-            output_token_cost = (0.28 / 1000000) * usage.get("completion_tokens", 0)
-            llm_cost = input_token_cost + output_token_cost
-
-            return LLMCallStats(
-                llm_cost=llm_cost,
-                input_tokens=usage.get("prompt_tokens", 0),
-                output_tokens=usage.get("completion_tokens", 0),
-                cached_tokens=0,  # UI-TARS doesn't have cached tokens
+                llm_cost=0,  # TODO: calculate the cost according to the price: https://www.volcengine.com/docs/82379/1544106
+                input_tokens=ui_tars_usage.get("prompt_tokens", 0),
+                output_tokens=ui_tars_usage.get("completion_tokens", 0),
+                cached_tokens=0,  # only part of model support cached tokens
                reasoning_tokens=0,
            )

--- a/skyvern/forge/sdk/api/llm/config_registry.py
+++ b/skyvern/forge/sdk/api/llm/config_registry.py
@@ -568,16 +568,46 @@ if settings.ENABLE_AZURE_O3:
            max_completion_tokens=100000,
        ),
    )
-if settings.ENABLE_UI_TARS:
+if settings.ENABLE_VOLCENGINE:
    LLMConfigRegistry.register_config(
-        "UI_TARS_SEED1_5_VL",
+        "VOLCENGINE_DOUBAO_SEED_1_6",
        LLMConfig(
-            settings.UI_TARS_MODEL,
-            ["UI_TARS_API_KEY"],
+            "volcengine/doubao-seed-1.6-250615",
+            ["VOLCENGINE_API_KEY"],
+            litellm_params=LiteLLMParams(
+                api_base=settings.VOLCENGINE_API_BASE,
+                api_key=settings.VOLCENGINE_API_KEY,
+            ),
+            supports_vision=True,
+            add_assistant_prefix=False,
+        ),
+    )
+
+    LLMConfigRegistry.register_config(
+        "VOLCENGINE_DOUBAO_SEED_1_6_FLASH",
+        LLMConfig(
+            "volcengine/doubao-seed-1.6-flash-250615",
+            ["VOLCENGINE_API_KEY"],
+            litellm_params=LiteLLMParams(
+                api_base=settings.VOLCENGINE_API_BASE,
+                api_key=settings.VOLCENGINE_API_KEY,
+            ),
+            supports_vision=True,
+            add_assistant_prefix=False,
+        ),
+    )
+
+    LLMConfigRegistry.register_config(
+        "VOLCENGINE_DOUBAO_1_5_THINKING_VISION_PRO",
+        LLMConfig(
+            "volcengine/doubao-1-5-thinking-vision-pro-250428",
+            ["VOLCENGINE_API_KEY"],
+            litellm_params=LiteLLMParams(
+                api_base=settings.VOLCENGINE_API_BASE,
+                api_key=settings.VOLCENGINE_API_KEY,
+            ),
            supports_vision=True,
            add_assistant_prefix=False,
-            max_tokens=400,
-            temperature=0.0,
        ),
    )

--- a/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py
+++ b/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py
@@ -62,7 +62,7 @@ class UITarsLLMCaller(LLMCaller):
            # Handle None case for navigation_goal
            instruction = task.navigation_goal or "Default navigation task"
            system_prompt = _build_system_prompt(instruction)
-            self.message_history = [{"role": "user", "content": system_prompt}]
+            self.message_history: list = [{"role": "user", "content": system_prompt}]
            self._conversation_initialized = True
            LOG.debug("Initialized UI-TARS conversation", task_id=task.task_id)

--- a/skyvern/forge/sdk/api/llm/ui_tars_response.py
+++ b/skyvern/forge/sdk/api/llm/ui_tars_response.py
@@ -3,21 +3,25 @@
 import json
 from typing import Any

+from anthropic import BaseModel

-class UITarsResponse:
+
+class Message:
+    def __init__(self, content: str):
+        self.content = content
+        self.role = "assistant"
+
+
+class Choice:
+    def __init__(self, content: str):
+        self.message = Message(content)
+
+
+class UITarsResponse(BaseModel):
    """A response object that mimics the ModelResponse interface for UI-TARS API responses."""

    def __init__(self, content: str, model: str):
        # Create choice objects with proper nested structure for parse_api_response
-        class Message:
-            def __init__(self, content: str):
-                self.content = content
-                self.role = "assistant"
-
-        class Choice:
-            def __init__(self, content: str):
-                self.message = Message(content)
-
        self.choices = [Choice(content)]
        self.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
        self.model = model