Pedro/thinking budget optimization (#3502)

2025-09-23 13:44:15 -07:00
parent a29a2bc49b
commit 485b1e025e
1 changed files with 156 additions and 1 deletions
--- a/skyvern/forge/sdk/api/llm/api_handler_factory.py
+++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py
@@ -47,6 +47,128 @@ class LLMCallStats(BaseModel):

 class LLMAPIHandlerFactory:
    _custom_handlers: dict[str, LLMAPIHandler] = {}
+    _thinking_budget_settings: dict[str, int] | None = None
+
+    @staticmethod
+    def _apply_thinking_budget_optimization(
+        parameters: dict[str, Any], new_budget: int, llm_config: LLMConfig | LLMRouterConfig, prompt_name: str
+    ) -> None:
+        """Apply thinking budget optimization based on model type and LiteLLM reasoning support."""
+        # Compute a safe model label and a representative model for capability checks
+        model_label = getattr(llm_config, "model_name", None)
+        if model_label is None and isinstance(llm_config, LLMRouterConfig):
+            model_label = getattr(llm_config, "main_model_group", "router")
+        check_model = model_label
+        if isinstance(llm_config, LLMRouterConfig) and getattr(llm_config, "model_list", None):
+            try:
+                check_model = llm_config.model_list[0].model_name or model_label  # type: ignore[attr-defined]
+            except Exception:
+                check_model = model_label
+        try:
+            # Early return if model doesn't support reasoning
+            if check_model and not litellm.supports_reasoning(model=check_model):
+                LOG.info(
+                    "Thinking budget optimization not supported for model",
+                    prompt_name=prompt_name,
+                    budget=new_budget,
+                    model=model_label,
+                )
+                return
+
+            # Apply optimization based on model type
+            model_label_lower = (model_label or "").lower()
+            if "gemini" in model_label_lower:
+                # Gemini models use the exact integer budget value
+                LLMAPIHandlerFactory._apply_gemini_thinking_optimization(
+                    parameters, new_budget, llm_config, prompt_name
+                )
+            elif "anthropic" in model_label_lower or "claude" in model_label_lower:
+                # Anthropic/Claude models use "low" for all budget values (per LiteLLM constants)
+                LLMAPIHandlerFactory._apply_anthropic_thinking_optimization(
+                    parameters, new_budget, llm_config, prompt_name
+                )
+            else:
+                # Other reasoning-capable models (Deepseek, etc.) - use "low" for all budget values
+                parameters["reasoning_effort"] = "low"
+                LOG.info(
+                    "Applied thinking budget optimization (reasoning_effort)",
+                    prompt_name=prompt_name,
+                    budget=new_budget,
+                    reasoning_effort="low",
+                    model=model_label,
+                )
+
+        except (AttributeError, KeyError, TypeError) as e:
+            LOG.warning(
+                "Failed to apply thinking budget optimization",
+                prompt_name=prompt_name,
+                budget=new_budget,
+                model=model_label,
+                error=str(e),
+                exc_info=True,
+            )
+
+    @staticmethod
+    def _apply_anthropic_thinking_optimization(
+        parameters: dict[str, Any], new_budget: int, llm_config: LLMConfig | LLMRouterConfig, prompt_name: str
+    ) -> None:
+        """Apply thinking optimization for Anthropic/Claude models."""
+        if llm_config.reasoning_effort is not None:
+            # Use reasoning_effort if configured in LLM config - always use "low" per LiteLLM constants
+            parameters["reasoning_effort"] = "low"
+            # Get safe model label for logging
+            model_label = getattr(llm_config, "model_name", None)
+            if model_label is None and isinstance(llm_config, LLMRouterConfig):
+                model_label = getattr(llm_config, "main_model_group", "router")
+
+            LOG.info(
+                "Applied thinking budget optimization (reasoning_effort)",
+                prompt_name=prompt_name,
+                budget=new_budget,
+                reasoning_effort="low",
+                model=model_label,
+            )
+        else:
+            # Use thinking parameter with budget_tokens for Anthropic models
+            if "thinking" in parameters and isinstance(parameters["thinking"], dict):
+                parameters["thinking"]["budget_tokens"] = new_budget
+            else:
+                parameters["thinking"] = {"budget_tokens": new_budget, "type": "enabled"}
+            # Get safe model label for logging
+            model_label = getattr(llm_config, "model_name", None)
+            if model_label is None and isinstance(llm_config, LLMRouterConfig):
+                model_label = getattr(llm_config, "main_model_group", "router")
+
+            LOG.info(
+                "Applied thinking budget optimization (thinking)",
+                prompt_name=prompt_name,
+                budget=new_budget,
+                model=model_label,
+            )
+
+    @staticmethod
+    def _apply_gemini_thinking_optimization(
+        parameters: dict[str, Any], new_budget: int, llm_config: LLMConfig | LLMRouterConfig, prompt_name: str
+    ) -> None:
+        """Apply thinking optimization for Gemini models using exact integer budget value."""
+        if "thinking" in parameters and isinstance(parameters["thinking"], dict):
+            parameters["thinking"]["budget_tokens"] = new_budget
+        else:
+            thinking_payload: dict[str, Any] = {"budget_tokens": new_budget}
+            if settings.GEMINI_INCLUDE_THOUGHT:
+                thinking_payload["type"] = "enabled"
+            parameters["thinking"] = thinking_payload
+        # Get safe model label for logging
+        model_label = getattr(llm_config, "model_name", None)
+        if model_label is None and isinstance(llm_config, LLMRouterConfig):
+            model_label = getattr(llm_config, "main_model_group", "router")
+
+        LOG.info(
+            "Applied thinking budget optimization (budget_tokens)",
+            prompt_name=prompt_name,
+            budget=new_budget,
+            model=model_label,
+        )

    @staticmethod
    def get_override_llm_api_handler(override_llm_key: str | None, *, default: LLMAPIHandler) -> LLMAPIHandler:
@@ -119,6 +241,16 @@ class LLMAPIHandlerFactory:
            if parameters is None:
                parameters = LLMAPIHandlerFactory.get_api_parameters(llm_config)

+            # Apply thinking budget optimization if settings are available
+            if (
+                LLMAPIHandlerFactory._thinking_budget_settings
+                and prompt_name in LLMAPIHandlerFactory._thinking_budget_settings
+            ):
+                new_budget = LLMAPIHandlerFactory._thinking_budget_settings[prompt_name]
+                LLMAPIHandlerFactory._apply_thinking_budget_optimization(
+                    parameters, new_budget, llm_config, prompt_name
+                )
+
            context = skyvern_context.current()
            if context and len(context.hashed_href_map) > 0:
                await app.ARTIFACT_MANAGER.create_llm_artifact(
@@ -323,6 +455,16 @@ class LLMAPIHandlerFactory:
            if llm_config.litellm_params:  # type: ignore
                active_parameters.update(llm_config.litellm_params)  # type: ignore

+            # Apply thinking budget optimization if settings are available
+            if (
+                LLMAPIHandlerFactory._thinking_budget_settings
+                and prompt_name in LLMAPIHandlerFactory._thinking_budget_settings
+            ):
+                new_budget = LLMAPIHandlerFactory._thinking_budget_settings[prompt_name]
+                LLMAPIHandlerFactory._apply_thinking_budget_optimization(
+                    active_parameters, new_budget, llm_config, prompt_name
+                )
+
            context = skyvern_context.current()
            if context and len(context.hashed_href_map) > 0:
                await app.ARTIFACT_MANAGER.create_llm_artifact(
@@ -374,6 +516,7 @@ class LLMAPIHandlerFactory:
                    model=model_name,
                    messages=messages,
                    timeout=settings.LLM_CONFIG_TIMEOUT,
+                    drop_params=True,  # Drop unsupported parameters gracefully
                    **active_parameters,
                )
            except litellm.exceptions.APIError as e:
@@ -534,6 +677,13 @@ class LLMAPIHandlerFactory:
            raise DuplicateCustomLLMProviderError(llm_key)
        cls._custom_handlers[llm_key] = handler

+    @classmethod
+    def set_thinking_budget_settings(cls, settings: dict[str, int] | None) -> None:
+        """Set thinking budget optimization settings for the current task/workflow."""
+        cls._thinking_budget_settings = settings
+        if settings:
+            LOG.info("Thinking budget optimization settings applied", settings=settings)
+

 class LLMCaller:
    """
@@ -800,7 +950,12 @@ class LLMCaller:
            return await self._call_ui_tars(messages, tools, timeout, **active_parameters)

        return await litellm.acompletion(
-            model=self.llm_config.model_name, messages=messages, tools=tools, timeout=timeout, **active_parameters
+            model=self.llm_config.model_name,
+            messages=messages,
+            tools=tools,
+            timeout=timeout,
+            drop_params=True,  # Drop unsupported parameters gracefully
+            **active_parameters,
        )

    async def _call_anthropic(