From 485b1e025eb8c5399956d481e73c682932d59ce7 Mon Sep 17 00:00:00 2001
From: pedrohsdb <pedro@skyvern.com>
Date: Tue, 23 Sep 2025 13:44:15 -0700
Subject: [PATCH] Pedro/thinking budget optimization (#3502)

---
 .../forge/sdk/api/llm/api_handler_factory.py  | 157 +++++++++++++++++-
 1 file changed, 156 insertions(+), 1 deletion(-)

diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py
index 064baea4..9a420b0b 100644
--- a/skyvern/forge/sdk/api/llm/api_handler_factory.py
+++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py
@@ -47,6 +47,128 @@ class LLMCallStats(BaseModel):
 
 class LLMAPIHandlerFactory:
     _custom_handlers: dict[str, LLMAPIHandler] = {}
+    _thinking_budget_settings: dict[str, int] | None = None
+
+    @staticmethod
+    def _apply_thinking_budget_optimization(
+        parameters: dict[str, Any], new_budget: int, llm_config: LLMConfig | LLMRouterConfig, prompt_name: str
+    ) -> None:
+        """Apply thinking budget optimization based on model type and LiteLLM reasoning support."""
+        # Compute a safe model label and a representative model for capability checks
+        model_label = getattr(llm_config, "model_name", None)
+        if model_label is None and isinstance(llm_config, LLMRouterConfig):
+            model_label = getattr(llm_config, "main_model_group", "router")
+        check_model = model_label
+        if isinstance(llm_config, LLMRouterConfig) and getattr(llm_config, "model_list", None):
+            try:
+                check_model = llm_config.model_list[0].model_name or model_label  # type: ignore[attr-defined]
+            except Exception:
+                check_model = model_label
+        try:
+            # Early return if model doesn't support reasoning
+            if check_model and not litellm.supports_reasoning(model=check_model):
+                LOG.info(
+                    "Thinking budget optimization not supported for model",
+                    prompt_name=prompt_name,
+                    budget=new_budget,
+                    model=model_label,
+                )
+                return
+
+            # Apply optimization based on model type
+            model_label_lower = (model_label or "").lower()
+            if "gemini" in model_label_lower:
+                # Gemini models use the exact integer budget value
+                LLMAPIHandlerFactory._apply_gemini_thinking_optimization(
+                    parameters, new_budget, llm_config, prompt_name
+                )
+            elif "anthropic" in model_label_lower or "claude" in model_label_lower:
+                # Anthropic/Claude models use "low" for all budget values (per LiteLLM constants)
+                LLMAPIHandlerFactory._apply_anthropic_thinking_optimization(
+                    parameters, new_budget, llm_config, prompt_name
+                )
+            else:
+                # Other reasoning-capable models (Deepseek, etc.) - use "low" for all budget values
+                parameters["reasoning_effort"] = "low"
+                LOG.info(
+                    "Applied thinking budget optimization (reasoning_effort)",
+                    prompt_name=prompt_name,
+                    budget=new_budget,
+                    reasoning_effort="low",
+                    model=model_label,
+                )
+
+        except (AttributeError, KeyError, TypeError) as e:
+            LOG.warning(
+                "Failed to apply thinking budget optimization",
+                prompt_name=prompt_name,
+                budget=new_budget,
+                model=model_label,
+                error=str(e),
+                exc_info=True,
+            )
+
+    @staticmethod
+    def _apply_anthropic_thinking_optimization(
+        parameters: dict[str, Any], new_budget: int, llm_config: LLMConfig | LLMRouterConfig, prompt_name: str
+    ) -> None:
+        """Apply thinking optimization for Anthropic/Claude models."""
+        if llm_config.reasoning_effort is not None:
+            # Use reasoning_effort if configured in LLM config - always use "low" per LiteLLM constants
+            parameters["reasoning_effort"] = "low"
+            # Get safe model label for logging
+            model_label = getattr(llm_config, "model_name", None)
+            if model_label is None and isinstance(llm_config, LLMRouterConfig):
+                model_label = getattr(llm_config, "main_model_group", "router")
+
+            LOG.info(
+                "Applied thinking budget optimization (reasoning_effort)",
+                prompt_name=prompt_name,
+                budget=new_budget,
+                reasoning_effort="low",
+                model=model_label,
+            )
+        else:
+            # Use thinking parameter with budget_tokens for Anthropic models
+            if "thinking" in parameters and isinstance(parameters["thinking"], dict):
+                parameters["thinking"]["budget_tokens"] = new_budget
+            else:
+                parameters["thinking"] = {"budget_tokens": new_budget, "type": "enabled"}
+            # Get safe model label for logging
+            model_label = getattr(llm_config, "model_name", None)
+            if model_label is None and isinstance(llm_config, LLMRouterConfig):
+                model_label = getattr(llm_config, "main_model_group", "router")
+
+            LOG.info(
+                "Applied thinking budget optimization (thinking)",
+                prompt_name=prompt_name,
+                budget=new_budget,
+                model=model_label,
+            )
+
+    @staticmethod
+    def _apply_gemini_thinking_optimization(
+        parameters: dict[str, Any], new_budget: int, llm_config: LLMConfig | LLMRouterConfig, prompt_name: str
+    ) -> None:
+        """Apply thinking optimization for Gemini models using exact integer budget value."""
+        if "thinking" in parameters and isinstance(parameters["thinking"], dict):
+            parameters["thinking"]["budget_tokens"] = new_budget
+        else:
+            thinking_payload: dict[str, Any] = {"budget_tokens": new_budget}
+            if settings.GEMINI_INCLUDE_THOUGHT:
+                thinking_payload["type"] = "enabled"
+            parameters["thinking"] = thinking_payload
+        # Get safe model label for logging
+        model_label = getattr(llm_config, "model_name", None)
+        if model_label is None and isinstance(llm_config, LLMRouterConfig):
+            model_label = getattr(llm_config, "main_model_group", "router")
+
+        LOG.info(
+            "Applied thinking budget optimization (budget_tokens)",
+            prompt_name=prompt_name,
+            budget=new_budget,
+            model=model_label,
+        )
 
     @staticmethod
     def get_override_llm_api_handler(override_llm_key: str | None, *, default: LLMAPIHandler) -> LLMAPIHandler:
@@ -119,6 +241,16 @@ class LLMAPIHandlerFactory:
             if parameters is None:
                 parameters = LLMAPIHandlerFactory.get_api_parameters(llm_config)
 
+            # Apply thinking budget optimization if settings are available
+            if (
+                LLMAPIHandlerFactory._thinking_budget_settings
+                and prompt_name in LLMAPIHandlerFactory._thinking_budget_settings
+            ):
+                new_budget = LLMAPIHandlerFactory._thinking_budget_settings[prompt_name]
+                LLMAPIHandlerFactory._apply_thinking_budget_optimization(
+                    parameters, new_budget, llm_config, prompt_name
+                )
+
             context = skyvern_context.current()
             if context and len(context.hashed_href_map) > 0:
                 await app.ARTIFACT_MANAGER.create_llm_artifact(
@@ -323,6 +455,16 @@ class LLMAPIHandlerFactory:
             if llm_config.litellm_params:  # type: ignore
                 active_parameters.update(llm_config.litellm_params)  # type: ignore
 
+            # Apply thinking budget optimization if settings are available
+            if (
+                LLMAPIHandlerFactory._thinking_budget_settings
+                and prompt_name in LLMAPIHandlerFactory._thinking_budget_settings
+            ):
+                new_budget = LLMAPIHandlerFactory._thinking_budget_settings[prompt_name]
+                LLMAPIHandlerFactory._apply_thinking_budget_optimization(
+                    active_parameters, new_budget, llm_config, prompt_name
+                )
+
             context = skyvern_context.current()
             if context and len(context.hashed_href_map) > 0:
                 await app.ARTIFACT_MANAGER.create_llm_artifact(
@@ -374,6 +516,7 @@ class LLMAPIHandlerFactory:
                     model=model_name,
                     messages=messages,
                     timeout=settings.LLM_CONFIG_TIMEOUT,
+                    drop_params=True,  # Drop unsupported parameters gracefully
                     **active_parameters,
                 )
             except litellm.exceptions.APIError as e:
@@ -534,6 +677,13 @@ class LLMAPIHandlerFactory:
             raise DuplicateCustomLLMProviderError(llm_key)
         cls._custom_handlers[llm_key] = handler
 
+    @classmethod
+    def set_thinking_budget_settings(cls, settings: dict[str, int] | None) -> None:
+        """Set thinking budget optimization settings for the current task/workflow."""
+        cls._thinking_budget_settings = settings
+        if settings:
+            LOG.info("Thinking budget optimization settings applied", settings=settings)
+
 
 class LLMCaller:
     """
@@ -800,7 +950,12 @@ class LLMCaller:
             return await self._call_ui_tars(messages, tools, timeout, **active_parameters)
 
         return await litellm.acompletion(
-            model=self.llm_config.model_name, messages=messages, tools=tools, timeout=timeout, **active_parameters
+            model=self.llm_config.model_name,
+            messages=messages,
+            tools=tools,
+            timeout=timeout,
+            drop_params=True,  # Drop unsupported parameters gracefully
+            **active_parameters,
         )
 
     async def _call_anthropic(