diff --git a/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py b/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py index e3987495..58cec24a 100644 --- a/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py +++ b/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py @@ -23,10 +23,7 @@ from PIL import Image from skyvern.forge.prompts import prompt_engine from skyvern.forge.sdk.api.llm.api_handler_factory import LLMCaller from skyvern.forge.sdk.models import Step -from skyvern.forge.sdk.schemas.ai_suggestions import AISuggestion -from skyvern.forge.sdk.schemas.task_v2 import TaskV2, Thought from skyvern.forge.sdk.schemas.tasks import Task -from skyvern.utils.image_resizer import Resolution LOG = structlog.get_logger() @@ -145,54 +142,15 @@ class UITarsLLMCaller(LLMCaller): return "png" # Default to PNG for unsupported formats return format_str - async def call( - self, - prompt: str | None = None, - prompt_name: str | None = None, - step: Step | None = None, - task_v2: TaskV2 | None = None, - thought: Thought | None = None, - ai_suggestion: AISuggestion | None = None, - screenshots: list[bytes] | None = None, - parameters: dict[str, Any] | None = None, - tools: list[Any] | None = None, - use_message_history: bool = False, - raw_response: bool = False, - window_dimension: Resolution | None = None, - **extra_parameters: Any, - ) -> dict[str, Any]: - """Override call method to use standard LLM routing instead of direct LiteLLM.""" - - # Use raw_response=True to bypass JSON parsing since UI-TARS returns plain text - response = await super().call( - prompt=prompt, - prompt_name=prompt_name, + async def generate_ui_tars_response(self, step: Step) -> str: + """Generate UI-TARS response using the parent LLMCaller directly.""" + response = await self.call( step=step, - task_v2=task_v2, - thought=thought, - ai_suggestion=ai_suggestion, - screenshots=screenshots, - parameters=parameters, - tools=tools, - use_message_history=True, # Use message history for UI-TARS - raw_response=True, # Bypass JSON parsing - UI-TARS returns plain text - window_dimension=window_dimension, - **extra_parameters, + use_message_history=True, # Use conversation history + raw_response=True, # Skip JSON parsing for plain text ) - # Extract content from the raw response - if isinstance(response, dict) and "choices" in response: - content = response["choices"][0]["message"]["content"] - return {"content": content} - else: - # Fallback for unexpected response format - return {"content": str(response)} - - async def generate_ui_tars_response(self, step: Step) -> str: - """Generate UI-TARS response using the overridden call method.""" - response = await self.call(step=step) - - content = response.get("content", "").strip() + content = response["choices"][0]["message"]["content"] # Add the response to conversation history self.add_assistant_response(content) diff --git a/skyvern/forge/sdk/api/llm/utils.py b/skyvern/forge/sdk/api/llm/utils.py index b9e5a835..586b5f07 100644 --- a/skyvern/forge/sdk/api/llm/utils.py +++ b/skyvern/forge/sdk/api/llm/utils.py @@ -97,7 +97,11 @@ async def llm_messages_builder_with_history( }, } current_user_messages.append(message) - messages.append({"role": "user", "content": current_user_messages}) + + # Only append a user message if there's actually content to add + if current_user_messages: + messages.append({"role": "user", "content": current_user_messages}) + # anthropic has hard limit of image & document messages (20 as of Apr 2025) # limit the number of image type messages to 10 for anthropic # delete the oldest image type message if the number of image type messages is greater than 10