ui tars integration fix (#2714)

This commit is contained in:
Wyatt Marshall
2025-06-13 16:52:14 -04:00
committed by GitHub
parent 84a0ff4e4c
commit 346b36fa4d
2 changed files with 11 additions and 49 deletions

View File

@@ -23,10 +23,7 @@ from PIL import Image
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMCaller
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.ai_suggestions import AISuggestion
from skyvern.forge.sdk.schemas.task_v2 import TaskV2, Thought
from skyvern.forge.sdk.schemas.tasks import Task
from skyvern.utils.image_resizer import Resolution
LOG = structlog.get_logger()
@@ -145,54 +142,15 @@ class UITarsLLMCaller(LLMCaller):
return "png" # Default to PNG for unsupported formats
return format_str
async def call(
self,
prompt: str | None = None,
prompt_name: str | None = None,
step: Step | None = None,
task_v2: TaskV2 | None = None,
thought: Thought | None = None,
ai_suggestion: AISuggestion | None = None,
screenshots: list[bytes] | None = None,
parameters: dict[str, Any] | None = None,
tools: list[Any] | None = None,
use_message_history: bool = False,
raw_response: bool = False,
window_dimension: Resolution | None = None,
**extra_parameters: Any,
) -> dict[str, Any]:
"""Override call method to use standard LLM routing instead of direct LiteLLM."""
# Use raw_response=True to bypass JSON parsing since UI-TARS returns plain text
response = await super().call(
prompt=prompt,
prompt_name=prompt_name,
async def generate_ui_tars_response(self, step: Step) -> str:
"""Generate UI-TARS response using the parent LLMCaller directly."""
response = await self.call(
step=step,
task_v2=task_v2,
thought=thought,
ai_suggestion=ai_suggestion,
screenshots=screenshots,
parameters=parameters,
tools=tools,
use_message_history=True, # Use message history for UI-TARS
raw_response=True, # Bypass JSON parsing - UI-TARS returns plain text
window_dimension=window_dimension,
**extra_parameters,
use_message_history=True, # Use conversation history
raw_response=True, # Skip JSON parsing for plain text
)
# Extract content from the raw response
if isinstance(response, dict) and "choices" in response:
content = response["choices"][0]["message"]["content"]
return {"content": content}
else:
# Fallback for unexpected response format
return {"content": str(response)}
async def generate_ui_tars_response(self, step: Step) -> str:
"""Generate UI-TARS response using the overridden call method."""
response = await self.call(step=step)
content = response.get("content", "").strip()
content = response["choices"][0]["message"]["content"]
# Add the response to conversation history
self.add_assistant_response(content)

View File

@@ -97,7 +97,11 @@ async def llm_messages_builder_with_history(
},
}
current_user_messages.append(message)
messages.append({"role": "user", "content": current_user_messages})
# Only append a user message if there's actually content to add
if current_user_messages:
messages.append({"role": "user", "content": current_user_messages})
# anthropic has hard limit of image & document messages (20 as of Apr 2025)
# limit the number of image type messages to 10 for anthropic
# delete the oldest image type message if the number of image type messages is greater than 10