ui tars integration fix (#2714)
This commit is contained in:
@@ -23,10 +23,7 @@ from PIL import Image
|
|||||||
from skyvern.forge.prompts import prompt_engine
|
from skyvern.forge.prompts import prompt_engine
|
||||||
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMCaller
|
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMCaller
|
||||||
from skyvern.forge.sdk.models import Step
|
from skyvern.forge.sdk.models import Step
|
||||||
from skyvern.forge.sdk.schemas.ai_suggestions import AISuggestion
|
|
||||||
from skyvern.forge.sdk.schemas.task_v2 import TaskV2, Thought
|
|
||||||
from skyvern.forge.sdk.schemas.tasks import Task
|
from skyvern.forge.sdk.schemas.tasks import Task
|
||||||
from skyvern.utils.image_resizer import Resolution
|
|
||||||
|
|
||||||
LOG = structlog.get_logger()
|
LOG = structlog.get_logger()
|
||||||
|
|
||||||
@@ -145,54 +142,15 @@ class UITarsLLMCaller(LLMCaller):
|
|||||||
return "png" # Default to PNG for unsupported formats
|
return "png" # Default to PNG for unsupported formats
|
||||||
return format_str
|
return format_str
|
||||||
|
|
||||||
async def call(
|
async def generate_ui_tars_response(self, step: Step) -> str:
|
||||||
self,
|
"""Generate UI-TARS response using the parent LLMCaller directly."""
|
||||||
prompt: str | None = None,
|
response = await self.call(
|
||||||
prompt_name: str | None = None,
|
|
||||||
step: Step | None = None,
|
|
||||||
task_v2: TaskV2 | None = None,
|
|
||||||
thought: Thought | None = None,
|
|
||||||
ai_suggestion: AISuggestion | None = None,
|
|
||||||
screenshots: list[bytes] | None = None,
|
|
||||||
parameters: dict[str, Any] | None = None,
|
|
||||||
tools: list[Any] | None = None,
|
|
||||||
use_message_history: bool = False,
|
|
||||||
raw_response: bool = False,
|
|
||||||
window_dimension: Resolution | None = None,
|
|
||||||
**extra_parameters: Any,
|
|
||||||
) -> dict[str, Any]:
|
|
||||||
"""Override call method to use standard LLM routing instead of direct LiteLLM."""
|
|
||||||
|
|
||||||
# Use raw_response=True to bypass JSON parsing since UI-TARS returns plain text
|
|
||||||
response = await super().call(
|
|
||||||
prompt=prompt,
|
|
||||||
prompt_name=prompt_name,
|
|
||||||
step=step,
|
step=step,
|
||||||
task_v2=task_v2,
|
use_message_history=True, # Use conversation history
|
||||||
thought=thought,
|
raw_response=True, # Skip JSON parsing for plain text
|
||||||
ai_suggestion=ai_suggestion,
|
|
||||||
screenshots=screenshots,
|
|
||||||
parameters=parameters,
|
|
||||||
tools=tools,
|
|
||||||
use_message_history=True, # Use message history for UI-TARS
|
|
||||||
raw_response=True, # Bypass JSON parsing - UI-TARS returns plain text
|
|
||||||
window_dimension=window_dimension,
|
|
||||||
**extra_parameters,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract content from the raw response
|
content = response["choices"][0]["message"]["content"]
|
||||||
if isinstance(response, dict) and "choices" in response:
|
|
||||||
content = response["choices"][0]["message"]["content"]
|
|
||||||
return {"content": content}
|
|
||||||
else:
|
|
||||||
# Fallback for unexpected response format
|
|
||||||
return {"content": str(response)}
|
|
||||||
|
|
||||||
async def generate_ui_tars_response(self, step: Step) -> str:
|
|
||||||
"""Generate UI-TARS response using the overridden call method."""
|
|
||||||
response = await self.call(step=step)
|
|
||||||
|
|
||||||
content = response.get("content", "").strip()
|
|
||||||
|
|
||||||
# Add the response to conversation history
|
# Add the response to conversation history
|
||||||
self.add_assistant_response(content)
|
self.add_assistant_response(content)
|
||||||
|
|||||||
@@ -97,7 +97,11 @@ async def llm_messages_builder_with_history(
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
current_user_messages.append(message)
|
current_user_messages.append(message)
|
||||||
messages.append({"role": "user", "content": current_user_messages})
|
|
||||||
|
# Only append a user message if there's actually content to add
|
||||||
|
if current_user_messages:
|
||||||
|
messages.append({"role": "user", "content": current_user_messages})
|
||||||
|
|
||||||
# anthropic has hard limit of image & document messages (20 as of Apr 2025)
|
# anthropic has hard limit of image & document messages (20 as of Apr 2025)
|
||||||
# limit the number of image type messages to 10 for anthropic
|
# limit the number of image type messages to 10 for anthropic
|
||||||
# delete the oldest image type message if the number of image type messages is greater than 10
|
# delete the oldest image type message if the number of image type messages is greater than 10
|
||||||
|
|||||||
Reference in New Issue
Block a user