ui-tars integration (#2656)

2025-06-13 01:23:39 -04:00
parent 47cf755d9c
commit 15d46aab82
18 changed files with 986 additions and 13 deletions
--- a/.env.example
+++ b/.env.example
@@ -43,6 +43,15 @@ ENABLE_NOVITA=false
 # NOVITA_API_KEY: Your Novita AI API key.
 NOVITA_API_KEY=""
 # ENABLE_UI_TARS: Set to true to enable UI-TARS (Seed1.5-VL) as a language model provider.
 ENABLE_UI_TARS=false
 # UI_TARS_API_KEY: Your ByteDance Doubao API key for accessing UI-TARS models.
 UI_TARS_API_KEY=""
 # UI_TARS_API_BASE: The base URL for ByteDance Doubao API.
 UI_TARS_API_BASE="https://ark.cn-beijing.volces.com/api/v3"
 # UI_TARS_MODEL: Your UI-TARS model endpoint ID from ByteDance Doubao.
 UI_TARS_MODEL="doubao-1-5-thinking-vision-pro-250428"
 # LLM_KEY: The chosen language model to use. This should be one of the models
 # provided by the enabled LLM providers (e.g., OPENAI_GPT4_TURBO, OPENAI_GPT4V, ANTHROPIC_CLAUDE3, AZURE_OPENAI_GPT4V).
 LLM_KEY=""
--- a/fern/running-tasks/run-tasks.mdx
+++ b/fern/running-tasks/run-tasks.mdx
@@ -24,6 +24,7 @@ This parameter defines the engine that powers the agent task.
 - `skyvern-1.0`: performs really well for tasks with a simple goal, like filling a form, or searching for information on Google.
 - `openai-cua`: uses OpenAI's CUA model.
 - `anthropic-cua`: uses Anthropic's Claude Sonnet 3.7 model with the computer use tool.
 - `ui-tars`: uses the UI-TARS model (Seed1.5-VL) via Doubao API for computer vision and GUI automation with multi-turn conversation support (https://seed.bytedance.com/zh/tech/seed1_5_vl).
 ### [Data Extraction Schema](/api-reference/api-reference/agent/run-task#request.body.data_extraction_schema)
--- a/skyvern/cli/llm_setup.py
+++ b/skyvern/cli/llm_setup.py
@@ -160,6 +160,31 @@ def setup_llm_providers() -> None:
    else:
        update_or_add_env_var("ENABLE_NOVITA", "false")
    console.print("\n[bold blue]--- UI-TARS Configuration ---[/bold blue]")
    console.print("To enable UI-TARS (Seed1.5-VL), you must have a ByteDance Doubao API key.")
    console.print("UI-TARS now uses direct VolcEngine API calls for improved compatibility.")
    enable_ui_tars = Confirm.ask("Do you want to enable UI-TARS?")
    if enable_ui_tars:
        ui_tars_api_key = Prompt.ask("Enter your ByteDance Doubao API key", password=True)
        if not ui_tars_api_key:
            console.print("[red]Error: UI-TARS API key is required. UI-TARS will not be enabled.[/red]")
        else:
            update_or_add_env_var("UI_TARS_API_KEY", ui_tars_api_key)
            update_or_add_env_var("ENABLE_UI_TARS", "true")
            # Optional: Allow customizing model endpoint
            custom_model = Confirm.ask(
                "Do you want to use a custom model endpoint? (default: doubao-1-5-thinking-vision-pro-250428)"
            )
            if custom_model:
                ui_tars_model = Prompt.ask("Enter your UI-TARS model endpoint ID")
                if ui_tars_model:
                    update_or_add_env_var("UI_TARS_MODEL", ui_tars_model)
            model_options.append("UI_TARS_SEED1_5_VL")
    else:
        update_or_add_env_var("ENABLE_UI_TARS", "false")
    console.print("\n[bold blue]--- OpenAI-Compatible Provider Configuration ---[/bold blue]")
    console.print("To enable an OpenAI-compatible provider, you must have a model name, API key, and API base URL.")
    enable_openai_compatible = Confirm.ask("Do you want to enable an OpenAI-compatible provider?")
--- a/skyvern/client/types/run_engine.py
+++ b/skyvern/client/types/run_engine.py
@@ -2,4 +2,6 @@
 import typing
-RunEngine = typing.Union[typing.Literal["skyvern-1.0", "skyvern-2.0", "openai-cua", "anthropic-cua"], typing.Any]
+RunEngine = typing.Union[
    typing.Literal["skyvern-1.0", "skyvern-2.0", "openai-cua", "anthropic-cua", "ui-tars"], typing.Any
 ]
--- a/skyvern/config.py
+++ b/skyvern/config.py
@@ -134,6 +134,13 @@ class Settings(BaseSettings):
    ANTHROPIC_API_KEY: str | None = None
    ANTHROPIC_CUA_LLM_KEY: str = "ANTHROPIC_CLAUDE3.7_SONNET"
    # UI-TARS (Seed1.5-VL via Doubao)
    UI_TARS_API_KEY: str | None = None
    UI_TARS_API_BASE: str = "https://ark.cn-beijing.volces.com/api/v3"
    UI_TARS_MODEL: str = "doubao-1-5-thinking-vision-pro-250428"
    UI_TARS_LLM_KEY: str = "UI_TARS_SEED1_5_VL"
    ENABLE_UI_TARS: bool = False
    # OPENAI COMPATIBLE
    OPENAI_COMPATIBLE_MODEL_NAME: str | None = None
    OPENAI_COMPATIBLE_API_KEY: str | None = None
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -59,6 +59,7 @@ from skyvern.forge.sdk.api.files import (
    wait_for_download_finished,
 )
 from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory, LLMCaller, LLMCallerManager
 from skyvern.forge.sdk.api.llm.ui_tars_llm_caller import UITarsLLMCaller
 from skyvern.forge.sdk.artifact.models import ArtifactType
 from skyvern.forge.sdk.core import skyvern_context
 from skyvern.forge.sdk.core.security import generate_skyvern_webhook_headers
@@ -91,7 +92,12 @@ from skyvern.webeye.actions.actions import (
 from skyvern.webeye.actions.caching import retrieve_action_plan
 from skyvern.webeye.actions.handler import ActionHandler, poll_verification_code
 from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput
-from skyvern.webeye.actions.parse_actions import parse_actions, parse_anthropic_actions, parse_cua_actions
+from skyvern.webeye.actions.parse_actions import (
    parse_actions,
    parse_anthropic_actions,
    parse_cua_actions,
    parse_ui_tars_actions,
 )
 from skyvern.webeye.actions.responses import ActionResult, ActionSuccess
 from skyvern.webeye.browser_factory import BrowserState
 from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website
@@ -393,9 +399,18 @@ class ForgeAgent:
                        llm_key=llm_key or settings.ANTHROPIC_CUA_LLM_KEY, screenshot_scaling_enabled=True
                    )
            if engine == RunEngine.ui_tars and not llm_caller:
                # see if the llm_caller is already set in memory
                llm_caller = LLMCallerManager.get_llm_caller(task.task_id)
                if not llm_caller:
                    # create a new UI-TARS llm_caller
                    llm_key = task.llm_key or settings.UI_TARS_LLM_KEY
                    llm_caller = UITarsLLMCaller(llm_key=llm_key, screenshot_scaling_enabled=True)
                    llm_caller.initialize_conversation(task)
            # TODO: remove the code after migrating everything to llm callers
-            # currently, only anthropic cua tasks use llm_caller
+            # currently, only anthropic cua and ui_tars tasks use llm_caller
-            if engine == RunEngine.anthropic_cua and llm_caller:
+            if engine in [RunEngine.anthropic_cua, RunEngine.ui_tars] and llm_caller:
                LLMCallerManager.set_llm_caller(task.task_id, llm_caller)
            step, detailed_output = await self.agent_step(
@@ -550,6 +565,7 @@ class ForgeAgent:
                    complete_verification=complete_verification,
                    engine=engine,
                    cua_response=cua_response_param,
                    llm_caller=llm_caller,
                )
            elif settings.execute_all_steps() and next_step:
                return await self.execute_step(
@@ -563,6 +579,7 @@ class ForgeAgent:
                    complete_verification=complete_verification,
                    engine=engine,
                    cua_response=cua_response_param,
                    llm_caller=llm_caller,
                )
            else:
                LOG.info(
@@ -854,6 +871,15 @@ class ForgeAgent:
                    scraped_page=scraped_page,
                    llm_caller=llm_caller,
                )
            elif engine == RunEngine.ui_tars:
                assert llm_caller is not None
                actions = await self._generate_ui_tars_actions(
                    task=task,
                    step=step,
                    scraped_page=scraped_page,
                    llm_caller=llm_caller,
                )
            else:
                using_cached_action_plan = False
                if not task.navigation_goal and not isinstance(task_block, ValidationBlock):
@@ -1483,6 +1509,56 @@ class ForgeAgent:
        )
        return actions
    async def _generate_ui_tars_actions(
        self,
        task: Task,
        step: Step,
        scraped_page: ScrapedPage,
        llm_caller: LLMCaller,
    ) -> list[Action]:
        """Generate actions using UI-TARS (Seed1.5-VL) model through the LLMCaller pattern."""
        LOG.info(
            "UI-TARS action generation starts",
            task_id=task.task_id,
            step_id=step.step_id,
            step_order=step.order,
        )
        # Ensure we have a UITarsLLMCaller instance
        if not isinstance(llm_caller, UITarsLLMCaller):
            raise ValueError(f"Expected UITarsLLMCaller, got {type(llm_caller)}")
        # Add the current screenshot to conversation
        if scraped_page.screenshots:
            llm_caller.add_screenshot(scraped_page.screenshots[0])
        else:
            LOG.error("No screenshots found, skipping UI-TARS action generation")
            raise ValueError("No screenshots found, skipping UI-TARS action generation")
        # Generate response using the LLMCaller
        response_content = await llm_caller.generate_ui_tars_response(step)
        LOG.info(f"UI-TARS raw response: {response_content}")
        window_dimension = (
            cast(Resolution, scraped_page.window_dimension)
            if scraped_page.window_dimension
            else Resolution(width=1920, height=1080)
        )
        LOG.info(f"UI-TARS browser window dimension: {window_dimension}")
        actions = await parse_ui_tars_actions(task, step, response_content, window_dimension)
        LOG.info(
            "UI-TARS action generation completed",
            task_id=task.task_id,
            step_id=step.step_id,
            actions_count=len(actions),
        )
        return actions
    async def complete_verify(
        self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
    ) -> CompleteVerifyResult:
@@ -2105,6 +2181,7 @@ class ForgeAgent:
            return
        await self.async_operation_pool.remove_task(task.task_id)
        await self.cleanup_browser_and_create_artifacts(
            close_browser_on_completion, last_step, task, browser_session_id=browser_session_id
        )
--- a/skyvern/forge/app.py
+++ b/skyvern/forge/app.py
@@ -46,6 +46,14 @@ ANTHROPIC_CLIENT = AsyncAnthropic(api_key=SettingsManager.get_settings().ANTHROP
 if SettingsManager.get_settings().ENABLE_BEDROCK_ANTHROPIC:
    ANTHROPIC_CLIENT = AsyncAnthropicBedrock()
 # Add UI-TARS client setup
 UI_TARS_CLIENT = None
 if SettingsManager.get_settings().ENABLE_UI_TARS:
    UI_TARS_CLIENT = AsyncOpenAI(
        api_key=SettingsManager.get_settings().UI_TARS_API_KEY,
        base_url=SettingsManager.get_settings().UI_TARS_API_BASE,
    )
 SECONDARY_LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(
    SETTINGS_MANAGER.SECONDARY_LLM_KEY if SETTINGS_MANAGER.SECONDARY_LLM_KEY else SETTINGS_MANAGER.LLM_KEY
 )
--- a/skyvern/forge/prompts/skyvern/ui-tars-system-prompt.j2
+++ b/skyvern/forge/prompts/skyvern/ui-tars-system-prompt.j2
@@ -0,0 +1,37 @@
 {#
 SPDX-License-Identifier: Apache-2.0
 Adapted from:
 https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/prompt.py
 Licensed under the Apache License, Version 2.0
 This prompt is used for the UI-TARS agent.
 #}
 You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
 ## Output Format
 ```
 Thought: ...
 Action: ...
 ```
 ## Action Space
 click(point='<point>x1 y1</point>')
 left_double(point='<point>x1 y1</point>')
 right_single(point='<point>x1 y1</point>')
 drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
 hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
 type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content. 
 scroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the `direction` side.
 wait() #Sleep for 5s and take a screenshot to check for any changes.
 finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
 ## Note
 - Use {{language}} in `Thought` part.
 - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
 ## User Instruction
 {{instruction}}
--- a/skyvern/forge/sdk/api/llm/api_handler_factory.py
+++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py
@@ -2,7 +2,7 @@ import dataclasses
 import json
 import time
 from asyncio import CancelledError
-from typing import Any
+from typing import Any, AsyncIterator
 import litellm
 import structlog
@@ -10,6 +10,7 @@ from anthropic import NOT_GIVEN
 from anthropic.types.beta.beta_message import BetaMessage as AnthropicMessage
 from jinja2 import Template
 from litellm.utils import CustomStreamWrapper, ModelResponse
 from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
 from pydantic import BaseModel
 from skyvern.config import settings
@@ -23,6 +24,7 @@ from skyvern.forge.sdk.api.llm.exceptions import (
    LLMProviderErrorRetryableTask,
 )
 from skyvern.forge.sdk.api.llm.models import LLMAPIHandler, LLMConfig, LLMRouterConfig, dummy_llm_api_handler
 from skyvern.forge.sdk.api.llm.ui_tars_response import UITarsResponse
 from skyvern.forge.sdk.api.llm.utils import llm_messages_builder, llm_messages_builder_with_history, parse_api_response
 from skyvern.forge.sdk.artifact.models import ArtifactType
 from skyvern.forge.sdk.core import skyvern_context
@@ -744,10 +746,14 @@ class LLMCaller:
        tools: list | None = None,
        timeout: float = settings.LLM_CONFIG_TIMEOUT,
        **active_parameters: dict[str, Any],
-    ) -> ModelResponse | CustomStreamWrapper | AnthropicMessage:
+    ) -> ModelResponse | CustomStreamWrapper | AnthropicMessage | Any:
        if self.llm_key and "ANTHROPIC" in self.llm_key:
            return await self._call_anthropic(messages, tools, timeout, **active_parameters)
        # Route UI-TARS models to custom handler instead of LiteLLM
        if self.llm_key and "UI_TARS" in self.llm_key:
            return await self._call_ui_tars(messages, tools, timeout, **active_parameters)
        return await litellm.acompletion(
            model=self.llm_config.model_name, messages=messages, tools=tools, timeout=timeout, **active_parameters
        )
@@ -790,8 +796,97 @@ class LLMCaller:
        )
        return response
-    async def get_call_stats(self, response: ModelResponse | CustomStreamWrapper | AnthropicMessage) -> LLMCallStats:
+    async def _call_ui_tars(
        self,
        messages: list[dict[str, Any]],
        tools: list | None = None,
        timeout: float = settings.LLM_CONFIG_TIMEOUT,
        **active_parameters: dict[str, Any],
    ) -> Any:
        """Custom UI-TARS API call using OpenAI client with VolcEngine endpoint."""
        max_tokens = active_parameters.get("max_completion_tokens") or active_parameters.get("max_tokens") or 400
        model_name = self.llm_config.model_name
        if not app.UI_TARS_CLIENT:
            raise ValueError(
                "UI_TARS_CLIENT not initialized. Please ensure ENABLE_UI_TARS=true and UI_TARS_API_KEY is set."
            )
        LOG.info(
            "UI-TARS request",
            model_name=model_name,
            timeout=timeout,
            messages_length=len(messages),
        )
        # Use the UI-TARS client (which is OpenAI-compatible with VolcEngine)
        chat_completion: AsyncIterator[ChatCompletionChunk] = await app.UI_TARS_CLIENT.chat.completions.create(
            model=model_name,
            messages=messages,
            top_p=None,
            temperature=active_parameters.get("temperature", 0.0),
            max_tokens=max_tokens,
            stream=True,
            seed=None,
            stop=None,
            frequency_penalty=None,
            presence_penalty=None,
            timeout=timeout,
        )
        # Aggregate streaming response like in ByteDance example
        response_content = ""
        async for message in chat_completion:
            if message.choices[0].delta.content:
                response_content += message.choices[0].delta.content
        response = UITarsResponse(response_content, model_name)
        LOG.info(
            "UI-TARS response",
            model_name=model_name,
            response_length=len(response_content),
            timeout=timeout,
        )
        return response
    async def get_call_stats(
        self, response: ModelResponse | CustomStreamWrapper | AnthropicMessage | dict[str, Any] | Any
    ) -> LLMCallStats:
        empty_call_stats = LLMCallStats()
        # Handle UI-TARS response (UITarsResponse object from _call_ui_tars)
        if hasattr(response, "usage") and hasattr(response, "choices") and hasattr(response, "model"):
            usage = response.usage
            # Use Doubao pricing: ¥0.8/1M input, ¥2/1M output (convert to USD: ~$0.11/$0.28)
            input_token_cost = (0.11 / 1000000) * usage.get("prompt_tokens", 0)
            output_token_cost = (0.28 / 1000000) * usage.get("completion_tokens", 0)
            llm_cost = input_token_cost + output_token_cost
            return LLMCallStats(
                llm_cost=llm_cost,
                input_tokens=usage.get("prompt_tokens", 0),
                output_tokens=usage.get("completion_tokens", 0),
                cached_tokens=0,  # UI-TARS doesn't have cached tokens
                reasoning_tokens=0,
            )
        # Handle UI-TARS response (dict format - fallback)
        if isinstance(response, dict) and "choices" in response and "usage" in response:
            usage = response["usage"]
            # Use Doubao pricing: ¥0.8/1M input, ¥2/1M output (convert to USD: ~$0.11/$0.28)
            input_token_cost = (0.11 / 1000000) * usage.get("prompt_tokens", 0)
            output_token_cost = (0.28 / 1000000) * usage.get("completion_tokens", 0)
            llm_cost = input_token_cost + output_token_cost
            return LLMCallStats(
                llm_cost=llm_cost,
                input_tokens=usage.get("prompt_tokens", 0),
                output_tokens=usage.get("completion_tokens", 0),
                cached_tokens=0,  # UI-TARS doesn't have cached tokens
                reasoning_tokens=0,
            )
        if isinstance(response, AnthropicMessage):
            usage = response.usage
            input_token_cost = (3.0 / 1000000) * usage.input_tokens
--- a/skyvern/forge/sdk/api/llm/config_registry.py
+++ b/skyvern/forge/sdk/api/llm/config_registry.py
@@ -568,6 +568,18 @@ if settings.ENABLE_AZURE_O3:
            max_completion_tokens=100000,
        ),
    )
 if settings.ENABLE_UI_TARS:
    LLMConfigRegistry.register_config(
        "UI_TARS_SEED1_5_VL",
        LLMConfig(
            settings.UI_TARS_MODEL,
            ["UI_TARS_API_KEY"],
            supports_vision=True,
            add_assistant_prefix=False,
            max_tokens=400,
            temperature=0.0,
        ),
    )
 if settings.ENABLE_GEMINI:
    LLMConfigRegistry.register_config(
@@ -630,6 +642,16 @@ if settings.ENABLE_GEMINI:
            max_completion_tokens=65536,
        ),
    )
    LLMConfigRegistry.register_config(
        "GEMINI_2.5_FLASH_PREVIEW",
        LLMConfig(
            "gemini/gemini-2.5-flash-preview-05-20",
            ["GEMINI_API_KEY"],
            supports_vision=True,
            add_assistant_prefix=False,
            max_completion_tokens=65536,
        ),
    )
 if settings.ENABLE_NOVITA:
--- a/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py
+++ b/skyvern/forge/sdk/api/llm/ui_tars_llm_caller.py
@@ -0,0 +1,200 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 # Code partially adapted from:
 # https://github.com/ByteDance-Seed/Seed1.5-VL/blob/main/GUI/gui.ipynb
 #
 # Licensed under the Apache License, Version 2.0
 #
 # For managing the conversation history of the UI-TARS agent.
 #
 """
 UI-TARS LLM Caller that follows the standard LLMCaller pattern.
 """
 import base64
 from io import BytesIO
 from typing import Any, Dict
 import structlog
 from PIL import Image
 from skyvern.forge.prompts import prompt_engine
 from skyvern.forge.sdk.api.llm.api_handler_factory import LLMCaller
 from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.ai_suggestions import AISuggestion
 from skyvern.forge.sdk.schemas.task_v2 import TaskV2, Thought
 from skyvern.forge.sdk.schemas.tasks import Task
 from skyvern.utils.image_resizer import Resolution
 LOG = structlog.get_logger()
 def _build_system_prompt(instruction: str, language: str = "English") -> str:
    """Build system prompt for UI-TARS using the prompt engine."""
    return prompt_engine.load_prompt("ui-tars-system-prompt", language=language, instruction=instruction)
 def _is_image_message(message: Dict[str, Any]) -> bool:
    """Check if message contains an image."""
    return (
        message.get("role") == "user"
        and isinstance(message.get("content"), list)
        and any(item.get("type") == "image_url" for item in message["content"])
    )
 class UITarsLLMCaller(LLMCaller):
    """
    UI-TARS specific LLM caller that manages conversation history.
    Follows the established LLMCaller pattern used by Anthropic CUA.
    """
    def __init__(self, llm_key: str, screenshot_scaling_enabled: bool = False):
        super().__init__(llm_key, screenshot_scaling_enabled)
        self.max_history_images = 5
        self._conversation_initialized = False
    def initialize_conversation(self, task: Task) -> None:
        """Initialize conversation with system prompt for the given task."""
        if not self._conversation_initialized:
            # Handle None case for navigation_goal
            instruction = task.navigation_goal or "Default navigation task"
            system_prompt = _build_system_prompt(instruction)
            self.message_history = [{"role": "user", "content": system_prompt}]
            self._conversation_initialized = True
            LOG.debug("Initialized UI-TARS conversation", task_id=task.task_id)
    def add_screenshot(self, screenshot_bytes: bytes) -> None:
        """Add screenshot to conversation history."""
        if not screenshot_bytes:
            return
        # Convert to PIL Image to get format
        image = Image.open(BytesIO(screenshot_bytes))
        image_format = self._get_image_format_from_pil(image)
        screenshot_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
        # Add image message
        image_message = {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": f"data:image/{image_format};base64,{screenshot_b64}"}}
            ],
        }
        self.message_history.append(image_message)
        self._maintain_history_limit()
        LOG.debug("Added screenshot to conversation", total_messages=len(self.message_history))
    def add_assistant_response(self, response: str) -> None:
        """Add assistant response to conversation history."""
        self.message_history.append({"role": "assistant", "content": response})
        LOG.debug("Added assistant response to conversation")
    def _maintain_history_limit(self) -> None:
        """Maintain history limit: keep system prompt + all assistant responses + last N screenshots."""
        image_count = self._count_image_messages()
        if image_count <= self.max_history_images:
            return
        # Ensure we have a system prompt (first message should be user with string content)
        if (
            not self.message_history
            or self.message_history[0]["role"] != "user"
            or not isinstance(self.message_history[0]["content"], str)
        ):
            LOG.error("Conversation history corrupted - missing system prompt")
            return
        # Remove oldest screenshots only (keep system prompt and all assistant responses)
        removed_count = 0
        images_to_remove = image_count - self.max_history_images
        i = 1  # Start after system prompt (index 0)
        while i < len(self.message_history) and removed_count < images_to_remove:
            message = self.message_history[i]
            if _is_image_message(message):
                # Remove only the screenshot message, keep all assistant responses
                self.message_history.pop(i)
                removed_count += 1
                # Don't increment i since we removed an element
            else:
                i += 1
        LOG.debug(
            f"Maintained history limit, removed {removed_count} old images, "
            f"current messages: {len(self.message_history)}"
        )
    def _count_image_messages(self) -> int:
        """Count existing image messages in the conversation history."""
        count = 0
        for message in self.message_history:
            if _is_image_message(message):
                count += 1
        return count
    def _get_image_format_from_pil(self, image: Image.Image) -> str:
        """Extract and validate image format from PIL Image object."""
        format_str = image.format.lower() if image.format else "png"
        if format_str not in ["jpg", "jpeg", "png", "webp"]:
            return "png"  # Default to PNG for unsupported formats
        return format_str
    async def call(
        self,
        prompt: str | None = None,
        prompt_name: str | None = None,
        step: Step | None = None,
        task_v2: TaskV2 | None = None,
        thought: Thought | None = None,
        ai_suggestion: AISuggestion | None = None,
        screenshots: list[bytes] | None = None,
        parameters: dict[str, Any] | None = None,
        tools: list[Any] | None = None,
        use_message_history: bool = False,
        raw_response: bool = False,
        window_dimension: Resolution | None = None,
        **extra_parameters: Any,
    ) -> dict[str, Any]:
        """Override call method to use standard LLM routing instead of direct LiteLLM."""
        # Use raw_response=True to bypass JSON parsing since UI-TARS returns plain text
        response = await super().call(
            prompt=prompt,
            prompt_name=prompt_name,
            step=step,
            task_v2=task_v2,
            thought=thought,
            ai_suggestion=ai_suggestion,
            screenshots=screenshots,
            parameters=parameters,
            tools=tools,
            use_message_history=True,  # Use message history for UI-TARS
            raw_response=True,  # Bypass JSON parsing - UI-TARS returns plain text
            window_dimension=window_dimension,
            **extra_parameters,
        )
        # Extract content from the raw response
        if isinstance(response, dict) and "choices" in response:
            content = response["choices"][0]["message"]["content"]
            return {"content": content}
        else:
            # Fallback for unexpected response format
            return {"content": str(response)}
    async def generate_ui_tars_response(self, step: Step) -> str:
        """Generate UI-TARS response using the overridden call method."""
        response = await self.call(step=step)
        content = response.get("content", "").strip()
        # Add the response to conversation history
        self.add_assistant_response(content)
        return content
--- a/skyvern/forge/sdk/api/llm/ui_tars_response.py
+++ b/skyvern/forge/sdk/api/llm/ui_tars_response.py
@@ -0,0 +1,66 @@
 """UI-TARS response model that mimics the ModelResponse interface."""
 import json
 from typing import Any
 class UITarsResponse:
    """A response object that mimics the ModelResponse interface for UI-TARS API responses."""
    def __init__(self, content: str, model: str):
        # Create choice objects with proper nested structure for parse_api_response
        class Message:
            def __init__(self, content: str):
                self.content = content
                self.role = "assistant"
        class Choice:
            def __init__(self, content: str):
                self.message = Message(content)
        self.choices = [Choice(content)]
        self.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
        self.model = model
        self.object = "chat.completion"
    def model_dump_json(self, indent: int = 2) -> str:
        """Provide model_dump_json compatibility for artifact creation."""
        return json.dumps(
            {
                "choices": [
                    {
                        "message": {
                            "content": self.choices[0].message.content,
                            "role": self.choices[0].message.role,
                        }
                    }
                ],
                "usage": self.usage,
                "model": self.model,
                "object": self.object,
            },
            indent=indent,
        )
    def model_dump(self, exclude_none: bool = True) -> dict:
        """Provide model_dump compatibility for raw_response."""
        return {
            "choices": [
                {"message": {"content": self.choices[0].message.content, "role": self.choices[0].message.role}}
            ],
            "usage": self.usage,
            "model": self.model,
            "object": self.object,
        }
    def get(self, key: str, default: Any = None) -> Any:
        """Provide dict-like access for compatibility."""
        return getattr(self, key, default)
    def __getitem__(self, key: str) -> Any:
        """Provide dict-like access for compatibility."""
        return getattr(self, key)
    def __contains__(self, key: str) -> bool:
        """Provide dict-like access for compatibility."""
        return hasattr(self, key)
--- a/skyvern/forge/sdk/executor/async_executor.py
+++ b/skyvern/forge/sdk/executor/async_executor.py
@@ -100,6 +100,8 @@ class BackgroundTaskExecutor(AsyncExecutor):
            engine = RunEngine.openai_cua
        elif run_obj and run_obj.task_run_type == RunType.anthropic_cua:
            engine = RunEngine.anthropic_cua
        elif run_obj and run_obj.task_run_type == RunType.ui_tars:
            engine = RunEngine.ui_tars
        context: SkyvernContext = skyvern_context.ensure_context()
        context.task_id = task.task_id
--- a/skyvern/schemas/runs.py
+++ b/skyvern/schemas/runs.py
@@ -182,6 +182,7 @@ class RunType(StrEnum):
    workflow_run = "workflow_run"
    openai_cua = "openai_cua"
    anthropic_cua = "anthropic_cua"
    ui_tars = "ui_tars"
 class RunEngine(StrEnum):
@@ -189,10 +190,11 @@ class RunEngine(StrEnum):
    skyvern_v2 = "skyvern-2.0"
    openai_cua = "openai-cua"
    anthropic_cua = "anthropic-cua"
    ui_tars = "ui-tars"
-CUA_ENGINES = [RunEngine.openai_cua, RunEngine.anthropic_cua]
+CUA_ENGINES = [RunEngine.openai_cua, RunEngine.anthropic_cua, RunEngine.ui_tars]
-CUA_RUN_TYPES = [RunType.openai_cua, RunType.anthropic_cua]
+CUA_RUN_TYPES = [RunType.openai_cua, RunType.anthropic_cua, RunType.ui_tars]
 class RunStatus(StrEnum):
@@ -373,8 +375,8 @@ class BaseRunResponse(BaseModel):
 class TaskRunResponse(BaseRunResponse):
-    run_type: Literal[RunType.task_v1, RunType.task_v2, RunType.openai_cua, RunType.anthropic_cua] = Field(
+    run_type: Literal[RunType.task_v1, RunType.task_v2, RunType.openai_cua, RunType.anthropic_cua, RunType.ui_tars] = (
-        description="Types of a task run - task_v1, task_v2, openai_cua, anthropic_cua"
+        Field(description="Types of a task run - task_v1, task_v2, openai_cua, anthropic_cua, ui_tars")
    )
    run_request: TaskRunRequest | None = Field(
        default=None, description="The original request parameters used to start this task run"
--- a/skyvern/services/run_service.py
+++ b/skyvern/services/run_service.py
@@ -24,6 +24,7 @@ async def get_run_response(run_id: str, organization_id: str | None = None) -> R
        run.task_run_type == RunType.task_v1
        or run.task_run_type == RunType.openai_cua
        or run.task_run_type == RunType.anthropic_cua
        or run.task_run_type == RunType.ui_tars
    ):
        # fetch task v1 from db and transform to task run response
        try:
@@ -37,6 +38,8 @@ async def get_run_response(run_id: str, organization_id: str | None = None) -> R
            run_engine = RunEngine.openai_cua
        elif run.task_run_type == RunType.anthropic_cua:
            run_engine = RunEngine.anthropic_cua
        elif run.task_run_type == RunType.ui_tars:
            run_engine = RunEngine.ui_tars
        return TaskRunResponse(
            run_id=run.run_id,
@@ -129,7 +132,7 @@ async def cancel_run(run_id: str, organization_id: str | None = None, api_key: s
            detail=f"Run not found {run_id}",
        )
-    if run.task_run_type in [RunType.task_v1, RunType.openai_cua, RunType.anthropic_cua]:
+    if run.task_run_type in [RunType.task_v1, RunType.openai_cua, RunType.anthropic_cua, RunType.ui_tars]:
        await cancel_task_v1(run_id, organization_id=organization_id, api_key=api_key)
    elif run.task_run_type == RunType.task_v2:
        await cancel_task_v2(run_id, organization_id=organization_id)
@@ -152,7 +155,7 @@ async def retry_run_webhook(run_id: str, organization_id: str | None = None, api
            detail=f"Run not found {run_id}",
        )
-    if run.task_run_type in [RunType.task_v1, RunType.openai_cua, RunType.anthropic_cua]:
+    if run.task_run_type in [RunType.task_v1, RunType.openai_cua, RunType.anthropic_cua, RunType.ui_tars]:
        task = await app.DATABASE.get_task(run_id, organization_id=organization_id)
        if not task:
            raise TaskNotFound(task_id=run_id)
--- a/skyvern/services/task_v1_service.py
+++ b/skyvern/services/task_v1_service.py
@@ -90,6 +90,8 @@ async def run_task(
        run_type = RunType.openai_cua
    elif engine == RunEngine.anthropic_cua:
        run_type = RunType.anthropic_cua
    elif engine == RunEngine.ui_tars:
        run_type = RunType.ui_tars
    await app.DATABASE.create_task_run(
        task_run_type=run_type,
        organization_id=organization.organization_id,
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -1745,6 +1745,9 @@ async def handle_keypress_action(
            updated_keys.append("Escape")
        elif key_lower_case == "alt":
            updated_keys.append("Alt")
        elif key_lower_case.startswith("f") and key_lower_case[1:].isdigit():
            # Handle function keys: f1 -> F1, f5 -> F5, etc.
            updated_keys.append(key_lower_case.upper())
        else:
            updated_keys.append(key)
    keypress_str = "+".join(updated_keys)
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -809,3 +809,415 @@ async def generate_cua_fallback_actions(
    action.step_order = step.order
    action.action_order = 0
    return [action]
 async def parse_ui_tars_actions(
    task: Task,
    step: Step,
    response_content: str,
    browser_window_dimension: Resolution,
 ) -> list[Action]:
    """Parse UI-TARS response and convert to Skyvern actions."""
    try:
        # Parse the UI-TARS response text
        parsed_actions = _parse_ui_tars_response(response_content, browser_window_dimension)
        actions: list[Action] = []
        for idx, parsed_action in enumerate(parsed_actions):
            try:
                action = _create_ui_tars_action(parsed_action, task, step, browser_window_dimension, idx)
                if action:
                    actions.append(action)
            except Exception:
                LOG.exception(
                    "Failed to create UI-TARS action",
                    task_id=task.task_id,
                    step_id=step.step_id,
                    parsed_action=parsed_action,
                )
                continue
        if not actions:
            LOG.warning(
                "No valid actions generated from UI-TARS response",
                task_id=task.task_id,
                step_id=step.step_id,
                response_preview=response_content[:200],
            )
        return actions
    except Exception:
        LOG.exception(
            "Failed to parse UI-TARS actions",
            task_id=task.task_id,
            step_id=step.step_id,
            response_content=response_content[:200],
        )
        return []
 def _parse_ui_tars_response(response_content: str, browser_window_dimension: Resolution) -> list[dict[str, Any]]:
    """Parse UI-TARS response text into structured action data.
    Extracts essential parsing logic from action_parser.py without the complex coordinate transformations.
    """
    import re
    text = response_content.strip()
    # Convert point format to coordinates if needed
    if "<point>" in text:
        text = _convert_point_to_coordinates(text)
    # Normalize parameter names
    text = text.replace("start_point=", "start_box=")
    text = text.replace("end_point=", "end_box=")
    text = text.replace("point=", "start_box=")
    # Extract thought/reasoning
    thought = None
    thought_patterns = [
        r"Thought: (.+?)(?=\s*Action: |$)",
        r"Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action: |$)",
        r"Action_Summary: (.+?)(?=\s*Action: |$)",
    ]
    for pattern in thought_patterns:
        thought_match = re.search(pattern, text, re.DOTALL)
        if thought_match:
            if len(thought_match.groups()) == 1:
                thought = thought_match.group(1).strip()
            elif len(thought_match.groups()) == 2:
                thought = thought_match.group(2).strip()  # Use Action_Summary
            break
    if "Action:" not in text:
        raise ValueError("No Action section found in UI-TARS response")
    # Extract action string
    action_str = text.split("Action: ")[-1]
    # Split multiple actions
    action_parts = action_str.split(")\n\n")
    all_actions = []
    for action_part in action_parts:
        action_part = action_part.strip()
        if not action_part:
            continue
        # Handle type action with content specially
        if "type(content" in action_part:
            if not action_part.endswith(")"):
                action_part += ")"
            # Extract content from type action
            pattern = r"type\(content='(.*?)'\)"
            match = re.search(pattern, action_part)
            if match:
                content = match.group(1)
                # Escape single quotes in content
                content = content.replace("'", "\\'")
                action_part = f"type(content='{content}')"
        if not action_part.endswith(")"):
            action_part += ")"
        all_actions.append(action_part)
    # Parse each action
    parsed_actions = []
    for action_str in all_actions:
        try:
            parsed_action = _parse_single_action(action_str)
            if parsed_action:
                parsed_action["thought"] = thought
                parsed_action["browser_window_dimension"] = browser_window_dimension
                parsed_actions.append(parsed_action)
        except Exception:
            LOG.warning(
                "Failed to parse individual UI-TARS action",
                action_str=action_str,
                exc_info=True,
            )
            continue
    return parsed_actions
 def _parse_single_action(action_str: str) -> dict[str, Any] | None:
    """Parse a single action string into structured data."""
    import ast
    try:
        # Clean up the action string
        action_str = action_str.replace("\n", "\\n").strip()
        # Parse as Python expression
        node = ast.parse(action_str, mode="eval")
        if not isinstance(node, ast.Expression) or not isinstance(node.body, ast.Call):
            return None
        call = node.body
        # Get function name
        if isinstance(call.func, ast.Name):
            func_name = call.func.id
        elif isinstance(call.func, ast.Attribute):
            func_name = call.func.attr
        else:
            return None
        # Get arguments
        action_inputs = {}
        for kw in call.keywords:
            if kw.arg and isinstance(kw.value, (ast.Constant, ast.Str)):
                if isinstance(kw.value, ast.Constant):
                    value = kw.value.value
                else:  # ast.Str for older Python versions
                    value = kw.value.s
                action_inputs[kw.arg] = value
        return {
            "action_type": func_name,
            "action_inputs": action_inputs,
        }
    except Exception:
        LOG.debug(f"Failed to parse action string: {action_str}", exc_info=True)
        return None
 def _convert_point_to_coordinates(text: str) -> str:
    """Convert <point>x y</point> format to (x,y) format."""
    import re
    from typing import Match
    pattern = r"<point>(\d+)\s+(\d+)</point>"
    def replace_match(match: Match[str]) -> str:
        x, y = map(int, match.groups())
        return f"({x},{y})"
    return re.sub(pattern, replace_match, text)
 def _create_ui_tars_action(
    parsed_action: dict[str, Any],
    task: Task,
    step: Step,
    browser_window_dimension: Resolution,
    action_order: int,
 ) -> Action | None:
    """Create a Skyvern action from parsed UI-TARS data."""
    action_type = parsed_action.get("action_type", "")
    action_inputs = parsed_action.get("action_inputs", {})
    thought = parsed_action.get("thought", "")
    base_params = {
        "reasoning": thought,
        "intention": thought,
        "organization_id": task.organization_id,
        "workflow_run_id": task.workflow_run_id,
        "task_id": task.task_id,
        "step_id": step.step_id,
        "step_order": step.order,
        "action_order": action_order,
    }
    if action_type == "click":
        x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
        if x is None or y is None:
            return None
        return ClickAction(
            element_id="",
            x=x,
            y=y,
            response=f"Click at ({x}, {y})",
            **base_params,
        )
    elif action_type == "left_double":
        x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
        if x is None or y is None:
            return None
        return ClickAction(
            element_id="",
            x=x,
            y=y,
            button="left",
            repeat=2,
            response=f"Double click at ({x}, {y})",
            **base_params,
        )
    elif action_type == "right_single":
        x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
        if x is None or y is None:
            return None
        return ClickAction(
            element_id="",
            x=x,
            y=y,
            button="right",
            response=f"Right click at ({x}, {y})",
            **base_params,
        )
    elif action_type == "type":
        content = action_inputs.get("content", "")
        if not content:
            return None
        return InputTextAction(
            element_id="",
            text=content,
            response=f"Type: {content[:50]}{'...' if len(content) > 50 else ''}",
            **base_params,
        )
    elif action_type in ["drag", "select"]:
        start_x, start_y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
        end_x, end_y = _extract_ui_tars_coordinates(action_inputs.get("end_box", ""), browser_window_dimension)
        if None in (start_x, start_y, end_x, end_y):
            return None
        return DragAction(
            start_x=start_x,
            start_y=start_y,
            path=[(end_x, end_y)],
            response=f"Drag from ({start_x}, {start_y}) to ({end_x}, {end_y})",
            **base_params,
        )
    elif action_type == "hotkey":
        key_combo = action_inputs.get("key", action_inputs.get("hotkey", ""))
        if not key_combo:
            return None
        keys = key_combo.split()
        return KeypressAction(
            keys=keys,
            response=f"Hotkey: {key_combo}",
            **base_params,
        )
    elif action_type == "scroll":
        direction = action_inputs.get("direction", "down").lower()
        x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
        if x is None or y is None:
            # Use center of screen as fallback
            x = browser_window_dimension["width"] // 2
            y = browser_window_dimension["height"] // 2
        scroll_amount = 300
        if direction == "down":
            scroll_x, scroll_y = 0, scroll_amount
        elif direction == "up":
            scroll_x, scroll_y = 0, -scroll_amount
        elif direction == "right":
            scroll_x, scroll_y = scroll_amount, 0
        elif direction == "left":
            scroll_x, scroll_y = -scroll_amount, 0
        else:
            scroll_x, scroll_y = 0, scroll_amount
        return ScrollAction(
            element_id="",
            x=x,
            y=y,
            scroll_x=scroll_x,
            scroll_y=scroll_y,
            response=f"Scroll {direction} at ({x}, {y})",
            **base_params,
        )
    elif action_type == "wait":
        return WaitAction(
            seconds=5,
            **base_params,
        )
    elif action_type == "finished":
        return CompleteAction(
            data_extraction_goal=task.data_extraction_goal,
            verified=True,  # UI-TARS has already determined completion, skip Skyvern validation
            **base_params,
        )
    else:
        LOG.warning(f"Unsupported UI-TARS action type: {action_type}")
        return None
 def _extract_ui_tars_coordinates(box_str: str, browser_window_dimension: Resolution) -> tuple[int | None, int | None]:
    """Extract coordinates from UI-TARS box format with proper coordinate conversion.
    UI-TARS coordinates need to be divided by 1000 to convert from the model's output
    format to relative coordinates (0-1 range), then multiplied by screen dimensions
    to get absolute pixel coordinates.
    """
    import ast
    if not box_str:
        return None, None
    try:
        # Parse coordinates from string format like "(450,320)" or "[0.5, 0.3, 0.5, 0.3]"
        coords = ast.literal_eval(box_str)
        if not isinstance(coords, (list, tuple)):
            return None, None
        if len(coords) == 2:
            # Direct coordinates like (450, 320) or (0.5, 0.3)
            x, y = coords
            # UI-TARS specific coordinate conversion
            # UI-TARS outputs coordinates that need to be divided by 1000 first
            if x > 1 or y > 1:  # Likely UI-TARS format needing factor conversion
                original_x, original_y = x, y
                x = x / 1000.0
                y = y / 1000.0
                LOG.debug(f"Applied UI-TARS factor conversion: ({original_x}, {original_y}) -> ({x}, {y})")
            # Convert relative coordinates (0-1) to absolute screen coordinates
            if 0 <= x <= 1 and 0 <= y <= 1:
                abs_x = int(x * browser_window_dimension["width"])
                abs_y = int(y * browser_window_dimension["height"])
                LOG.debug(
                    f"Converted to absolute coordinates: ({abs_x}, {abs_y}) for screen {browser_window_dimension['width']}x{browser_window_dimension['height']}"
                )
                return abs_x, abs_y
            return int(x), int(y)
        elif len(coords) == 4:
            # Bounding box format [x1, y1, x2, y2] - take center point
            x1, y1, x2, y2 = coords
            x = (x1 + x2) / 2
            y = (y1 + y2) / 2
            # UI-TARS specific coordinate conversion for bounding boxes
            if x > 1 or y > 1:  # Likely UI-TARS format needing factor conversion
                original_x, original_y = x, y
                x = x / 1000.0
                y = y / 1000.0
                LOG.debug(
                    f"Applied UI-TARS factor conversion to bbox center: ({original_x}, {original_y}) -> ({x}, {y})"
                )
            # Convert relative coordinates (0-1) to absolute screen coordinates
            if 0 <= x <= 1 and 0 <= y <= 1:
                abs_x = int(x * browser_window_dimension["width"])
                abs_y = int(y * browser_window_dimension["height"])
                LOG.debug(
                    f"Converted bbox center to absolute coordinates: ({abs_x}, {abs_y}) for screen {browser_window_dimension['width']}x{browser_window_dimension['height']}"
                )
                return abs_x, abs_y
            return int(x), int(y)
        else:
            return None, None
    except Exception:
        LOG.debug(f"Failed to parse UI-TARS coordinates: {box_str}", exc_info=True)
        return None, None