Dorod-Sky/skyvern/webeye/actions/parse_actions.py

import ast
import re
from typing import Any, Dict, Match

import structlog
from openai.types.responses.response import Response as OpenAIResponse
from pydantic import ValidationError

from skyvern.constants import EXTRACT_ACTION_SCROLL_AMOUNT, SCROLL_AMOUNT_MULTIPLIER
from skyvern.exceptions import FailedToGetTOTPVerificationCode, NoTOTPVerificationCodeFound, UnsupportedActionType
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.core import skyvern_context
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.tasks import Task
from skyvern.forge.sdk.schemas.totp_codes import OTPType
from skyvern.services.otp_service import poll_otp_value, try_generate_totp_from_credential
from skyvern.utils.image_resizer import Resolution, scale_coordinates
from skyvern.webeye.actions.action_types import ActionType
from skyvern.webeye.actions.actions import (
    Action,
    CaptchaType,
    CheckboxAction,
    ClickAction,
    ClickContext,
    ClosePageAction,
    CompleteAction,
    DownloadFileAction,
    DragAction,
    GotoUrlAction,
    HoverAction,
    InputOrSelectContext,
    InputTextAction,
    KeypressAction,
    LeftMouseAction,
    MoveAction,
    NullAction,
    ScrollAction,
    SelectOption,
    SelectOptionAction,
    SolveCaptchaAction,
    TerminateAction,
    UploadFileAction,
    VerificationCodeAction,
    WaitAction,
)
from skyvern.webeye.scraper.scraped_page import ScrapedPage

LOG = structlog.get_logger()


def parse_action(
    action: Dict[str, Any],
    scraped_page: ScrapedPage,
    data_extraction_goal: str | None = None,
    totp_code_required: bool = False,
) -> Action:
    if "id" in action:
        element_id = action["id"]
    elif "element_id" in action:
        element_id = action["element_id"]
    else:
        element_id = None

    skyvern_element_hash = scraped_page.id_to_element_hash.get(element_id) if element_id else None
    skyvern_element_data = scraped_page.id_to_element_dict.get(element_id) if element_id else None

    reasoning = action["reasoning"] if "reasoning" in action else None
    confidence_float = action["confidence_float"] if "confidence_float" in action else None
    # TODO: currently action intention and response are only used for Q&A actions, like input_text
    # When we start supporting click action, intention will be the reasoning for the click action (why take the action)
    intention = action["user_detail_query"] if "user_detail_query" in action else None
    response = action["user_detail_answer"] if "user_detail_answer" in action else None

    base_action_dict = {
        "element_id": element_id,
        "skyvern_element_hash": skyvern_element_hash,
        "skyvern_element_data": skyvern_element_data,
        "reasoning": reasoning,
        "confidence_float": confidence_float,
        "intention": intention,
        "response": response,
    }
    input_or_select_context: InputOrSelectContext | None = None

    if "action_type" not in action or action["action_type"] is None:
        return NullAction(**base_action_dict)

    # `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
    action_type_str = action["action_type"].upper()
    # Backward compat: map PRESS_ENTER to KEYPRESS (old prompt used PRESS_ENTER)
    if action_type_str == "PRESS_ENTER":
        action_type_str = "KEYPRESS"
    action_type = ActionType[action_type_str]

    if not action_type.is_web_action() and action_type != ActionType.SCROLL:
        # LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc.
        # That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not
        # set for non-web actions. SCROLL needs element_id to target a specific scrollable container.
        base_action_dict["element_id"] = None

    if action_type == ActionType.TERMINATE:
        return TerminateAction(**base_action_dict, errors=action["errors"] if "errors" in action else [])

    if action_type == ActionType.CLICK:
        file_url = action["file_url"] if "file_url" in action else None
        click_context = action.get("click_context", None)
        if click_context:
            click_context = ClickContext.model_validate(click_context)
        return ClickAction(
            **base_action_dict,
            file_url=file_url,
            download=action.get("download", False),
            click_context=click_context,
        )

    if action_type == ActionType.INPUT_TEXT:
        context_dict = action.get("context", {})
        if context_dict and len(context_dict) > 0:
            context_dict["intention"] = intention
            input_or_select_context = InputOrSelectContext.model_validate(context_dict)
        return InputTextAction(
            **base_action_dict,
            text=action["text"],
            input_or_select_context=input_or_select_context,
            totp_code_required=totp_code_required,
        )

    if action_type == ActionType.UPLOAD_FILE:
        # TODO: see if the element is a file input element. if it's not, convert this action into a click action
        return UploadFileAction(
            **base_action_dict,
            file_url=action["file_url"],
        )

    # This action is not used in the current implementation. Click actions are used instead.
    if action_type == ActionType.DOWNLOAD_FILE:
        return DownloadFileAction(**base_action_dict, file_name=action["file_name"])

    if action_type == ActionType.SELECT_OPTION:
        option = action["option"]
        if option is None:
            raise ValueError("SelectOptionAction requires an 'option' field")

        context_dict = action.get("context", {})
        if context_dict and len(context_dict) > 0:
            context_dict["intention"] = intention
            input_or_select_context = InputOrSelectContext.model_validate(context_dict)

        label = option.get("label")
        value = option.get("value")
        index = option.get("index")
        if label is None and value is None and index is None:
            raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption")
        return SelectOptionAction(
            **base_action_dict,
            option=SelectOption(
                label=label,
                value=value,
                index=index,
            ),
            input_or_select_context=input_or_select_context,
            download=action.get("download", False),
        )

    if action_type == ActionType.CHECKBOX:
        return CheckboxAction(
            **base_action_dict,
            is_checked=action["is_checked"],
        )

    if action_type == ActionType.WAIT:
        return WaitAction(**base_action_dict)

    if action_type == ActionType.HOVER:
        return HoverAction(**base_action_dict, hold_seconds=action.get("hold_seconds", 0) or 0)

    if action_type == ActionType.COMPLETE:
        return CompleteAction(
            **base_action_dict,
            data_extraction_goal=data_extraction_goal,
            errors=action["errors"] if "errors" in action else [],
        )

    if action_type == "null":
        return NullAction(**base_action_dict)

    if action_type == ActionType.SOLVE_CAPTCHA:
        captcha_type: str | None = action.get("captcha_type")
        return SolveCaptchaAction(
            **base_action_dict, captcha_type=CaptchaType[captcha_type.upper()] if captcha_type else None
        )

    if action_type == ActionType.KEYPRESS:
        # KEYPRESS is a global keyboard action, not element-targeted
        base_action_dict["skyvern_element_hash"] = None
        base_action_dict["skyvern_element_data"] = None
        # Support both "key" (single key from prompt) and "keys" (list, from code/legacy)
        # Limited to navigation/submission keys to prevent misuse on regular form fields
        allowed_keys = {"Enter", "Tab", "Escape", "ArrowDown", "ArrowUp"}
        key = action.get("key")
        if key:
            if key not in allowed_keys:
                LOG.warning("KEYPRESS action has unsupported key, skipping action", key=key)
                return NullAction(**base_action_dict)
            keys = [key]
        else:
            keys = action.get("keys", ["Enter"])
        return KeypressAction(**base_action_dict, keys=keys)

    if action_type == ActionType.SCROLL:
        # SCROLL from extract-action prompt provides a direction and optionally an element_id
        # for the scrollable container. Convert direction to scroll_x/scroll_y pixel values.
        base_action_dict["skyvern_element_hash"] = None
        base_action_dict["skyvern_element_data"] = None
        direction = action.get("direction", "down").lower()
        if direction not in ("up", "down"):
            LOG.warning("SCROLL action has unexpected direction, defaulting to down", direction=direction)
            direction = "down"
        if direction == "up":
            scroll_x = 0
            scroll_y = -EXTRACT_ACTION_SCROLL_AMOUNT
        else:
            scroll_x = 0
            scroll_y = EXTRACT_ACTION_SCROLL_AMOUNT
        return ScrollAction(**base_action_dict, scroll_x=scroll_x, scroll_y=scroll_y)

    if action_type == ActionType.CLOSE_PAGE:
        return ClosePageAction(**base_action_dict)

    raise UnsupportedActionType(action_type=action_type)


def parse_actions(
    task: Task, step_id: str, step_order: int, scraped_page: ScrapedPage, json_response: list[Dict[str, Any]]
) -> list[Action]:
    actions: list[Action] = []
    context = skyvern_context.ensure_context()
    totp_code = context.totp_codes.get(task.task_id)
    totp_code_required = bool(totp_code)
    for idx, action in enumerate(json_response):
        try:
            action_instance = parse_action(
                action=action,
                scraped_page=scraped_page,
                data_extraction_goal=task.data_extraction_goal,
                totp_code_required=totp_code_required,
            )
            action_instance.organization_id = task.organization_id
            action_instance.workflow_run_id = task.workflow_run_id
            action_instance.task_id = task.task_id
            action_instance.step_id = step_id
            action_instance.step_order = step_order
            action_instance.action_order = idx
            if isinstance(action_instance, TerminateAction):
                LOG.warning(
                    "Agent decided to terminate",
                    task_id=task.task_id,
                    llm_response=json_response,
                    reasoning=action_instance.reasoning,
                    actions=actions,
                )
            actions.append(action_instance)

        except UnsupportedActionType:
            LOG.error(
                "Unsupported action type when parsing actions",
                task_id=task.task_id,
                raw_action=action,
                exc_info=True,
            )
        except (ValidationError, ValueError):
            LOG.warning(
                "Invalid action",
                task_id=task.task_id,
                raw_action=action,
                exc_info=True,
            )
        except Exception:
            LOG.error(
                "Failed to marshal action",
                task_id=task.task_id,
                raw_action=action,
                exc_info=True,
            )

    ############################ This part of code might not be needed ############################
    # Reason #1. validation can be done in action handler but not in parser
    # Reason #2. no need to validate whether the element_id has a hash.
    # If there's no hash, we can fall back to normal operation
    all_element_ids = [action.element_id for action in actions if action.element_id]
    missing_element_ids = [
        element_id for element_id in all_element_ids if element_id not in scraped_page.id_to_element_hash
    ]
    if missing_element_ids:
        LOG.warning(
            "Missing elements in scraped page",
            task_id=task.task_id,
            missing_element_ids=missing_element_ids,
            all_element_ids=all_element_ids,
        )
    ############################ This part of code might not be needed ############################
    return actions


async def parse_cua_actions(
    task: Task,
    step: Step,
    response: OpenAIResponse,
) -> list[Action]:
    computer_calls = [item for item in response.output if item.type == "computer_call"]
    reasonings = [item for item in response.output if item.type == "reasoning"]
    assistant_messages = [item for item in response.output if item.type == "message" and item.role == "assistant"]
    actions: list[Action] = []
    for idx, computer_call in enumerate(computer_calls):
        cua_action = computer_call.action
        action_type = cua_action.type
        try:
            reasoning = None
            if idx < len(reasonings):
                try:
                    reasoning = reasonings[idx].summary[0].text
                except Exception:
                    LOG.exception(
                        "Failed to parse reasoning",
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                    )

            match action_type:
                case "click":
                    button = cua_action.button
                    if button != "left" and button != "right":
                        button = "left"
                    reasoning = reasoning or f"Click at: ({cua_action.x}, {cua_action.y})"
                    action = ClickAction(
                        element_id="",
                        x=cua_action.x,
                        y=cua_action.y,
                        button=button,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=f"Click at: ({cua_action.x}, {cua_action.y})",
                    )
                case "scroll":
                    reasoning = reasoning or f"Scroll by: ({cua_action.x}, {cua_action.y})"
                    action = ScrollAction(
                        element_id="",
                        x=cua_action.x,
                        y=cua_action.y,
                        scroll_x=cua_action.scroll_x,
                        scroll_y=cua_action.scroll_y,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=f"Scroll by: ({cua_action.x}, {cua_action.y})",
                    )
                case "keypress":
                    reasoning_str = f"Press keys: {cua_action.keys}"
                    if len(cua_action.keys) == 1:
                        reasoning_str = f"Press the '{cua_action.keys[0]}' key"
                    reasoning = reasoning or reasoning_str
                    action = KeypressAction(
                        element_id="",
                        keys=cua_action.keys,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=str(cua_action.keys),
                    )
                case "type":
                    action = InputTextAction(
                        element_id="",
                        text=cua_action.text,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=cua_action.text,
                    )
                case "wait":
                    action = WaitAction(
                        seconds=5,
                        reasoning=reasoning,
                        intention=reasoning,
                    )
                case "move":
                    response = f"Move mouse to: ({cua_action.x}, {cua_action.y})"
                    reasoning = reasoning or response
                    action = MoveAction(
                        x=cua_action.x,
                        y=cua_action.y,
                        reasoning=reasoning,
                        intention=reasoning,
                    )
                case "drag":
                    whole_path = cua_action.path
                    if not whole_path or len(whole_path) < 2:
                        LOG.warning(
                            "Invalid drag action",
                            task_id=task.task_id,
                            step_id=step.step_id,
                            step_order=step.order,
                            action_order=idx,
                            whole_path=whole_path,
                        )
                        action = WaitAction(
                            seconds=5,
                            reasoning=reasoning,
                            intention=reasoning,
                        )
                    else:
                        # ActionDragPath objects have x and y attributes
                        start_x, start_y = whole_path[0].x, whole_path[0].y
                        reasoning = reasoning or f"Drag action path: {whole_path}"
                        action = DragAction(
                            start_x=start_x,
                            start_y=start_y,
                            path=[(p.x, p.y) for p in whole_path[1:]],
                            reasoning=reasoning,
                            intention=reasoning,
                        )
                case "screenshot":
                    action = NullAction(
                        reasoning=reasoning,
                        intention=reasoning,
                    )
                case _:
                    raise ValueError(f"Unsupported action type: {action_type}")
            action.organization_id = task.organization_id
            action.workflow_run_id = task.workflow_run_id
            action.task_id = task.task_id
            action.step_id = step.step_id
            action.step_order = step.order
            action.action_order = idx
            actions.append(action)
        except Exception:
            LOG.exception(
                "Failed to parse action",
                task_id=task.task_id,
                step_id=step.step_id,
                step_order=step.order,
                action_order=idx,
            )
            break
    if not actions:
        LOG.info(
            "Empty action returned by CUA",
            task_id=task.task_id,
            step_id=step.step_id,
            organization_id=task.organization_id,
            workflow_run_id=task.workflow_run_id,
            response=response.model_dump(),
        )
        reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
        assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
        actions = await generate_cua_fallback_actions(task, step, assistant_message, reasoning)
    return actions


async def parse_anthropic_actions(
    task: Task,
    step: Step,
    assistant_content: list[dict[str, Any]],
    browser_window_dimension: Resolution,
    screenshot_resize_target_dimension: Resolution,
) -> list[Action]:
    tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
    reasonings = [block for block in assistant_content if block["type"] == "thinking"]
    LOG.info("Anthropic tool calls", tool_calls=tool_calls, reasonings=reasonings, assistant_content=assistant_content)
    if len(reasonings) > 1:
        LOG.warning(
            "Anthropic CUA: multiple reasonings in assistant content",
            task_id=task.task_id,
            step_id=step.step_id,
            assistant_content=assistant_content,
        )
    reasoning = reasonings[0]["thinking"] if reasonings else None
    idx = 0
    actions: list[Action] = []
    while idx < len(tool_calls):
        tool_call = tool_calls[idx]
        try:
            tool_call_id = tool_call["id"]
            tool_call_input = tool_call.get("input")
            if not tool_call_input:
                idx += 1
                continue
            action = tool_call_input["action"]
            if action == "mouse_move":
                coordinate = tool_call_input.get("coordinate")
                if not coordinate:
                    LOG.warning(
                        "Anthropic CUA error: mouse move action has no coordinate",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
                # (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension.
                x, y = validate_and_get_coordinates(
                    coordinate, screenshot_resize_target_dimension, browser_window_dimension
                )
                response = f"Move mouse to: ({x}, {y})"
                reasoning = reasoning or response
                actions.append(
                    # TODO: add response by adding specifying the element to move to
                    MoveAction(
                        x=x,
                        y=y,
                        reasoning=reasoning,
                        intention=reasoning,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action in ["left_click", "double_click", "triple_click", "right_click"]:
                coordinate = tool_call_input.get("coordinate")
                if not coordinate and idx - 1 >= 0:
                    prev_tool_call = tool_calls[idx - 1]
                    prev_tool_call_input = prev_tool_call.get("input")
                    if prev_tool_call_input and prev_tool_call_input["action"] == "mouse_move":
                        coordinate = prev_tool_call_input.get("coordinate")

                if not coordinate:
                    LOG.warning(
                        "Anthropic CUA error: left click action has no coordinate and it doesn't have mouse_move before it",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
                x, y = validate_and_get_coordinates(
                    coordinate, screenshot_resize_target_dimension, browser_window_dimension
                )
                repeat = 1
                if action == "double_click":
                    repeat = 2
                elif action == "triple_click":
                    repeat = 3

                response = f"Click at: ({x}, {y})"
                reasoning = reasoning or response
                button = "left"
                if action == "right_click":
                    button = "right"
                actions.append(
                    ClickAction(
                        element_id="",
                        x=x,
                        y=y,
                        button=button,
                        repeat=repeat,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=response,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action == "type":
                text = tool_call_input.get("text")
                if not text:
                    LOG.warning(
                        "Anthropic CUA error: type action has no text",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
                actions.append(
                    InputTextAction(
                        element_id="",
                        text=text,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=text,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action in ["key", "hold_key"]:
                text = tool_call_input.get("text", "")
                if not text:
                    LOG.warning(
                        "Anthropic CUA error: key action has no text",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
                response = f"Press keys: {text}"
                keys = text.split("+")
                hold = action == "hold_key"
                duration = tool_call_input.get("duration", 0)
                if hold:
                    response = f"Hold keys for {duration} seconds: {text}"
                reasoning = reasoning or response
                actions.append(
                    KeypressAction(
                        element_id="",
                        keys=keys,
                        hold=hold,
                        duration=duration,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=response,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action == "screenshot":
                actions.append(
                    NullAction(
                        reasoning=reasoning,
                        intention=reasoning,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action == "scroll":
                x, y = None, None
                coordinate = tool_call_input.get("coordinate")
                if coordinate:
                    x, y = validate_and_get_coordinates(
                        coordinate, browser_window_dimension, screenshot_resize_target_dimension
                    )
                scroll_direction = tool_call_input.get("scroll_direction")
                scroll_amount = tool_call_input.get("scroll_amount")
                if scroll_direction == "up":
                    scroll_x = 0
                    scroll_y = -scroll_amount * SCROLL_AMOUNT_MULTIPLIER
                elif scroll_direction == "down":
                    scroll_x = 0
                    scroll_y = scroll_amount * SCROLL_AMOUNT_MULTIPLIER
                elif scroll_direction == "left":
                    scroll_x = -scroll_amount * SCROLL_AMOUNT_MULTIPLIER
                    scroll_y = 0
                elif scroll_direction == "right":
                    scroll_x = scroll_amount * SCROLL_AMOUNT_MULTIPLIER
                    scroll_y = 0
                else:
                    LOG.warning(
                        "Anthropic CUA error: unsupported scroll direction",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
                response = f"Scroll by: ({scroll_x}, {scroll_y})"
                reasoning = reasoning or response
                actions.append(
                    ScrollAction(
                        element_id="",
                        x=x,
                        y=y,
                        scroll_x=scroll_x,
                        scroll_y=scroll_y,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=response,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action in ["left_mouse_down", "left_mouse_up"]:
                coordinate = tool_call_input.get("coordinate")
                x, y = None, None
                if coordinate:
                    x, y = validate_and_get_coordinates(
                        coordinate, browser_window_dimension, screenshot_resize_target_dimension
                    )
                direction = "down" if action == "left_mouse_down" else "up"
                response = f"Left mouse {direction} at: ({x}, {y})"
                reasoning = reasoning or response
                actions.append(
                    LeftMouseAction(
                        x=x,
                        y=y,
                        direction=direction,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=response,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action == "left_click_drag":
                coordinate = tool_call_input.get("coordinate")
                start_coordinate = tool_call_input.get("start_coordinate")
                LOG.info(
                    "Anthropic CUA left click drag action", coordinate=coordinate, start_coordinate=start_coordinate
                )
                if not coordinate or not start_coordinate:
                    LOG.warning(
                        "Anthropic CUA error: left click drag action has no coordinate or start coordinate",
                        tool_call=tool_call,
                    )
                    idx += 1
                    continue
                x, y = validate_and_get_coordinates(
                    coordinate, browser_window_dimension, screenshot_resize_target_dimension
                )
                start_x, start_y = validate_and_get_coordinates(
                    start_coordinate, browser_window_dimension, screenshot_resize_target_dimension
                )
                response = f"Drag from ({start_x}, {start_y}) to ({x}, {y})"
                reasoning = reasoning or response
                actions.append(
                    DragAction(
                        start_x=start_x,
                        start_y=start_y,
                        path=[(x, y)],
                        reasoning=reasoning,
                        intention=reasoning,
                        response=response,
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            elif action == "wait":
                duration = tool_call_input.get("duration", 5)
                actions.append(
                    WaitAction(
                        seconds=duration,
                        reasoning=reasoning,
                        intention=reasoning,
                        response=f"Wait for {duration} seconds",
                        organization_id=task.organization_id,
                        workflow_run_id=task.workflow_run_id,
                        task_id=task.task_id,
                        step_id=step.step_id,
                        step_order=step.order,
                        action_order=idx,
                        tool_call_id=tool_call_id,
                    )
                )
            else:
                LOG.error(
                    "Anthropic CUA error: unsupported action",
                    tool_call=tool_call,
                )
            idx += 1
        except Exception:
            LOG.exception(
                "Anthropic CUA error: failed to parse action",
                task_id=task.task_id,
                step_id=step.step_id,
                tool_call=tool_call,
            )
            break
    if not actions:
        reasoning = reasonings[0]["thinking"] if reasonings else None
        assistant_messages = [block for block in assistant_content if block["type"] == "text"]
        assistant_message = assistant_messages[0]["text"] if assistant_messages else None
        actions = await generate_cua_fallback_actions(task, step, assistant_message, reasoning)
    return actions


# function from anthropic's quickstart guide
# https://github.com/anthropics/anthropic-quickstarts/blob/81c4085944abb1734db411f05290b538fdc46dcd/computer-use-demo/computer_use_demo/tools/computer.py#L214C1-L221C1
def validate_and_get_coordinates(
    coordinate: tuple[int, int] | list[int],
    current_dimension: Resolution,
    target_dimension: Resolution,
) -> tuple[int, int]:
    if len(coordinate) != 2:
        raise ValueError(f"{coordinate} must be a tuple of length 2")
    if not all(isinstance(i, int) and i >= 0 for i in coordinate):
        raise ValueError(f"{coordinate} must be a tuple of non-negative ints")

    return scale_coordinates((coordinate[0], coordinate[1]), current_dimension, target_dimension)


async def generate_cua_fallback_actions(
    task: Task,
    step: Step,
    assistant_message: str | None,
    reasoning: str | None,
) -> list[Action]:
    fallback_action_prompt = prompt_engine.load_prompt(
        "cua-fallback-action",
        navigation_goal=task.navigation_goal,
        assistant_message=assistant_message,
        assistant_reasoning=reasoning,
    )

    action_response = await app.LLM_API_HANDLER(
        prompt=fallback_action_prompt,
        prompt_name="cua-fallback-action",
        step=step,
    )
    LOG.info("Fallback action response", action_response=action_response)
    skyvern_action_type = action_response.get("action")
    useful_information = action_response.get("useful_information")

    # use 'other' action as fallback in the 'cua-fallback-action' prompt
    # it can avoid LLM returning unreasonable actions, and fallback to use 'wait' action in agent instead
    action = WaitAction(
        seconds=5,
        reasoning=reasoning,
        intention=reasoning,
    )
    if skyvern_action_type == "complete":
        LOG.info(
            "Updating task with useful information",
            task_id=task.task_id,
            organization_id=task.organization_id,
            useful_information=useful_information,
            assistant_message=assistant_message,
            reasoning=reasoning,
        )
        await app.DATABASE.update_task(
            task.task_id,
            organization_id=task.organization_id,
            extracted_information=assistant_message,
        )
        action = CompleteAction(
            reasoning=reasoning,
            intention=reasoning,
            verified=True,
            data_extraction_goal=task.data_extraction_goal,
        )
    elif skyvern_action_type == "terminate":
        action = TerminateAction(
            reasoning=reasoning,
            intention=reasoning,
        )
    elif skyvern_action_type == "solve_captcha":
        action = SolveCaptchaAction(
            reasoning=reasoning,
            intention=reasoning,
        )
    elif skyvern_action_type == "get_magic_link":
        if (task.totp_verification_url or task.totp_identifier) and task.organization_id:
            LOG.info(
                "Getting magic link for CUA",
                task_id=task.task_id,
                organization_id=task.organization_id,
                workflow_run_id=task.workflow_run_id,
                totp_verification_url=task.totp_verification_url,
                totp_identifier=task.totp_identifier,
            )
            try:
                otp_value = await poll_otp_value(
                    organization_id=task.organization_id,
                    task_id=task.task_id,
                    workflow_run_id=task.workflow_run_id,
                    totp_verification_url=task.totp_verification_url,
                    totp_identifier=task.totp_identifier,
                )
                if not otp_value or otp_value.get_otp_type() != OTPType.MAGIC_LINK:
                    raise NoTOTPVerificationCodeFound()
                magic_link = otp_value.value
                reasoning = reasoning or "Received magic link. Navigating to the magic link URL to verify the login"
                action = GotoUrlAction(
                    url=magic_link,
                    reasoning=reasoning,
                    intention=reasoning,
                    is_magic_link=True,
                )
            except NoTOTPVerificationCodeFound:
                reasoning_suffix = "No magic link found"
                reasoning = f"{reasoning}. {reasoning_suffix}" if reasoning else reasoning_suffix
                action = TerminateAction(
                    reasoning=reasoning,
                    intention=reasoning,
                )
            except FailedToGetTOTPVerificationCode as e:
                reasoning_suffix = f"Failed to get magic link. Reason: {e.reason}"
                reasoning = f"{reasoning}. {reasoning_suffix}" if reasoning else reasoning_suffix
                action = TerminateAction(
                    reasoning=reasoning,
                    intention=reasoning,
                )
        else:
            action = TerminateAction(
                reasoning=reasoning,
                intention=reasoning,
            )

    elif skyvern_action_type == "get_verification_code":
        # Try credential TOTP first (highest priority, doesn't need totp_url/totp_identifier)
        otp_value = try_generate_totp_from_credential(task.workflow_run_id)
        # Fall back to webhook/totp_identifier
        if not otp_value and task.organization_id:
            LOG.info(
                "Getting verification code for CUA",
                task_id=task.task_id,
                organization_id=task.organization_id,
                workflow_run_id=task.workflow_run_id,
                totp_verification_url=task.totp_verification_url,
                totp_identifier=task.totp_identifier,
            )
            try:
                otp_value = await poll_otp_value(
                    organization_id=task.organization_id,
                    task_id=task.task_id,
                    workflow_run_id=task.workflow_run_id,
                    totp_verification_url=task.totp_verification_url,
                    totp_identifier=task.totp_identifier,
                )
            except NoTOTPVerificationCodeFound:
                reasoning_suffix = "No verification code found"
                reasoning = f"{reasoning}. {reasoning_suffix}" if reasoning else reasoning_suffix
            except FailedToGetTOTPVerificationCode as e:
                reasoning_suffix = f"Failed to get verification code. Reason: {e.reason}"
                reasoning = f"{reasoning}. {reasoning_suffix}" if reasoning else reasoning_suffix

        if otp_value and otp_value.get_otp_type() == OTPType.TOTP:
            verification_code = otp_value.value
            reasoning = reasoning or f"Received verification code: {verification_code}"
            action = VerificationCodeAction(
                verification_code=verification_code,
                reasoning=reasoning,
                intention=reasoning,
            )
        else:
            action = TerminateAction(
                reasoning=reasoning,
                intention=reasoning,
            )

    action.organization_id = task.organization_id
    action.workflow_run_id = task.workflow_run_id
    action.task_id = task.task_id
    action.step_id = step.step_id
    action.step_order = step.order
    action.action_order = 0
    return [action]


async def parse_ui_tars_actions(
    task: Task,
    step: Step,
    response_content: str,
    browser_window_dimension: Resolution,
) -> list[Action]:
    """Parse UI-TARS response and convert to Skyvern actions."""
    try:
        # Parse the UI-TARS response text
        parsed_actions = _parse_ui_tars_response(response_content, browser_window_dimension)

        actions: list[Action] = []
        for idx, parsed_action in enumerate(parsed_actions):
            try:
                action = _create_ui_tars_action(parsed_action, task, step, browser_window_dimension, idx)
                if action:
                    actions.append(action)
            except Exception:
                LOG.exception(
                    "Failed to create UI-TARS action",
                    task_id=task.task_id,
                    step_id=step.step_id,
                    parsed_action=parsed_action,
                )
                continue

        if not actions:
            LOG.warning(
                "No valid actions generated from UI-TARS response",
                task_id=task.task_id,
                step_id=step.step_id,
                response_preview=response_content[:200],
            )

        return actions

    except Exception:
        LOG.exception(
            "Failed to parse UI-TARS actions",
            task_id=task.task_id,
            step_id=step.step_id,
            response_content=response_content[:200],
        )
        return []


def _parse_ui_tars_response(response_content: str, browser_window_dimension: Resolution) -> list[dict[str, Any]]:
    """Parse UI-TARS response text into structured action data.

    Extracts essential parsing logic from action_parser.py without the complex coordinate transformations.
    """
    text = response_content.strip()

    # Convert point format to coordinates if needed
    if "<point>" in text:
        text = _convert_point_to_coordinates(text)

    # Normalize parameter names
    text = text.replace("start_point=", "start_box=")
    text = text.replace("end_point=", "end_box=")
    text = text.replace("point=", "start_box=")

    # Extract thought/reasoning
    thought = None
    thought_patterns = [
        r"Thought: (.+?)(?=\s*Action: |$)",
        r"Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action: |$)",
        r"Action_Summary: (.+?)(?=\s*Action: |$)",
    ]

    for pattern in thought_patterns:
        thought_match = re.search(pattern, text, re.DOTALL)
        if thought_match:
            if len(thought_match.groups()) == 1:
                thought = thought_match.group(1).strip()
            elif len(thought_match.groups()) == 2:
                thought = thought_match.group(2).strip()  # Use Action_Summary
            break

    if "Action:" not in text:
        raise ValueError("No Action section found in UI-TARS response")

    # Extract action string
    action_str = text.split("Action: ")[-1]

    # Split multiple actions
    action_parts = action_str.split(")\n\n")
    all_actions = []

    for action_part in action_parts:
        action_part = action_part.strip()
        if not action_part:
            continue

        # Handle type action with content specially
        if "type(content" in action_part:
            if not action_part.endswith(")"):
                action_part += ")"
            # Extract content from type action
            pattern = r"type\(content='(.*?)'\)"
            match = re.search(pattern, action_part)
            if match:
                content = match.group(1)
                # Escape single quotes in content
                content = content.replace("'", "\\'")
                action_part = f"type(content='{content}')"

        if not action_part.endswith(")"):
            action_part += ")"

        all_actions.append(action_part)

    # Parse each action
    parsed_actions = []
    for action_str in all_actions:
        try:
            parsed_action = _parse_single_action(action_str)
            if parsed_action:
                parsed_action["thought"] = thought
                parsed_action["browser_window_dimension"] = browser_window_dimension
                parsed_actions.append(parsed_action)
        except Exception:
            LOG.warning(
                "Failed to parse individual UI-TARS action",
                action_str=action_str,
                exc_info=True,
            )
            continue

    return parsed_actions


def _parse_single_action(action_str: str) -> dict[str, Any] | None:
    """Parse a single action string into structured data."""

    try:
        # Clean up the action string
        action_str = action_str.replace("\n", "\\n").strip()

        # Parse as Python expression
        node = ast.parse(action_str, mode="eval")
        if not isinstance(node, ast.Expression) or not isinstance(node.body, ast.Call):
            return None

        call = node.body

        # Get function name
        if isinstance(call.func, ast.Name):
            func_name = call.func.id
        elif isinstance(call.func, ast.Attribute):
            func_name = call.func.attr
        else:
            return None

        # Get arguments
        action_inputs = {}
        for kw in call.keywords:
            if kw.arg and isinstance(kw.value, (ast.Constant, ast.Str)):
                if isinstance(kw.value, ast.Constant):
                    value = kw.value.value
                else:  # ast.Str for older Python versions
                    value = kw.value.s
                action_inputs[kw.arg] = value

        return {
            "action_type": func_name,
            "action_inputs": action_inputs,
        }

    except Exception:
        LOG.debug(f"Failed to parse action string: {action_str}", exc_info=True)
        return None


def _convert_point_to_coordinates(text: str) -> str:
    """Convert <point>x y</point> format to (x,y) format."""
    pattern = r"<point>(\d+)\s+(\d+)</point>"

    def replace_match(match: Match[str]) -> str:
        x, y = map(int, match.groups())
        return f"({x},{y})"

    return re.sub(pattern, replace_match, text)


def _create_ui_tars_action(
    parsed_action: dict[str, Any],
    task: Task,
    step: Step,
    browser_window_dimension: Resolution,
    action_order: int,
) -> Action | None:
    """Create a Skyvern action from parsed UI-TARS data."""
    action_type = parsed_action.get("action_type", "")
    action_inputs = parsed_action.get("action_inputs", {})
    thought = parsed_action.get("thought", "")

    base_params = {
        "reasoning": thought,
        "intention": thought,
        "organization_id": task.organization_id,
        "workflow_run_id": task.workflow_run_id,
        "task_id": task.task_id,
        "step_id": step.step_id,
        "step_order": step.order,
        "action_order": action_order,
    }

    if action_type == "click":
        x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
        if x is None or y is None:
            return None
        return ClickAction(
            element_id="",
            x=x,
            y=y,
            response=f"Click at ({x}, {y})",
            **base_params,
        )

    elif action_type == "left_double":
        x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
        if x is None or y is None:
            return None
        return ClickAction(
            element_id="",
            x=x,
            y=y,
            button="left",
            repeat=2,
            response=f"Double click at ({x}, {y})",
            **base_params,
        )

    elif action_type == "right_single":
        x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
        if x is None or y is None:
            return None
        return ClickAction(
            element_id="",
            x=x,
            y=y,
            button="right",
            response=f"Right click at ({x}, {y})",
            **base_params,
        )

    elif action_type == "type":
        content = action_inputs.get("content", "")
        if not content:
            return None
        return InputTextAction(
            element_id="",
            text=content,
            response=f"Type: {content[:50]}{'...' if len(content) > 50 else ''}",
            **base_params,
        )

    elif action_type in ["drag", "select"]:
        start_x, start_y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
        end_x, end_y = _extract_ui_tars_coordinates(action_inputs.get("end_box", ""), browser_window_dimension)
        if None in (start_x, start_y, end_x, end_y):
            return None
        return DragAction(
            start_x=start_x,
            start_y=start_y,
            path=[(end_x, end_y)],
            response=f"Drag from ({start_x}, {start_y}) to ({end_x}, {end_y})",
            **base_params,
        )

    elif action_type == "hotkey":
        key_combo = action_inputs.get("key", action_inputs.get("hotkey", ""))
        if not key_combo:
            return None
        keys = key_combo.split()
        return KeypressAction(
            keys=keys,
            response=f"Hotkey: {key_combo}",
            **base_params,
        )

    elif action_type == "scroll":
        direction = action_inputs.get("direction", "down").lower()
        x, y = _extract_ui_tars_coordinates(action_inputs.get("start_box", ""), browser_window_dimension)
        if x is None or y is None:
            # Use center of screen as fallback
            x = browser_window_dimension["width"] // 2
            y = browser_window_dimension["height"] // 2

        scroll_amount = 300
        if direction == "down":
            scroll_x, scroll_y = 0, scroll_amount
        elif direction == "up":
            scroll_x, scroll_y = 0, -scroll_amount
        elif direction == "right":
            scroll_x, scroll_y = scroll_amount, 0
        elif direction == "left":
            scroll_x, scroll_y = -scroll_amount, 0
        else:
            scroll_x, scroll_y = 0, scroll_amount

        return ScrollAction(
            element_id="",
            x=x,
            y=y,
            scroll_x=scroll_x,
            scroll_y=scroll_y,
            response=f"Scroll {direction} at ({x}, {y})",
            **base_params,
        )

    elif action_type == "wait":
        return WaitAction(
            seconds=5,
            **base_params,
        )

    elif action_type == "finished":
        return CompleteAction(
            data_extraction_goal=task.data_extraction_goal,
            verified=True,  # UI-TARS has already determined completion, skip Skyvern validation
            **base_params,
        )

    else:
        LOG.warning(f"Unsupported UI-TARS action type: {action_type}")
        return None


def _extract_ui_tars_coordinates(box_str: str, browser_window_dimension: Resolution) -> tuple[int | None, int | None]:
    """Extract coordinates from UI-TARS box format with proper coordinate conversion.

    UI-TARS coordinates need to be divided by 1000 to convert from the model's output
    format to relative coordinates (0-1 range), then multiplied by screen dimensions
    to get absolute pixel coordinates.
    """
    if not box_str:
        return None, None

    try:
        # Parse coordinates from string format like "(450,320)" or "[0.5, 0.3, 0.5, 0.3]"
        coords = ast.literal_eval(box_str)

        if not isinstance(coords, (list, tuple)):
            return None, None

        if len(coords) == 2:
            # Direct coordinates like (450, 320) or (0.5, 0.3)
            x, y = coords

            # UI-TARS specific coordinate conversion
            # UI-TARS outputs coordinates that need to be divided by 1000 first
            if x > 1 or y > 1:  # Likely UI-TARS format needing factor conversion
                original_x, original_y = x, y
                x = x / 1000.0
                y = y / 1000.0
                LOG.debug(f"Applied UI-TARS factor conversion: ({original_x}, {original_y}) -> ({x}, {y})")

            # Convert relative coordinates (0-1) to absolute screen coordinates
            if 0 <= x <= 1 and 0 <= y <= 1:
                abs_x = int(x * browser_window_dimension["width"])
                abs_y = int(y * browser_window_dimension["height"])
                LOG.debug(
                    f"Converted to absolute coordinates: ({abs_x}, {abs_y}) for screen {browser_window_dimension['width']}x{browser_window_dimension['height']}"
                )
                return abs_x, abs_y

            return int(x), int(y)

        elif len(coords) == 4:
            # Bounding box format [x1, y1, x2, y2] - take center point
            x1, y1, x2, y2 = coords
            x = (x1 + x2) / 2
            y = (y1 + y2) / 2

            # UI-TARS specific coordinate conversion for bounding boxes
            if x > 1 or y > 1:  # Likely UI-TARS format needing factor conversion
                original_x, original_y = x, y
                x = x / 1000.0
                y = y / 1000.0
                LOG.debug(
                    f"Applied UI-TARS factor conversion to bbox center: ({original_x}, {original_y}) -> ({x}, {y})"
                )

            # Convert relative coordinates (0-1) to absolute screen coordinates
            if 0 <= x <= 1 and 0 <= y <= 1:
                abs_x = int(x * browser_window_dimension["width"])
                abs_y = int(y * browser_window_dimension["height"])
                LOG.debug(
                    f"Converted bbox center to absolute coordinates: ({abs_x}, {abs_y}) for screen {browser_window_dimension['width']}x{browser_window_dimension['height']}"
                )
                return abs_x, abs_y

            return int(x), int(y)

        else:
            return None, None

    except Exception:
        LOG.debug(f"Failed to parse UI-TARS coordinates: {box_str}", exc_info=True)
        return None, None