From 0b47dff89d0a57877f686a76f32cc0640e2fe6ce Mon Sep 17 00:00:00 2001
From: LawyZheng <lawyzheng1106@gmail.com>
Date: Thu, 20 Nov 2025 02:24:00 +0800
Subject: [PATCH] fix cua engine (#4036)

---
 .../forge/sdk/api/llm/api_handler_factory.py  | 49 ++++++++++---------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py
index 547ccfe2..f16dbe3a 100644
--- a/skyvern/forge/sdk/api/llm/api_handler_factory.py
+++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py
@@ -1121,6 +1121,31 @@ class LLMCaller:
                 thought_cost=call_stats.llm_cost,
             )
 
+        organization_id = organization_id or (
+            step.organization_id if step else (thought.organization_id if thought else None)
+        )
+        # Track LLM API handler duration, token counts, and cost
+        duration_seconds = time.perf_counter() - start_time
+        LOG.info(
+            "LLM API handler duration metrics",
+            llm_key=self.llm_key,
+            prompt_name=prompt_name,
+            model=self.llm_config.model_name,
+            duration_seconds=duration_seconds,
+            step_id=step.step_id if step else None,
+            thought_id=thought.observer_thought_id if thought else None,
+            organization_id=organization_id,
+            input_tokens=call_stats.input_tokens if call_stats and call_stats.input_tokens else None,
+            output_tokens=call_stats.output_tokens if call_stats and call_stats.output_tokens else None,
+            reasoning_tokens=call_stats.reasoning_tokens if call_stats and call_stats.reasoning_tokens else None,
+            cached_tokens=call_stats.cached_tokens if call_stats and call_stats.cached_tokens else None,
+            llm_cost=call_stats.llm_cost if call_stats and call_stats.llm_cost else None,
+        )
+
+        # Raw response is used for CUA engine LLM calls.
+        if raw_response:
+            return response.model_dump(exclude_none=True)
+
         parsed_response = parse_api_response(response, self.llm_config.add_assistant_prefix)
         parsed_response_json = json.dumps(parsed_response, indent=2)
         if step and not is_speculative_step:
@@ -1149,27 +1174,6 @@ class LLMCaller:
                     ai_suggestion=ai_suggestion,
                 )
 
-        organization_id = organization_id or (
-            step.organization_id if step else (thought.organization_id if thought else None)
-        )
-        # Track LLM API handler duration, token counts, and cost
-        duration_seconds = time.perf_counter() - start_time
-        LOG.info(
-            "LLM API handler duration metrics",
-            llm_key=self.llm_key,
-            prompt_name=prompt_name,
-            model=self.llm_config.model_name,
-            duration_seconds=duration_seconds,
-            step_id=step.step_id if step else None,
-            thought_id=thought.observer_thought_id if thought else None,
-            organization_id=organization_id,
-            input_tokens=call_stats.input_tokens if call_stats and call_stats.input_tokens else None,
-            output_tokens=call_stats.output_tokens if call_stats and call_stats.output_tokens else None,
-            reasoning_tokens=call_stats.reasoning_tokens if call_stats and call_stats.reasoning_tokens else None,
-            cached_tokens=call_stats.cached_tokens if call_stats and call_stats.cached_tokens else None,
-            llm_cost=call_stats.llm_cost if call_stats and call_stats.llm_cost else None,
-        )
-
         if step and is_speculative_step:
             step.speculative_llm_metadata = SpeculativeLLMMetadata(
                 prompt=llm_prompt_value,
@@ -1187,9 +1191,6 @@ class LLMCaller:
                 llm_cost=call_stats.llm_cost,
             )
 
-        if raw_response:
-            return response.model_dump(exclude_none=True)
-
         return parsed_response
 
     def get_screenshot_resize_target_dimension(self, window_dimension: Resolution | None) -> Resolution: