From db7f1abeab4cdad0304e81b28495afdc94b624a5 Mon Sep 17 00:00:00 2001
From: Shuchang Zheng <wintonzheng0325@gmail.com>
Date: Sat, 12 Apr 2025 20:55:38 -0700
Subject: [PATCH] shu/fallback to skyvern actions when cua returns no action
 (#2141)

---
 skyvern/forge/agent.py                        |  2 +-
 .../prompts/skyvern/cua-fallback-action.j2    | 21 +++++++
 skyvern/forge/sdk/routes/agent_protocol.py    |  2 +
 skyvern/services/run_service.py               |  2 +-
 skyvern/webeye/actions/parse_actions.py       | 59 +++++++++++++++----
 5 files changed, 72 insertions(+), 14 deletions(-)
 create mode 100644 skyvern/forge/prompts/skyvern/cua-fallback-action.j2

diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py
index 94f14da1..5e63872a 100644
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -1319,7 +1319,7 @@ class ForgeAgent:
             incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
         )
 
-        return parse_cua_actions(task, step, current_response), current_response
+        return await parse_cua_actions(task, step, current_response), current_response
 
     @staticmethod
     async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult:
diff --git a/skyvern/forge/prompts/skyvern/cua-fallback-action.j2 b/skyvern/forge/prompts/skyvern/cua-fallback-action.j2
new file mode 100644
index 00000000..78b9589e
--- /dev/null
+++ b/skyvern/forge/prompts/skyvern/cua-fallback-action.j2
@@ -0,0 +1,21 @@
+The user is trying to achieve a goal in the browser assisted by an browser AI assistant.
+
+According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.
+
+Help the user decide what to do next based on the assistant's message. Here's the list of available actions:
+- solve_captcha: the task is blocked by captcha and the assistant is asking the user to solve the captcha
+- complete: the user goal has been achieved
+- terminate: the user goal cannot be achieved. Terminate the task. Examples: 1) there's not enough data provided to achieve the goal and the assistant is asking the user to provide more information. For examples: login is required and the user has not provided the login credentials or incorrect credentials are provided; a form needs to be filled and a required field is missing. 2) The site is stuck or not loading after multiple attempts
+- get_verification_code: the assistant is asking the user to provide a verification code (2FA, MFA or TOTP code)
+- other: the assistant is asking the user to do something else
+
+Return the action to take next in the following JSON format:
+{
+    "action": str // complete, terminate, solve_captcha, get_verification_code
+}
+
+User goal: {{ navigation_goal }}
+
+Assistant reasoning: {{ assistant_reasoning }}
+
+Assistant message: {{ assistant_message }}
\ No newline at end of file
diff --git a/skyvern/forge/sdk/routes/agent_protocol.py b/skyvern/forge/sdk/routes/agent_protocol.py
index b63f83e3..15bbf5ac 100644
--- a/skyvern/forge/sdk/routes/agent_protocol.py
+++ b/skyvern/forge/sdk/routes/agent_protocol.py
@@ -1479,6 +1479,8 @@ async def run_task(
         )
         url = url or task_generation.url
         navigation_goal = task_generation.navigation_goal or run_request.prompt
+        if run_request.engine == RunEngine.openai_cua:
+            navigation_goal = run_request.prompt
         navigation_payload = task_generation.navigation_payload
         data_extraction_goal = task_generation.data_extraction_goal
         data_extraction_schema = data_extraction_schema or task_generation.extracted_information_schema
diff --git a/skyvern/services/run_service.py b/skyvern/services/run_service.py
index 7022f401..247d202a 100644
--- a/skyvern/services/run_service.py
+++ b/skyvern/services/run_service.py
@@ -136,7 +136,7 @@ async def cancel_run(run_id: str, organization_id: str | None = None, api_key: s
             detail=f"Run not found {run_id}",
         )
 
-    if run.task_run_type == RunType.task_v1:
+    if run.task_run_type in [RunType.task_v1, RunType.openai_cua]:
         await cancel_task_v1(run_id, organization_id=organization_id, api_key=api_key)
     elif run.task_run_type == RunType.task_v2:
         await cancel_task_v2(run_id, organization_id=organization_id)
diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py
index 1d2f2802..64e4f7f4 100644
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -5,6 +5,8 @@ from openai.types.responses.response import Response as OpenAIResponse
 from pydantic import ValidationError
 
 from skyvern.exceptions import UnsupportedActionType
+from skyvern.forge import app
+from skyvern.forge.prompts import prompt_engine
 from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.tasks import Task
 from skyvern.webeye.actions.actions import (
@@ -200,13 +202,14 @@ def parse_actions(
     return actions
 
 
-def parse_cua_actions(
+async def parse_cua_actions(
     task: Task,
     step: Step,
     response: OpenAIResponse,
 ) -> list[Action]:
     computer_calls = [item for item in response.output if item.type == "computer_call"]
     reasonings = [item for item in response.output if item.type == "reasoning"]
+    assistant_messages = [item for item in response.output if item.type == "message" and item.role == "assistant"]
     actions: list[Action] = []
     for idx, computer_call in enumerate(computer_calls):
         cua_action = computer_call.action
@@ -305,16 +308,48 @@ def parse_cua_actions(
             workflow_run_id=task.workflow_run_id,
             response=response.dict(),
         )
-        complete_action = CompleteAction(
-            reasoning="No more actions to take",
-            verified=True,
-            data_extraction_goal=task.data_extraction_goal,
-            organization_id=task.organization_id,
-            workflow_run_id=task.workflow_run_id,
-            task_id=task.task_id,
-            step_id=step.step_id,
-            step_order=step.order,
-            action_order=0,
+        reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
+        assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
+        fallback_action_prompt = prompt_engine.load_prompt(
+            "cua-fallback-action",
+            navigation_goal=task.navigation_goal,
+            assistant_message=assistant_message,
+            assistant_reasoning=reasoning,
         )
-        return [complete_action]
+
+        action_response = await app.LLM_API_HANDLER(
+            prompt=fallback_action_prompt,
+            prompt_name="cua-fallback-action",
+        )
+        skyvern_action_type = action_response.get("action")
+        action = WaitAction(
+            seconds=5,
+            reasoning=reasoning,
+            intention=reasoning,
+        )
+        if skyvern_action_type == "complete":
+            action = CompleteAction(
+                reasoning=reasoning,
+                intention=reasoning,
+                verified=False,
+                data_extraction_goal=task.data_extraction_goal,
+            )
+        elif skyvern_action_type == "terminate":
+            action = TerminateAction(
+                reasoning=reasoning,
+                intention=reasoning,
+            )
+        elif skyvern_action_type == "solve_captcha":
+            action = SolveCaptchaAction(
+                reasoning=reasoning,
+                intention=reasoning,
+            )
+        elif skyvern_action_type == "get_verification_code":
+            # Currently we don't support verification code
+            # TODO: handle verification code by fetching the code and send it to CUA
+            action = TerminateAction(
+                reasoning=reasoning,
+                intention=reasoning,
+            )
+        return [action]
     return actions