shu/fallback to skyvern actions when cua returns no action (#2141)

2025-04-12 20:55:38 -07:00
parent e8efcc0bd7
commit db7f1abeab
5 changed files with 72 additions and 14 deletions
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -1319,7 +1319,7 @@ class ForgeAgent:
            incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
        )

-        return parse_cua_actions(task, step, current_response), current_response
+        return await parse_cua_actions(task, step, current_response), current_response

    @staticmethod
    async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult:
--- a/skyvern/forge/prompts/skyvern/cua-fallback-action.j2
+++ b/skyvern/forge/prompts/skyvern/cua-fallback-action.j2
@@ -0,0 +1,21 @@
+The user is trying to achieve a goal in the browser assisted by an browser AI assistant.
+
+According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.
+
+Help the user decide what to do next based on the assistant's message. Here's the list of available actions:
+- solve_captcha: the task is blocked by captcha and the assistant is asking the user to solve the captcha
+- complete: the user goal has been achieved
+- terminate: the user goal cannot be achieved. Terminate the task. Examples: 1) there's not enough data provided to achieve the goal and the assistant is asking the user to provide more information. For examples: login is required and the user has not provided the login credentials or incorrect credentials are provided; a form needs to be filled and a required field is missing. 2) The site is stuck or not loading after multiple attempts
+- get_verification_code: the assistant is asking the user to provide a verification code (2FA, MFA or TOTP code)
+- other: the assistant is asking the user to do something else
+
+Return the action to take next in the following JSON format:
+{
+    "action": str // complete, terminate, solve_captcha, get_verification_code
+}
+
+User goal: {{ navigation_goal }}
+
+Assistant reasoning: {{ assistant_reasoning }}
+
+Assistant message: {{ assistant_message }}
--- a/skyvern/forge/sdk/routes/agent_protocol.py
+++ b/skyvern/forge/sdk/routes/agent_protocol.py
@@ -1479,6 +1479,8 @@ async def run_task(
        )
        url = url or task_generation.url
        navigation_goal = task_generation.navigation_goal or run_request.prompt
+        if run_request.engine == RunEngine.openai_cua:
+            navigation_goal = run_request.prompt
        navigation_payload = task_generation.navigation_payload
        data_extraction_goal = task_generation.data_extraction_goal
        data_extraction_schema = data_extraction_schema or task_generation.extracted_information_schema
--- a/skyvern/services/run_service.py
+++ b/skyvern/services/run_service.py
@@ -136,7 +136,7 @@ async def cancel_run(run_id: str, organization_id: str | None = None, api_key: s
            detail=f"Run not found {run_id}",
        )

-    if run.task_run_type == RunType.task_v1:
+    if run.task_run_type in [RunType.task_v1, RunType.openai_cua]:
        await cancel_task_v1(run_id, organization_id=organization_id, api_key=api_key)
    elif run.task_run_type == RunType.task_v2:
        await cancel_task_v2(run_id, organization_id=organization_id)
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -5,6 +5,8 @@ from openai.types.responses.response import Response as OpenAIResponse
 from pydantic import ValidationError

 from skyvern.exceptions import UnsupportedActionType
+from skyvern.forge import app
+from skyvern.forge.prompts import prompt_engine
 from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.tasks import Task
 from skyvern.webeye.actions.actions import (
@@ -200,13 +202,14 @@ def parse_actions(
    return actions


-def parse_cua_actions(
+async def parse_cua_actions(
    task: Task,
    step: Step,
    response: OpenAIResponse,
 ) -> list[Action]:
    computer_calls = [item for item in response.output if item.type == "computer_call"]
    reasonings = [item for item in response.output if item.type == "reasoning"]
+    assistant_messages = [item for item in response.output if item.type == "message" and item.role == "assistant"]
    actions: list[Action] = []
    for idx, computer_call in enumerate(computer_calls):
        cua_action = computer_call.action
@@ -305,16 +308,48 @@ def parse_cua_actions(
            workflow_run_id=task.workflow_run_id,
            response=response.dict(),
        )
-        complete_action = CompleteAction(
-            reasoning="No more actions to take",
-            verified=True,
-            data_extraction_goal=task.data_extraction_goal,
-            organization_id=task.organization_id,
-            workflow_run_id=task.workflow_run_id,
-            task_id=task.task_id,
-            step_id=step.step_id,
-            step_order=step.order,
-            action_order=0,
+        reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
+        assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
+        fallback_action_prompt = prompt_engine.load_prompt(
+            "cua-fallback-action",
+            navigation_goal=task.navigation_goal,
+            assistant_message=assistant_message,
+            assistant_reasoning=reasoning,
        )
-        return [complete_action]
+
+        action_response = await app.LLM_API_HANDLER(
+            prompt=fallback_action_prompt,
+            prompt_name="cua-fallback-action",
+        )
+        skyvern_action_type = action_response.get("action")
+        action = WaitAction(
+            seconds=5,
+            reasoning=reasoning,
+            intention=reasoning,
+        )
+        if skyvern_action_type == "complete":
+            action = CompleteAction(
+                reasoning=reasoning,
+                intention=reasoning,
+                verified=False,
+                data_extraction_goal=task.data_extraction_goal,
+            )
+        elif skyvern_action_type == "terminate":
+            action = TerminateAction(
+                reasoning=reasoning,
+                intention=reasoning,
+            )
+        elif skyvern_action_type == "solve_captcha":
+            action = SolveCaptchaAction(
+                reasoning=reasoning,
+                intention=reasoning,
+            )
+        elif skyvern_action_type == "get_verification_code":
+            # Currently we don't support verification code
+            # TODO: handle verification code by fetching the code and send it to CUA
+            action = TerminateAction(
+                reasoning=reasoning,
+                intention=reasoning,
+            )
+        return [action]
    return actions