shu/fallback to skyvern actions when cua returns no action (#2141)

2025-04-12 20:55:38 -07:00
parent e8efcc0bd7
commit db7f1abeab
5 changed files with 72 additions and 14 deletions
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -1319,7 +1319,7 @@ class ForgeAgent:
            incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
        )
-        return parse_cua_actions(task, step, current_response), current_response
+        return await parse_cua_actions(task, step, current_response), current_response
    @staticmethod
    async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult:
--- a/skyvern/forge/prompts/skyvern/cua-fallback-action.j2
+++ b/skyvern/forge/prompts/skyvern/cua-fallback-action.j2
@@ -0,0 +1,21 @@
 The user is trying to achieve a goal in the browser assisted by an browser AI assistant.
 According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.
 Help the user decide what to do next based on the assistant's message. Here's the list of available actions:
 - solve_captcha: the task is blocked by captcha and the assistant is asking the user to solve the captcha
 - complete: the user goal has been achieved
 - terminate: the user goal cannot be achieved. Terminate the task. Examples: 1) there's not enough data provided to achieve the goal and the assistant is asking the user to provide more information. For examples: login is required and the user has not provided the login credentials or incorrect credentials are provided; a form needs to be filled and a required field is missing. 2) The site is stuck or not loading after multiple attempts
 - get_verification_code: the assistant is asking the user to provide a verification code (2FA, MFA or TOTP code)
 - other: the assistant is asking the user to do something else
 Return the action to take next in the following JSON format:
 {
    "action": str // complete, terminate, solve_captcha, get_verification_code
 }
 User goal: {{ navigation_goal }}
 Assistant reasoning: {{ assistant_reasoning }}
 Assistant message: {{ assistant_message }}
--- a/skyvern/forge/sdk/routes/agent_protocol.py
+++ b/skyvern/forge/sdk/routes/agent_protocol.py
@@ -1479,6 +1479,8 @@ async def run_task(
        )
        url = url or task_generation.url
        navigation_goal = task_generation.navigation_goal or run_request.prompt
        if run_request.engine == RunEngine.openai_cua:
            navigation_goal = run_request.prompt
        navigation_payload = task_generation.navigation_payload
        data_extraction_goal = task_generation.data_extraction_goal
        data_extraction_schema = data_extraction_schema or task_generation.extracted_information_schema
--- a/skyvern/services/run_service.py
+++ b/skyvern/services/run_service.py
@@ -136,7 +136,7 @@ async def cancel_run(run_id: str, organization_id: str | None = None, api_key: s
            detail=f"Run not found {run_id}",
        )
-    if run.task_run_type == RunType.task_v1:
+    if run.task_run_type in [RunType.task_v1, RunType.openai_cua]:
        await cancel_task_v1(run_id, organization_id=organization_id, api_key=api_key)
    elif run.task_run_type == RunType.task_v2:
        await cancel_task_v2(run_id, organization_id=organization_id)
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -5,6 +5,8 @@ from openai.types.responses.response import Response as OpenAIResponse
 from pydantic import ValidationError
 from skyvern.exceptions import UnsupportedActionType
 from skyvern.forge import app
 from skyvern.forge.prompts import prompt_engine
 from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.tasks import Task
 from skyvern.webeye.actions.actions import (
@@ -200,13 +202,14 @@ def parse_actions(
    return actions
-def parse_cua_actions(
+async def parse_cua_actions(
    task: Task,
    step: Step,
    response: OpenAIResponse,
 ) -> list[Action]:
    computer_calls = [item for item in response.output if item.type == "computer_call"]
    reasonings = [item for item in response.output if item.type == "reasoning"]
    assistant_messages = [item for item in response.output if item.type == "message" and item.role == "assistant"]
    actions: list[Action] = []
    for idx, computer_call in enumerate(computer_calls):
        cua_action = computer_call.action
@@ -305,16 +308,48 @@ def parse_cua_actions(
            workflow_run_id=task.workflow_run_id,
            response=response.dict(),
        )
-        complete_action = CompleteAction(
+        reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
-            reasoning="No more actions to take",
+        assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
-            verified=True,
+        fallback_action_prompt = prompt_engine.load_prompt(
-            data_extraction_goal=task.data_extraction_goal,
+            "cua-fallback-action",
-            organization_id=task.organization_id,
+            navigation_goal=task.navigation_goal,
-            workflow_run_id=task.workflow_run_id,
+            assistant_message=assistant_message,
-            task_id=task.task_id,
+            assistant_reasoning=reasoning,
            step_id=step.step_id,
            step_order=step.order,
            action_order=0,
        )
-        return [complete_action]
+
        action_response = await app.LLM_API_HANDLER(
            prompt=fallback_action_prompt,
            prompt_name="cua-fallback-action",
        )
        skyvern_action_type = action_response.get("action")
        action = WaitAction(
            seconds=5,
            reasoning=reasoning,
            intention=reasoning,
        )
        if skyvern_action_type == "complete":
            action = CompleteAction(
                reasoning=reasoning,
                intention=reasoning,
                verified=False,
                data_extraction_goal=task.data_extraction_goal,
            )
        elif skyvern_action_type == "terminate":
            action = TerminateAction(
                reasoning=reasoning,
                intention=reasoning,
            )
        elif skyvern_action_type == "solve_captcha":
            action = SolveCaptchaAction(
                reasoning=reasoning,
                intention=reasoning,
            )
        elif skyvern_action_type == "get_verification_code":
            # Currently we don't support verification code
            # TODO: handle verification code by fetching the code and send it to CUA
            action = TerminateAction(
                reasoning=reasoning,
                intention=reasoning,
            )
        return [action]
    return actions