From db7f1abeab4cdad0304e81b28495afdc94b624a5 Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Sat, 12 Apr 2025 20:55:38 -0700 Subject: [PATCH] shu/fallback to skyvern actions when cua returns no action (#2141) --- skyvern/forge/agent.py | 2 +- .../prompts/skyvern/cua-fallback-action.j2 | 21 +++++++ skyvern/forge/sdk/routes/agent_protocol.py | 2 + skyvern/services/run_service.py | 2 +- skyvern/webeye/actions/parse_actions.py | 59 +++++++++++++++---- 5 files changed, 72 insertions(+), 14 deletions(-) create mode 100644 skyvern/forge/prompts/skyvern/cua-fallback-action.j2 diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 94f14da1..5e63872a 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -1319,7 +1319,7 @@ class ForgeAgent: incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None, ) - return parse_cua_actions(task, step, current_response), current_response + return await parse_cua_actions(task, step, current_response), current_response @staticmethod async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult: diff --git a/skyvern/forge/prompts/skyvern/cua-fallback-action.j2 b/skyvern/forge/prompts/skyvern/cua-fallback-action.j2 new file mode 100644 index 00000000..78b9589e --- /dev/null +++ b/skyvern/forge/prompts/skyvern/cua-fallback-action.j2 @@ -0,0 +1,21 @@ +The user is trying to achieve a goal in the browser assisted by an browser AI assistant. + +According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website. + +Help the user decide what to do next based on the assistant's message. Here's the list of available actions: +- solve_captcha: the task is blocked by captcha and the assistant is asking the user to solve the captcha +- complete: the user goal has been achieved +- terminate: the user goal cannot be achieved. Terminate the task. Examples: 1) there's not enough data provided to achieve the goal and the assistant is asking the user to provide more information. For examples: login is required and the user has not provided the login credentials or incorrect credentials are provided; a form needs to be filled and a required field is missing. 2) The site is stuck or not loading after multiple attempts +- get_verification_code: the assistant is asking the user to provide a verification code (2FA, MFA or TOTP code) +- other: the assistant is asking the user to do something else + +Return the action to take next in the following JSON format: +{ + "action": str // complete, terminate, solve_captcha, get_verification_code +} + +User goal: {{ navigation_goal }} + +Assistant reasoning: {{ assistant_reasoning }} + +Assistant message: {{ assistant_message }} \ No newline at end of file diff --git a/skyvern/forge/sdk/routes/agent_protocol.py b/skyvern/forge/sdk/routes/agent_protocol.py index b63f83e3..15bbf5ac 100644 --- a/skyvern/forge/sdk/routes/agent_protocol.py +++ b/skyvern/forge/sdk/routes/agent_protocol.py @@ -1479,6 +1479,8 @@ async def run_task( ) url = url or task_generation.url navigation_goal = task_generation.navigation_goal or run_request.prompt + if run_request.engine == RunEngine.openai_cua: + navigation_goal = run_request.prompt navigation_payload = task_generation.navigation_payload data_extraction_goal = task_generation.data_extraction_goal data_extraction_schema = data_extraction_schema or task_generation.extracted_information_schema diff --git a/skyvern/services/run_service.py b/skyvern/services/run_service.py index 7022f401..247d202a 100644 --- a/skyvern/services/run_service.py +++ b/skyvern/services/run_service.py @@ -136,7 +136,7 @@ async def cancel_run(run_id: str, organization_id: str | None = None, api_key: s detail=f"Run not found {run_id}", ) - if run.task_run_type == RunType.task_v1: + if run.task_run_type in [RunType.task_v1, RunType.openai_cua]: await cancel_task_v1(run_id, organization_id=organization_id, api_key=api_key) elif run.task_run_type == RunType.task_v2: await cancel_task_v2(run_id, organization_id=organization_id) diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index 1d2f2802..64e4f7f4 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -5,6 +5,8 @@ from openai.types.responses.response import Response as OpenAIResponse from pydantic import ValidationError from skyvern.exceptions import UnsupportedActionType +from skyvern.forge import app +from skyvern.forge.prompts import prompt_engine from skyvern.forge.sdk.models import Step from skyvern.forge.sdk.schemas.tasks import Task from skyvern.webeye.actions.actions import ( @@ -200,13 +202,14 @@ def parse_actions( return actions -def parse_cua_actions( +async def parse_cua_actions( task: Task, step: Step, response: OpenAIResponse, ) -> list[Action]: computer_calls = [item for item in response.output if item.type == "computer_call"] reasonings = [item for item in response.output if item.type == "reasoning"] + assistant_messages = [item for item in response.output if item.type == "message" and item.role == "assistant"] actions: list[Action] = [] for idx, computer_call in enumerate(computer_calls): cua_action = computer_call.action @@ -305,16 +308,48 @@ def parse_cua_actions( workflow_run_id=task.workflow_run_id, response=response.dict(), ) - complete_action = CompleteAction( - reasoning="No more actions to take", - verified=True, - data_extraction_goal=task.data_extraction_goal, - organization_id=task.organization_id, - workflow_run_id=task.workflow_run_id, - task_id=task.task_id, - step_id=step.step_id, - step_order=step.order, - action_order=0, + reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None + assistant_message = assistant_messages[0].content[0].text if assistant_messages else None + fallback_action_prompt = prompt_engine.load_prompt( + "cua-fallback-action", + navigation_goal=task.navigation_goal, + assistant_message=assistant_message, + assistant_reasoning=reasoning, ) - return [complete_action] + + action_response = await app.LLM_API_HANDLER( + prompt=fallback_action_prompt, + prompt_name="cua-fallback-action", + ) + skyvern_action_type = action_response.get("action") + action = WaitAction( + seconds=5, + reasoning=reasoning, + intention=reasoning, + ) + if skyvern_action_type == "complete": + action = CompleteAction( + reasoning=reasoning, + intention=reasoning, + verified=False, + data_extraction_goal=task.data_extraction_goal, + ) + elif skyvern_action_type == "terminate": + action = TerminateAction( + reasoning=reasoning, + intention=reasoning, + ) + elif skyvern_action_type == "solve_captcha": + action = SolveCaptchaAction( + reasoning=reasoning, + intention=reasoning, + ) + elif skyvern_action_type == "get_verification_code": + # Currently we don't support verification code + # TODO: handle verification code by fetching the code and send it to CUA + action = TerminateAction( + reasoning=reasoning, + intention=reasoning, + ) + return [action] return actions