shu/fallback to skyvern actions when cua returns no action (#2141)

This commit is contained in:
Shuchang Zheng
2025-04-12 20:55:38 -07:00
committed by GitHub
parent e8efcc0bd7
commit db7f1abeab
5 changed files with 72 additions and 14 deletions

View File

@@ -1319,7 +1319,7 @@ class ForgeAgent:
incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
)
return parse_cua_actions(task, step, current_response), current_response
return await parse_cua_actions(task, step, current_response), current_response
@staticmethod
async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult:

View File

@@ -0,0 +1,21 @@
The user is trying to achieve a goal in the browser assisted by an browser AI assistant.
According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.
Help the user decide what to do next based on the assistant's message. Here's the list of available actions:
- solve_captcha: the task is blocked by captcha and the assistant is asking the user to solve the captcha
- complete: the user goal has been achieved
- terminate: the user goal cannot be achieved. Terminate the task. Examples: 1) there's not enough data provided to achieve the goal and the assistant is asking the user to provide more information. For examples: login is required and the user has not provided the login credentials or incorrect credentials are provided; a form needs to be filled and a required field is missing. 2) The site is stuck or not loading after multiple attempts
- get_verification_code: the assistant is asking the user to provide a verification code (2FA, MFA or TOTP code)
- other: the assistant is asking the user to do something else
Return the action to take next in the following JSON format:
{
"action": str // complete, terminate, solve_captcha, get_verification_code
}
User goal: {{ navigation_goal }}
Assistant reasoning: {{ assistant_reasoning }}
Assistant message: {{ assistant_message }}

View File

@@ -1479,6 +1479,8 @@ async def run_task(
)
url = url or task_generation.url
navigation_goal = task_generation.navigation_goal or run_request.prompt
if run_request.engine == RunEngine.openai_cua:
navigation_goal = run_request.prompt
navigation_payload = task_generation.navigation_payload
data_extraction_goal = task_generation.data_extraction_goal
data_extraction_schema = data_extraction_schema or task_generation.extracted_information_schema

View File

@@ -136,7 +136,7 @@ async def cancel_run(run_id: str, organization_id: str | None = None, api_key: s
detail=f"Run not found {run_id}",
)
if run.task_run_type == RunType.task_v1:
if run.task_run_type in [RunType.task_v1, RunType.openai_cua]:
await cancel_task_v1(run_id, organization_id=organization_id, api_key=api_key)
elif run.task_run_type == RunType.task_v2:
await cancel_task_v2(run_id, organization_id=organization_id)

View File

@@ -5,6 +5,8 @@ from openai.types.responses.response import Response as OpenAIResponse
from pydantic import ValidationError
from skyvern.exceptions import UnsupportedActionType
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.tasks import Task
from skyvern.webeye.actions.actions import (
@@ -200,13 +202,14 @@ def parse_actions(
return actions
def parse_cua_actions(
async def parse_cua_actions(
task: Task,
step: Step,
response: OpenAIResponse,
) -> list[Action]:
computer_calls = [item for item in response.output if item.type == "computer_call"]
reasonings = [item for item in response.output if item.type == "reasoning"]
assistant_messages = [item for item in response.output if item.type == "message" and item.role == "assistant"]
actions: list[Action] = []
for idx, computer_call in enumerate(computer_calls):
cua_action = computer_call.action
@@ -305,16 +308,48 @@ def parse_cua_actions(
workflow_run_id=task.workflow_run_id,
response=response.dict(),
)
complete_action = CompleteAction(
reasoning="No more actions to take",
verified=True,
data_extraction_goal=task.data_extraction_goal,
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=0,
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
fallback_action_prompt = prompt_engine.load_prompt(
"cua-fallback-action",
navigation_goal=task.navigation_goal,
assistant_message=assistant_message,
assistant_reasoning=reasoning,
)
return [complete_action]
action_response = await app.LLM_API_HANDLER(
prompt=fallback_action_prompt,
prompt_name="cua-fallback-action",
)
skyvern_action_type = action_response.get("action")
action = WaitAction(
seconds=5,
reasoning=reasoning,
intention=reasoning,
)
if skyvern_action_type == "complete":
action = CompleteAction(
reasoning=reasoning,
intention=reasoning,
verified=False,
data_extraction_goal=task.data_extraction_goal,
)
elif skyvern_action_type == "terminate":
action = TerminateAction(
reasoning=reasoning,
intention=reasoning,
)
elif skyvern_action_type == "solve_captcha":
action = SolveCaptchaAction(
reasoning=reasoning,
intention=reasoning,
)
elif skyvern_action_type == "get_verification_code":
# Currently we don't support verification code
# TODO: handle verification code by fetching the code and send it to CUA
action = TerminateAction(
reasoning=reasoning,
intention=reasoning,
)
return [action]
return actions