shu/fallback to skyvern actions when cua returns no action (#2141)
This commit is contained in:
@@ -1319,7 +1319,7 @@ class ForgeAgent:
|
||||
incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
|
||||
)
|
||||
|
||||
return parse_cua_actions(task, step, current_response), current_response
|
||||
return await parse_cua_actions(task, step, current_response), current_response
|
||||
|
||||
@staticmethod
|
||||
async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult:
|
||||
|
||||
21
skyvern/forge/prompts/skyvern/cua-fallback-action.j2
Normal file
21
skyvern/forge/prompts/skyvern/cua-fallback-action.j2
Normal file
@@ -0,0 +1,21 @@
|
||||
The user is trying to achieve a goal in the browser assisted by an browser AI assistant.
|
||||
|
||||
According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.
|
||||
|
||||
Help the user decide what to do next based on the assistant's message. Here's the list of available actions:
|
||||
- solve_captcha: the task is blocked by captcha and the assistant is asking the user to solve the captcha
|
||||
- complete: the user goal has been achieved
|
||||
- terminate: the user goal cannot be achieved. Terminate the task. Examples: 1) there's not enough data provided to achieve the goal and the assistant is asking the user to provide more information. For examples: login is required and the user has not provided the login credentials or incorrect credentials are provided; a form needs to be filled and a required field is missing. 2) The site is stuck or not loading after multiple attempts
|
||||
- get_verification_code: the assistant is asking the user to provide a verification code (2FA, MFA or TOTP code)
|
||||
- other: the assistant is asking the user to do something else
|
||||
|
||||
Return the action to take next in the following JSON format:
|
||||
{
|
||||
"action": str // complete, terminate, solve_captcha, get_verification_code
|
||||
}
|
||||
|
||||
User goal: {{ navigation_goal }}
|
||||
|
||||
Assistant reasoning: {{ assistant_reasoning }}
|
||||
|
||||
Assistant message: {{ assistant_message }}
|
||||
@@ -1479,6 +1479,8 @@ async def run_task(
|
||||
)
|
||||
url = url or task_generation.url
|
||||
navigation_goal = task_generation.navigation_goal or run_request.prompt
|
||||
if run_request.engine == RunEngine.openai_cua:
|
||||
navigation_goal = run_request.prompt
|
||||
navigation_payload = task_generation.navigation_payload
|
||||
data_extraction_goal = task_generation.data_extraction_goal
|
||||
data_extraction_schema = data_extraction_schema or task_generation.extracted_information_schema
|
||||
|
||||
@@ -136,7 +136,7 @@ async def cancel_run(run_id: str, organization_id: str | None = None, api_key: s
|
||||
detail=f"Run not found {run_id}",
|
||||
)
|
||||
|
||||
if run.task_run_type == RunType.task_v1:
|
||||
if run.task_run_type in [RunType.task_v1, RunType.openai_cua]:
|
||||
await cancel_task_v1(run_id, organization_id=organization_id, api_key=api_key)
|
||||
elif run.task_run_type == RunType.task_v2:
|
||||
await cancel_task_v2(run_id, organization_id=organization_id)
|
||||
|
||||
@@ -5,6 +5,8 @@ from openai.types.responses.response import Response as OpenAIResponse
|
||||
from pydantic import ValidationError
|
||||
|
||||
from skyvern.exceptions import UnsupportedActionType
|
||||
from skyvern.forge import app
|
||||
from skyvern.forge.prompts import prompt_engine
|
||||
from skyvern.forge.sdk.models import Step
|
||||
from skyvern.forge.sdk.schemas.tasks import Task
|
||||
from skyvern.webeye.actions.actions import (
|
||||
@@ -200,13 +202,14 @@ def parse_actions(
|
||||
return actions
|
||||
|
||||
|
||||
def parse_cua_actions(
|
||||
async def parse_cua_actions(
|
||||
task: Task,
|
||||
step: Step,
|
||||
response: OpenAIResponse,
|
||||
) -> list[Action]:
|
||||
computer_calls = [item for item in response.output if item.type == "computer_call"]
|
||||
reasonings = [item for item in response.output if item.type == "reasoning"]
|
||||
assistant_messages = [item for item in response.output if item.type == "message" and item.role == "assistant"]
|
||||
actions: list[Action] = []
|
||||
for idx, computer_call in enumerate(computer_calls):
|
||||
cua_action = computer_call.action
|
||||
@@ -305,16 +308,48 @@ def parse_cua_actions(
|
||||
workflow_run_id=task.workflow_run_id,
|
||||
response=response.dict(),
|
||||
)
|
||||
complete_action = CompleteAction(
|
||||
reasoning="No more actions to take",
|
||||
verified=True,
|
||||
data_extraction_goal=task.data_extraction_goal,
|
||||
organization_id=task.organization_id,
|
||||
workflow_run_id=task.workflow_run_id,
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
step_order=step.order,
|
||||
action_order=0,
|
||||
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
|
||||
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
|
||||
fallback_action_prompt = prompt_engine.load_prompt(
|
||||
"cua-fallback-action",
|
||||
navigation_goal=task.navigation_goal,
|
||||
assistant_message=assistant_message,
|
||||
assistant_reasoning=reasoning,
|
||||
)
|
||||
return [complete_action]
|
||||
|
||||
action_response = await app.LLM_API_HANDLER(
|
||||
prompt=fallback_action_prompt,
|
||||
prompt_name="cua-fallback-action",
|
||||
)
|
||||
skyvern_action_type = action_response.get("action")
|
||||
action = WaitAction(
|
||||
seconds=5,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
)
|
||||
if skyvern_action_type == "complete":
|
||||
action = CompleteAction(
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
verified=False,
|
||||
data_extraction_goal=task.data_extraction_goal,
|
||||
)
|
||||
elif skyvern_action_type == "terminate":
|
||||
action = TerminateAction(
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
)
|
||||
elif skyvern_action_type == "solve_captcha":
|
||||
action = SolveCaptchaAction(
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
)
|
||||
elif skyvern_action_type == "get_verification_code":
|
||||
# Currently we don't support verification code
|
||||
# TODO: handle verification code by fetching the code and send it to CUA
|
||||
action = TerminateAction(
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
)
|
||||
return [action]
|
||||
return actions
|
||||
|
||||
Reference in New Issue
Block a user