shu/fallback to skyvern actions when cua returns no action (#2141)
This commit is contained in:
@@ -1319,7 +1319,7 @@ class ForgeAgent:
|
|||||||
incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
|
incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
return parse_cua_actions(task, step, current_response), current_response
|
return await parse_cua_actions(task, step, current_response), current_response
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult:
|
async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult:
|
||||||
|
|||||||
21
skyvern/forge/prompts/skyvern/cua-fallback-action.j2
Normal file
21
skyvern/forge/prompts/skyvern/cua-fallback-action.j2
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
The user is trying to achieve a goal in the browser assisted by an browser AI assistant.
|
||||||
|
|
||||||
|
According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.
|
||||||
|
|
||||||
|
Help the user decide what to do next based on the assistant's message. Here's the list of available actions:
|
||||||
|
- solve_captcha: the task is blocked by captcha and the assistant is asking the user to solve the captcha
|
||||||
|
- complete: the user goal has been achieved
|
||||||
|
- terminate: the user goal cannot be achieved. Terminate the task. Examples: 1) there's not enough data provided to achieve the goal and the assistant is asking the user to provide more information. For examples: login is required and the user has not provided the login credentials or incorrect credentials are provided; a form needs to be filled and a required field is missing. 2) The site is stuck or not loading after multiple attempts
|
||||||
|
- get_verification_code: the assistant is asking the user to provide a verification code (2FA, MFA or TOTP code)
|
||||||
|
- other: the assistant is asking the user to do something else
|
||||||
|
|
||||||
|
Return the action to take next in the following JSON format:
|
||||||
|
{
|
||||||
|
"action": str // complete, terminate, solve_captcha, get_verification_code
|
||||||
|
}
|
||||||
|
|
||||||
|
User goal: {{ navigation_goal }}
|
||||||
|
|
||||||
|
Assistant reasoning: {{ assistant_reasoning }}
|
||||||
|
|
||||||
|
Assistant message: {{ assistant_message }}
|
||||||
@@ -1479,6 +1479,8 @@ async def run_task(
|
|||||||
)
|
)
|
||||||
url = url or task_generation.url
|
url = url or task_generation.url
|
||||||
navigation_goal = task_generation.navigation_goal or run_request.prompt
|
navigation_goal = task_generation.navigation_goal or run_request.prompt
|
||||||
|
if run_request.engine == RunEngine.openai_cua:
|
||||||
|
navigation_goal = run_request.prompt
|
||||||
navigation_payload = task_generation.navigation_payload
|
navigation_payload = task_generation.navigation_payload
|
||||||
data_extraction_goal = task_generation.data_extraction_goal
|
data_extraction_goal = task_generation.data_extraction_goal
|
||||||
data_extraction_schema = data_extraction_schema or task_generation.extracted_information_schema
|
data_extraction_schema = data_extraction_schema or task_generation.extracted_information_schema
|
||||||
|
|||||||
@@ -136,7 +136,7 @@ async def cancel_run(run_id: str, organization_id: str | None = None, api_key: s
|
|||||||
detail=f"Run not found {run_id}",
|
detail=f"Run not found {run_id}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if run.task_run_type == RunType.task_v1:
|
if run.task_run_type in [RunType.task_v1, RunType.openai_cua]:
|
||||||
await cancel_task_v1(run_id, organization_id=organization_id, api_key=api_key)
|
await cancel_task_v1(run_id, organization_id=organization_id, api_key=api_key)
|
||||||
elif run.task_run_type == RunType.task_v2:
|
elif run.task_run_type == RunType.task_v2:
|
||||||
await cancel_task_v2(run_id, organization_id=organization_id)
|
await cancel_task_v2(run_id, organization_id=organization_id)
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ from openai.types.responses.response import Response as OpenAIResponse
|
|||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
|
||||||
from skyvern.exceptions import UnsupportedActionType
|
from skyvern.exceptions import UnsupportedActionType
|
||||||
|
from skyvern.forge import app
|
||||||
|
from skyvern.forge.prompts import prompt_engine
|
||||||
from skyvern.forge.sdk.models import Step
|
from skyvern.forge.sdk.models import Step
|
||||||
from skyvern.forge.sdk.schemas.tasks import Task
|
from skyvern.forge.sdk.schemas.tasks import Task
|
||||||
from skyvern.webeye.actions.actions import (
|
from skyvern.webeye.actions.actions import (
|
||||||
@@ -200,13 +202,14 @@ def parse_actions(
|
|||||||
return actions
|
return actions
|
||||||
|
|
||||||
|
|
||||||
def parse_cua_actions(
|
async def parse_cua_actions(
|
||||||
task: Task,
|
task: Task,
|
||||||
step: Step,
|
step: Step,
|
||||||
response: OpenAIResponse,
|
response: OpenAIResponse,
|
||||||
) -> list[Action]:
|
) -> list[Action]:
|
||||||
computer_calls = [item for item in response.output if item.type == "computer_call"]
|
computer_calls = [item for item in response.output if item.type == "computer_call"]
|
||||||
reasonings = [item for item in response.output if item.type == "reasoning"]
|
reasonings = [item for item in response.output if item.type == "reasoning"]
|
||||||
|
assistant_messages = [item for item in response.output if item.type == "message" and item.role == "assistant"]
|
||||||
actions: list[Action] = []
|
actions: list[Action] = []
|
||||||
for idx, computer_call in enumerate(computer_calls):
|
for idx, computer_call in enumerate(computer_calls):
|
||||||
cua_action = computer_call.action
|
cua_action = computer_call.action
|
||||||
@@ -305,16 +308,48 @@ def parse_cua_actions(
|
|||||||
workflow_run_id=task.workflow_run_id,
|
workflow_run_id=task.workflow_run_id,
|
||||||
response=response.dict(),
|
response=response.dict(),
|
||||||
)
|
)
|
||||||
complete_action = CompleteAction(
|
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
|
||||||
reasoning="No more actions to take",
|
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
|
||||||
verified=True,
|
fallback_action_prompt = prompt_engine.load_prompt(
|
||||||
data_extraction_goal=task.data_extraction_goal,
|
"cua-fallback-action",
|
||||||
organization_id=task.organization_id,
|
navigation_goal=task.navigation_goal,
|
||||||
workflow_run_id=task.workflow_run_id,
|
assistant_message=assistant_message,
|
||||||
task_id=task.task_id,
|
assistant_reasoning=reasoning,
|
||||||
step_id=step.step_id,
|
|
||||||
step_order=step.order,
|
|
||||||
action_order=0,
|
|
||||||
)
|
)
|
||||||
return [complete_action]
|
|
||||||
|
action_response = await app.LLM_API_HANDLER(
|
||||||
|
prompt=fallback_action_prompt,
|
||||||
|
prompt_name="cua-fallback-action",
|
||||||
|
)
|
||||||
|
skyvern_action_type = action_response.get("action")
|
||||||
|
action = WaitAction(
|
||||||
|
seconds=5,
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
)
|
||||||
|
if skyvern_action_type == "complete":
|
||||||
|
action = CompleteAction(
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
verified=False,
|
||||||
|
data_extraction_goal=task.data_extraction_goal,
|
||||||
|
)
|
||||||
|
elif skyvern_action_type == "terminate":
|
||||||
|
action = TerminateAction(
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
)
|
||||||
|
elif skyvern_action_type == "solve_captcha":
|
||||||
|
action = SolveCaptchaAction(
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
)
|
||||||
|
elif skyvern_action_type == "get_verification_code":
|
||||||
|
# Currently we don't support verification code
|
||||||
|
# TODO: handle verification code by fetching the code and send it to CUA
|
||||||
|
action = TerminateAction(
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
)
|
||||||
|
return [action]
|
||||||
return actions
|
return actions
|
||||||
|
|||||||
Reference in New Issue
Block a user