CUA feature - skyvern answers CUA questions and makes decision if being asked to (#2163)
This commit is contained in:
@@ -879,15 +879,6 @@ class ForgeAgent:
|
||||
status=StepStatus.failed,
|
||||
output=detailed_agent_step_output.to_agent_step_output(),
|
||||
)
|
||||
detailed_agent_step_output = DetailedAgentStepOutput(
|
||||
scraped_page=scraped_page,
|
||||
extract_action_prompt=extract_action_prompt,
|
||||
llm_response=json_response,
|
||||
actions=actions,
|
||||
action_results=[],
|
||||
actions_and_results=[],
|
||||
step_exception=None,
|
||||
)
|
||||
return step, detailed_agent_step_output
|
||||
|
||||
# Execute the actions
|
||||
@@ -1268,44 +1259,86 @@ class ForgeAgent:
|
||||
incremental_reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
|
||||
incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
|
||||
)
|
||||
|
||||
computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
|
||||
if not computer_calls:
|
||||
return [], previous_response
|
||||
|
||||
if not scraped_page.screenshots:
|
||||
return [], previous_response
|
||||
|
||||
last_call_id = computer_calls[-1].call_id
|
||||
screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
|
||||
computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
|
||||
reasonings = [item for item in previous_response.output if item.type == "reasoning"]
|
||||
assistant_messages = [
|
||||
item for item in previous_response.output if item.type == "message" and item.role == "assistant"
|
||||
]
|
||||
last_call_id = None
|
||||
if computer_calls:
|
||||
last_call_id = computer_calls[-1].call_id
|
||||
|
||||
current_response = await app.OPENAI_CLIENT.responses.create(
|
||||
model="computer-use-preview",
|
||||
previous_response_id=previous_response.id,
|
||||
tools=[
|
||||
{
|
||||
"type": "computer_use_preview",
|
||||
"display_width": settings.BROWSER_WIDTH,
|
||||
"display_height": settings.BROWSER_HEIGHT,
|
||||
"environment": "browser",
|
||||
}
|
||||
],
|
||||
input=[
|
||||
{
|
||||
"call_id": last_call_id,
|
||||
"type": "computer_call_output",
|
||||
"output": {
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{screenshot_base64}",
|
||||
},
|
||||
}
|
||||
],
|
||||
reasoning={
|
||||
"generate_summary": "concise",
|
||||
},
|
||||
truncation="auto",
|
||||
temperature=0,
|
||||
)
|
||||
screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
|
||||
if last_call_id is None:
|
||||
# try address the conversation with the context we have
|
||||
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
|
||||
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
|
||||
skyvern_repsonse_prompt = load_prompt_with_elements(
|
||||
scraped_page=scraped_page,
|
||||
prompt_engine=prompt_engine,
|
||||
template_name="cua-answer-question",
|
||||
navigation_goal=task.navigation_goal,
|
||||
assistant_reasoning=reasoning,
|
||||
assistant_message=assistant_message,
|
||||
)
|
||||
skyvern_response = await app.LLM_API_HANDLER(
|
||||
prompt=skyvern_repsonse_prompt,
|
||||
prompt_name="cua-answer-question",
|
||||
step=step,
|
||||
screenshots=scraped_page.screenshots,
|
||||
)
|
||||
resp_content = skyvern_response.get("answer")
|
||||
if not resp_content:
|
||||
resp_content = "I don't know. Can you help me make the best decision to achieve the goal?"
|
||||
current_response = await app.OPENAI_CLIENT.responses.create(
|
||||
model="computer-use-preview",
|
||||
previous_response_id=previous_response.id,
|
||||
tools=[
|
||||
{
|
||||
"type": "computer_use_preview",
|
||||
"display_width": settings.BROWSER_WIDTH,
|
||||
"display_height": settings.BROWSER_HEIGHT,
|
||||
"environment": "browser",
|
||||
}
|
||||
],
|
||||
input=[
|
||||
{"role": "user", "content": resp_content},
|
||||
],
|
||||
reasoning={"generate_summary": "concise"},
|
||||
truncation="auto",
|
||||
temperature=0,
|
||||
)
|
||||
else:
|
||||
current_response = await app.OPENAI_CLIENT.responses.create(
|
||||
model="computer-use-preview",
|
||||
previous_response_id=previous_response.id,
|
||||
tools=[
|
||||
{
|
||||
"type": "computer_use_preview",
|
||||
"display_width": settings.BROWSER_WIDTH,
|
||||
"display_height": settings.BROWSER_HEIGHT,
|
||||
"environment": "browser",
|
||||
}
|
||||
],
|
||||
input=[
|
||||
{
|
||||
"call_id": last_call_id,
|
||||
"type": "computer_call_output",
|
||||
"output": {
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{screenshot_base64}",
|
||||
},
|
||||
}
|
||||
],
|
||||
reasoning={
|
||||
"generate_summary": "concise",
|
||||
},
|
||||
truncation="auto",
|
||||
temperature=0,
|
||||
)
|
||||
input_tokens = current_response.usage.input_tokens or 0
|
||||
output_tokens = current_response.usage.output_tokens or 0
|
||||
current_response.usage.total_tokens or 0
|
||||
|
||||
19
skyvern/forge/prompts/skyvern/cua-answer-question.j2
Normal file
19
skyvern/forge/prompts/skyvern/cua-answer-question.j2
Normal file
@@ -0,0 +1,19 @@
|
||||
The user is trying to achieve a goal in the browser assisted by a browser AI assistant.
|
||||
|
||||
According to the AI assistant's feedback, including reasoning and its message, there might be a question to answer or a decision to be made.
|
||||
|
||||
Help answer the question or make the decision based on the information provided. Try your best to help the user achieve the goal.
|
||||
|
||||
Reply in JSON format with the following keys:
|
||||
{
|
||||
"question_or_decision": str // What is the question to answer or what is the decision to make?
|
||||
"thought": str // Think step by step. What kind of feedback can best help the AI assistant move forward and achieve the goal?
|
||||
"enough_information": bool // Whether the information provided is enough to answer the question or make the decision.
|
||||
"answer": str // The answer to the question or the decision to make. Give the answer as if you're talking to the assistant.
|
||||
}
|
||||
|
||||
User goal: {{ navigation_goal }}
|
||||
|
||||
Assistant reasoning: {{ assistant_reasoning }}
|
||||
|
||||
Assistant message: {{ assistant_message }}
|
||||
@@ -1,4 +1,4 @@
|
||||
The user is trying to achieve a goal in the browser assisted by an browser AI assistant.
|
||||
The user is trying to achieve a goal in the browser assisted by a browser AI assistant.
|
||||
|
||||
According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.
|
||||
|
||||
|
||||
@@ -317,6 +317,11 @@ async def parse_cua_actions(
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
)
|
||||
case "screenshot":
|
||||
return NullAction(
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
)
|
||||
case _:
|
||||
raise ValueError(f"Unsupported action type: {action_type}")
|
||||
action.organization_id = task.organization_id
|
||||
|
||||
Reference in New Issue
Block a user