CUA feature - skyvern answers CUA questions and makes decision if being asked to (#2163)
This commit is contained in:
@@ -879,15 +879,6 @@ class ForgeAgent:
|
|||||||
status=StepStatus.failed,
|
status=StepStatus.failed,
|
||||||
output=detailed_agent_step_output.to_agent_step_output(),
|
output=detailed_agent_step_output.to_agent_step_output(),
|
||||||
)
|
)
|
||||||
detailed_agent_step_output = DetailedAgentStepOutput(
|
|
||||||
scraped_page=scraped_page,
|
|
||||||
extract_action_prompt=extract_action_prompt,
|
|
||||||
llm_response=json_response,
|
|
||||||
actions=actions,
|
|
||||||
action_results=[],
|
|
||||||
actions_and_results=[],
|
|
||||||
step_exception=None,
|
|
||||||
)
|
|
||||||
return step, detailed_agent_step_output
|
return step, detailed_agent_step_output
|
||||||
|
|
||||||
# Execute the actions
|
# Execute the actions
|
||||||
@@ -1268,44 +1259,86 @@ class ForgeAgent:
|
|||||||
incremental_reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
|
incremental_reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
|
||||||
incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
|
incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
|
|
||||||
if not computer_calls:
|
|
||||||
return [], previous_response
|
|
||||||
|
|
||||||
if not scraped_page.screenshots:
|
if not scraped_page.screenshots:
|
||||||
return [], previous_response
|
return [], previous_response
|
||||||
|
|
||||||
last_call_id = computer_calls[-1].call_id
|
computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
|
||||||
screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
|
reasonings = [item for item in previous_response.output if item.type == "reasoning"]
|
||||||
|
assistant_messages = [
|
||||||
|
item for item in previous_response.output if item.type == "message" and item.role == "assistant"
|
||||||
|
]
|
||||||
|
last_call_id = None
|
||||||
|
if computer_calls:
|
||||||
|
last_call_id = computer_calls[-1].call_id
|
||||||
|
|
||||||
current_response = await app.OPENAI_CLIENT.responses.create(
|
screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
|
||||||
model="computer-use-preview",
|
if last_call_id is None:
|
||||||
previous_response_id=previous_response.id,
|
# try address the conversation with the context we have
|
||||||
tools=[
|
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
|
||||||
{
|
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
|
||||||
"type": "computer_use_preview",
|
skyvern_repsonse_prompt = load_prompt_with_elements(
|
||||||
"display_width": settings.BROWSER_WIDTH,
|
scraped_page=scraped_page,
|
||||||
"display_height": settings.BROWSER_HEIGHT,
|
prompt_engine=prompt_engine,
|
||||||
"environment": "browser",
|
template_name="cua-answer-question",
|
||||||
}
|
navigation_goal=task.navigation_goal,
|
||||||
],
|
assistant_reasoning=reasoning,
|
||||||
input=[
|
assistant_message=assistant_message,
|
||||||
{
|
)
|
||||||
"call_id": last_call_id,
|
skyvern_response = await app.LLM_API_HANDLER(
|
||||||
"type": "computer_call_output",
|
prompt=skyvern_repsonse_prompt,
|
||||||
"output": {
|
prompt_name="cua-answer-question",
|
||||||
"type": "input_image",
|
step=step,
|
||||||
"image_url": f"data:image/png;base64,{screenshot_base64}",
|
screenshots=scraped_page.screenshots,
|
||||||
},
|
)
|
||||||
}
|
resp_content = skyvern_response.get("answer")
|
||||||
],
|
if not resp_content:
|
||||||
reasoning={
|
resp_content = "I don't know. Can you help me make the best decision to achieve the goal?"
|
||||||
"generate_summary": "concise",
|
current_response = await app.OPENAI_CLIENT.responses.create(
|
||||||
},
|
model="computer-use-preview",
|
||||||
truncation="auto",
|
previous_response_id=previous_response.id,
|
||||||
temperature=0,
|
tools=[
|
||||||
)
|
{
|
||||||
|
"type": "computer_use_preview",
|
||||||
|
"display_width": settings.BROWSER_WIDTH,
|
||||||
|
"display_height": settings.BROWSER_HEIGHT,
|
||||||
|
"environment": "browser",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
input=[
|
||||||
|
{"role": "user", "content": resp_content},
|
||||||
|
],
|
||||||
|
reasoning={"generate_summary": "concise"},
|
||||||
|
truncation="auto",
|
||||||
|
temperature=0,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
current_response = await app.OPENAI_CLIENT.responses.create(
|
||||||
|
model="computer-use-preview",
|
||||||
|
previous_response_id=previous_response.id,
|
||||||
|
tools=[
|
||||||
|
{
|
||||||
|
"type": "computer_use_preview",
|
||||||
|
"display_width": settings.BROWSER_WIDTH,
|
||||||
|
"display_height": settings.BROWSER_HEIGHT,
|
||||||
|
"environment": "browser",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
input=[
|
||||||
|
{
|
||||||
|
"call_id": last_call_id,
|
||||||
|
"type": "computer_call_output",
|
||||||
|
"output": {
|
||||||
|
"type": "input_image",
|
||||||
|
"image_url": f"data:image/png;base64,{screenshot_base64}",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
reasoning={
|
||||||
|
"generate_summary": "concise",
|
||||||
|
},
|
||||||
|
truncation="auto",
|
||||||
|
temperature=0,
|
||||||
|
)
|
||||||
input_tokens = current_response.usage.input_tokens or 0
|
input_tokens = current_response.usage.input_tokens or 0
|
||||||
output_tokens = current_response.usage.output_tokens or 0
|
output_tokens = current_response.usage.output_tokens or 0
|
||||||
current_response.usage.total_tokens or 0
|
current_response.usage.total_tokens or 0
|
||||||
|
|||||||
19
skyvern/forge/prompts/skyvern/cua-answer-question.j2
Normal file
19
skyvern/forge/prompts/skyvern/cua-answer-question.j2
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
The user is trying to achieve a goal in the browser assisted by a browser AI assistant.
|
||||||
|
|
||||||
|
According to the AI assistant's feedback, including reasoning and its message, there might be a question to answer or a decision to be made.
|
||||||
|
|
||||||
|
Help answer the question or make the decision based on the information provided. Try your best to help the user achieve the goal.
|
||||||
|
|
||||||
|
Reply in JSON format with the following keys:
|
||||||
|
{
|
||||||
|
"question_or_decision": str // What is the question to answer or what is the decision to make?
|
||||||
|
"thought": str // Think step by step. What kind of feedback can best help the AI assistant move forward and achieve the goal?
|
||||||
|
"enough_information": bool // Whether the information provided is enough to answer the question or make the decision.
|
||||||
|
"answer": str // The answer to the question or the decision to make. Give the answer as if you're talking to the assistant.
|
||||||
|
}
|
||||||
|
|
||||||
|
User goal: {{ navigation_goal }}
|
||||||
|
|
||||||
|
Assistant reasoning: {{ assistant_reasoning }}
|
||||||
|
|
||||||
|
Assistant message: {{ assistant_message }}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
The user is trying to achieve a goal in the browser assisted by an browser AI assistant.
|
The user is trying to achieve a goal in the browser assisted by a browser AI assistant.
|
||||||
|
|
||||||
According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.
|
According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.
|
||||||
|
|
||||||
|
|||||||
@@ -317,6 +317,11 @@ async def parse_cua_actions(
|
|||||||
reasoning=reasoning,
|
reasoning=reasoning,
|
||||||
intention=reasoning,
|
intention=reasoning,
|
||||||
)
|
)
|
||||||
|
case "screenshot":
|
||||||
|
return NullAction(
|
||||||
|
reasoning=reasoning,
|
||||||
|
intention=reasoning,
|
||||||
|
)
|
||||||
case _:
|
case _:
|
||||||
raise ValueError(f"Unsupported action type: {action_type}")
|
raise ValueError(f"Unsupported action type: {action_type}")
|
||||||
action.organization_id = task.organization_id
|
action.organization_id = task.organization_id
|
||||||
|
|||||||
Reference in New Issue
Block a user