CUA feature - skyvern answers CUA questions and makes decision if being asked to (#2163)

2025-04-16 00:56:50 -07:00
parent 1386ac4780
commit dd5c0f2e9d
4 changed files with 101 additions and 44 deletions
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -879,15 +879,6 @@ class ForgeAgent:
                    status=StepStatus.failed,
                    output=detailed_agent_step_output.to_agent_step_output(),
                )
                detailed_agent_step_output = DetailedAgentStepOutput(
                    scraped_page=scraped_page,
                    extract_action_prompt=extract_action_prompt,
                    llm_response=json_response,
                    actions=actions,
                    action_results=[],
                    actions_and_results=[],
                    step_exception=None,
                )
                return step, detailed_agent_step_output
            # Execute the actions
@@ -1268,44 +1259,86 @@ class ForgeAgent:
                incremental_reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
                incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
            )
        computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
        if not computer_calls:
            return [], previous_response
        if not scraped_page.screenshots:
            return [], previous_response
-        last_call_id = computer_calls[-1].call_id
+        computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
-        screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
+        reasonings = [item for item in previous_response.output if item.type == "reasoning"]
        assistant_messages = [
            item for item in previous_response.output if item.type == "message" and item.role == "assistant"
        ]
        last_call_id = None
        if computer_calls:
            last_call_id = computer_calls[-1].call_id
-        current_response = await app.OPENAI_CLIENT.responses.create(
+        screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
-            model="computer-use-preview",
+        if last_call_id is None:
-            previous_response_id=previous_response.id,
+            # try address the conversation with the context we have
-            tools=[
+            reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
-                {
+            assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
-                    "type": "computer_use_preview",
+            skyvern_repsonse_prompt = load_prompt_with_elements(
-                    "display_width": settings.BROWSER_WIDTH,
+                scraped_page=scraped_page,
-                    "display_height": settings.BROWSER_HEIGHT,
+                prompt_engine=prompt_engine,
-                    "environment": "browser",
+                template_name="cua-answer-question",
-                }
+                navigation_goal=task.navigation_goal,
-            ],
+                assistant_reasoning=reasoning,
-            input=[
+                assistant_message=assistant_message,
-                {
+            )
-                    "call_id": last_call_id,
+            skyvern_response = await app.LLM_API_HANDLER(
-                    "type": "computer_call_output",
+                prompt=skyvern_repsonse_prompt,
-                    "output": {
+                prompt_name="cua-answer-question",
-                        "type": "input_image",
+                step=step,
-                        "image_url": f"data:image/png;base64,{screenshot_base64}",
+                screenshots=scraped_page.screenshots,
-                    },
+            )
-                }
+            resp_content = skyvern_response.get("answer")
-            ],
+            if not resp_content:
-            reasoning={
+                resp_content = "I don't know. Can you help me make the best decision to achieve the goal?"
-                "generate_summary": "concise",
+            current_response = await app.OPENAI_CLIENT.responses.create(
-            },
+                model="computer-use-preview",
-            truncation="auto",
+                previous_response_id=previous_response.id,
-            temperature=0,
+                tools=[
-        )
+                    {
                        "type": "computer_use_preview",
                        "display_width": settings.BROWSER_WIDTH,
                        "display_height": settings.BROWSER_HEIGHT,
                        "environment": "browser",
                    }
                ],
                input=[
                    {"role": "user", "content": resp_content},
                ],
                reasoning={"generate_summary": "concise"},
                truncation="auto",
                temperature=0,
            )
        else:
            current_response = await app.OPENAI_CLIENT.responses.create(
                model="computer-use-preview",
                previous_response_id=previous_response.id,
                tools=[
                    {
                        "type": "computer_use_preview",
                        "display_width": settings.BROWSER_WIDTH,
                        "display_height": settings.BROWSER_HEIGHT,
                        "environment": "browser",
                    }
                ],
                input=[
                    {
                        "call_id": last_call_id,
                        "type": "computer_call_output",
                        "output": {
                            "type": "input_image",
                            "image_url": f"data:image/png;base64,{screenshot_base64}",
                        },
                    }
                ],
                reasoning={
                    "generate_summary": "concise",
                },
                truncation="auto",
                temperature=0,
            )
        input_tokens = current_response.usage.input_tokens or 0
        output_tokens = current_response.usage.output_tokens or 0
        current_response.usage.total_tokens or 0
--- a/skyvern/forge/prompts/skyvern/cua-answer-question.j2
+++ b/skyvern/forge/prompts/skyvern/cua-answer-question.j2
@@ -0,0 +1,19 @@
 The user is trying to achieve a goal in the browser assisted by a browser AI assistant.
 According to the AI assistant's feedback, including reasoning and its message, there might be a question to answer or a decision to be made.
 Help answer the question or make the decision based on the information provided. Try your best to help the user achieve the goal.
 Reply in JSON format with the following keys:
 {
    "question_or_decision": str // What is the question to answer or what is the decision to make?
    "thought": str // Think step by step. What kind of feedback can best help the AI assistant move forward and achieve the goal?
    "enough_information": bool // Whether the information provided is enough to answer the question or make the decision.
    "answer": str // The answer to the question or the decision to make. Give the answer as if you're talking to the assistant.
 }
 User goal: {{ navigation_goal }}
 Assistant reasoning: {{ assistant_reasoning }}
 Assistant message: {{ assistant_message }}
--- a/skyvern/forge/prompts/skyvern/cua-fallback-action.j2
+++ b/skyvern/forge/prompts/skyvern/cua-fallback-action.j2
@@ -1,4 +1,4 @@
-The user is trying to achieve a goal in the browser assisted by an browser AI assistant.
+The user is trying to achieve a goal in the browser assisted by a browser AI assistant.
 According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -317,6 +317,11 @@ async def parse_cua_actions(
                            reasoning=reasoning,
                            intention=reasoning,
                        )
                case "screenshot":
                    return NullAction(
                        reasoning=reasoning,
                        intention=reasoning,
                    )
                case _:
                    raise ValueError(f"Unsupported action type: {action_type}")
            action.organization_id = task.organization_id
`@@ -1,4 +1,4 @@`
	`The user is trying to achieve a goal in the browser assisted by an browser AI assistant.`	`The user is trying to achieve a goal in the browser assisted by a browser AI assistant.`

	`According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.`	`According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.`