CUA feature - skyvern answers CUA questions and makes decision if being asked to (#2163)

2025-04-16 00:56:50 -07:00
parent 1386ac4780
commit dd5c0f2e9d
4 changed files with 101 additions and 44 deletions
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -879,15 +879,6 @@ class ForgeAgent:
                    status=StepStatus.failed,
                    output=detailed_agent_step_output.to_agent_step_output(),
                )
-                detailed_agent_step_output = DetailedAgentStepOutput(
-                    scraped_page=scraped_page,
-                    extract_action_prompt=extract_action_prompt,
-                    llm_response=json_response,
-                    actions=actions,
-                    action_results=[],
-                    actions_and_results=[],
-                    step_exception=None,
-                )
                return step, detailed_agent_step_output

            # Execute the actions
@@ -1268,44 +1259,86 @@ class ForgeAgent:
                incremental_reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
                incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
            )
-
-        computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
-        if not computer_calls:
-            return [], previous_response
-
        if not scraped_page.screenshots:
            return [], previous_response

-        last_call_id = computer_calls[-1].call_id
-        screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
+        computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
+        reasonings = [item for item in previous_response.output if item.type == "reasoning"]
+        assistant_messages = [
+            item for item in previous_response.output if item.type == "message" and item.role == "assistant"
+        ]
+        last_call_id = None
+        if computer_calls:
+            last_call_id = computer_calls[-1].call_id

-        current_response = await app.OPENAI_CLIENT.responses.create(
-            model="computer-use-preview",
-            previous_response_id=previous_response.id,
-            tools=[
-                {
-                    "type": "computer_use_preview",
-                    "display_width": settings.BROWSER_WIDTH,
-                    "display_height": settings.BROWSER_HEIGHT,
-                    "environment": "browser",
-                }
-            ],
-            input=[
-                {
-                    "call_id": last_call_id,
-                    "type": "computer_call_output",
-                    "output": {
-                        "type": "input_image",
-                        "image_url": f"data:image/png;base64,{screenshot_base64}",
-                    },
-                }
-            ],
-            reasoning={
-                "generate_summary": "concise",
-            },
-            truncation="auto",
-            temperature=0,
-        )
+        screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
+        if last_call_id is None:
+            # try address the conversation with the context we have
+            reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
+            assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
+            skyvern_repsonse_prompt = load_prompt_with_elements(
+                scraped_page=scraped_page,
+                prompt_engine=prompt_engine,
+                template_name="cua-answer-question",
+                navigation_goal=task.navigation_goal,
+                assistant_reasoning=reasoning,
+                assistant_message=assistant_message,
+            )
+            skyvern_response = await app.LLM_API_HANDLER(
+                prompt=skyvern_repsonse_prompt,
+                prompt_name="cua-answer-question",
+                step=step,
+                screenshots=scraped_page.screenshots,
+            )
+            resp_content = skyvern_response.get("answer")
+            if not resp_content:
+                resp_content = "I don't know. Can you help me make the best decision to achieve the goal?"
+            current_response = await app.OPENAI_CLIENT.responses.create(
+                model="computer-use-preview",
+                previous_response_id=previous_response.id,
+                tools=[
+                    {
+                        "type": "computer_use_preview",
+                        "display_width": settings.BROWSER_WIDTH,
+                        "display_height": settings.BROWSER_HEIGHT,
+                        "environment": "browser",
+                    }
+                ],
+                input=[
+                    {"role": "user", "content": resp_content},
+                ],
+                reasoning={"generate_summary": "concise"},
+                truncation="auto",
+                temperature=0,
+            )
+        else:
+            current_response = await app.OPENAI_CLIENT.responses.create(
+                model="computer-use-preview",
+                previous_response_id=previous_response.id,
+                tools=[
+                    {
+                        "type": "computer_use_preview",
+                        "display_width": settings.BROWSER_WIDTH,
+                        "display_height": settings.BROWSER_HEIGHT,
+                        "environment": "browser",
+                    }
+                ],
+                input=[
+                    {
+                        "call_id": last_call_id,
+                        "type": "computer_call_output",
+                        "output": {
+                            "type": "input_image",
+                            "image_url": f"data:image/png;base64,{screenshot_base64}",
+                        },
+                    }
+                ],
+                reasoning={
+                    "generate_summary": "concise",
+                },
+                truncation="auto",
+                temperature=0,
+            )
        input_tokens = current_response.usage.input_tokens or 0
        output_tokens = current_response.usage.output_tokens or 0
        current_response.usage.total_tokens or 0
--- a/skyvern/forge/prompts/skyvern/cua-answer-question.j2
+++ b/skyvern/forge/prompts/skyvern/cua-answer-question.j2
@@ -0,0 +1,19 @@
+The user is trying to achieve a goal in the browser assisted by a browser AI assistant.
+
+According to the AI assistant's feedback, including reasoning and its message, there might be a question to answer or a decision to be made.
+
+Help answer the question or make the decision based on the information provided. Try your best to help the user achieve the goal.
+
+Reply in JSON format with the following keys:
+{
+    "question_or_decision": str // What is the question to answer or what is the decision to make?
+    "thought": str // Think step by step. What kind of feedback can best help the AI assistant move forward and achieve the goal?
+    "enough_information": bool // Whether the information provided is enough to answer the question or make the decision.
+    "answer": str // The answer to the question or the decision to make. Give the answer as if you're talking to the assistant.
+}
+
+User goal: {{ navigation_goal }}
+
+Assistant reasoning: {{ assistant_reasoning }}
+
+Assistant message: {{ assistant_message }}
--- a/skyvern/forge/prompts/skyvern/cua-fallback-action.j2
+++ b/skyvern/forge/prompts/skyvern/cua-fallback-action.j2
@@ -1,4 +1,4 @@
-The user is trying to achieve a goal in the browser assisted by an browser AI assistant.
+The user is trying to achieve a goal in the browser assisted by a browser AI assistant.

 According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website.

--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -317,6 +317,11 @@ async def parse_cua_actions(
                            reasoning=reasoning,
                            intention=reasoning,
                        )
+                case "screenshot":
+                    return NullAction(
+                        reasoning=reasoning,
+                        intention=reasoning,
+                    )
                case _:
                    raise ValueError(f"Unsupported action type: {action_type}")
            action.organization_id = task.organization_id