diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 3a85cf22..a4b208e9 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -879,15 +879,6 @@ class ForgeAgent: status=StepStatus.failed, output=detailed_agent_step_output.to_agent_step_output(), ) - detailed_agent_step_output = DetailedAgentStepOutput( - scraped_page=scraped_page, - extract_action_prompt=extract_action_prompt, - llm_response=json_response, - actions=actions, - action_results=[], - actions_and_results=[], - step_exception=None, - ) return step, detailed_agent_step_output # Execute the actions @@ -1268,44 +1259,86 @@ class ForgeAgent: incremental_reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None, incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None, ) - - computer_calls = [item for item in previous_response.output if item.type == "computer_call"] - if not computer_calls: - return [], previous_response - if not scraped_page.screenshots: return [], previous_response - last_call_id = computer_calls[-1].call_id - screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8") + computer_calls = [item for item in previous_response.output if item.type == "computer_call"] + reasonings = [item for item in previous_response.output if item.type == "reasoning"] + assistant_messages = [ + item for item in previous_response.output if item.type == "message" and item.role == "assistant" + ] + last_call_id = None + if computer_calls: + last_call_id = computer_calls[-1].call_id - current_response = await app.OPENAI_CLIENT.responses.create( - model="computer-use-preview", - previous_response_id=previous_response.id, - tools=[ - { - "type": "computer_use_preview", - "display_width": settings.BROWSER_WIDTH, - "display_height": settings.BROWSER_HEIGHT, - "environment": "browser", - } - ], - input=[ - { - "call_id": last_call_id, - "type": "computer_call_output", - "output": { - "type": "input_image", - "image_url": f"data:image/png;base64,{screenshot_base64}", - }, - } - ], - reasoning={ - "generate_summary": "concise", - }, - truncation="auto", - temperature=0, - ) + screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8") + if last_call_id is None: + # try address the conversation with the context we have + reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None + assistant_message = assistant_messages[0].content[0].text if assistant_messages else None + skyvern_repsonse_prompt = load_prompt_with_elements( + scraped_page=scraped_page, + prompt_engine=prompt_engine, + template_name="cua-answer-question", + navigation_goal=task.navigation_goal, + assistant_reasoning=reasoning, + assistant_message=assistant_message, + ) + skyvern_response = await app.LLM_API_HANDLER( + prompt=skyvern_repsonse_prompt, + prompt_name="cua-answer-question", + step=step, + screenshots=scraped_page.screenshots, + ) + resp_content = skyvern_response.get("answer") + if not resp_content: + resp_content = "I don't know. Can you help me make the best decision to achieve the goal?" + current_response = await app.OPENAI_CLIENT.responses.create( + model="computer-use-preview", + previous_response_id=previous_response.id, + tools=[ + { + "type": "computer_use_preview", + "display_width": settings.BROWSER_WIDTH, + "display_height": settings.BROWSER_HEIGHT, + "environment": "browser", + } + ], + input=[ + {"role": "user", "content": resp_content}, + ], + reasoning={"generate_summary": "concise"}, + truncation="auto", + temperature=0, + ) + else: + current_response = await app.OPENAI_CLIENT.responses.create( + model="computer-use-preview", + previous_response_id=previous_response.id, + tools=[ + { + "type": "computer_use_preview", + "display_width": settings.BROWSER_WIDTH, + "display_height": settings.BROWSER_HEIGHT, + "environment": "browser", + } + ], + input=[ + { + "call_id": last_call_id, + "type": "computer_call_output", + "output": { + "type": "input_image", + "image_url": f"data:image/png;base64,{screenshot_base64}", + }, + } + ], + reasoning={ + "generate_summary": "concise", + }, + truncation="auto", + temperature=0, + ) input_tokens = current_response.usage.input_tokens or 0 output_tokens = current_response.usage.output_tokens or 0 current_response.usage.total_tokens or 0 diff --git a/skyvern/forge/prompts/skyvern/cua-answer-question.j2 b/skyvern/forge/prompts/skyvern/cua-answer-question.j2 new file mode 100644 index 00000000..f5208771 --- /dev/null +++ b/skyvern/forge/prompts/skyvern/cua-answer-question.j2 @@ -0,0 +1,19 @@ +The user is trying to achieve a goal in the browser assisted by a browser AI assistant. + +According to the AI assistant's feedback, including reasoning and its message, there might be a question to answer or a decision to be made. + +Help answer the question or make the decision based on the information provided. Try your best to help the user achieve the goal. + +Reply in JSON format with the following keys: +{ + "question_or_decision": str // What is the question to answer or what is the decision to make? + "thought": str // Think step by step. What kind of feedback can best help the AI assistant move forward and achieve the goal? + "enough_information": bool // Whether the information provided is enough to answer the question or make the decision. + "answer": str // The answer to the question or the decision to make. Give the answer as if you're talking to the assistant. +} + +User goal: {{ navigation_goal }} + +Assistant reasoning: {{ assistant_reasoning }} + +Assistant message: {{ assistant_message }} \ No newline at end of file diff --git a/skyvern/forge/prompts/skyvern/cua-fallback-action.j2 b/skyvern/forge/prompts/skyvern/cua-fallback-action.j2 index 1e7dcf25..a6b13e27 100644 --- a/skyvern/forge/prompts/skyvern/cua-fallback-action.j2 +++ b/skyvern/forge/prompts/skyvern/cua-fallback-action.j2 @@ -1,4 +1,4 @@ -The user is trying to achieve a goal in the browser assisted by an browser AI assistant. +The user is trying to achieve a goal in the browser assisted by a browser AI assistant. According to the AI assistant's feedback, including reasoning and its message, there's no immediate action the assistant can take in the website. diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index ff75f181..1e8d5276 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -317,6 +317,11 @@ async def parse_cua_actions( reasoning=reasoning, intention=reasoning, ) + case "screenshot": + return NullAction( + reasoning=reasoning, + intention=reasoning, + ) case _: raise ValueError(f"Unsupported action type: {action_type}") action.organization_id = task.organization_id