From 5b33f5e2210e8afdab1e27c73d840a567f7c9cf3 Mon Sep 17 00:00:00 2001
From: Shuchang Zheng <wintonzheng0325@gmail.com>
Date: Fri, 10 Jan 2025 00:39:13 -0800
Subject: [PATCH] observer completion improvement (#1529)

---
 skyvern/forge/prompts/skyvern/observer.j2        |  1 +
 .../prompts/skyvern/observer_check_completion.j2 |  5 +++--
 skyvern/forge/sdk/services/observer_service.py   | 16 ++++++++++++++--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/skyvern/forge/prompts/skyvern/observer.j2 b/skyvern/forge/prompts/skyvern/observer.j2
index c9868031..0c81102a 100644
--- a/skyvern/forge/prompts/skyvern/observer.j2
+++ b/skyvern/forge/prompts/skyvern/observer.j2
@@ -14,6 +14,7 @@ Reply in JSON format with the following keys:
   "page_info": str, // Think step by step. Describe all the useful information in the page related to the user goal.
   "extraction_thought": str, // Think step by step. Should any information be extracted given the user goal? If yes, has all the information been extracted? If the user is searching for something, looking for information or specifically trying to extract information along side the goal, consider it an intention to extract information. Phrases like "find something", "show me something", "search something" and so on indicate the intention to extract information.
   "require_extraction": bool, // True if the user goal requires information extraction. False otherwise.
+  "current_information": str, // List the inforamtion extracted so far. Think what information has been collected that's helpful and relevant to the user goal, and what information is missing if any.
   "information_extracted": optional[bool], // True if the needed information has been extracted. False if the needed information has not been extracted. If task history has no "extract" type, that means no data extraction has happened, return false. Null if the user goal does not require information extraction.
   "thoughts": str, // Think step by step. What has been done so far and what is the next reasonable mini goal a human can do foreseeably move towards the overall goal.
   "user_goal_achieved": bool, // True if the user goal has been completed, false otherwise. If the user wants to extract information and it has not been done, the user goal is not achieved.
diff --git a/skyvern/forge/prompts/skyvern/observer_check_completion.j2 b/skyvern/forge/prompts/skyvern/observer_check_completion.j2
index 6a5bb5da..d3632dd5 100644
--- a/skyvern/forge/prompts/skyvern/observer_check_completion.j2
+++ b/skyvern/forge/prompts/skyvern/observer_check_completion.j2
@@ -1,13 +1,14 @@
-You're to assist the user to achieve the user goal in the web, given the user goal and the mini tasks that have been completed by the user along the way.
+You're to assist the user to achieve the user goal in the web. Given the user goal, the latest screenshot of the page and the mini tasks that have been completed by the user along the way, help decide whether the user goal has been achieved.
 
 Reply in JSON format with the following keys:
 {
   "page_info": str, // Think step by step. Describe all the useful information in the page related to the user goal.
   "extraction_thought": str, // Think step by step. Should any information be extracted given the user goal? If yes, has all the information been extracted? If the user is searching for something, looking for information or specifically trying to extract information along side the goal, consider it an intention to extract information. Phrases like "find something", "show me something", "search something" and so on indicate the intention to extract information.
   "require_extraction": bool, // True if the user goal requires information extraction. False otherwise.
+  "current_information": str, // List the inforamtion extracted so far. Think what information has been collected that's helpful and relevant to the user goal, and what information is missing if any.
   "information_extracted": optional[bool], // True if the needed information has been extracted. False if the needed information has not been extracted. Null if the user goal does not require information extraction.
   "thoughts": str, // Think step by step. Would completing the tasks in the task history be good enough to achieve the user goal? If more tasks need to be completed to achieve the goal, what would be the next task?
-  "user_goal_achieved": bool, // True if the user goal has been completed, false otherwise. If the user wants to extract information and it has not been done, the user goal is not achieved.
+  "user_goal_achieved": bool, // True if the user goal has been completed, false otherwise. If the user wants to extract information and it has not been done, the user goal is not achieved. If info extraction is not required, use the task history, assisted by the screenshot to decide if the user goal has been achieved.
 }
 
 User goal:
diff --git a/skyvern/forge/sdk/services/observer_service.py b/skyvern/forge/sdk/services/observer_service.py
index 75ddbb49..9eeb607a 100644
--- a/skyvern/forge/sdk/services/observer_service.py
+++ b/skyvern/forge/sdk/services/observer_service.py
@@ -517,7 +517,19 @@ async def run_observer_cruise_helper(
                 status=workflow_run.status,
             )
             break
-        if block_result.success is True and i == max_iterations - 1:
+        if block_result.success is True:
+            screenshots = []
+            try:
+                scraped_page = await scrape_website(
+                    browser_state,
+                    url,
+                    app.AGENT_FUNCTION.cleanup_element_tree_factory(),
+                    scrape_exclude=app.scrape_exclude,
+                )
+                screenshots = scraped_page.screenshots
+            except Exception:
+                LOG.warning("Failed to scrape the website for observer completion check")
+
             # validate completion only happens at the last iteration
             observer_completion_prompt = prompt_engine.load_prompt(
                 "observer_check_completion",
@@ -536,9 +548,9 @@ async def run_observer_cruise_helper(
             )
             completion_resp = await app.LLM_API_HANDLER(
                 prompt=observer_completion_prompt,
+                screenshots=screenshots,
                 observer_cruise=observer_thought,
             )
-            await _record_thought_screenshot(observer_thought=observer_thought, workflow_run_id=workflow_run_id)
             LOG.info(
                 "Observer completion check response",
                 completion_resp=completion_resp,