observer completion improvement (#1529)

2025-01-10 00:39:13 -08:00
parent a4744ed9f5
commit 5b33f5e221
3 changed files with 18 additions and 4 deletions
--- a/skyvern/forge/prompts/skyvern/observer.j2
+++ b/skyvern/forge/prompts/skyvern/observer.j2
@@ -14,6 +14,7 @@ Reply in JSON format with the following keys:
  "page_info": str, // Think step by step. Describe all the useful information in the page related to the user goal.
  "extraction_thought": str, // Think step by step. Should any information be extracted given the user goal? If yes, has all the information been extracted? If the user is searching for something, looking for information or specifically trying to extract information along side the goal, consider it an intention to extract information. Phrases like "find something", "show me something", "search something" and so on indicate the intention to extract information.
  "require_extraction": bool, // True if the user goal requires information extraction. False otherwise.
+  "current_information": str, // List the inforamtion extracted so far. Think what information has been collected that's helpful and relevant to the user goal, and what information is missing if any.
  "information_extracted": optional[bool], // True if the needed information has been extracted. False if the needed information has not been extracted. If task history has no "extract" type, that means no data extraction has happened, return false. Null if the user goal does not require information extraction.
  "thoughts": str, // Think step by step. What has been done so far and what is the next reasonable mini goal a human can do foreseeably move towards the overall goal.
  "user_goal_achieved": bool, // True if the user goal has been completed, false otherwise. If the user wants to extract information and it has not been done, the user goal is not achieved.
--- a/skyvern/forge/prompts/skyvern/observer_check_completion.j2
+++ b/skyvern/forge/prompts/skyvern/observer_check_completion.j2
@@ -1,13 +1,14 @@
-You're to assist the user to achieve the user goal in the web, given the user goal and the mini tasks that have been completed by the user along the way.
+You're to assist the user to achieve the user goal in the web. Given the user goal, the latest screenshot of the page and the mini tasks that have been completed by the user along the way, help decide whether the user goal has been achieved.

 Reply in JSON format with the following keys:
 {
  "page_info": str, // Think step by step. Describe all the useful information in the page related to the user goal.
  "extraction_thought": str, // Think step by step. Should any information be extracted given the user goal? If yes, has all the information been extracted? If the user is searching for something, looking for information or specifically trying to extract information along side the goal, consider it an intention to extract information. Phrases like "find something", "show me something", "search something" and so on indicate the intention to extract information.
  "require_extraction": bool, // True if the user goal requires information extraction. False otherwise.
+  "current_information": str, // List the inforamtion extracted so far. Think what information has been collected that's helpful and relevant to the user goal, and what information is missing if any.
  "information_extracted": optional[bool], // True if the needed information has been extracted. False if the needed information has not been extracted. Null if the user goal does not require information extraction.
  "thoughts": str, // Think step by step. Would completing the tasks in the task history be good enough to achieve the user goal? If more tasks need to be completed to achieve the goal, what would be the next task?
-  "user_goal_achieved": bool, // True if the user goal has been completed, false otherwise. If the user wants to extract information and it has not been done, the user goal is not achieved.
+  "user_goal_achieved": bool, // True if the user goal has been completed, false otherwise. If the user wants to extract information and it has not been done, the user goal is not achieved. If info extraction is not required, use the task history, assisted by the screenshot to decide if the user goal has been achieved.
 }

 User goal:
--- a/skyvern/forge/sdk/services/observer_service.py
+++ b/skyvern/forge/sdk/services/observer_service.py
@@ -517,7 +517,19 @@ async def run_observer_cruise_helper(
                status=workflow_run.status,
            )
            break
-        if block_result.success is True and i == max_iterations - 1:
+        if block_result.success is True:
+            screenshots = []
+            try:
+                scraped_page = await scrape_website(
+                    browser_state,
+                    url,
+                    app.AGENT_FUNCTION.cleanup_element_tree_factory(),
+                    scrape_exclude=app.scrape_exclude,
+                )
+                screenshots = scraped_page.screenshots
+            except Exception:
+                LOG.warning("Failed to scrape the website for observer completion check")
+
            # validate completion only happens at the last iteration
            observer_completion_prompt = prompt_engine.load_prompt(
                "observer_check_completion",
@@ -536,9 +548,9 @@ async def run_observer_cruise_helper(
            )
            completion_resp = await app.LLM_API_HANDLER(
                prompt=observer_completion_prompt,
+                screenshots=screenshots,
                observer_cruise=observer_thought,
            )
-            await _record_thought_screenshot(observer_thought=observer_thought, workflow_run_id=workflow_run_id)
            LOG.info(
                "Observer completion check response",
                completion_resp=completion_resp,