introduce complete verification (#1201)

2024-11-15 14:03:01 +08:00
parent 54f793c797
commit e505671230
4 changed files with 91 additions and 36 deletions
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -49,6 +49,7 @@ from skyvern.webeye.actions.actions import (
    Action,
    ActionType,
    CompleteAction,
+    CompleteVerifyResult,
    DecisiveAction,
    UserDefinedError,
    WebAction,
@@ -923,57 +924,59 @@ class ForgeAgent:
            )
            return failed_step, detailed_agent_step_output.get_clean_detailed_output()

+    @staticmethod
+    async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult:
+        LOG.info(
+            "Checking if user goal is achieved after re-scraping the page",
+            task_id=task.task_id,
+            step_id=step.step_id,
+            workflow_run_id=task.workflow_run_id,
+        )
+        scraped_page_refreshed = await scraped_page.refresh()
+
+        # TODO: currently, just using the check user goal for complete verification
+        # maybe need a desinged complete criterion in the future
+        verification_prompt = prompt_engine.load_prompt(
+            "check-user-goal",
+            navigation_goal=task.navigation_goal,
+            navigation_payload=task.navigation_payload,
+            elements=scraped_page_refreshed.build_element_tree(ElementTreeFormat.HTML),
+        )
+
+        # this prompt is critical to our agent so let's use the primary LLM API handler
+        verification_result = await app.LLM_API_HANDLER(
+            prompt=verification_prompt, step=step, screenshots=scraped_page_refreshed.screenshots
+        )
+        return CompleteVerifyResult.model_validate(verification_result)
+
    @staticmethod
    async def check_user_goal_complete(
        page: Page, scraped_page: ScrapedPage, task: Task, step: Step
    ) -> CompleteAction | None:
        try:
-            LOG.info(
-                "Checking if user goal is achieved after re-scraping the page without screenshots",
-                task_id=task.task_id,
-                step_id=step.step_id,
-                workflow_run_id=task.workflow_run_id,
-            )
-            scraped_page_refreshed = await scraped_page.refresh()
-
-            verification_prompt = prompt_engine.load_prompt(
-                "check-user-goal",
-                navigation_goal=task.navigation_goal,
-                navigation_payload=task.navigation_payload,
-                elements=scraped_page_refreshed.build_element_tree(ElementTreeFormat.HTML),
+            verification_result = await app.agent.complete_verify(
+                page=page,
+                scraped_page=scraped_page,
+                task=task,
+                step=step,
            )

-            # this prompt is critical to our agent so let's use the primary LLM API handler
-            verification_response = await app.LLM_API_HANDLER(
-                prompt=verification_prompt, step=step, screenshots=scraped_page_refreshed.screenshots
-            )
-            if "user_goal_achieved" not in verification_response or "thoughts" not in verification_response:
-                LOG.error(
-                    "Invalid LLM response for user goal success verification, skipping verification",
-                    verification_response=verification_response,
-                    task_id=task.task_id,
-                    step_id=step.step_id,
-                    workflow_run_id=task.workflow_run_id,
-                )
-                return None
-
-            user_goal_achieved: bool = verification_response["user_goal_achieved"]
            # We don't want to return a complete action if the user goal is not achieved since we're checking at every step
-            if not user_goal_achieved:
+            if not verification_result.user_goal_achieved:
                return None

            return CompleteAction(
-                reasoning=verification_response["thoughts"],
+                reasoning=verification_result.thoughts,
                data_extraction_goal=task.data_extraction_goal,
+                verified=True,
            )

        except Exception:
-            LOG.error(
-                "LLM verification failed for complete action, skipping LLM verification",
+            LOG.exception(
+                "Failed to check user goal complete, skipping",
                task_id=task.task_id,
                step_id=step.step_id,
                workflow_run_id=task.workflow_run_id,
-                exc_info=True,
            )
            return None

--- a/skyvern/forge/prompts/skyvern/check-user-goal.j2
+++ b/skyvern/forge/prompts/skyvern/check-user-goal.j2
@@ -7,12 +7,19 @@ Make sure to ONLY return the JSON object in this format with no additional text
  "thoughts": str, // Think step by step. What information makes you believe whether user goal has completed or not. Use information you see on the site to explain.
  "user_goal_achieved": bool // True if the user goal has been completed, false otherwise.
 }
+```

 Elements on the page:
+```
 {{ elements }}
+```

 User Goal:
+```
 {{ navigation_goal }}
+```

 User Details:
+```
 {{ navigation_payload }}
+```
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -65,6 +65,15 @@ class SelectOption(BaseModel):
        return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"


+class CompleteVerifyResult(BaseModel):
+    user_goal_achieved: bool
+    thoughts: str
+    page_info: str | None = None
+
+    def __repr__(self) -> str:
+        return f"CompleteVerifyResponse(thoughts={self.thoughts}, user_goal_achieved={self.user_goal_achieved}, page_info={self.page_info})"
+
+
 class InputOrSelectContext(BaseModel):
    field: str | None = None
    is_required: bool | None = None
@@ -226,6 +235,7 @@ class TerminateAction(DecisiveAction):

 class CompleteAction(DecisiveAction):
    action_type: ActionType = ActionType.COMPLETE
+    verified: bool = False
    data_extraction_goal: str | None = None


--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -1032,9 +1032,14 @@ async def handle_complete_action(
 ) -> list[ActionResult]:
    # If this action has a source_action_id, then we need to make sure if the goal is actually completed.
    if action.source_action_id:
-        LOG.info("CompleteAction has source_action_id, checking if goal is completed")
-        complete_action = await app.agent.check_user_goal_complete(page, scraped_page, task, step)
-        if complete_action is None:
+        LOG.info(
+            "CompleteAction has source_action_id, checking if goal is completed",
+            task_id=task.task_id,
+            step_id=step.step_id,
+            workflow_run_id=task.workflow_run_id,
+        )
+        verified_complete_action = await app.agent.check_user_goal_complete(page, scraped_page, task, step)
+        if verified_complete_action is None:
            return [
                ActionFailure(
                    exception=IllegitComplete(
@@ -1044,6 +1049,36 @@ async def handle_complete_action(
                    )
                )
            ]
+        action.verified = True
+
+    if not action.verified:
+        LOG.info(
+            "CompleteAction hasn't been verified, going to verify the user goal",
+            task_id=task.task_id,
+            step_id=step.step_id,
+            workflow_run_id=task.workflow_run_id,
+        )
+        try:
+            verification_result = await app.agent.complete_verify(page, scraped_page, task, step)
+        except Exception as e:
+            LOG.exception(
+                "Failed to verify the complete action",
+                task_id=task.task_id,
+                step_id=step.step_id,
+                workflow_run_id=task.workflow_run_id,
+            )
+            return [ActionFailure(exception=e)]
+
+        if not verification_result.user_goal_achieved:
+            return [ActionFailure(exception=IllegitComplete(data={"error": verification_result.thoughts}))]
+
+        LOG.info(
+            "CompleteAction has been verified successfully",
+            task_id=task.task_id,
+            step_id=step.step_id,
+            workflow_run_id=task.workflow_run_id,
+        )
+        action.verified = True

    extracted_data = None
    if action.data_extraction_goal: