Add complete action verification (#845)

2024-09-17 18:59:40 -07:00
parent 20154020dd
commit d19ff2bd69
3 changed files with 115 additions and 1 deletions
--- a/skyvern/exceptions.py
+++ b/skyvern/exceptions.py
@@ -502,3 +502,9 @@ class FailToFindAutocompleteOption(SkyvernException):
        super().__init__(
            f"Can't find a suitable auto completion for the current value, maybe retry with another reasonable value. current_value={current_value}"
        )
 class IllegitComplete(SkyvernException):
    def __init__(self, data: dict | None = None) -> None:
        data_str = f", data={data}" if data else ""
        super().__init__(f"Illegit complete{data_str}")
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -50,7 +50,7 @@ from skyvern.webeye.actions.actions import (
    WebAction,
    parse_actions,
 )
-from skyvern.webeye.actions.handler import ActionHandler, poll_verification_code
+from skyvern.webeye.actions.handler import ActionHandler, handle_complete_action, poll_verification_code
 from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput
 from skyvern.webeye.actions.responses import ActionResult
 from skyvern.webeye.browser_factory import BrowserState
@@ -773,6 +773,36 @@ class ForgeAgent:
                step_retry=step.retry_index,
                action_results=action_results,
            )
            if app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
                "CHECK_USER_GOAL_SUCCESS_EVERY_STEP",
                task.workflow_run_id or task.task_id,
                properties={
                    "organization_id": task.organization_id,
                    "organization_created_at": str(organization.created_at) if organization else None,
                },
            ):
                LOG.info("Checking if user goal is achieved after re-scraping the page")
                # Check if navigation goal is achieved after re-scraping the page
                new_scraped_page = await self._scrape_with_type(
                    task=task,
                    step=step,
                    browser_state=browser_state,
                    scrape_type=ScrapeType.NORMAL,
                    organization=organization,
                )
                if new_scraped_page is None:
                    LOG.warning("Failed to scrape the page before checking user goal success, skipping check...")
                else:
                    working_page = await browser_state.get_working_page()
                    result_tuple = await self.check_user_goal_success(
                        page=working_page,
                        scraped_page=new_scraped_page,
                        task=task,
                        step=step,
                    )
                    if result_tuple is not None:
                        complete_action, action_results = result_tuple
                        detailed_agent_step_output.actions_and_results.append((complete_action, action_results))
            # If no action errors return the agent state and output
            completed_step = await self.update_step(
                step=step,
@@ -811,6 +841,55 @@ class ForgeAgent:
            )
            return failed_step, detailed_agent_step_output.get_clean_detailed_output()
    @staticmethod
    async def check_user_goal_success(
        page: Page, scraped_page: ScrapedPage, task: Task, step: Step
    ) -> tuple[CompleteAction, list[ActionResult]] | None:
        try:
            # Check if Skyvern already returned a complete action, if so, don't run verification
            if step.output and step.output.actions_and_results:
                for action, results in step.output.actions_and_results:
                    if isinstance(action, CompleteAction):
                        return None
            verification_prompt = prompt_engine.load_prompt(
                "check-user-goal",
                navigation_goal=task.navigation_goal,
                navigation_payload=task.navigation_payload,
                elements=scraped_page.build_element_tree(ElementTreeFormat.HTML),
            )
            screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url)
            verification_llm_api_handler = app.SECONDARY_LLM_API_HANDLER
            verification_response = await verification_llm_api_handler(
                prompt=verification_prompt, step=step, screenshots=screenshots
            )
            if "user_goal_achieved" not in verification_response or "reasoning" not in verification_response:
                LOG.error(
                    "Invalid LLM response for user goal success verification, skipping verification",
                    verification_response=verification_response,
                )
                return None
            user_goal_achieved: bool = verification_response["user_goal_achieved"]
            complete_action = CompleteAction(
                reasoning=verification_response["reasoning"],
                data_extraction_goal=task.data_extraction_goal,
            )
            # We don't want to return a complete action if the user goal is not achieved since we're checking at every step
            if not user_goal_achieved:
                return None
            LOG.info("User goal achieved, executing complete action")
            action_results = await handle_complete_action(complete_action, page, scraped_page, task, step)
            return complete_action, action_results
        except Exception:
            LOG.error("LLM verification failed for complete action, skipping LLM verification", exc_info=True)
            return None
    async def record_artifacts_after_action(self, task: Task, step: Step, browser_state: BrowserState) -> None:
        working_page = await browser_state.get_working_page()
        if not working_page:
--- a/skyvern/forge/prompts/skyvern/check-user-goal.j2
+++ b/skyvern/forge/prompts/skyvern/check-user-goal.j2
@@ -0,0 +1,29 @@
 Based on the content of the screenshot and the elements on the page, determine whether the user goal has been successfully completed or not.
 The JSON object should be in this format:
 ```json
 {
  "reasoning": str, // Describe the state of the user goal and explain why it has been completed or not completed.
  "user_goal_achieved": bool // True if the user goal has been completed, False otherwise.
 }
 Make sure to ONLY return the JSON object, with no additional text before or after it. Do not make any assumptions based on the screenshot, return a response solely based on what you observe in the screenshot and nothing else.
 Examples:
 {
  "reasoning": "The screenshot shows a success message for a file upload field. Since the user's goal is to upload a file, it has been successfully completed.",
  "user_goal_achieved": true
 }
 {
  "reasoning": "The screenshot shows a job application form with fields. Since the user's goal is to submit a job application, it has not been successfully completed.",
  "user_goal_achieved": false
 }
 Elements on the page:
 {{ elements }}
 User Goal:
 {{ navigation_goal }}
 User Details:
 {{ navigation_payload }}