Add termination-aware complete verification experiment (SKY-6884) (#3948)

2025-11-07 18:53:51 -08:00
parent ea7361c9f2
commit ca958da6be
4 changed files with 252 additions and 28 deletions
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -89,7 +89,12 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskRequest, TaskResponse, Tas
 from skyvern.forge.sdk.schemas.totp_codes import OTPType
 from skyvern.forge.sdk.trace import TraceManager
 from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
-from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock
+from skyvern.forge.sdk.workflow.models.block import (
+    ActionBlock,
+    BaseTaskBlock,
+    FileDownloadBlock,
+    ValidationBlock,
+)
 from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus
 from skyvern.schemas.runs import CUA_ENGINES, RunEngine
 from skyvern.schemas.steps import AgentStepOutput
@@ -1328,6 +1333,7 @@ class ForgeAgent:
                        scraped_page=scraped_page,
                        task=task,
                        step=step,
+                        task_block=task_block,
                    )
                    if complete_action is not None:
                        LOG.info("User goal achieved, executing complete action")
@@ -1785,7 +1791,7 @@ class ForgeAgent:
            return None

    async def complete_verify(
-        self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+        self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step, task_block: BaseTaskBlock | None = None
    ) -> CompleteVerifyResult:
        LOG.info(
            "Checking if user goal is achieved after re-scraping the page",
@@ -1803,13 +1809,47 @@ class ForgeAgent:
        if task.include_action_history_in_verification:
            actions_and_results_str = await self._get_action_results(task, current_step=step)

+        # Check if we should use the termination-aware prompt (experiment)
+        # Only enabled for file download blocks
+        use_termination_prompt = False
+        is_file_download_block = task_block is not None and isinstance(task_block, FileDownloadBlock)
+
+        if is_file_download_block:
+            try:
+                distinct_id = task.workflow_run_id if task.workflow_run_id else task.task_id
+                use_termination_prompt = await app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
+                    "USE_TERMINATION_AWARE_COMPLETE_VERIFICATION",
+                    distinct_id,
+                    properties={"organization_id": task.organization_id},
+                )
+                if use_termination_prompt:
+                    LOG.info(
+                        "Experiment enabled: using termination-aware complete verification prompt for file download block",
+                        task_id=task.task_id,
+                        workflow_run_id=task.workflow_run_id,
+                        organization_id=task.organization_id,
+                        block_type="file_download",
+                    )
+            except Exception as e:
+                LOG.warning(
+                    "Failed to check USE_TERMINATION_AWARE_COMPLETE_VERIFICATION experiment; using legacy behavior",
+                    task_id=task.task_id,
+                    workflow_run_id=task.workflow_run_id,
+                    error=str(e),
+                )
+
+        # Select the appropriate template based on experiment
+        template_name = "check-user-goal-with-termination" if use_termination_prompt else "check-user-goal"
+        prompt_name = "check-user-goal-with-termination" if use_termination_prompt else "check-user-goal"
+
        verification_prompt = load_prompt_with_elements(
            element_tree_builder=scraped_page_refreshed,
            prompt_engine=prompt_engine,
-            template_name="check-user-goal",
+            template_name=template_name,
            navigation_goal=task.navigation_goal,
            navigation_payload=task.navigation_payload,
            complete_criterion=task.complete_criterion,
+            terminate_criterion=task.terminate_criterion,
            action_history=actions_and_results_str,
            local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(),
        )
@@ -1855,23 +1895,37 @@ class ForgeAgent:
            prompt=verification_prompt,
            step=step,
            screenshots=scraped_page_refreshed.screenshots,
-            prompt_name="check-user-goal",
+            prompt_name=prompt_name,
        )
        return CompleteVerifyResult.model_validate(verification_result)

    async def check_user_goal_complete(
-        self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
-    ) -> CompleteAction | None:
+        self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step, task_block: BaseTaskBlock | None = None
+    ) -> CompleteAction | TerminateAction | None:
        try:
            verification_result = await self.complete_verify(
                page=page,
                scraped_page=scraped_page,
                task=task,
                step=step,
+                task_block=task_block,
            )

+            # Check if we should terminate instead of complete
+            # Note: This requires the USE_TERMINATION_AWARE_COMPLETE_VERIFICATION experiment to be enabled
+            if verification_result.is_terminate:
+                LOG.warning(
+                    "Periodic verification determined task should terminate (termination-aware experiment)",
+                    workflow_run_id=task.workflow_run_id,
+                    thoughts=verification_result.thoughts,
+                    status=verification_result.status if verification_result.status else "legacy",
+                )
+                return TerminateAction(
+                    reasoning=verification_result.thoughts,
+                )
+
            # We don't want to return a complete action if the user goal is not achieved since we're checking at every step
-            if not verification_result.user_goal_achieved:
+            if not verification_result.is_complete:
                return None

            return CompleteAction(
@@ -3160,6 +3214,7 @@ class ForgeAgent:
                scraped_page=scraped_page,
                task=task,
                step=step,
+                task_block=task_block,
            ),
            name=f"verify_goal_{step.step_id}",
        )
@@ -3187,11 +3242,13 @@ class ForgeAgent:
            complete_action = None

        if complete_action is not None:
-            # Goal achieved! Cancel the pre-scraping task
+            # Goal achieved or should terminate! Cancel the pre-scraping task
+            is_terminate = isinstance(complete_action, TerminateAction)
            LOG.info(
-                "Parallel verification: goal achieved, cancelling pre-scraping",
+                "Parallel verification: goal achieved or termination required, cancelling pre-scraping",
                step_id=step.step_id,
                task_id=task.task_id,
+                is_terminate=is_terminate,
            )
            pre_scrape_task.cancel()
            try:
@@ -3201,22 +3258,73 @@ class ForgeAgent:
            except Exception:
                LOG.debug("Pre-scraping task cleanup failed", step_id=step.step_id, exc_info=True)

-            # Mark task as complete
-            # Note: Step is already marked as completed by agent_step
-            # We don't add the complete action to the step output since the step is already finalized
-            LOG.info(
-                "Parallel verification: goal achieved, marking task as complete",
-                step_id=step.step_id,
-                task_id=task.task_id,
-            )
-            last_step = await self.update_step(step, is_last=True)
-            extracted_information = await self.get_extracted_information_for_task(task)
-            await self.update_task(
-                task,
-                status=TaskStatus.completed,
-                extracted_information=extracted_information,
-            )
-            return True, last_step, None
+            working_page = page
+            if working_page is None:
+                working_page = await browser_state.must_get_working_page()
+
+            if step.output is None:
+                step.output = AgentStepOutput(action_results=[], actions_and_results=[], errors=[])
+            if step.output.action_results is None:
+                step.output.action_results = []
+            if step.output.actions_and_results is None:
+                step.output.actions_and_results = []
+
+            persisted_action = cast(Action, complete_action)
+            if isinstance(persisted_action, (CompleteAction, TerminateAction)):
+                persisted_action.organization_id = task.organization_id
+                persisted_action.workflow_run_id = task.workflow_run_id
+                persisted_action.task_id = task.task_id
+                persisted_action.step_id = step.step_id
+                persisted_action.step_order = step.order
+                persisted_action.action_order = len(step.output.actions_and_results)
+
+            action_results = await ActionHandler.handle_action(scraped_page, task, step, working_page, persisted_action)
+            await self.record_artifacts_after_action(task, step, browser_state, engine)
+            step.output.action_results.extend(action_results)
+            step.output.actions_and_results.append((persisted_action, action_results))
+            if isinstance(persisted_action, DecisiveAction) and persisted_action.errors:
+                step.output.errors.extend(persisted_action.errors)
+
+            if is_terminate:
+                # Mark task as terminated/failed
+                # Note: This requires the USE_TERMINATION_AWARE_COMPLETE_VERIFICATION experiment to be enabled
+                LOG.warning(
+                    "Parallel verification: termination required, marking task as terminated (termination-aware experiment)",
+                    step_id=step.step_id,
+                    task_id=task.task_id,
+                    reasoning=complete_action.reasoning,
+                )
+                last_step = await self.update_step(step, output=step.output, is_last=True)
+                task_errors = None
+                if isinstance(persisted_action, TerminateAction) and persisted_action.errors:
+                    task_errors = [error.model_dump() for error in persisted_action.errors]
+                failure_reason = persisted_action.reasoning
+                if isinstance(persisted_action, TerminateAction) and persisted_action.errors:
+                    failure_reason = "; ".join(error.reasoning for error in persisted_action.errors)
+                await self.update_task(
+                    task,
+                    status=TaskStatus.terminated,
+                    failure_reason=failure_reason,
+                    errors=task_errors,
+                )
+                return True, last_step, None
+            else:
+                # Mark task as complete
+                # Note: Step is already marked as completed by agent_step
+                # We don't add the complete action to the step output since the step is already finalized
+                LOG.info(
+                    "Parallel verification: goal achieved, marking task as complete",
+                    step_id=step.step_id,
+                    task_id=task.task_id,
+                )
+                last_step = await self.update_step(step, output=step.output, is_last=True)
+                extracted_information = await self.get_extracted_information_for_task(task)
+                await self.update_task(
+                    task,
+                    status=TaskStatus.completed,
+                    extracted_information=extracted_information,
+                )
+                return True, last_step, None
        else:
            # Goal not achieved - wait for pre-scraping to complete
            LOG.info(
--- a/skyvern/forge/prompts/skyvern/check-user-goal-with-termination.j2
+++ b/skyvern/forge/prompts/skyvern/check-user-goal-with-termination.j2
@@ -0,0 +1,54 @@
+You are here to help the user determine if the user has completed their goal on the web{{ " according to the complete criterion" if complete_criterion else "" }}. Use the content of the elements parsed from the page,{{ "" if without_screenshots else " the screenshots of the page," }} the user goal and user details to determine the status of the task.
+
+Make sure to ONLY return the JSON object in this format with no additional text before or after it:
+```json
+{
+  "page_info": str, // Think step by step. Describe all the useful information in the page related to the user goal.
+  "thoughts": str, // Think step by step. Explain your reasoning for the status you selected.
+  "status": str // Must be one of three values: "complete", "terminate", or "continue". Use "complete" ONLY if the user goal has been fully achieved{{ " according to the complete criterion" if complete_criterion else "" }}. Use "terminate" ONLY if the goal CANNOT ever be achieved (e.g., a file doesn't exist, an error modal blocks progress permanently, or an explicit termination condition is met in the user goal). Use "continue" if the goal is not yet achieved but more steps could potentially achieve it (this is the most common case - use this when you need to wait, navigate, or try different actions).
+}
+```
+
+Important: Think carefully about the difference between "terminate" and "continue":
+- "terminate" = impossible to achieve, stop trying (e.g., "account does not exist", "file unavailable", permanent error)
+- "continue" = not done yet, but achievable with more steps (e.g., page is loading, need to click something, need to wait)
+
+User Goal:
+```
+{{ navigation_goal }}
+```
+
+User Details:
+```
+{{ navigation_payload }}
+```
+{% if complete_criterion %}
+Complete Criterion:
+```
+{{ complete_criterion }}
+```{% endif %}{% if terminate_criterion %}
+Terminate Criterion:
+```
+{{ terminate_criterion }}
+```{% endif %}
+{% if action_history %}
+Action History:
+```
+{{ action_history }}
+```
+{% endif %}{% if new_elements_ids %}
+IDs for emerging HTML elements
+```
+{{ new_elements_ids }}
+```
+{% endif %}
+Elements on the page:
+```
+{{ elements }}
+```
+
+Current datetime, ISO format:
+```
+{{ local_datetime }}
+```
+
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -29,13 +29,50 @@ class SelectOption(BaseModel):
        return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"


+class VerificationStatus(StrEnum):
+    """Status of user goal verification."""
+
+    complete = "complete"  # Goal achieved successfully
+    terminate = "terminate"  # Goal cannot be achieved, stop trying
+    continue_step = "continue"  # Goal not yet achieved, continue with more steps
+
+
 class CompleteVerifyResult(BaseModel):
-    user_goal_achieved: bool
+    # New field: explicit status with three options (used when experiment is enabled)
+    status: VerificationStatus | None = None
+
+    # Legacy fields: for backward compatibility (used when experiment is disabled)
+    user_goal_achieved: bool = False
+    should_terminate: bool = False
+
    thoughts: str
    page_info: str | None = None

    def __repr__(self) -> str:
-        return f"CompleteVerifyResponse(thoughts={self.thoughts}, user_goal_achieved={self.user_goal_achieved}, page_info={self.page_info})"
+        if self.status:
+            return f"CompleteVerifyResult(status={self.status}, thoughts={self.thoughts}, page_info={self.page_info})"
+        return f"CompleteVerifyResult(thoughts={self.thoughts}, user_goal_achieved={self.user_goal_achieved}, should_terminate={self.should_terminate}, page_info={self.page_info})"
+
+    @property
+    def is_complete(self) -> bool:
+        """True if goal was achieved (supports both new and legacy formats)."""
+        if self.status:
+            return self.status == VerificationStatus.complete
+        return self.user_goal_achieved
+
+    @property
+    def is_terminate(self) -> bool:
+        """True if task should terminate (supports both new and legacy formats)."""
+        if self.status:
+            return self.status == VerificationStatus.terminate
+        return self.should_terminate
+
+    @property
+    def is_continue(self) -> bool:
+        """True if task should continue (supports both new and legacy formats)."""
+        if self.status:
+            return self.status == VerificationStatus.continue_step
+        return not self.user_goal_achieved and not self.should_terminate


 class InputOrSelectContext(BaseModel):
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -1994,7 +1994,32 @@ async def handle_complete_action(
            )
            return [ActionFailure(exception=e)]

-        if not verification_result.user_goal_achieved:
+        # Check if we should terminate instead of complete
+        # Note: This requires the USE_TERMINATION_AWARE_COMPLETE_VERIFICATION experiment to be enabled
+        if verification_result.is_terminate:
+            LOG.warning(
+                "CompleteAction verification determined task should terminate instead (termination-aware experiment)",
+                workflow_run_id=task.workflow_run_id,
+                thoughts=verification_result.thoughts,
+                status=verification_result.status if verification_result.status else "legacy",
+            )
+            # Create a TerminateAction and execute it
+            terminate_action = actions.TerminateAction(
+                reasoning=verification_result.thoughts,
+                organization_id=action.organization_id,
+                workflow_run_id=action.workflow_run_id,
+                task_id=action.task_id,
+                step_id=action.step_id,
+                step_order=action.step_order,
+                action_order=action.action_order,
+            )
+            results = await handle_terminate_action(terminate_action, page, scraped_page, task, step)
+            action.action_type = ActionType.TERMINATE
+            action.reasoning = terminate_action.reasoning
+            action.errors = terminate_action.errors
+            return results
+
+        if not verification_result.is_complete:
            return [ActionFailure(exception=IllegitComplete(data={"error": verification_result.thoughts}))]

        LOG.info(