Add termination-aware complete verification experiment (SKY-6884) (#3948)

This commit is contained in:
pedrohsdb
2025-11-07 18:53:51 -08:00
committed by GitHub
parent ea7361c9f2
commit ca958da6be
4 changed files with 252 additions and 28 deletions

View File

@@ -89,7 +89,12 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskRequest, TaskResponse, Tas
from skyvern.forge.sdk.schemas.totp_codes import OTPType
from skyvern.forge.sdk.trace import TraceManager
from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock
from skyvern.forge.sdk.workflow.models.block import (
ActionBlock,
BaseTaskBlock,
FileDownloadBlock,
ValidationBlock,
)
from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus
from skyvern.schemas.runs import CUA_ENGINES, RunEngine
from skyvern.schemas.steps import AgentStepOutput
@@ -1328,6 +1333,7 @@ class ForgeAgent:
scraped_page=scraped_page,
task=task,
step=step,
task_block=task_block,
)
if complete_action is not None:
LOG.info("User goal achieved, executing complete action")
@@ -1785,7 +1791,7 @@ class ForgeAgent:
return None
async def complete_verify(
self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step, task_block: BaseTaskBlock | None = None
) -> CompleteVerifyResult:
LOG.info(
"Checking if user goal is achieved after re-scraping the page",
@@ -1803,13 +1809,47 @@ class ForgeAgent:
if task.include_action_history_in_verification:
actions_and_results_str = await self._get_action_results(task, current_step=step)
# Check if we should use the termination-aware prompt (experiment)
# Only enabled for file download blocks
use_termination_prompt = False
is_file_download_block = task_block is not None and isinstance(task_block, FileDownloadBlock)
if is_file_download_block:
try:
distinct_id = task.workflow_run_id if task.workflow_run_id else task.task_id
use_termination_prompt = await app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
"USE_TERMINATION_AWARE_COMPLETE_VERIFICATION",
distinct_id,
properties={"organization_id": task.organization_id},
)
if use_termination_prompt:
LOG.info(
"Experiment enabled: using termination-aware complete verification prompt for file download block",
task_id=task.task_id,
workflow_run_id=task.workflow_run_id,
organization_id=task.organization_id,
block_type="file_download",
)
except Exception as e:
LOG.warning(
"Failed to check USE_TERMINATION_AWARE_COMPLETE_VERIFICATION experiment; using legacy behavior",
task_id=task.task_id,
workflow_run_id=task.workflow_run_id,
error=str(e),
)
# Select the appropriate template based on experiment
template_name = "check-user-goal-with-termination" if use_termination_prompt else "check-user-goal"
prompt_name = "check-user-goal-with-termination" if use_termination_prompt else "check-user-goal"
verification_prompt = load_prompt_with_elements(
element_tree_builder=scraped_page_refreshed,
prompt_engine=prompt_engine,
template_name="check-user-goal",
template_name=template_name,
navigation_goal=task.navigation_goal,
navigation_payload=task.navigation_payload,
complete_criterion=task.complete_criterion,
terminate_criterion=task.terminate_criterion,
action_history=actions_and_results_str,
local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(),
)
@@ -1855,23 +1895,37 @@ class ForgeAgent:
prompt=verification_prompt,
step=step,
screenshots=scraped_page_refreshed.screenshots,
prompt_name="check-user-goal",
prompt_name=prompt_name,
)
return CompleteVerifyResult.model_validate(verification_result)
async def check_user_goal_complete(
self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> CompleteAction | None:
self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step, task_block: BaseTaskBlock | None = None
) -> CompleteAction | TerminateAction | None:
try:
verification_result = await self.complete_verify(
page=page,
scraped_page=scraped_page,
task=task,
step=step,
task_block=task_block,
)
# Check if we should terminate instead of complete
# Note: This requires the USE_TERMINATION_AWARE_COMPLETE_VERIFICATION experiment to be enabled
if verification_result.is_terminate:
LOG.warning(
"Periodic verification determined task should terminate (termination-aware experiment)",
workflow_run_id=task.workflow_run_id,
thoughts=verification_result.thoughts,
status=verification_result.status if verification_result.status else "legacy",
)
return TerminateAction(
reasoning=verification_result.thoughts,
)
# We don't want to return a complete action if the user goal is not achieved since we're checking at every step
if not verification_result.user_goal_achieved:
if not verification_result.is_complete:
return None
return CompleteAction(
@@ -3160,6 +3214,7 @@ class ForgeAgent:
scraped_page=scraped_page,
task=task,
step=step,
task_block=task_block,
),
name=f"verify_goal_{step.step_id}",
)
@@ -3187,11 +3242,13 @@ class ForgeAgent:
complete_action = None
if complete_action is not None:
# Goal achieved! Cancel the pre-scraping task
# Goal achieved or should terminate! Cancel the pre-scraping task
is_terminate = isinstance(complete_action, TerminateAction)
LOG.info(
"Parallel verification: goal achieved, cancelling pre-scraping",
"Parallel verification: goal achieved or termination required, cancelling pre-scraping",
step_id=step.step_id,
task_id=task.task_id,
is_terminate=is_terminate,
)
pre_scrape_task.cancel()
try:
@@ -3201,22 +3258,73 @@ class ForgeAgent:
except Exception:
LOG.debug("Pre-scraping task cleanup failed", step_id=step.step_id, exc_info=True)
# Mark task as complete
# Note: Step is already marked as completed by agent_step
# We don't add the complete action to the step output since the step is already finalized
LOG.info(
"Parallel verification: goal achieved, marking task as complete",
step_id=step.step_id,
task_id=task.task_id,
)
last_step = await self.update_step(step, is_last=True)
extracted_information = await self.get_extracted_information_for_task(task)
await self.update_task(
task,
status=TaskStatus.completed,
extracted_information=extracted_information,
)
return True, last_step, None
working_page = page
if working_page is None:
working_page = await browser_state.must_get_working_page()
if step.output is None:
step.output = AgentStepOutput(action_results=[], actions_and_results=[], errors=[])
if step.output.action_results is None:
step.output.action_results = []
if step.output.actions_and_results is None:
step.output.actions_and_results = []
persisted_action = cast(Action, complete_action)
if isinstance(persisted_action, (CompleteAction, TerminateAction)):
persisted_action.organization_id = task.organization_id
persisted_action.workflow_run_id = task.workflow_run_id
persisted_action.task_id = task.task_id
persisted_action.step_id = step.step_id
persisted_action.step_order = step.order
persisted_action.action_order = len(step.output.actions_and_results)
action_results = await ActionHandler.handle_action(scraped_page, task, step, working_page, persisted_action)
await self.record_artifacts_after_action(task, step, browser_state, engine)
step.output.action_results.extend(action_results)
step.output.actions_and_results.append((persisted_action, action_results))
if isinstance(persisted_action, DecisiveAction) and persisted_action.errors:
step.output.errors.extend(persisted_action.errors)
if is_terminate:
# Mark task as terminated/failed
# Note: This requires the USE_TERMINATION_AWARE_COMPLETE_VERIFICATION experiment to be enabled
LOG.warning(
"Parallel verification: termination required, marking task as terminated (termination-aware experiment)",
step_id=step.step_id,
task_id=task.task_id,
reasoning=complete_action.reasoning,
)
last_step = await self.update_step(step, output=step.output, is_last=True)
task_errors = None
if isinstance(persisted_action, TerminateAction) and persisted_action.errors:
task_errors = [error.model_dump() for error in persisted_action.errors]
failure_reason = persisted_action.reasoning
if isinstance(persisted_action, TerminateAction) and persisted_action.errors:
failure_reason = "; ".join(error.reasoning for error in persisted_action.errors)
await self.update_task(
task,
status=TaskStatus.terminated,
failure_reason=failure_reason,
errors=task_errors,
)
return True, last_step, None
else:
# Mark task as complete
# Note: Step is already marked as completed by agent_step
# We don't add the complete action to the step output since the step is already finalized
LOG.info(
"Parallel verification: goal achieved, marking task as complete",
step_id=step.step_id,
task_id=task.task_id,
)
last_step = await self.update_step(step, output=step.output, is_last=True)
extracted_information = await self.get_extracted_information_for_task(task)
await self.update_task(
task,
status=TaskStatus.completed,
extracted_information=extracted_information,
)
return True, last_step, None
else:
# Goal not achieved - wait for pre-scraping to complete
LOG.info(

View File

@@ -0,0 +1,54 @@
You are here to help the user determine if the user has completed their goal on the web{{ " according to the complete criterion" if complete_criterion else "" }}. Use the content of the elements parsed from the page,{{ "" if without_screenshots else " the screenshots of the page," }} the user goal and user details to determine the status of the task.
Make sure to ONLY return the JSON object in this format with no additional text before or after it:
```json
{
"page_info": str, // Think step by step. Describe all the useful information in the page related to the user goal.
"thoughts": str, // Think step by step. Explain your reasoning for the status you selected.
"status": str // Must be one of three values: "complete", "terminate", or "continue". Use "complete" ONLY if the user goal has been fully achieved{{ " according to the complete criterion" if complete_criterion else "" }}. Use "terminate" ONLY if the goal CANNOT ever be achieved (e.g., a file doesn't exist, an error modal blocks progress permanently, or an explicit termination condition is met in the user goal). Use "continue" if the goal is not yet achieved but more steps could potentially achieve it (this is the most common case - use this when you need to wait, navigate, or try different actions).
}
```
Important: Think carefully about the difference between "terminate" and "continue":
- "terminate" = impossible to achieve, stop trying (e.g., "account does not exist", "file unavailable", permanent error)
- "continue" = not done yet, but achievable with more steps (e.g., page is loading, need to click something, need to wait)
User Goal:
```
{{ navigation_goal }}
```
User Details:
```
{{ navigation_payload }}
```
{% if complete_criterion %}
Complete Criterion:
```
{{ complete_criterion }}
```{% endif %}{% if terminate_criterion %}
Terminate Criterion:
```
{{ terminate_criterion }}
```{% endif %}
{% if action_history %}
Action History:
```
{{ action_history }}
```
{% endif %}{% if new_elements_ids %}
IDs for emerging HTML elements
```
{{ new_elements_ids }}
```
{% endif %}
Elements on the page:
```
{{ elements }}
```
Current datetime, ISO format:
```
{{ local_datetime }}
```

View File

@@ -29,13 +29,50 @@ class SelectOption(BaseModel):
return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
class VerificationStatus(StrEnum):
"""Status of user goal verification."""
complete = "complete" # Goal achieved successfully
terminate = "terminate" # Goal cannot be achieved, stop trying
continue_step = "continue" # Goal not yet achieved, continue with more steps
class CompleteVerifyResult(BaseModel):
user_goal_achieved: bool
# New field: explicit status with three options (used when experiment is enabled)
status: VerificationStatus | None = None
# Legacy fields: for backward compatibility (used when experiment is disabled)
user_goal_achieved: bool = False
should_terminate: bool = False
thoughts: str
page_info: str | None = None
def __repr__(self) -> str:
return f"CompleteVerifyResponse(thoughts={self.thoughts}, user_goal_achieved={self.user_goal_achieved}, page_info={self.page_info})"
if self.status:
return f"CompleteVerifyResult(status={self.status}, thoughts={self.thoughts}, page_info={self.page_info})"
return f"CompleteVerifyResult(thoughts={self.thoughts}, user_goal_achieved={self.user_goal_achieved}, should_terminate={self.should_terminate}, page_info={self.page_info})"
@property
def is_complete(self) -> bool:
"""True if goal was achieved (supports both new and legacy formats)."""
if self.status:
return self.status == VerificationStatus.complete
return self.user_goal_achieved
@property
def is_terminate(self) -> bool:
"""True if task should terminate (supports both new and legacy formats)."""
if self.status:
return self.status == VerificationStatus.terminate
return self.should_terminate
@property
def is_continue(self) -> bool:
"""True if task should continue (supports both new and legacy formats)."""
if self.status:
return self.status == VerificationStatus.continue_step
return not self.user_goal_achieved and not self.should_terminate
class InputOrSelectContext(BaseModel):

View File

@@ -1994,7 +1994,32 @@ async def handle_complete_action(
)
return [ActionFailure(exception=e)]
if not verification_result.user_goal_achieved:
# Check if we should terminate instead of complete
# Note: This requires the USE_TERMINATION_AWARE_COMPLETE_VERIFICATION experiment to be enabled
if verification_result.is_terminate:
LOG.warning(
"CompleteAction verification determined task should terminate instead (termination-aware experiment)",
workflow_run_id=task.workflow_run_id,
thoughts=verification_result.thoughts,
status=verification_result.status if verification_result.status else "legacy",
)
# Create a TerminateAction and execute it
terminate_action = actions.TerminateAction(
reasoning=verification_result.thoughts,
organization_id=action.organization_id,
workflow_run_id=action.workflow_run_id,
task_id=action.task_id,
step_id=action.step_id,
step_order=action.step_order,
action_order=action.action_order,
)
results = await handle_terminate_action(terminate_action, page, scraped_page, task, step)
action.action_type = ActionType.TERMINATE
action.reasoning = terminate_action.reasoning
action.errors = terminate_action.errors
return results
if not verification_result.is_complete:
return [ActionFailure(exception=IllegitComplete(data={"error": verification_result.thoughts}))]
LOG.info(