task v2 termination (#4589)

2026-01-30 14:46:34 -08:00
parent a503a19146
commit ee5cd014f3
4 changed files with 195 additions and 7 deletions
--- a/skyvern/forge/prompts/skyvern/task_v2.j2
+++ b/skyvern/forge/prompts/skyvern/task_v2.j2
@@ -6,6 +6,33 @@ You have access to the following task types to take actions:
 - loop: this task can be used to generate a list of planning sessions like this. When to use a loop task? Use loop when there are multiple parallel tasks you can do with the same goal. Each task in the loop has the same goal but with different objects/values/targets/variables. Use loop task when it's in a "breadth first search" situation where you can go through a list of values and execute the same task for each value. Examples:
  - When the goal is "Open up to 10 links from an ecomm search result page, and extract information like the price of each product.", loop task should be used to iterate through a list of links or URLs. In each iteration of the loop, the task will go to the linked page and trigger another planning session with the goal of extracting price information of the product
  - When the goal is "download 5 documents found on a page", loop task should be used to iterate through a list of document names. Each document will trigger another planning session to download the relative document
+{% if enable_termination %}
+
+You may also determine that the user goal is IMPOSSIBLE to achieve. Use termination ONLY when there is CLEAR, EXPLICIT, and UNAMBIGUOUS evidence that the goal cannot ever be accomplished. Be very conservative - when in doubt, continue trying.
+
+CRITICAL: Termination should be rare. Only terminate when the website EXPLICITLY tells you the action is impossible. Examples of when to terminate:
+- An explicit error message like "Account not found", "User does not exist", "No results found for [specific query]"
+- "Access denied" or "Unauthorized" errors after authentication was attempted
+- A 404 page that explicitly says the resource doesn't exist
+- Login failed with explicit "Invalid credentials" or "Wrong password" message (not just empty fields or validation errors)
+- "File not available", "Out of stock with no restock date", or "This item has been discontinued"
+- The website explicitly states: "This action cannot be performed" or "This feature is not available"
+
+Do NOT terminate when:
+- The page is still loading, blank, or shows a spinner
+- You need to navigate to find the right page or section
+- A captcha, verification step, or 2FA appeared (these can be solved)
+- A transient network error, timeout, or "try again later" message occurred
+- You simply haven't found the right element yet but it might exist elsewhere on the page or site
+- The task is difficult but not impossible
+- You're on a wrong page and need to navigate back
+- Search returned no results but you could try different search terms
+- A form submission failed but you could correct the input
+- You see a generic error without specific details about why it failed
+- The page structure is different than expected but might still contain the needed functionality
+
+When in doubt, DO NOT terminate. Try an alternative approach instead.
+{% endif %}

 MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.

@@ -16,10 +43,14 @@ Reply in JSON format with the following keys:
  "require_extraction": bool, // True if the user goal requires information extraction. False otherwise.
  "task_history_information": str, // Think step by step. In task history, what information has been collected that's helpful and relevant to the user goal, and what information is missing if any.
  "information_extracted": optional[bool], // True if the needed information has been extracted. False if the needed information has not been extracted. If task history has no "extract" type, that means no data extraction has happened, return false. Null if the user goal does not require information extraction.
-  "thoughts": str, // Think step by step. What has been done so far and what is the next reasonable mini goal a human can do foreseeably move towards the overall goal.
+  "thoughts": str, // Think step by step. What has been done so far and what is the next reasonable mini goal a human can do foreseeably move towards the overall goal.{% if enable_termination %} If the goal appears impossible to achieve, explain why.{% endif %}
  "user_goal_achieved": bool, // True if the user goal has been completed, false otherwise. If the user wants to extract information and it has not been done, the user goal is not achieved.
-  "plan": str, // The mini goal to achieve to move towards the user goal. DO NOT come up or hallucinate any data that's not provided in the user goal. Be accurate and precise. Return null if the user goal has been achieved.
-  "task_type": str, // One of the available task types: navigate, extract, loop
+{% if enable_termination %}
+  "should_terminate": bool, // True ONLY if the user goal is definitively IMPOSSIBLE to achieve. Must have explicit evidence from the page. See termination guidelines above. False if there's any chance the goal can still be achieved.
+  "termination_reason": str, // If should_terminate is true, quote the EXACT error message or text from the page that proves impossibility. Be specific. Null if should_terminate is false.
+{% endif %}
+  "plan": str, // The mini goal to achieve to move towards the user goal. DO NOT come up or hallucinate any data that's not provided in the user goal. Be accurate and precise. Return null if the user goal has been achieved{% if enable_termination %} or if should_terminate is true{% endif %}.
+  "task_type": str, // One of the available task types: navigate, extract, loop. Null if user_goal_achieved is true{% if enable_termination %} or should_terminate is true{% endif %}.
  "loop_values": list[str], // a list of string values to iterate through for loop task. null if it's not a loop task
  "is_loop_value_link": bool, // true if the loop_values is a list of urls to go to before for each planning session inside the loop
 }
--- a/skyvern/forge/prompts/skyvern/task_v2_check_completion.j2
+++ b/skyvern/forge/prompts/skyvern/task_v2_check_completion.j2
@@ -1,4 +1,4 @@
-You're to assist the user to achieve the user goal in the web. Given the user goal, the latest screenshot of the page and the mini tasks that have been completed by the user along the way, help decide whether the user goal has been achieved.
+You're to assist the user to achieve the user goal in the web. Given the user goal, the latest screenshot of the page and the mini tasks that have been completed by the user along the way, help decide whether the user goal has been achieved{% if enable_termination %}, is impossible to achieve,{% endif %} or needs more steps.

 Reply in JSON format with the following keys:
 {
@@ -7,9 +7,24 @@ Reply in JSON format with the following keys:
  "require_extraction": bool, // True if the user goal requires information extraction. False otherwise.
  "task_history_information": str, // Think step by step. In task history, what information has been collected that's helpful and relevant to the user goal, and what information is missing if any.
  "information_extracted": optional[bool], // True if the needed information has been extracted. False if the needed information has not been extracted (no extract task in history). Null if the user goal does not require information extraction.
-  "thoughts": str, // Think step by step. Would completing the tasks in the task history be good enough to achieve the user goal? If more tasks need to be completed to achieve the goal, what would be the next task?
+  "thoughts": str, // Think step by step. Would completing the tasks in the task history be good enough to achieve the user goal? If more tasks need to be completed to achieve the goal, what would be the next task?{% if enable_termination %} If the goal appears impossible, explain why.{% endif %}
  "user_goal_achieved": bool, // True if the user goal has been completed, false otherwise. If the user wants to extract information and it has not been done, the user goal is not achieved. If info extraction is not required, use the task history, assisted by the screenshot to decide if the user goal has been achieved.
+{% if enable_termination %}
+  "should_terminate": bool, // True ONLY if there is CLEAR, EXPLICIT, UNAMBIGUOUS evidence that the goal is IMPOSSIBLE. The page must explicitly state the action cannot be done. False if there's ANY chance the goal can still be achieved.
+  "termination_reason": str // If should_terminate is true, quote the EXACT error message or text from the page that proves impossibility. Null if should_terminate is false.
+{% endif %}
 }
+{% if enable_termination %}
+
+CRITICAL - Be very conservative about termination. Only terminate when:
+- The page shows an EXPLICIT error message stating impossibility (e.g., "Account does not exist", "Product discontinued", "Access permanently denied")
+- NOT when the page is loading, blank, or showing a generic error
+- NOT when you simply haven't found the element yet
+- NOT when a form failed but could be retried with different input
+- NOT when you're on a wrong page that could be navigated away from
+
+When in doubt, set should_terminate to false and continue trying.
+{% endif %}

 User goal:
 ```
--- a/skyvern/forge/sdk/schemas/task_v2.py
+++ b/skyvern/forge/sdk/schemas/task_v2.py
@@ -117,14 +117,29 @@ class TaskV2(BaseModel):


 class ThoughtType(StrEnum):
+    """
+    Type of thought recorded during task execution.
+
+    Note: Stored as VARCHAR in the database (not a PostgreSQL ENUM), so new values
+    can be added without database migrations. See observer_thoughts.observer_thought_type column.
+    """
+
    plan = "plan"
    metadata = "metadata"
    user_goal_check = "user_goal_check"
    internal_plan = "internal_plan"
    failure_describe = "failure_describe"
+    termination = "termination"


 class ThoughtScenario(StrEnum):
+    """
+    Scenario in which a thought was generated.
+
+    Note: Stored as VARCHAR in the database (not a PostgreSQL ENUM), so new values
+    can be added without database migrations. See observer_thoughts.observer_thought_scenario column.
+    """
+
    generate_plan = "generate_plan"
    user_goal_check = "user_goal_check"
    failure_describe = "failure_describe"
@@ -133,6 +148,7 @@ class ThoughtScenario(StrEnum):
    extract_loop_values = "extract_loop_values"
    generate_task_in_loop = "generate_task_in_loop"
    generate_task = "generate_general_task"
+    termination = "termination"


 class Thought(BaseModel):
--- a/skyvern/services/task_v2_service.py
+++ b/skyvern/services/task_v2_service.py
@@ -146,6 +146,80 @@ async def _summarize_max_steps_failure_reason(
        return ""


+async def _handle_task_v2_termination(
+    task_v2_id: str,
+    organization_id: str,
+    workflow_run_id: str,
+    workflow_id: str,
+    workflow_permanent_id: str,
+    termination_reason: str | None,
+    iteration: int,
+    source: str | None = None,
+) -> TaskV2:
+    """
+    Handle task v2 termination by creating a termination thought and marking the task as terminated.
+
+    Args:
+        task_v2_id: The task v2 ID
+        organization_id: The organization ID
+        workflow_run_id: The workflow run ID
+        workflow_id: The workflow ID
+        workflow_permanent_id: The workflow permanent ID
+        termination_reason: The reason for termination (from LLM response)
+        iteration: The current iteration number
+        source: Optional source identifier (e.g., "completion_check")
+
+    Returns:
+        The updated TaskV2 object with terminated status
+    """
+    log_message = "Task v2 should terminate"
+    if source:
+        log_message = f"Task v2 should terminate according to {source}"
+    log_message += " - goal is impossible to achieve"
+
+    LOG.info(
+        log_message,
+        iteration=iteration,
+        workflow_run_id=workflow_run_id,
+        termination_reason=termination_reason,
+    )
+
+    # Create a dedicated termination thought for UI visibility
+    termination_thought = await app.DATABASE.create_thought(
+        task_v2_id=task_v2_id,
+        organization_id=organization_id,
+        workflow_run_id=workflow_run_id,
+        workflow_id=workflow_id,
+        workflow_permanent_id=workflow_permanent_id,
+        thought_type=ThoughtType.termination,
+        thought_scenario=ThoughtScenario.termination,
+        thought=termination_reason or "Task goal is impossible to achieve",
+    )
+
+    output: dict[str, Any] = {
+        "should_terminate": True,
+        "termination_reason": termination_reason,
+        "iteration": iteration,
+    }
+    if source:
+        output["source"] = source
+
+    await app.DATABASE.update_thought(
+        thought_id=termination_thought.observer_thought_id,
+        organization_id=organization_id,
+        output=output,
+    )
+
+    task_v2 = await mark_task_v2_as_terminated(
+        task_v2_id=task_v2_id,
+        workflow_run_id=workflow_run_id,
+        organization_id=organization_id,
+        failure_reason=termination_reason or "Task goal is impossible to achieve",
+    )
+
+    return task_v2
+
+
 async def initialize_task_v2(
    organization: Organization,
    user_prompt: str,
@@ -526,6 +600,16 @@ async def run_task_v2_helper(
        current_run_id,
        properties={"organization_id": organization_id, "task_url": task_v2.url},
    )
+    enable_task_v2_termination = await app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
+        "ENABLE_TASK_V2_TERMINATION",
+        current_run_id,
+        properties={"organization_id": organization_id, "task_url": task_v2.url},
+    )
+    LOG.info(
+        "Task v2 termination feature flag",
+        enable_task_v2_termination=enable_task_v2_termination,
+        organization_id=organization_id,
+    )
    skyvern_context.set(
        SkyvernContext(
            organization_id=organization_id,
@@ -702,6 +786,7 @@ async def run_task_v2_helper(
                user_goal=user_prompt,
                task_history=task_history,
                local_datetime=datetime.now(context.tz_info).isoformat(),
+                enable_termination=bool(enable_task_v2_termination),
            )
            thought = await app.DATABASE.create_thought(
                task_v2_id=task_v2_id,
@@ -730,6 +815,8 @@ async def run_task_v2_helper(
            )
            # see if the user goal has achieved or not
            user_goal_achieved = task_v2_response.get("user_goal_achieved", False)
+            should_terminate = task_v2_response.get("should_terminate", False)
+            termination_reason = task_v2_response.get("termination_reason")
            observation = task_v2_response.get("page_info", "")
            thoughts: str = task_v2_response.get("thoughts", "")
            plan = task_v2_response.get("plan", "")
@@ -741,7 +828,12 @@ async def run_task_v2_helper(
                thought=thoughts,
                observation=observation,
                answer=plan,
-                output={"task_type": task_type, "user_goal_achieved": user_goal_achieved},
+                output={
+                    "task_type": task_type,
+                    "user_goal_achieved": user_goal_achieved,
+                    "should_terminate": should_terminate,
+                    "termination_reason": termination_reason,
+                },
            )

            if user_goal_achieved is True:
@@ -763,6 +855,19 @@ async def run_task_v2_helper(
                    )
                break

+            # Only handle termination if the feature flag is enabled
+            if enable_task_v2_termination and should_terminate is True:
+                task_v2 = await _handle_task_v2_termination(
+                    task_v2_id=task_v2_id,
+                    organization_id=organization_id,
+                    workflow_run_id=workflow_run_id,
+                    workflow_id=workflow_id,
+                    workflow_permanent_id=workflow.workflow_permanent_id,
+                    termination_reason=termination_reason,
+                    iteration=i,
+                )
+                return workflow, workflow_run, task_v2
+
            if not plan:
                LOG.warning("No plan found in task v2 response", task_v2_response=task_v2_response)
                continue
@@ -925,6 +1030,7 @@ async def run_task_v2_helper(
                user_goal=user_prompt,
                task_history=task_history,
                local_datetime=datetime.now(context.tz_info).isoformat(),
+                enable_termination=bool(enable_task_v2_termination),
            )
            thought = await app.DATABASE.create_thought(
                task_v2_id=task_v2_id,
@@ -949,12 +1055,18 @@ async def run_task_v2_helper(
                task_history=task_history,
            )
            user_goal_achieved = completion_resp.get("user_goal_achieved", False)
+            should_terminate = completion_resp.get("should_terminate", False)
+            termination_reason = completion_resp.get("termination_reason")
            thought_content = completion_resp.get("thoughts", "")
            await app.DATABASE.update_thought(
                thought_id=thought.observer_thought_id,
                organization_id=organization_id,
                thought=thought_content,
-                output={"user_goal_achieved": user_goal_achieved},
+                output={
+                    "user_goal_achieved": user_goal_achieved,
+                    "should_terminate": should_terminate,
+                    "termination_reason": termination_reason,
+                },
            )
            if user_goal_achieved:
                LOG.info(
@@ -977,6 +1089,20 @@ async def run_task_v2_helper(
                    )
                break

+            # Only handle termination if the feature flag is enabled
+            if enable_task_v2_termination and should_terminate:
+                task_v2 = await _handle_task_v2_termination(
+                    task_v2_id=task_v2_id,
+                    organization_id=organization_id,
+                    workflow_run_id=workflow_run_id,
+                    workflow_id=workflow_id,
+                    workflow_permanent_id=workflow.workflow_permanent_id,
+                    termination_reason=termination_reason,
+                    iteration=i,
+                    source="completion_check",
+                )
+                return workflow, workflow_run, task_v2
+
        # total step number validation
        workflow_run_tasks = await app.DATABASE.get_tasks_by_workflow_run_id(workflow_run_id=workflow_run_id)
        total_step_count = await app.DATABASE.get_total_unique_step_order_count_by_task_ids(