Pedro/support_multi_field_6_digit_totp (#3622)

2025-10-06 16:37:35 -07:00
parent 7f6e5d2e36
commit 6fc56d9775
4 changed files with 415 additions and 8 deletions
--- a/skyvern/errors/errors.py
+++ b/skyvern/errors/errors.py
@@ -45,3 +45,8 @@ class GetTOTPVerificationCodeError(SkyvernDefinedError):
 class TimeoutGetTOTPVerificationCodeError(SkyvernDefinedError):
    error_code: str = "OTP_TIMEOUT"
    reasoning: str = "Timeout getting TOTP verification code."
 class TOTPExpiredError(SkyvernDefinedError):
    error_code: str = "OTP_EXPIRED"
    reasoning: str = "TOTP verification code has expired during multi-field input sequence."
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -1202,14 +1202,47 @@ class ForgeAgent:
                    # Do not verify the complete action when complete_verification is False
                    # set verified to True will skip the completion verification
                    action.verified = True
                # Pass TOTP secret to handler for multi-field TOTP sequences
                # Handler will generate TOTP at execution time
                if (
                    action.action_type == ActionType.INPUT_TEXT
                    and self._is_multi_field_totp_sequence(actions)
                    and (totp_secret := skyvern_context.ensure_context().totp_codes.get(f"{task.task_id}_secret"))
                ):
                    # Pass TOTP secret to handler for execution-time generation
                    action.totp_timing_info = {
                        "is_totp_sequence": True,
                        "action_index": action_idx,
                        "totp_secret": totp_secret,
                        "is_retry": step.retry_index > 0,
                    }
                results = await ActionHandler.handle_action(scraped_page, task, step, current_page, action)
                await app.AGENT_FUNCTION.post_action_execution()
                detailed_agent_step_output.actions_and_results[action_idx] = (
                    action,
                    results,
                )
-                # wait random time between actions to avoid detection
+
-                await asyncio.sleep(random.uniform(0.5, 1.0))
+                # Determine wait time between actions
                wait_time = random.uniform(0.5, 1.0)
                # For multi-field TOTP sequences, use zero delay between all digits for fast execution
                if action.action_type == ActionType.INPUT_TEXT and self._is_multi_field_totp_sequence(actions):
                    current_text = action.text if hasattr(action, "text") else None
                    if current_text and len(current_text) == 1 and current_text.isdigit():
                        # Zero delay between all TOTP digits for fast execution
                        wait_time = 0.0
                        LOG.debug(
                            "TOTP: zero delay for digit",
                            task_id=task.task_id,
                            action_idx=action_idx,
                            digit=current_text,
                        )
                await asyncio.sleep(wait_time)
                await self.record_artifacts_after_action(task, step, browser_state, engine)
                for result in results:
                    result.step_retry_number = step.retry_index
@@ -1306,6 +1339,21 @@ class ForgeAgent:
                action_results=action_results,
            )
            # Clean up TOTP cache after multi-field TOTP sequence completion
            if self._is_multi_field_totp_sequence(actions):
                context = skyvern_context.ensure_context()
                cache_key = f"{task.task_id}_totp_cache"
                if cache_key in context.totp_codes:
                    context.totp_codes.pop(cache_key)
                    LOG.debug(
                        "Cleaned up TOTP cache after multi-field sequence completion",
                        task_id=task.task_id,
                    )
                secret_key = f"{task.task_id}_secret"
                if secret_key in context.totp_codes:
                    context.totp_codes.pop(secret_key)
            # Check if Skyvern already returned a complete action, if so, don't run user goal check
            has_decisive_action = False
            if detailed_agent_step_output and detailed_agent_step_output.actions_and_results:
@@ -2065,7 +2113,7 @@ class ForgeAgent:
            await SkyvernFrame.evaluate(frame=page, expression="() => document.location.href") if page else starting_url
        )
        final_navigation_payload = self._build_navigation_payload(
-            task, expire_verification_code=expire_verification_code
+            task, expire_verification_code=expire_verification_code, step=step, scraped_page=scraped_page
        )
        task_type = task.task_type if task.task_type else TaskType.general
@@ -2167,12 +2215,124 @@ class ForgeAgent:
        return full_prompt, use_caching
    def _should_process_totp(self, scraped_page: ScrapedPage | None) -> bool:
        """Detect TOTP pages by checking for multiple input fields or verification keywords."""
        if not scraped_page:
            return False
        try:
            # Count input fields that could be for TOTP (more flexible than maxlength="1")
            input_fields = [
                element
                for element in scraped_page.elements
                if element.get("tagName", "").lower() == "input"
                and element.get("attributes", {}).get("type", "text").lower() in ["text", "number", "tel"]
            ]
            # Check for multiple input fields (potential multi-field TOTP)
            if len(input_fields) >= 4:
                # Additional check: look for patterns that suggest multi-field TOTP
                # Check if inputs are close together or have similar attributes
                has_maxlength_1 = any(elem.get("attributes", {}).get("maxlength") == "1" for elem in input_fields)
                # Check for input fields with numeric patterns (type="number", pattern for digits)
                has_numeric_patterns = any(
                    elem.get("attributes", {}).get("type") == "number"
                    or elem.get("attributes", {}).get("pattern", "").isdigit()
                    or "digit" in elem.get("attributes", {}).get("pattern", "").lower()
                    for elem in input_fields
                )
                if has_maxlength_1 or has_numeric_patterns:
                    return True
            # Check for TOTP-related keywords in page content
            page_text = scraped_page.html.lower() if scraped_page.html else ""
            totp_keywords = [
                "verification code",
                "authentication code",
                "security code",
                "2fa",
                "two-factor",
                "totp",
                "authenticator",
                "verification",
                "enter code",
                "verification number",
                "security number",
            ]
            keyword_matches = sum(1 for keyword in totp_keywords if keyword in page_text)
            # If we have multiple TOTP keywords and multiple input fields, likely TOTP
            if keyword_matches >= 2 and len(input_fields) >= 6:
                return True
            # Strong single keyword match with multiple inputs
            strong_keywords = ["verification code", "authentication code", "2fa", "two-factor"]
            if any(keyword in page_text for keyword in strong_keywords) and len(input_fields) >= 3:
                return True
            return False
        except Exception:
            return False
    def _is_multi_field_totp_sequence(self, actions: list) -> bool:
        """
        Check if the action sequence represents a multi-field TOTP input (6 single-digit fields).
        Args:
            actions: List of actions to analyze
        Returns:
            bool: True if this is a multi-field TOTP sequence
        """
        # Must have at least 4 actions (minimum for TOTP)
        if len(actions) < 4:
            return False
        # Check if we have multiple consecutive single-digit INPUT_TEXT actions
        consecutive_single_digits = 0
        max_consecutive = 0
        for action in actions:
            if (
                action.action_type == ActionType.INPUT_TEXT
                and hasattr(action, "text")
                and action.text
                and len(action.text) == 1
                and action.text.isdigit()
            ):
                consecutive_single_digits += 1
                max_consecutive = max(max_consecutive, consecutive_single_digits)
            else:
                # If we hit a non-single-digit action, reset consecutive counter
                consecutive_single_digits = 0
        # Consider it a multi-field TOTP if we have 4+ consecutive single-digit inputs
        # This is more reliable than just counting total single digits
        # We use 4+ as the threshold to avoid false positives with single TOTP fields
        is_multi_field_totp = max_consecutive >= 4
        if is_multi_field_totp:
            LOG.debug(
                "Detected multi-field TOTP sequence",
                max_consecutive=max_consecutive,
                total_actions=len(actions),
            )
        return is_multi_field_totp
    def _build_navigation_payload(
        self,
        task: Task,
        expire_verification_code: bool = False,
        step: Step | None = None,
        scraped_page: ScrapedPage | None = None,
    ) -> dict[str, Any] | list | str | None:
        final_navigation_payload = task.navigation_payload
        current_context = skyvern_context.ensure_context()
        verification_code = current_context.totp_codes.get(task.task_id)
        if (task.totp_verification_url or task.totp_identifier) and verification_code:
@@ -2190,6 +2350,32 @@ class ForgeAgent:
                )
            if expire_verification_code:
                current_context.totp_codes.pop(task.task_id)
        # Store TOTP secrets and provide placeholder TOTP for LLM to see format
        # Only when on a TOTP page to avoid premature processing
        if (
            task.workflow_run_id
            and step
            and isinstance(final_navigation_payload, dict)
            and self._should_process_totp(scraped_page)
        ):
            workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(task.workflow_run_id)
            for key, value in list(final_navigation_payload.items()):
                if isinstance(value, dict) and "totp" in value:
                    totp_placeholder = value.get("totp")
                    if totp_placeholder and isinstance(totp_placeholder, str):
                        totp_secret_key = workflow_run_context.totp_secret_value_key(totp_placeholder)
                        totp_secret = workflow_run_context.get_original_secret_value_or_none(totp_secret_key)
                        if totp_secret:
                            # Store TOTP secret for handler to use during execution
                            current_context = skyvern_context.ensure_context()
                            current_context.totp_codes[f"{task.task_id}_secret"] = totp_secret
                            # Send a placeholder TOTP for the LLM to see the format
                            final_navigation_payload[key]["totp"] = "123456"
        return final_navigation_payload
    async def _get_action_results(self, task: Task, current_step: Step | None = None) -> str:
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -88,6 +88,9 @@ class Action(BaseModel):
    is_checked: bool | None = None
    verified: bool = False
    # TOTP timing information for multi-field TOTP sequences
    totp_timing_info: dict[str, Any] | None = None
    created_at: datetime | None = None
    modified_at: datetime | None = None
    created_by: str | None = None
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -2,6 +2,7 @@ import asyncio
 import copy
 import json
 import os
 import time
 import urllib.parse
 import uuid
 from datetime import datetime, timedelta, timezone
@@ -10,6 +11,7 @@ from typing import Any, Awaitable, Callable, List
 import pyotp
 import structlog
 from playwright._impl._errors import Error as PlaywrightError
 from playwright.async_api import FileChooser, Frame, Locator, Page, TimeoutError
 from pydantic import BaseModel
@@ -22,6 +24,7 @@ from skyvern.constants import (
    REPO_ROOT_DIR,
    SKYVERN_ID_ATTR,
 )
 from skyvern.errors.errors import TOTPExpiredError
 from skyvern.exceptions import (
    DownloadFileMaxWaitingTime,
    EmptySelect,
@@ -929,6 +932,128 @@ async def handle_click_to_download_file_action(
    return [ActionSuccess(download_triggered=True)]
 # TOTP timing constants
 TOTP_TIME_STEP_SECONDS = 30
 TOTP_EXPIRY_THRESHOLD_SECONDS = 20
 async def _handle_multi_field_totp_sequence(
    timing_info: dict[str, Any],
    task: Task,
 ) -> list[ActionResult] | None:
    """
    Handle TOTP generation and caching for multi-field TOTP sequences.
    Returns:
        ActionFailure if TOTP handling failed, None if successful
    """
    action_index = timing_info["action_index"]
    cache_key = f"{task.task_id}_totp_cache"
    current_context = skyvern_context.ensure_context()
    if action_index == 0:
        # First digit: generate TOTP and cache it
        totp_secret = timing_info["totp_secret"]
        totp = pyotp.TOTP(totp_secret)
        # Check current TOTP expiry time
        current_time = int(time.time())
        current_totp_valid_until = ((current_time // TOTP_TIME_STEP_SECONDS) + 1) * TOTP_TIME_STEP_SECONDS
        seconds_until_expiry = current_totp_valid_until - current_time
        # If less than threshold seconds until expiry, use the next TOTP
        if seconds_until_expiry < TOTP_EXPIRY_THRESHOLD_SECONDS:
            # Force generation of next TOTP by advancing time
            next_time = current_totp_valid_until
            current_totp = totp.at(next_time)
            LOG.debug(
                "Using multi-field TOTP flow - using NEXT TOTP due to <20s expiry",
                task_id=task.task_id,
                action_idx=action_index,
                current_totp=totp.now(),
                next_totp=current_totp,
                seconds_until_expiry=seconds_until_expiry,
                is_retry=timing_info.get("is_retry", False),
            )
        else:
            # Use current TOTP
            current_totp = totp.now()
        current_context.totp_codes[cache_key] = current_totp
    else:
        # Subsequent digits: reuse cached TOTP
        current_totp = current_context.totp_codes.get(cache_key)
        if not current_totp:
            # TOTP cache missing for subsequent digit - this should not happen
            # If it does, something went wrong with the first digit, so fail the action
            LOG.error(
                "TOTP cache missing for subsequent digit - first digit may have failed",
                task_id=task.task_id,
                action_idx=action_index,
                cache_key=cache_key,
            )
            return [ActionFailure(TOTPExpiredError())]
        # Check if cached TOTP has expired
        totp_secret = timing_info["totp_secret"]
        totp = pyotp.TOTP(totp_secret)
        # Get current time and calculate TOTP expiry
        current_time = int(time.time())
        totp_valid_until = ((current_time // TOTP_TIME_STEP_SECONDS) + 1) * TOTP_TIME_STEP_SECONDS
        if current_time >= totp_valid_until:
            LOG.error(
                "Cached TOTP has expired during multi-field sequence",
                task_id=task.task_id,
                action_idx=action_index,
                current_time=current_time,
                totp_valid_until=totp_valid_until,
                cached_totp=current_totp,
            )
            return [ActionFailure(TOTPExpiredError())]
        LOG.debug(
            "Using multi-field TOTP flow - reusing cached TOTP",
            task_id=task.task_id,
            action_idx=action_index,
            totp=current_totp,
            current_time=current_time,
            totp_valid_until=totp_valid_until,
        )
    # Special handling for the 6th digit (action_index=5): wait if TOTP is not yet valid
    if action_index == 5:
        # Calculate when this TOTP becomes valid (valid_from time)
        # If we used the next TOTP window, valid_from is the start of that window
        totp_valid_from = totp_valid_until - TOTP_TIME_STEP_SECONDS
        if current_time < totp_valid_from:
            # TOTP is not yet valid, wait until it becomes valid
            wait_seconds = totp_valid_from - current_time
            LOG.debug(
                "6th digit: TOTP not yet valid, waiting until valid_from",
                task_id=task.task_id,
                action_idx=action_index,
                current_time=current_time,
                totp_valid_from=totp_valid_from,
                wait_seconds=wait_seconds,
                totp=current_totp,
            )
            await asyncio.sleep(wait_seconds)
            LOG.debug(
                "6th digit: Finished waiting, TOTP is now valid",
                task_id=task.task_id,
                action_idx=action_index,
            )
    return None  # Success
@TraceManager.traced_async(ignore_inputs=["scraped_page", "page"])
 async def handle_input_text_action(
    action: actions.InputTextAction,
@@ -954,9 +1079,17 @@ async def handle_input_text_action(
    # before filling text, we need to validate if the element can be filled if it's not one of COMMON_INPUT_TAGS
    tag_name = scraped_page.id_to_element_dict[action.element_id]["tagName"].lower()
-    text: str | None = await get_actual_value_of_parameter_if_secret(task, action.text)
+
-    if text is None:
+    # Check if this is multi-field TOTP first - if so, skip secret resolution
-        return [ActionFailure(FailedToFetchSecret())]
+    if action.totp_timing_info and action.totp_timing_info.get("is_totp_sequence"):
        # For multi-field TOTP, we'll set text directly in the TOTP logic below
        text: str = ""
    else:
        # For regular inputs, resolve secrets
        text_result = await get_actual_value_of_parameter_if_secret(task, action.text)
        if text_result is None:
            return [ActionFailure(FailedToFetchSecret())]
        text = text_result
    is_totp_value = (
        text == BitwardenConstants.TOTP or text == OnePasswordConstants.TOTP or text == AzureVaultConstants.TOTP
@@ -1198,6 +1331,32 @@ async def handle_input_text_action(
        await skyvern_element.input(text)
        return [ActionSuccess()]
    # Handle TOTP generation for multi-field TOTP sequences
    if action.totp_timing_info:
        timing_info = action.totp_timing_info
        if timing_info.get("is_totp_sequence"):
            result = await _handle_multi_field_totp_sequence(timing_info, task)
            if result is not None:
                return result  # Return ActionFailure if TOTP handling failed
            # Extract the digit for this action index
            current_totp = skyvern_context.ensure_context().totp_codes.get(f"{task.task_id}_totp_cache")
            action_index = timing_info["action_index"]
            if current_totp and len(current_totp) > action_index:
                digit = current_totp[action_index]
                action.text = digit
                # Also update the text variable that will be used later
                text = digit
            else:
                LOG.error(
                    "TOTP too short for action index",
                    task_id=task.task_id,
                    action_idx=action_index,
                    totp_length=len(current_totp) if current_totp else 0,
                )
                return [ActionFailure(TOTPExpiredError())]
    try:
        # TODO: not sure if this case will trigger auto-completion
        if tag_name not in COMMON_INPUT_TAGS:
@@ -1246,18 +1405,57 @@ async def handle_input_text_action(
        try:
            await skyvern_element.input_sequentially(text=text)
-        finally:
+
            incremental_element = await incremental_scraped.get_incremental_element_tree(
                clean_and_remove_element_tree_factory(
-                    task=task, step=step, check_filter_funcs=[check_existed_but_not_option_element_in_dom_factory(dom)]
+                    task=task,
                    step=step,
                    check_filter_funcs=[check_existed_but_not_option_element_in_dom_factory(dom)],
                ),
            )
            if len(incremental_element) > 0:
                auto_complete_hacky_flag = True
        except PlaywrightError as inc_error:
            # Handle Playwright-specific errors during incremental element processing (e.g., TOTP form auto-submit)
            error_message = str(inc_error).lower()
            if (
                "execution context was destroyed" in error_message
                or "navigation" in error_message
                or "target closed" in error_message
            ):
                # These are expected during page navigation/auto-submit, silently continue
                LOG.debug(
                    "Playwright error during incremental element processing (likely page navigation)",
                    task_id=task.task_id,
                    step_id=step.step_id,
                    error_type=type(inc_error).__name__,
                    error_message=error_message,
                )
            else:
                LOG.warning(
                    "Unexpected Playwright error during incremental element processing",
                    task_id=task.task_id,
                    step_id=step.step_id,
                    error_type=type(inc_error).__name__,
                    error_message=str(inc_error),
                )
        except Exception as inc_error:
            # Handle any other unexpected errors during incremental element processing
            LOG.warning(
                "Unexpected error during incremental element processing",
                task_id=task.task_id,
                step_id=step.step_id,
                error_type=type(inc_error).__name__,
                error_message=str(inc_error),
            )
        finally:
            # Always stop listening
            await incremental_scraped.stop_listen_dom_increment()
        return [ActionSuccess()]
    except Exception as e:
        # Handle any other unexpected errors during text input
        LOG.exception(
            "Failed to input the value or finish the auto completion",
            task_id=task.task_id,
@@ -3738,6 +3936,21 @@ async def _get_input_or_select_context(
    json_response = await app.PARSE_SELECT_LLM_API_HANDLER(
        prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
    )
    # Handle edge case where LLM returns list instead of dict
    if isinstance(json_response, list):
        LOG.warning(
            "LLM returned list instead of dict for input/select context parsing",
            step_id=step.step_id,
            original_response_type=type(json_response).__name__,
            original_response_length=len(json_response) if json_response else 0,
            first_item_type=type(json_response[0]).__name__ if json_response else None,
            first_item_keys=list(json_response[0].keys())
            if json_response and isinstance(json_response[0], dict)
            else None,
        )
        json_response = json_response[0] if json_response else {}
    json_response["intention"] = action.intention
    input_or_select_context = InputOrSelectContext.model_validate(json_response)
    LOG.info(