From 6fc56d977576472ce697805ad4428fe9b1138bf8 Mon Sep 17 00:00:00 2001 From: pedrohsdb Date: Mon, 6 Oct 2025 16:37:35 -0700 Subject: [PATCH] Pedro/support_multi_field_6_digit_totp (#3622) --- skyvern/errors/errors.py | 5 + skyvern/forge/agent.py | 192 ++++++++++++++++++++++++- skyvern/webeye/actions/actions.py | 3 + skyvern/webeye/actions/handler.py | 223 +++++++++++++++++++++++++++++- 4 files changed, 415 insertions(+), 8 deletions(-) diff --git a/skyvern/errors/errors.py b/skyvern/errors/errors.py index a8d339a6..af07f069 100644 --- a/skyvern/errors/errors.py +++ b/skyvern/errors/errors.py @@ -45,3 +45,8 @@ class GetTOTPVerificationCodeError(SkyvernDefinedError): class TimeoutGetTOTPVerificationCodeError(SkyvernDefinedError): error_code: str = "OTP_TIMEOUT" reasoning: str = "Timeout getting TOTP verification code." + + +class TOTPExpiredError(SkyvernDefinedError): + error_code: str = "OTP_EXPIRED" + reasoning: str = "TOTP verification code has expired during multi-field input sequence." diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 5221084e..393a506b 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -1202,14 +1202,47 @@ class ForgeAgent: # Do not verify the complete action when complete_verification is False # set verified to True will skip the completion verification action.verified = True + + # Pass TOTP secret to handler for multi-field TOTP sequences + # Handler will generate TOTP at execution time + if ( + action.action_type == ActionType.INPUT_TEXT + and self._is_multi_field_totp_sequence(actions) + and (totp_secret := skyvern_context.ensure_context().totp_codes.get(f"{task.task_id}_secret")) + ): + # Pass TOTP secret to handler for execution-time generation + action.totp_timing_info = { + "is_totp_sequence": True, + "action_index": action_idx, + "totp_secret": totp_secret, + "is_retry": step.retry_index > 0, + } + results = await ActionHandler.handle_action(scraped_page, task, step, current_page, action) await app.AGENT_FUNCTION.post_action_execution() detailed_agent_step_output.actions_and_results[action_idx] = ( action, results, ) - # wait random time between actions to avoid detection - await asyncio.sleep(random.uniform(0.5, 1.0)) + + # Determine wait time between actions + wait_time = random.uniform(0.5, 1.0) + + # For multi-field TOTP sequences, use zero delay between all digits for fast execution + if action.action_type == ActionType.INPUT_TEXT and self._is_multi_field_totp_sequence(actions): + current_text = action.text if hasattr(action, "text") else None + + if current_text and len(current_text) == 1 and current_text.isdigit(): + # Zero delay between all TOTP digits for fast execution + wait_time = 0.0 + LOG.debug( + "TOTP: zero delay for digit", + task_id=task.task_id, + action_idx=action_idx, + digit=current_text, + ) + + await asyncio.sleep(wait_time) await self.record_artifacts_after_action(task, step, browser_state, engine) for result in results: result.step_retry_number = step.retry_index @@ -1306,6 +1339,21 @@ class ForgeAgent: action_results=action_results, ) + # Clean up TOTP cache after multi-field TOTP sequence completion + if self._is_multi_field_totp_sequence(actions): + context = skyvern_context.ensure_context() + cache_key = f"{task.task_id}_totp_cache" + if cache_key in context.totp_codes: + context.totp_codes.pop(cache_key) + LOG.debug( + "Cleaned up TOTP cache after multi-field sequence completion", + task_id=task.task_id, + ) + + secret_key = f"{task.task_id}_secret" + if secret_key in context.totp_codes: + context.totp_codes.pop(secret_key) + # Check if Skyvern already returned a complete action, if so, don't run user goal check has_decisive_action = False if detailed_agent_step_output and detailed_agent_step_output.actions_and_results: @@ -2065,7 +2113,7 @@ class ForgeAgent: await SkyvernFrame.evaluate(frame=page, expression="() => document.location.href") if page else starting_url ) final_navigation_payload = self._build_navigation_payload( - task, expire_verification_code=expire_verification_code + task, expire_verification_code=expire_verification_code, step=step, scraped_page=scraped_page ) task_type = task.task_type if task.task_type else TaskType.general @@ -2167,12 +2215,124 @@ class ForgeAgent: return full_prompt, use_caching + def _should_process_totp(self, scraped_page: ScrapedPage | None) -> bool: + """Detect TOTP pages by checking for multiple input fields or verification keywords.""" + if not scraped_page: + return False + + try: + # Count input fields that could be for TOTP (more flexible than maxlength="1") + input_fields = [ + element + for element in scraped_page.elements + if element.get("tagName", "").lower() == "input" + and element.get("attributes", {}).get("type", "text").lower() in ["text", "number", "tel"] + ] + + # Check for multiple input fields (potential multi-field TOTP) + if len(input_fields) >= 4: + # Additional check: look for patterns that suggest multi-field TOTP + # Check if inputs are close together or have similar attributes + has_maxlength_1 = any(elem.get("attributes", {}).get("maxlength") == "1" for elem in input_fields) + + # Check for input fields with numeric patterns (type="number", pattern for digits) + has_numeric_patterns = any( + elem.get("attributes", {}).get("type") == "number" + or elem.get("attributes", {}).get("pattern", "").isdigit() + or "digit" in elem.get("attributes", {}).get("pattern", "").lower() + for elem in input_fields + ) + + if has_maxlength_1 or has_numeric_patterns: + return True + + # Check for TOTP-related keywords in page content + page_text = scraped_page.html.lower() if scraped_page.html else "" + totp_keywords = [ + "verification code", + "authentication code", + "security code", + "2fa", + "two-factor", + "totp", + "authenticator", + "verification", + "enter code", + "verification number", + "security number", + ] + + keyword_matches = sum(1 for keyword in totp_keywords if keyword in page_text) + + # If we have multiple TOTP keywords and multiple input fields, likely TOTP + if keyword_matches >= 2 and len(input_fields) >= 6: + return True + + # Strong single keyword match with multiple inputs + strong_keywords = ["verification code", "authentication code", "2fa", "two-factor"] + if any(keyword in page_text for keyword in strong_keywords) and len(input_fields) >= 3: + return True + + return False + + except Exception: + return False + + def _is_multi_field_totp_sequence(self, actions: list) -> bool: + """ + Check if the action sequence represents a multi-field TOTP input (6 single-digit fields). + + Args: + actions: List of actions to analyze + + Returns: + bool: True if this is a multi-field TOTP sequence + """ + # Must have at least 4 actions (minimum for TOTP) + if len(actions) < 4: + return False + + # Check if we have multiple consecutive single-digit INPUT_TEXT actions + consecutive_single_digits = 0 + max_consecutive = 0 + + for action in actions: + if ( + action.action_type == ActionType.INPUT_TEXT + and hasattr(action, "text") + and action.text + and len(action.text) == 1 + and action.text.isdigit() + ): + consecutive_single_digits += 1 + max_consecutive = max(max_consecutive, consecutive_single_digits) + else: + # If we hit a non-single-digit action, reset consecutive counter + consecutive_single_digits = 0 + + # Consider it a multi-field TOTP if we have 4+ consecutive single-digit inputs + # This is more reliable than just counting total single digits + # We use 4+ as the threshold to avoid false positives with single TOTP fields + is_multi_field_totp = max_consecutive >= 4 + + if is_multi_field_totp: + LOG.debug( + "Detected multi-field TOTP sequence", + max_consecutive=max_consecutive, + total_actions=len(actions), + ) + + return is_multi_field_totp + def _build_navigation_payload( self, task: Task, expire_verification_code: bool = False, + step: Step | None = None, + scraped_page: ScrapedPage | None = None, ) -> dict[str, Any] | list | str | None: final_navigation_payload = task.navigation_payload + current_context = skyvern_context.ensure_context() verification_code = current_context.totp_codes.get(task.task_id) if (task.totp_verification_url or task.totp_identifier) and verification_code: @@ -2190,6 +2350,32 @@ class ForgeAgent: ) if expire_verification_code: current_context.totp_codes.pop(task.task_id) + + # Store TOTP secrets and provide placeholder TOTP for LLM to see format + # Only when on a TOTP page to avoid premature processing + if ( + task.workflow_run_id + and step + and isinstance(final_navigation_payload, dict) + and self._should_process_totp(scraped_page) + ): + workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(task.workflow_run_id) + + for key, value in list(final_navigation_payload.items()): + if isinstance(value, dict) and "totp" in value: + totp_placeholder = value.get("totp") + if totp_placeholder and isinstance(totp_placeholder, str): + totp_secret_key = workflow_run_context.totp_secret_value_key(totp_placeholder) + totp_secret = workflow_run_context.get_original_secret_value_or_none(totp_secret_key) + + if totp_secret: + # Store TOTP secret for handler to use during execution + current_context = skyvern_context.ensure_context() + current_context.totp_codes[f"{task.task_id}_secret"] = totp_secret + + # Send a placeholder TOTP for the LLM to see the format + final_navigation_payload[key]["totp"] = "123456" + return final_navigation_payload async def _get_action_results(self, task: Task, current_step: Step | None = None) -> str: diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index 75bed40d..6f10ce73 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -88,6 +88,9 @@ class Action(BaseModel): is_checked: bool | None = None verified: bool = False + # TOTP timing information for multi-field TOTP sequences + totp_timing_info: dict[str, Any] | None = None + created_at: datetime | None = None modified_at: datetime | None = None created_by: str | None = None diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 8d99aa8e..1906c22e 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -2,6 +2,7 @@ import asyncio import copy import json import os +import time import urllib.parse import uuid from datetime import datetime, timedelta, timezone @@ -10,6 +11,7 @@ from typing import Any, Awaitable, Callable, List import pyotp import structlog +from playwright._impl._errors import Error as PlaywrightError from playwright.async_api import FileChooser, Frame, Locator, Page, TimeoutError from pydantic import BaseModel @@ -22,6 +24,7 @@ from skyvern.constants import ( REPO_ROOT_DIR, SKYVERN_ID_ATTR, ) +from skyvern.errors.errors import TOTPExpiredError from skyvern.exceptions import ( DownloadFileMaxWaitingTime, EmptySelect, @@ -929,6 +932,128 @@ async def handle_click_to_download_file_action( return [ActionSuccess(download_triggered=True)] +# TOTP timing constants +TOTP_TIME_STEP_SECONDS = 30 +TOTP_EXPIRY_THRESHOLD_SECONDS = 20 + + +async def _handle_multi_field_totp_sequence( + timing_info: dict[str, Any], + task: Task, +) -> list[ActionResult] | None: + """ + Handle TOTP generation and caching for multi-field TOTP sequences. + + Returns: + ActionFailure if TOTP handling failed, None if successful + """ + action_index = timing_info["action_index"] + cache_key = f"{task.task_id}_totp_cache" + current_context = skyvern_context.ensure_context() + + if action_index == 0: + # First digit: generate TOTP and cache it + totp_secret = timing_info["totp_secret"] + totp = pyotp.TOTP(totp_secret) + + # Check current TOTP expiry time + current_time = int(time.time()) + current_totp_valid_until = ((current_time // TOTP_TIME_STEP_SECONDS) + 1) * TOTP_TIME_STEP_SECONDS + seconds_until_expiry = current_totp_valid_until - current_time + + # If less than threshold seconds until expiry, use the next TOTP + if seconds_until_expiry < TOTP_EXPIRY_THRESHOLD_SECONDS: + # Force generation of next TOTP by advancing time + next_time = current_totp_valid_until + current_totp = totp.at(next_time) + + LOG.debug( + "Using multi-field TOTP flow - using NEXT TOTP due to <20s expiry", + task_id=task.task_id, + action_idx=action_index, + current_totp=totp.now(), + next_totp=current_totp, + seconds_until_expiry=seconds_until_expiry, + is_retry=timing_info.get("is_retry", False), + ) + else: + # Use current TOTP + current_totp = totp.now() + + current_context.totp_codes[cache_key] = current_totp + else: + # Subsequent digits: reuse cached TOTP + current_totp = current_context.totp_codes.get(cache_key) + if not current_totp: + # TOTP cache missing for subsequent digit - this should not happen + # If it does, something went wrong with the first digit, so fail the action + LOG.error( + "TOTP cache missing for subsequent digit - first digit may have failed", + task_id=task.task_id, + action_idx=action_index, + cache_key=cache_key, + ) + return [ActionFailure(TOTPExpiredError())] + + # Check if cached TOTP has expired + totp_secret = timing_info["totp_secret"] + totp = pyotp.TOTP(totp_secret) + + # Get current time and calculate TOTP expiry + current_time = int(time.time()) + totp_valid_until = ((current_time // TOTP_TIME_STEP_SECONDS) + 1) * TOTP_TIME_STEP_SECONDS + + if current_time >= totp_valid_until: + LOG.error( + "Cached TOTP has expired during multi-field sequence", + task_id=task.task_id, + action_idx=action_index, + current_time=current_time, + totp_valid_until=totp_valid_until, + cached_totp=current_totp, + ) + return [ActionFailure(TOTPExpiredError())] + + LOG.debug( + "Using multi-field TOTP flow - reusing cached TOTP", + task_id=task.task_id, + action_idx=action_index, + totp=current_totp, + current_time=current_time, + totp_valid_until=totp_valid_until, + ) + + # Special handling for the 6th digit (action_index=5): wait if TOTP is not yet valid + if action_index == 5: + # Calculate when this TOTP becomes valid (valid_from time) + # If we used the next TOTP window, valid_from is the start of that window + totp_valid_from = totp_valid_until - TOTP_TIME_STEP_SECONDS + + if current_time < totp_valid_from: + # TOTP is not yet valid, wait until it becomes valid + wait_seconds = totp_valid_from - current_time + + LOG.debug( + "6th digit: TOTP not yet valid, waiting until valid_from", + task_id=task.task_id, + action_idx=action_index, + current_time=current_time, + totp_valid_from=totp_valid_from, + wait_seconds=wait_seconds, + totp=current_totp, + ) + + await asyncio.sleep(wait_seconds) + + LOG.debug( + "6th digit: Finished waiting, TOTP is now valid", + task_id=task.task_id, + action_idx=action_index, + ) + + return None # Success + + @TraceManager.traced_async(ignore_inputs=["scraped_page", "page"]) async def handle_input_text_action( action: actions.InputTextAction, @@ -954,9 +1079,17 @@ async def handle_input_text_action( # before filling text, we need to validate if the element can be filled if it's not one of COMMON_INPUT_TAGS tag_name = scraped_page.id_to_element_dict[action.element_id]["tagName"].lower() - text: str | None = await get_actual_value_of_parameter_if_secret(task, action.text) - if text is None: - return [ActionFailure(FailedToFetchSecret())] + + # Check if this is multi-field TOTP first - if so, skip secret resolution + if action.totp_timing_info and action.totp_timing_info.get("is_totp_sequence"): + # For multi-field TOTP, we'll set text directly in the TOTP logic below + text: str = "" + else: + # For regular inputs, resolve secrets + text_result = await get_actual_value_of_parameter_if_secret(task, action.text) + if text_result is None: + return [ActionFailure(FailedToFetchSecret())] + text = text_result is_totp_value = ( text == BitwardenConstants.TOTP or text == OnePasswordConstants.TOTP or text == AzureVaultConstants.TOTP @@ -1198,6 +1331,32 @@ async def handle_input_text_action( await skyvern_element.input(text) return [ActionSuccess()] + # Handle TOTP generation for multi-field TOTP sequences + if action.totp_timing_info: + timing_info = action.totp_timing_info + if timing_info.get("is_totp_sequence"): + result = await _handle_multi_field_totp_sequence(timing_info, task) + if result is not None: + return result # Return ActionFailure if TOTP handling failed + + # Extract the digit for this action index + current_totp = skyvern_context.ensure_context().totp_codes.get(f"{task.task_id}_totp_cache") + action_index = timing_info["action_index"] + + if current_totp and len(current_totp) > action_index: + digit = current_totp[action_index] + action.text = digit + # Also update the text variable that will be used later + text = digit + else: + LOG.error( + "TOTP too short for action index", + task_id=task.task_id, + action_idx=action_index, + totp_length=len(current_totp) if current_totp else 0, + ) + return [ActionFailure(TOTPExpiredError())] + try: # TODO: not sure if this case will trigger auto-completion if tag_name not in COMMON_INPUT_TAGS: @@ -1246,18 +1405,57 @@ async def handle_input_text_action( try: await skyvern_element.input_sequentially(text=text) - finally: + incremental_element = await incremental_scraped.get_incremental_element_tree( clean_and_remove_element_tree_factory( - task=task, step=step, check_filter_funcs=[check_existed_but_not_option_element_in_dom_factory(dom)] + task=task, + step=step, + check_filter_funcs=[check_existed_but_not_option_element_in_dom_factory(dom)], ), ) if len(incremental_element) > 0: auto_complete_hacky_flag = True + except PlaywrightError as inc_error: + # Handle Playwright-specific errors during incremental element processing (e.g., TOTP form auto-submit) + error_message = str(inc_error).lower() + if ( + "execution context was destroyed" in error_message + or "navigation" in error_message + or "target closed" in error_message + ): + # These are expected during page navigation/auto-submit, silently continue + LOG.debug( + "Playwright error during incremental element processing (likely page navigation)", + task_id=task.task_id, + step_id=step.step_id, + error_type=type(inc_error).__name__, + error_message=error_message, + ) + else: + LOG.warning( + "Unexpected Playwright error during incremental element processing", + task_id=task.task_id, + step_id=step.step_id, + error_type=type(inc_error).__name__, + error_message=str(inc_error), + ) + except Exception as inc_error: + # Handle any other unexpected errors during incremental element processing + LOG.warning( + "Unexpected error during incremental element processing", + task_id=task.task_id, + step_id=step.step_id, + error_type=type(inc_error).__name__, + error_message=str(inc_error), + ) + finally: + # Always stop listening await incremental_scraped.stop_listen_dom_increment() return [ActionSuccess()] except Exception as e: + # Handle any other unexpected errors during text input + LOG.exception( "Failed to input the value or finish the auto completion", task_id=task.task_id, @@ -3738,6 +3936,21 @@ async def _get_input_or_select_context( json_response = await app.PARSE_SELECT_LLM_API_HANDLER( prompt=prompt, step=step, prompt_name="parse-input-or-select-context" ) + + # Handle edge case where LLM returns list instead of dict + if isinstance(json_response, list): + LOG.warning( + "LLM returned list instead of dict for input/select context parsing", + step_id=step.step_id, + original_response_type=type(json_response).__name__, + original_response_length=len(json_response) if json_response else 0, + first_item_type=type(json_response[0]).__name__ if json_response else None, + first_item_keys=list(json_response[0].keys()) + if json_response and isinstance(json_response[0], dict) + else None, + ) + json_response = json_response[0] if json_response else {} + json_response["intention"] = action.intention input_or_select_context = InputOrSelectContext.model_validate(json_response) LOG.info(