From a9c8a1d88943e79e2afd1a8e073271340e74a6a5 Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Wed, 16 Apr 2025 17:11:01 -0700 Subject: [PATCH] CUA feature - verification code / 2FA (#2174) --- skyvern/forge/agent.py | 52 +++++++++++++++---------- skyvern/services/totp_service.py | 0 skyvern/webeye/actions/actions.py | 6 +++ skyvern/webeye/actions/handler.py | 19 +++++++++ skyvern/webeye/actions/parse_actions.py | 35 ++++++++++++++--- 5 files changed, 86 insertions(+), 26 deletions(-) create mode 100644 skyvern/services/totp_service.py diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index a4b208e9..b055596b 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -1273,26 +1273,38 @@ class ForgeAgent: screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8") if last_call_id is None: - # try address the conversation with the context we have - reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None - assistant_message = assistant_messages[0].content[0].text if assistant_messages else None - skyvern_repsonse_prompt = load_prompt_with_elements( - scraped_page=scraped_page, - prompt_engine=prompt_engine, - template_name="cua-answer-question", - navigation_goal=task.navigation_goal, - assistant_reasoning=reasoning, - assistant_message=assistant_message, - ) - skyvern_response = await app.LLM_API_HANDLER( - prompt=skyvern_repsonse_prompt, - prompt_name="cua-answer-question", - step=step, - screenshots=scraped_page.screenshots, - ) - resp_content = skyvern_response.get("answer") - if not resp_content: - resp_content = "I don't know. Can you help me make the best decision to achieve the goal?" + current_context = skyvern_context.ensure_context() + resp_content = None + if task.task_id in current_context.totp_codes: + verification_code = current_context.totp_codes[task.task_id] + current_context.totp_codes.pop(task.task_id) + LOG.info( + "Using verification code from context", + task_id=task.task_id, + verification_code=verification_code, + ) + resp_content = f"Here is the verification code: {verification_code}" + else: + # try address the conversation with the context we have + reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None + assistant_message = assistant_messages[0].content[0].text if assistant_messages else None + skyvern_repsonse_prompt = load_prompt_with_elements( + scraped_page=scraped_page, + prompt_engine=prompt_engine, + template_name="cua-answer-question", + navigation_goal=task.navigation_goal, + assistant_reasoning=reasoning, + assistant_message=assistant_message, + ) + skyvern_response = await app.LLM_API_HANDLER( + prompt=skyvern_repsonse_prompt, + prompt_name="cua-answer-question", + step=step, + screenshots=scraped_page.screenshots, + ) + resp_content = skyvern_response.get("answer") + if not resp_content: + resp_content = "I don't know. Can you help me make the best decision to achieve the goal?" current_response = await app.OPENAI_CLIENT.responses.create( model="computer-use-preview", previous_response_id=previous_response.id, diff --git a/skyvern/services/totp_service.py b/skyvern/services/totp_service.py new file mode 100644 index 00000000..e69de29b diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index 6cc4a064..9342fee3 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -33,6 +33,7 @@ class ActionType(StrEnum): TYPE = "type" MOVE = "move" DRAG = "drag" + VERIFICATION_CODE = "verification_code" def is_web_action(self) -> bool: return self in [ @@ -293,6 +294,11 @@ class DragAction(Action): path: list[tuple[int, int]] = [] +class VerificationCodeAction(Action): + action_type: ActionType = ActionType.VERIFICATION_CODE + verification_code: str + + class ScrapeResult(BaseModel): """ Scraped response from a webpage, including: diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index e7b2ba63..3b2186fd 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -1559,6 +1559,24 @@ async def handle_drag_action( return [ActionSuccess()] +async def handle_verification_code_action( + action: actions.VerificationCodeAction, + page: Page, + scraped_page: ScrapedPage, + task: Task, + step: Step, +) -> list[ActionResult]: + LOG.info( + "Setting verification code in skyvern context", + task_id=task.task_id, + step_id=step.step_id, + verification_code=action.verification_code, + ) + current_context = skyvern_context.ensure_context() + current_context.totp_codes[task.task_id] = action.verification_code + return [ActionSuccess()] + + ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action) ActionHandler.register_action_type(ActionType.CLICK, handle_click_action) ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action) @@ -1574,6 +1592,7 @@ ActionHandler.register_action_type(ActionType.SCROLL, handle_scroll_action) ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action) ActionHandler.register_action_type(ActionType.MOVE, handle_move_action) ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action) +ActionHandler.register_action_type(ActionType.VERIFICATION_CODE, handle_verification_code_action) async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any: diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index 1e8d5276..853ab729 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -27,8 +27,10 @@ from skyvern.webeye.actions.actions import ( SolveCaptchaAction, TerminateAction, UploadFileAction, + VerificationCodeAction, WaitAction, ) +from skyvern.webeye.actions.handler import poll_verification_code from skyvern.webeye.scraper.scraper import ScrapedPage LOG = structlog.get_logger() @@ -400,12 +402,33 @@ async def parse_cua_actions( intention=reasoning, ) elif skyvern_action_type == "get_verification_code": - # Currently we don't support verification code - # TODO: handle verification code by fetching the code and send it to CUA - action = TerminateAction( - reasoning=reasoning, - intention=reasoning, - ) + if (task.totp_verification_url or task.totp_identifier) and task.organization_id: + LOG.info( + "Getting verification code for CUA", + task_id=task.task_id, + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + totp_verification_url=task.totp_verification_url, + totp_identifier=task.totp_identifier, + ) + verification_code = await poll_verification_code( + task.task_id, + task.organization_id, + workflow_run_id=task.workflow_run_id, + totp_verification_url=task.totp_verification_url, + totp_identifier=task.totp_identifier, + ) + reasoning = reasoning or f"Received verification code: {verification_code}" + action = VerificationCodeAction( + verification_code=verification_code, + reasoning=reasoning, + intention=reasoning, + ) + else: + action = TerminateAction( + reasoning=reasoning, + intention=reasoning, + ) action.organization_id = task.organization_id action.workflow_run_id = task.workflow_run_id