diff --git a/skyvern-frontend/src/api/types.ts b/skyvern-frontend/src/api/types.ts index a869b559..53614521 100644 --- a/skyvern-frontend/src/api/types.ts +++ b/skyvern-frontend/src/api/types.ts @@ -244,6 +244,7 @@ export const ActionTypes = { Drag: "drag", LeftMouse: "left_mouse", GotoUrl: "goto_url", + ClosePage: "close_page", } as const; export type ActionType = (typeof ActionTypes)[keyof typeof ActionTypes]; @@ -269,6 +270,7 @@ export const ReadableActionTypes: { drag: "Drag", left_mouse: "Left Mouse", goto_url: "Goto URL", + close_page: "Close Page", }; export type Option = { diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py index 0461f0d5..150e24fa 100644 --- a/skyvern/exceptions.py +++ b/skyvern/exceptions.py @@ -218,6 +218,11 @@ class FailedToStopLoadingPage(SkyvernException): super().__init__(f"Failed to stop loading page url {url}. Error message: {error_message}") +class EmptyBrowserContext(SkyvernException): + def __init__(self) -> None: + super().__init__("Browser context is empty") + + class UnexpectedTaskStatus(SkyvernException): def __init__(self, task_id: str, status: str) -> None: super().__init__(f"Unexpected task status {status} for task {task_id}") diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 4c2ffc47..1221139f 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -2185,6 +2185,7 @@ class ForgeAgent: complete_criterion=task.complete_criterion.strip() if task.complete_criterion else None, terminate_criterion=task.terminate_criterion.strip() if task.terminate_criterion else None, parse_select_feature_enabled=context.enable_parse_select_in_extract, + has_magic_link_page=context.has_magic_link_page(task.task_id), ) # Store static prompt for caching and return dynamic prompt @@ -2214,6 +2215,7 @@ class ForgeAgent: complete_criterion=task.complete_criterion.strip() if task.complete_criterion else None, terminate_criterion=task.terminate_criterion.strip() if task.terminate_criterion else None, parse_select_feature_enabled=context.enable_parse_select_in_extract, + has_magic_link_page=context.has_magic_link_page(task.task_id), ) return full_prompt, use_caching @@ -3273,10 +3275,13 @@ class ForgeAgent: if not otp_value or otp_value.get_otp_type() != OTPType.MAGIC_LINK: return [] - # TODO: not sure whether all magic links can directly login + navigate to the homepage + # always open a new tab to navigate to the magic link + page = await browser_state.new_page() + context = skyvern_context.ensure_context() + context.add_magic_link_page(task.task_id, page) + return [ GotoUrlAction( - action_type=ActionType.GOTO_URL, reasoning="Navigating to the magic link URL to verify the login", intention="Navigating to the magic link URL to verify the login", url=otp_value.value, @@ -3286,6 +3291,7 @@ class ForgeAgent: step_id=step.step_id, step_order=step.order, action_order=0, + is_magic_link=True, ), ] diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2 index 8bf9a874..9d3e9898 100644 --- a/skyvern/forge/prompts/skyvern/extract-action.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action.j2 @@ -18,7 +18,7 @@ Reply in JSON format with the following keys: "user_detail_query": str, // Think of this value as a Jeopardy question and the intention behind the action. Ask the user for the details you need for executing this action. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what the intention is behind the click and what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?". If the action doesn't require any user details, describe the intention behind the action. "user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details. "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned. + "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} "id": str, // The id of the element to take action on. The id has to be one from the elements list "text": str, // Text for INPUT_TEXT action only "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. diff --git a/skyvern/forge/sdk/core/skyvern_context.py b/skyvern/forge/sdk/core/skyvern_context.py index 21a978d2..0eb9dd17 100644 --- a/skyvern/forge/sdk/core/skyvern_context.py +++ b/skyvern/forge/sdk/core/skyvern_context.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field from typing import Any from zoneinfo import ZoneInfo -from playwright.async_api import Frame +from playwright.async_api import Frame, Page @dataclass @@ -46,6 +46,15 @@ class SkyvernContext: script_run_parameters: dict[str, Any] = field(default_factory=dict) script_mode: bool = False ai_mode_override: str | None = None + + # magic link handling + # task_id is the key, page is the value + # we only consider the page is a magic link page in the same task scope + # for example, login block has a magic link page, + # but it will only be considered as a magic link page in the login block scope + # next blocks won't consider the page as a magic link page + magic_link_pages: dict[str, Page] = field(default_factory=dict) + """ Example output value: {"loop_value": "str", "output_parameter": "the key of the parameter", "output_value": Any} @@ -62,6 +71,19 @@ class SkyvernContext: if task_id in self.totp_codes: self.totp_codes.pop(task_id) + def add_magic_link_page(self, task_id: str, page: Page) -> None: + self.magic_link_pages[task_id] = page + + def has_magic_link_page(self, task_id: str) -> bool: + if task_id not in self.magic_link_pages: + return False + + page = self.magic_link_pages[task_id] + if page.is_closed(): + self.magic_link_pages.pop(task_id) + return False + return True + _context: ContextVar[SkyvernContext | None] = ContextVar( "Global context", diff --git a/skyvern/webeye/actions/action_types.py b/skyvern/webeye/actions/action_types.py index 9666b35c..fc92b2ec 100644 --- a/skyvern/webeye/actions/action_types.py +++ b/skyvern/webeye/actions/action_types.py @@ -17,6 +17,7 @@ class ActionType(StrEnum): TERMINATE = "terminate" COMPLETE = "complete" RELOAD_PAGE = "reload_page" + CLOSE_PAGE = "close_page" EXTRACT = "extract" VERIFICATION_CODE = "verification_code" diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index f4403709..a6229c66 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -125,6 +125,10 @@ class Action(BaseModel): return SolveCaptchaAction.model_validate(value) elif action_type is ActionType.RELOAD_PAGE: return ReloadPageAction.model_validate(value) + elif action_type is ActionType.GOTO_URL: + return GotoUrlAction.model_validate(value) + elif action_type is ActionType.CLOSE_PAGE: + return ClosePageAction.model_validate(value) else: raise ValueError(f"Unsupported action type: {action_type}") else: @@ -153,6 +157,11 @@ class ReloadPageAction(Action): action_type: ActionType = ActionType.RELOAD_PAGE +# TODO: right now, it's only enabled when there's magic link during login +class ClosePageAction(Action): + action_type: ActionType = ActionType.CLOSE_PAGE + + class ClickAction(WebAction): action_type: ActionType = ActionType.CLICK file_url: str | None = None @@ -263,6 +272,7 @@ class KeypressAction(Action): class GotoUrlAction(Action): action_type: ActionType = ActionType.GOTO_URL url: str + is_magic_link: bool = False # if True, shouldn't go to url directly when replaying the cache class MoveAction(Action): diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index a991aa67..f8ae2fd6 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -2180,6 +2180,18 @@ async def handle_goto_url_action( return [ActionSuccess()] +@TraceManager.traced_async(ignore_inputs=["scraped_page", "page"]) +async def handle_close_page_action( + action: actions.ClosePageAction, + page: Page, + scraped_page: ScrapedPage, + task: Task, + step: Step, +) -> list[ActionResult]: + await page.close(reason=action.reasoning) + return [ActionSuccess()] + + ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action) ActionHandler.register_action_type(ActionType.CLICK, handle_click_action) ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action) @@ -2198,6 +2210,7 @@ ActionHandler.register_action_type(ActionType.DRAG, handle_drag_action) ActionHandler.register_action_type(ActionType.VERIFICATION_CODE, handle_verification_code_action) ActionHandler.register_action_type(ActionType.LEFT_MOUSE, handle_left_mouse_action) ActionHandler.register_action_type(ActionType.GOTO_URL, handle_goto_url_action) +ActionHandler.register_action_type(ActionType.CLOSE_PAGE, handle_close_page_action) async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any: diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index 70b420e4..4b0cf999 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -21,6 +21,7 @@ from skyvern.webeye.actions.actions import ( Action, CheckboxAction, ClickAction, + ClosePageAction, CompleteAction, DownloadFileAction, DragAction, @@ -168,6 +169,9 @@ def parse_action( if action_type == ActionType.SOLVE_CAPTCHA: return SolveCaptchaAction(**base_action_dict) + if action_type == ActionType.CLOSE_PAGE: + return ClosePageAction(**base_action_dict) + raise UnsupportedActionType(action_type=action_type) @@ -826,6 +830,7 @@ async def generate_cua_fallback_actions( url=magic_link, reasoning=reasoning, intention=reasoning, + is_magic_link=True, ) except NoTOTPVerificationCodeFound: reasoning_suffix = "No magic link found" diff --git a/skyvern/webeye/browser_factory.py b/skyvern/webeye/browser_factory.py index 7cc98d3d..a697ee55 100644 --- a/skyvern/webeye/browser_factory.py +++ b/skyvern/webeye/browser_factory.py @@ -25,6 +25,7 @@ from pydantic import BaseModel, PrivateAttr from skyvern.config import settings from skyvern.constants import BROWSER_CLOSE_TIMEOUT, BROWSER_DOWNLOAD_TIMEOUT, NAVIGATION_MAX_RETRY_TIME, SKYVERN_DIR from skyvern.exceptions import ( + EmptyBrowserContext, FailedToNavigateToUrl, FailedToReloadPage, FailedToStopLoadingPage, @@ -899,6 +900,11 @@ class BrowserState: LOG.exception(f"Error while stop loading the page: {repr(e)}") raise FailedToStopLoadingPage(url=page.url, error_message=repr(e)) + async def new_page(self) -> Page: + if self.browser_context is None: + raise EmptyBrowserContext() + return await self.browser_context.new_page() + async def reload_page(self) -> None: page = await self.__assert_page()