From 087275492fe2110f38484a11b07bd162ce724230 Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Wed, 6 Nov 2024 10:15:47 +0800 Subject: [PATCH] download with file extension (#1142) --- skyvern/constants.py | 1 + skyvern/webeye/browser_factory.py | 66 ++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/skyvern/constants.py b/skyvern/constants.py index 1ae050df..85a2ec76 100644 --- a/skyvern/constants.py +++ b/skyvern/constants.py @@ -9,6 +9,7 @@ REPO_ROOT_DIR = SKYVERN_DIR.parent INPUT_TEXT_TIMEOUT = 120000 # 2 minutes PAGE_CONTENT_TIMEOUT = 300 # 5 mins BROWSER_CLOSE_TIMEOUT = 180 # 3 minute +BROWSER_DOWNLOAD_TIMEOUT = 600 # 10 minute # reserved fields for navigation payload SPECIAL_FIELD_VERIFICATION_CODE = "verification_code" diff --git a/skyvern/webeye/browser_factory.py b/skyvern/webeye/browser_factory.py index 7907d1dc..bdb5e356 100644 --- a/skyvern/webeye/browser_factory.py +++ b/skyvern/webeye/browser_factory.py @@ -6,15 +6,16 @@ import tempfile import time import uuid from datetime import datetime +from pathlib import Path from typing import Any, Awaitable, Callable, Protocol import aiofiles import structlog -from playwright.async_api import BrowserContext, ConsoleMessage, Error, Page, Playwright +from playwright.async_api import BrowserContext, ConsoleMessage, Download, Error, Page, Playwright from pydantic import BaseModel, PrivateAttr from skyvern.config import settings -from skyvern.constants import BROWSER_CLOSE_TIMEOUT, REPO_ROOT_DIR +from skyvern.constants import BROWSER_CLOSE_TIMEOUT, BROWSER_DOWNLOAD_TIMEOUT, REPO_ROOT_DIR from skyvern.exceptions import ( FailedToNavigateToUrl, FailedToReloadPage, @@ -68,6 +69,66 @@ def set_browser_console_log(browser_context: BrowserContext, browser_artifacts: browser_context.on("console", browser_console_log) +def set_download_file_listener(browser_context: BrowserContext, **kwargs: Any) -> None: + async def listen_to_download(download: Download) -> None: + try: + workflow_run_id = kwargs.get("workflow_run_id") + task_id = kwargs.get("task_id") + + async with asyncio.timeout(BROWSER_DOWNLOAD_TIMEOUT): + file_path = await download.path() + if file_path.suffix: + return + + LOG.info( + "No file extensions, going to add file extension automatically", + workflow_run_id=workflow_run_id, + task_id=task_id, + suggested_filename=download.suggested_filename, + url=download.url, + ) + suffix = Path(download.suggested_filename).suffix + if suffix: + LOG.info( + "Add extension according to suggested filename", + workflow_run_id=workflow_run_id, + task_id=task_id, + filepath=str(file_path) + suffix, + ) + file_path.rename(str(file_path) + suffix) + return + suffix = Path(download.url).suffix + if suffix: + LOG.info( + "Add extension according to download url", + workflow_run_id=workflow_run_id, + task_id=task_id, + filepath=str(file_path) + suffix, + ) + file_path.rename(str(file_path) + suffix) + return + # TODO: maybe should try to parse it from URL response + except asyncio.TimeoutError: + LOG.error( + "timeout to download file, going to cancel the download", + workflow_run_id=workflow_run_id, + task_id=task_id, + ) + await download.cancel() + + except Exception: + LOG.exception( + "Failed to add file extension name to downloaded file", + workflow_run_id=workflow_run_id, + task_id=task_id, + ) + + def listen_to_new_page(page: Page) -> None: + page.on("download", listen_to_download) + + browser_context.on("page", listen_to_new_page) + + class BrowserContextCreator(Protocol): def __call__( self, playwright: Playwright, **kwargs: dict[str, Any] @@ -145,6 +206,7 @@ class BrowserContextFactory: raise UnknownBrowserType(browser_type) browser_context, browser_artifacts, cleanup_func = await creator(playwright, **kwargs) set_browser_console_log(browser_context=browser_context, browser_artifacts=browser_artifacts) + set_download_file_listener(browser_context=browser_context, **kwargs) return browser_context, browser_artifacts, cleanup_func except Exception as e: if browser_context is not None: