wait files fully downloaded before complete task (#1707)

2025-02-03 23:49:46 +08:00
parent 1ba225002b
commit b43f1bfec2
5 changed files with 81 additions and 24 deletions
--- a/skyvern/constants.py
+++ b/skyvern/constants.py
@@ -18,6 +18,7 @@ GET_DOWNLOADED_FILES_TIMEOUT = 30
 NAVIGATION_MAX_RETRY_TIME = 5
 AUTO_COMPLETION_POTENTIAL_VALUES_COUNT = 5
 DROPDOWN_MENU_MAX_DISTANCE = 100
 BROWSER_DOWNLOADING_SUFFIX = ".crdownload"
 # reserved fields for navigation payload
 SPECIAL_FIELD_VERIFICATION_CODE = "verification_code"
--- a/skyvern/exceptions.py
+++ b/skyvern/exceptions.py
@@ -1,3 +1,5 @@
 from pathlib import Path
 from fastapi import status
@@ -252,6 +254,12 @@ class DownloadFileMaxSizeExceeded(SkyvernException):
        super().__init__(f"Download file size exceeded the maximum allowed size of {max_size} MB.")
 class DownloadFileMaxWaitingTime(SkyvernException):
    def __init__(self, downloading_files: list[Path]) -> None:
        self.downloading_files = downloading_files
        super().__init__(f"Long-time downloading files [{downloading_files}].")
 class NoFileDownloadTriggered(SkyvernException):
    def __init__(self, element_id: str) -> None:
        super().__init__(f"Clicking on element doesn't trigger the file download. element_id={element_id}")
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -16,6 +16,7 @@ from playwright.async_api import Page
 from skyvern import analytics
 from skyvern.config import settings
 from skyvern.constants import (
    BROWSER_DOWNLOADING_SUFFIX,
    GET_DOWNLOADED_FILES_TIMEOUT,
    SAVE_DOWNLOADED_FILES_TIMEOUT,
    SCRAPE_TYPE_ORDER,
@@ -24,6 +25,7 @@ from skyvern.constants import (
 )
 from skyvern.exceptions import (
    BrowserStateMissingPage,
    DownloadFileMaxWaitingTime,
    EmptyScrapePage,
    FailedToNavigateToUrl,
    FailedToParseActionInstruction,
@@ -45,7 +47,13 @@ from skyvern.exceptions import (
 from skyvern.forge import app
 from skyvern.forge.async_operations import AgentPhase, AsyncOperationPool
 from skyvern.forge.prompts import prompt_engine
-from skyvern.forge.sdk.api.files import get_path_for_workflow_download_directory, list_files_in_directory, rename_file
+from skyvern.forge.sdk.api.files import (
    get_path_for_workflow_download_directory,
    list_downloading_files_in_directory,
    list_files_in_directory,
    rename_file,
    wait_for_download_finished,
 )
 from skyvern.forge.sdk.artifact.models import ArtifactType
 from skyvern.forge.sdk.core import skyvern_context
 from skyvern.forge.sdk.core.security import generate_skyvern_webhook_headers
@@ -375,12 +383,31 @@ class ForgeAgent:
            if task_block and task_block.complete_on_download and task.workflow_run_id:
                workflow_download_directory = get_path_for_workflow_download_directory(task.workflow_run_id)
                downloading_files: list[Path] = list_downloading_files_in_directory(workflow_download_directory)
                if len(downloading_files) > 0:
                    LOG.info(
                        "Detecting files are still downloading, waiting for files to be completely downloaded.",
                        downloading_files=downloading_files,
                        step_id=step.step_id,
                    )
                    try:
                        await wait_for_download_finished(downloading_files=downloading_files)
                    except DownloadFileMaxWaitingTime as e:
                        LOG.warning(
                            "There're several long-time downloading files, these files might be broken",
                            downloading_files=e.downloading_files,
                            task_id=task.task_id,
                            step_id=step.step_id,
                            workflow_run_id=task.workflow_run_id,
                        )
                list_files_after = list_files_in_directory(workflow_download_directory)
                if len(list_files_after) > len(list_files_before):
                    files_to_rename = list(set(list_files_after) - set(list_files_before))
                    for file in files_to_rename:
                        file_extension = Path(file).suffix
-                        if file_extension == ".crdownload":
+                        if file_extension == BROWSER_DOWNLOADING_SUFFIX:
                            LOG.warning(
                                "Detecting incompleted download file, skip the rename",
                                file=file,
--- a/skyvern/forge/sdk/api/files.py
+++ b/skyvern/forge/sdk/api/files.py
@@ -1,3 +1,4 @@
 import asyncio
 import hashlib
 import mimetypes
 import os
@@ -13,8 +14,8 @@ import structlog
 from multidict import CIMultiDictProxy
 from skyvern.config import settings
-from skyvern.constants import REPO_ROOT_DIR
+from skyvern.constants import BROWSER_DOWNLOAD_TIMEOUT, BROWSER_DOWNLOADING_SUFFIX, REPO_ROOT_DIR
-from skyvern.exceptions import DownloadFileMaxSizeExceeded
+from skyvern.exceptions import DownloadFileMaxSizeExceeded, DownloadFileMaxWaitingTime
 from skyvern.forge.sdk.api.aws import AsyncAWSClient
 LOG = structlog.get_logger()
@@ -158,6 +159,34 @@ def list_files_in_directory(directory: Path, recursive: bool = False) -> list[st
    return listed_files
 def list_downloading_files_in_directory(
    directory: Path, downloading_suffix: str = BROWSER_DOWNLOADING_SUFFIX
 ) -> list[Path]:
    # check if there's any file is still downloading
    downloading_files: list[Path] = []
    for file in list_files_in_directory(directory):
        path = Path(file)
        if path.suffix == downloading_suffix:
            downloading_files.append(path)
    return downloading_files
 async def wait_for_download_finished(downloading_files: list[Path], timeout: float = BROWSER_DOWNLOAD_TIMEOUT) -> None:
    cur_downloading_files = downloading_files
    try:
        async with asyncio.timeout(timeout):
            while len(cur_downloading_files) > 0:
                new_downloading_files: list[Path] = []
                for path in cur_downloading_files:
                    if not path.exists():
                        continue
                    new_downloading_files.append(path)
                cur_downloading_files = new_downloading_files
                await asyncio.sleep(1)
    except asyncio.TimeoutError:
        raise DownloadFileMaxWaitingTime(downloading_files=cur_downloading_files)
 def get_number_of_files_in_directory(directory: Path, recursive: bool = False) -> int:
    return len(list_files_in_directory(directory, recursive))
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -17,12 +17,12 @@ from skyvern.config import settings
 from skyvern.constants import (
    AUTO_COMPLETION_POTENTIAL_VALUES_COUNT,
    BROWSER_DOWNLOAD_MAX_WAIT_TIME,
    BROWSER_DOWNLOAD_TIMEOUT,
    DROPDOWN_MENU_MAX_DISTANCE,
    REPO_ROOT_DIR,
    SKYVERN_ID_ATTR,
 )
 from skyvern.exceptions import (
    DownloadFileMaxWaitingTime,
    EmptySelect,
    ErrEmptyTweakValue,
    ErrFoundSelectableElement,
@@ -52,7 +52,13 @@ from skyvern.exceptions import (
 )
 from skyvern.forge import app
 from skyvern.forge.prompts import prompt_engine
-from skyvern.forge.sdk.api.files import download_file, get_download_dir, list_files_in_directory
+from skyvern.forge.sdk.api.files import (
    download_file,
    get_download_dir,
    list_downloading_files_in_directory,
    list_files_in_directory,
    wait_for_download_finished,
 )
 from skyvern.forge.sdk.api.llm.exceptions import LLMProviderError
 from skyvern.forge.sdk.core.aiohttp_helper import aiohttp_post
 from skyvern.forge.sdk.core.security import generate_skyvern_signature
@@ -505,12 +511,7 @@ async def handle_click_to_download_file_action(
        return [ActionFailure(exception=NoFileDownloadTriggered(action.element_id))]
    # check if there's any file is still downloading
-    downloading_files: list[Path] = []
+    downloading_files = list_downloading_files_in_directory(download_dir)
    for file in list_files_after:
        path = Path(file)
        if path.suffix == ".crdownload":
            downloading_files.append(path)
    if len(downloading_files) == 0:
        return [ActionSuccess(download_triggered=True)]
@@ -522,20 +523,11 @@ async def handle_click_to_download_file_action(
        workflow_run_id=task.workflow_run_id,
    )
    try:
-        async with asyncio.timeout(BROWSER_DOWNLOAD_TIMEOUT):
+        await wait_for_download_finished(downloading_files=downloading_files)
-            while len(downloading_files) > 0:
+    except DownloadFileMaxWaitingTime as e:
                new_downloading_files: list[Path] = []
                for path in downloading_files:
                    if not path.exists():
                        continue
                    new_downloading_files.append(path)
                downloading_files = new_downloading_files
                await asyncio.sleep(1)
    except asyncio.TimeoutError:
        LOG.warning(
            "There're several long-time downloading files, these files might be broken",
-            downloading_files=downloading_files,
+            downloading_files=e.downloading_files,
            task_id=task.task_id,
            step_id=step.step_id,
            workflow_run_id=task.workflow_run_id,