remove hard waiting time in scraping (#3195)

2025-08-15 02:24:59 +08:00
parent f971cf8e58
commit cac4792f38
3 changed files with 45 additions and 8 deletions
--- a/skyvern/exceptions.py
+++ b/skyvern/exceptions.py
@@ -749,3 +749,8 @@ class ScriptNotFound(SkyvernHTTPException):
 class NoTOTPSecretFound(SkyvernException):
    def __init__(self) -> None:
        super().__init__("No TOTP secret found")
+
+
+class NoElementFound(SkyvernException):
+    def __init__(self) -> None:
+        super().__init__("No element found.")
--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -13,7 +13,13 @@ from pydantic import BaseModel, PrivateAttr

 from skyvern.config import settings
 from skyvern.constants import DEFAULT_MAX_TOKENS, SKYVERN_DIR, SKYVERN_ID_ATTR
-from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, ScrapingFailedBlankPage, UnknownElementTreeFormat
+from skyvern.exceptions import (
+    FailedToTakeScreenshot,
+    NoElementFound,
+    ScrapingFailed,
+    ScrapingFailedBlankPage,
+    UnknownElementTreeFormat,
+)
 from skyvern.forge.sdk.api.crypto import calculate_sha256
 from skyvern.forge.sdk.core import skyvern_context
 from skyvern.forge.sdk.settings_manager import SettingsManager
@@ -524,7 +530,7 @@ async def scrape_web_unsafe(
    max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
    scroll: bool = True,
    support_empty_page: bool = False,
-    wait_seconds: float = 3,
+    wait_seconds: float = 0,
 ) -> ScrapedPage:
    """
    Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -549,10 +555,22 @@ async def scrape_web_unsafe(
    if url == "about:blank" and not support_empty_page:
        raise ScrapingFailedBlankPage()

-    LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.")
-    await asyncio.sleep(wait_seconds)
+    try:
+        await page.wait_for_load_state("load", timeout=3000)
+        await SkyvernFrame.wait_for_animation_end(page)
+    except Exception:
+        LOG.warning("Failed to wait for load state, will continue scraping", exc_info=True)
+
+    if wait_seconds > 0:
+        LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.", wait_seconds=wait_seconds)
+        await asyncio.sleep(wait_seconds)

    elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
+    if not elements and not support_empty_page:
+        LOG.warning("No elements found on the page, wait for 3 seconds and retry")
+        await asyncio.sleep(3)
+        elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
+
    element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
    element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree))

@@ -576,9 +594,9 @@ async def scrape_web_unsafe(
        elements
    )

-    # if there are no elements, fail the scraping
+    # if there are no elements, fail the scraping unless support_empty_page is True
    if not elements and not support_empty_page:
-        raise Exception("No elements found on the page")
+        raise NoElementFound()

    text_content = await get_frame_text(page.main_frame)

--- a/skyvern/webeye/utils/page.py
+++ b/skyvern/webeye/utils/page.py
@@ -161,8 +161,7 @@ async def _scrolling_screenshots_helper(

        if mode == ScreenshotMode.DETAILED:
            # wait until animation ends, which is triggered by scrolling
-            LOG.debug("Waiting for 2 seconds until animation ends.")
-            await asyncio.sleep(2)
+            await SkyvernFrame.wait_for_animation_end(skyvern_page.frame)
    else:
        if draw_boxes:
            await skyvern_page.build_elements_and_draw_bounding_boxes(frame=frame, frame_index=frame_index)
@@ -215,6 +214,21 @@ def _merge_images_by_position(images: list[Image.Image], positions: list[int]) -


 class SkyvernFrame:
+    @staticmethod
+    async def wait_for_animation_end(page: Page, timeout: float = 3000) -> None:
+        try:
+            await page.wait_for_function(
+                """
+                () => {
+                    const animations = document.getAnimations();
+                    return animations.every(a => a.playState === 'finished');
+                }
+            """,
+                timeout=timeout,
+            )
+        except Exception:
+            LOG.warning("Failed to wait for animation end, but continue", exc_info=True)
+
    @staticmethod
    async def evaluate(
        frame: Page | Frame,