remove hard waiting time in scraping (#3195)

This commit is contained in:
LawyZheng
2025-08-15 02:24:59 +08:00
committed by GitHub
parent f971cf8e58
commit cac4792f38
3 changed files with 45 additions and 8 deletions

View File

@@ -749,3 +749,8 @@ class ScriptNotFound(SkyvernHTTPException):
class NoTOTPSecretFound(SkyvernException): class NoTOTPSecretFound(SkyvernException):
def __init__(self) -> None: def __init__(self) -> None:
super().__init__("No TOTP secret found") super().__init__("No TOTP secret found")
class NoElementFound(SkyvernException):
def __init__(self) -> None:
super().__init__("No element found.")

View File

@@ -13,7 +13,13 @@ from pydantic import BaseModel, PrivateAttr
from skyvern.config import settings from skyvern.config import settings
from skyvern.constants import DEFAULT_MAX_TOKENS, SKYVERN_DIR, SKYVERN_ID_ATTR from skyvern.constants import DEFAULT_MAX_TOKENS, SKYVERN_DIR, SKYVERN_ID_ATTR
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, ScrapingFailedBlankPage, UnknownElementTreeFormat from skyvern.exceptions import (
FailedToTakeScreenshot,
NoElementFound,
ScrapingFailed,
ScrapingFailedBlankPage,
UnknownElementTreeFormat,
)
from skyvern.forge.sdk.api.crypto import calculate_sha256 from skyvern.forge.sdk.api.crypto import calculate_sha256
from skyvern.forge.sdk.core import skyvern_context from skyvern.forge.sdk.core import skyvern_context
from skyvern.forge.sdk.settings_manager import SettingsManager from skyvern.forge.sdk.settings_manager import SettingsManager
@@ -524,7 +530,7 @@ async def scrape_web_unsafe(
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS, max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True, scroll: bool = True,
support_empty_page: bool = False, support_empty_page: bool = False,
wait_seconds: float = 3, wait_seconds: float = 0,
) -> ScrapedPage: ) -> ScrapedPage:
""" """
Asynchronous function that performs web scraping without any built-in error handling. This function is intended Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -549,10 +555,22 @@ async def scrape_web_unsafe(
if url == "about:blank" and not support_empty_page: if url == "about:blank" and not support_empty_page:
raise ScrapingFailedBlankPage() raise ScrapingFailedBlankPage()
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.") try:
await asyncio.sleep(wait_seconds) await page.wait_for_load_state("load", timeout=3000)
await SkyvernFrame.wait_for_animation_end(page)
except Exception:
LOG.warning("Failed to wait for load state, will continue scraping", exc_info=True)
if wait_seconds > 0:
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.", wait_seconds=wait_seconds)
await asyncio.sleep(wait_seconds)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude) elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
if not elements and not support_empty_page:
LOG.warning("No elements found on the page, wait for 3 seconds and retry")
await asyncio.sleep(3)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree)) element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree)) element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree))
@@ -576,9 +594,9 @@ async def scrape_web_unsafe(
elements elements
) )
# if there are no elements, fail the scraping # if there are no elements, fail the scraping unless support_empty_page is True
if not elements and not support_empty_page: if not elements and not support_empty_page:
raise Exception("No elements found on the page") raise NoElementFound()
text_content = await get_frame_text(page.main_frame) text_content = await get_frame_text(page.main_frame)

View File

@@ -161,8 +161,7 @@ async def _scrolling_screenshots_helper(
if mode == ScreenshotMode.DETAILED: if mode == ScreenshotMode.DETAILED:
# wait until animation ends, which is triggered by scrolling # wait until animation ends, which is triggered by scrolling
LOG.debug("Waiting for 2 seconds until animation ends.") await SkyvernFrame.wait_for_animation_end(skyvern_page.frame)
await asyncio.sleep(2)
else: else:
if draw_boxes: if draw_boxes:
await skyvern_page.build_elements_and_draw_bounding_boxes(frame=frame, frame_index=frame_index) await skyvern_page.build_elements_and_draw_bounding_boxes(frame=frame, frame_index=frame_index)
@@ -215,6 +214,21 @@ def _merge_images_by_position(images: list[Image.Image], positions: list[int]) -
class SkyvernFrame: class SkyvernFrame:
@staticmethod
async def wait_for_animation_end(page: Page, timeout: float = 3000) -> None:
try:
await page.wait_for_function(
"""
() => {
const animations = document.getAnimations();
return animations.every(a => a.playState === 'finished');
}
""",
timeout=timeout,
)
except Exception:
LOG.warning("Failed to wait for animation end, but continue", exc_info=True)
@staticmethod @staticmethod
async def evaluate( async def evaluate(
frame: Page | Frame, frame: Page | Frame,