remove hard waiting time in scraping (#3195)

This commit is contained in:
LawyZheng
2025-08-15 02:24:59 +08:00
committed by GitHub
parent f971cf8e58
commit cac4792f38
3 changed files with 45 additions and 8 deletions

View File

@@ -749,3 +749,8 @@ class ScriptNotFound(SkyvernHTTPException):
class NoTOTPSecretFound(SkyvernException):
def __init__(self) -> None:
super().__init__("No TOTP secret found")
class NoElementFound(SkyvernException):
def __init__(self) -> None:
super().__init__("No element found.")

View File

@@ -13,7 +13,13 @@ from pydantic import BaseModel, PrivateAttr
from skyvern.config import settings
from skyvern.constants import DEFAULT_MAX_TOKENS, SKYVERN_DIR, SKYVERN_ID_ATTR
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, ScrapingFailedBlankPage, UnknownElementTreeFormat
from skyvern.exceptions import (
FailedToTakeScreenshot,
NoElementFound,
ScrapingFailed,
ScrapingFailedBlankPage,
UnknownElementTreeFormat,
)
from skyvern.forge.sdk.api.crypto import calculate_sha256
from skyvern.forge.sdk.core import skyvern_context
from skyvern.forge.sdk.settings_manager import SettingsManager
@@ -524,7 +530,7 @@ async def scrape_web_unsafe(
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 3,
wait_seconds: float = 0,
) -> ScrapedPage:
"""
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -549,10 +555,22 @@ async def scrape_web_unsafe(
if url == "about:blank" and not support_empty_page:
raise ScrapingFailedBlankPage()
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.")
await asyncio.sleep(wait_seconds)
try:
await page.wait_for_load_state("load", timeout=3000)
await SkyvernFrame.wait_for_animation_end(page)
except Exception:
LOG.warning("Failed to wait for load state, will continue scraping", exc_info=True)
if wait_seconds > 0:
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.", wait_seconds=wait_seconds)
await asyncio.sleep(wait_seconds)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
if not elements and not support_empty_page:
LOG.warning("No elements found on the page, wait for 3 seconds and retry")
await asyncio.sleep(3)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree))
@@ -576,9 +594,9 @@ async def scrape_web_unsafe(
elements
)
# if there are no elements, fail the scraping
# if there are no elements, fail the scraping unless support_empty_page is True
if not elements and not support_empty_page:
raise Exception("No elements found on the page")
raise NoElementFound()
text_content = await get_frame_text(page.main_frame)

View File

@@ -161,8 +161,7 @@ async def _scrolling_screenshots_helper(
if mode == ScreenshotMode.DETAILED:
# wait until animation ends, which is triggered by scrolling
LOG.debug("Waiting for 2 seconds until animation ends.")
await asyncio.sleep(2)
await SkyvernFrame.wait_for_animation_end(skyvern_page.frame)
else:
if draw_boxes:
await skyvern_page.build_elements_and_draw_bounding_boxes(frame=frame, frame_index=frame_index)
@@ -215,6 +214,21 @@ def _merge_images_by_position(images: list[Image.Image], positions: list[int]) -
class SkyvernFrame:
@staticmethod
async def wait_for_animation_end(page: Page, timeout: float = 3000) -> None:
try:
await page.wait_for_function(
"""
() => {
const animations = document.getAnimations();
return animations.every(a => a.playState === 'finished');
}
""",
timeout=timeout,
)
except Exception:
LOG.warning("Failed to wait for animation end, but continue", exc_info=True)
@staticmethod
async def evaluate(
frame: Page | Frame,