remove hard waiting time in scraping (#3195)
This commit is contained in:
@@ -749,3 +749,8 @@ class ScriptNotFound(SkyvernHTTPException):
|
||||
class NoTOTPSecretFound(SkyvernException):
|
||||
def __init__(self) -> None:
|
||||
super().__init__("No TOTP secret found")
|
||||
|
||||
|
||||
class NoElementFound(SkyvernException):
|
||||
def __init__(self) -> None:
|
||||
super().__init__("No element found.")
|
||||
|
||||
@@ -13,7 +13,13 @@ from pydantic import BaseModel, PrivateAttr
|
||||
|
||||
from skyvern.config import settings
|
||||
from skyvern.constants import DEFAULT_MAX_TOKENS, SKYVERN_DIR, SKYVERN_ID_ATTR
|
||||
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, ScrapingFailedBlankPage, UnknownElementTreeFormat
|
||||
from skyvern.exceptions import (
|
||||
FailedToTakeScreenshot,
|
||||
NoElementFound,
|
||||
ScrapingFailed,
|
||||
ScrapingFailedBlankPage,
|
||||
UnknownElementTreeFormat,
|
||||
)
|
||||
from skyvern.forge.sdk.api.crypto import calculate_sha256
|
||||
from skyvern.forge.sdk.core import skyvern_context
|
||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||
@@ -524,7 +530,7 @@ async def scrape_web_unsafe(
|
||||
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||
scroll: bool = True,
|
||||
support_empty_page: bool = False,
|
||||
wait_seconds: float = 3,
|
||||
wait_seconds: float = 0,
|
||||
) -> ScrapedPage:
|
||||
"""
|
||||
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
|
||||
@@ -549,10 +555,22 @@ async def scrape_web_unsafe(
|
||||
if url == "about:blank" and not support_empty_page:
|
||||
raise ScrapingFailedBlankPage()
|
||||
|
||||
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.")
|
||||
await asyncio.sleep(wait_seconds)
|
||||
try:
|
||||
await page.wait_for_load_state("load", timeout=3000)
|
||||
await SkyvernFrame.wait_for_animation_end(page)
|
||||
except Exception:
|
||||
LOG.warning("Failed to wait for load state, will continue scraping", exc_info=True)
|
||||
|
||||
if wait_seconds > 0:
|
||||
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.", wait_seconds=wait_seconds)
|
||||
await asyncio.sleep(wait_seconds)
|
||||
|
||||
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
|
||||
if not elements and not support_empty_page:
|
||||
LOG.warning("No elements found on the page, wait for 3 seconds and retry")
|
||||
await asyncio.sleep(3)
|
||||
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
|
||||
|
||||
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
|
||||
element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree))
|
||||
|
||||
@@ -576,9 +594,9 @@ async def scrape_web_unsafe(
|
||||
elements
|
||||
)
|
||||
|
||||
# if there are no elements, fail the scraping
|
||||
# if there are no elements, fail the scraping unless support_empty_page is True
|
||||
if not elements and not support_empty_page:
|
||||
raise Exception("No elements found on the page")
|
||||
raise NoElementFound()
|
||||
|
||||
text_content = await get_frame_text(page.main_frame)
|
||||
|
||||
|
||||
@@ -161,8 +161,7 @@ async def _scrolling_screenshots_helper(
|
||||
|
||||
if mode == ScreenshotMode.DETAILED:
|
||||
# wait until animation ends, which is triggered by scrolling
|
||||
LOG.debug("Waiting for 2 seconds until animation ends.")
|
||||
await asyncio.sleep(2)
|
||||
await SkyvernFrame.wait_for_animation_end(skyvern_page.frame)
|
||||
else:
|
||||
if draw_boxes:
|
||||
await skyvern_page.build_elements_and_draw_bounding_boxes(frame=frame, frame_index=frame_index)
|
||||
@@ -215,6 +214,21 @@ def _merge_images_by_position(images: list[Image.Image], positions: list[int]) -
|
||||
|
||||
|
||||
class SkyvernFrame:
|
||||
@staticmethod
|
||||
async def wait_for_animation_end(page: Page, timeout: float = 3000) -> None:
|
||||
try:
|
||||
await page.wait_for_function(
|
||||
"""
|
||||
() => {
|
||||
const animations = document.getAnimations();
|
||||
return animations.every(a => a.playState === 'finished');
|
||||
}
|
||||
""",
|
||||
timeout=timeout,
|
||||
)
|
||||
except Exception:
|
||||
LOG.warning("Failed to wait for animation end, but continue", exc_info=True)
|
||||
|
||||
@staticmethod
|
||||
async def evaluate(
|
||||
frame: Page | Frame,
|
||||
|
||||
Reference in New Issue
Block a user