remove hard waiting time in scraping (#3195)
This commit is contained in:
@@ -749,3 +749,8 @@ class ScriptNotFound(SkyvernHTTPException):
|
|||||||
class NoTOTPSecretFound(SkyvernException):
|
class NoTOTPSecretFound(SkyvernException):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
super().__init__("No TOTP secret found")
|
super().__init__("No TOTP secret found")
|
||||||
|
|
||||||
|
|
||||||
|
class NoElementFound(SkyvernException):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__("No element found.")
|
||||||
|
|||||||
@@ -13,7 +13,13 @@ from pydantic import BaseModel, PrivateAttr
|
|||||||
|
|
||||||
from skyvern.config import settings
|
from skyvern.config import settings
|
||||||
from skyvern.constants import DEFAULT_MAX_TOKENS, SKYVERN_DIR, SKYVERN_ID_ATTR
|
from skyvern.constants import DEFAULT_MAX_TOKENS, SKYVERN_DIR, SKYVERN_ID_ATTR
|
||||||
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, ScrapingFailedBlankPage, UnknownElementTreeFormat
|
from skyvern.exceptions import (
|
||||||
|
FailedToTakeScreenshot,
|
||||||
|
NoElementFound,
|
||||||
|
ScrapingFailed,
|
||||||
|
ScrapingFailedBlankPage,
|
||||||
|
UnknownElementTreeFormat,
|
||||||
|
)
|
||||||
from skyvern.forge.sdk.api.crypto import calculate_sha256
|
from skyvern.forge.sdk.api.crypto import calculate_sha256
|
||||||
from skyvern.forge.sdk.core import skyvern_context
|
from skyvern.forge.sdk.core import skyvern_context
|
||||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||||
@@ -524,7 +530,7 @@ async def scrape_web_unsafe(
|
|||||||
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||||
scroll: bool = True,
|
scroll: bool = True,
|
||||||
support_empty_page: bool = False,
|
support_empty_page: bool = False,
|
||||||
wait_seconds: float = 3,
|
wait_seconds: float = 0,
|
||||||
) -> ScrapedPage:
|
) -> ScrapedPage:
|
||||||
"""
|
"""
|
||||||
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
|
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
|
||||||
@@ -549,10 +555,22 @@ async def scrape_web_unsafe(
|
|||||||
if url == "about:blank" and not support_empty_page:
|
if url == "about:blank" and not support_empty_page:
|
||||||
raise ScrapingFailedBlankPage()
|
raise ScrapingFailedBlankPage()
|
||||||
|
|
||||||
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.")
|
try:
|
||||||
await asyncio.sleep(wait_seconds)
|
await page.wait_for_load_state("load", timeout=3000)
|
||||||
|
await SkyvernFrame.wait_for_animation_end(page)
|
||||||
|
except Exception:
|
||||||
|
LOG.warning("Failed to wait for load state, will continue scraping", exc_info=True)
|
||||||
|
|
||||||
|
if wait_seconds > 0:
|
||||||
|
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.", wait_seconds=wait_seconds)
|
||||||
|
await asyncio.sleep(wait_seconds)
|
||||||
|
|
||||||
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
|
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
|
||||||
|
if not elements and not support_empty_page:
|
||||||
|
LOG.warning("No elements found on the page, wait for 3 seconds and retry")
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
|
||||||
|
|
||||||
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
|
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
|
||||||
element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree))
|
element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree))
|
||||||
|
|
||||||
@@ -576,9 +594,9 @@ async def scrape_web_unsafe(
|
|||||||
elements
|
elements
|
||||||
)
|
)
|
||||||
|
|
||||||
# if there are no elements, fail the scraping
|
# if there are no elements, fail the scraping unless support_empty_page is True
|
||||||
if not elements and not support_empty_page:
|
if not elements and not support_empty_page:
|
||||||
raise Exception("No elements found on the page")
|
raise NoElementFound()
|
||||||
|
|
||||||
text_content = await get_frame_text(page.main_frame)
|
text_content = await get_frame_text(page.main_frame)
|
||||||
|
|
||||||
|
|||||||
@@ -161,8 +161,7 @@ async def _scrolling_screenshots_helper(
|
|||||||
|
|
||||||
if mode == ScreenshotMode.DETAILED:
|
if mode == ScreenshotMode.DETAILED:
|
||||||
# wait until animation ends, which is triggered by scrolling
|
# wait until animation ends, which is triggered by scrolling
|
||||||
LOG.debug("Waiting for 2 seconds until animation ends.")
|
await SkyvernFrame.wait_for_animation_end(skyvern_page.frame)
|
||||||
await asyncio.sleep(2)
|
|
||||||
else:
|
else:
|
||||||
if draw_boxes:
|
if draw_boxes:
|
||||||
await skyvern_page.build_elements_and_draw_bounding_boxes(frame=frame, frame_index=frame_index)
|
await skyvern_page.build_elements_and_draw_bounding_boxes(frame=frame, frame_index=frame_index)
|
||||||
@@ -215,6 +214,21 @@ def _merge_images_by_position(images: list[Image.Image], positions: list[int]) -
|
|||||||
|
|
||||||
|
|
||||||
class SkyvernFrame:
|
class SkyvernFrame:
|
||||||
|
@staticmethod
|
||||||
|
async def wait_for_animation_end(page: Page, timeout: float = 3000) -> None:
|
||||||
|
try:
|
||||||
|
await page.wait_for_function(
|
||||||
|
"""
|
||||||
|
() => {
|
||||||
|
const animations = document.getAnimations();
|
||||||
|
return animations.every(a => a.playState === 'finished');
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
LOG.warning("Failed to wait for animation end, but continue", exc_info=True)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def evaluate(
|
async def evaluate(
|
||||||
frame: Page | Frame,
|
frame: Page | Frame,
|
||||||
|
|||||||
Reference in New Issue
Block a user