From 434bbff459f105aa26cc1cedad26c89edc59514c Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Wed, 13 Aug 2025 19:22:50 -0700 Subject: [PATCH] add support_empty_page and wait_seconds to the scrape_website interface (#3181) --- skyvern/webeye/scraper/scraper.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 1f10e8e9..08e509c6 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -405,6 +405,8 @@ async def scrape_website( draw_boxes: bool = True, max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS, scroll: bool = True, + support_empty_page: bool = False, + wait_seconds: float = 3, ) -> ScrapedPage: """ ************************************************************************************************ @@ -439,6 +441,8 @@ async def scrape_website( draw_boxes=draw_boxes, max_screenshot_number=max_screenshot_number, scroll=scroll, + support_empty_page=support_empty_page, + wait_seconds=wait_seconds, ) except ScrapingFailedBlankPage: raise @@ -517,6 +521,8 @@ async def scrape_web_unsafe( draw_boxes: bool = True, max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS, scroll: bool = True, + support_empty_page: bool = False, + wait_seconds: float = 3, ) -> ScrapedPage: """ Asynchronous function that performs web scraping without any built-in error handling. This function is intended @@ -538,11 +544,11 @@ async def scrape_web_unsafe( # This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after # clicking start my quote) url = page.url - if url == "about:blank": + if url == "about:blank" and not support_empty_page: raise ScrapingFailedBlankPage() - LOG.info("Waiting for 3 seconds before scraping the website.") - await asyncio.sleep(3) + LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.") + await asyncio.sleep(wait_seconds) elements, element_tree = await get_interactable_element_tree(page, scrape_exclude) element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree)) @@ -569,7 +575,7 @@ async def scrape_web_unsafe( ) # if there are no elements, fail the scraping - if not elements: + if not elements and not support_empty_page: raise Exception("No elements found on the page") text_content = await get_frame_text(page.main_frame)