add support_empty_page and wait_seconds to the scrape_website interface (#3181)

This commit is contained in:
Shuchang Zheng
2025-08-13 19:22:50 -07:00
committed by GitHub
parent 399fd4ea74
commit 434bbff459

View File

@@ -405,6 +405,8 @@ async def scrape_website(
draw_boxes: bool = True,
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 3,
) -> ScrapedPage:
"""
************************************************************************************************
@@ -439,6 +441,8 @@ async def scrape_website(
draw_boxes=draw_boxes,
max_screenshot_number=max_screenshot_number,
scroll=scroll,
support_empty_page=support_empty_page,
wait_seconds=wait_seconds,
)
except ScrapingFailedBlankPage:
raise
@@ -517,6 +521,8 @@ async def scrape_web_unsafe(
draw_boxes: bool = True,
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 3,
) -> ScrapedPage:
"""
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -538,11 +544,11 @@ async def scrape_web_unsafe(
# This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after
# clicking start my quote)
url = page.url
if url == "about:blank":
if url == "about:blank" and not support_empty_page:
raise ScrapingFailedBlankPage()
LOG.info("Waiting for 3 seconds before scraping the website.")
await asyncio.sleep(3)
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.")
await asyncio.sleep(wait_seconds)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
@@ -569,7 +575,7 @@ async def scrape_web_unsafe(
)
# if there are no elements, fail the scraping
if not elements:
if not elements and not support_empty_page:
raise Exception("No elements found on the page")
text_content = await get_frame_text(page.main_frame)