add support_empty_page and wait_seconds to the scrape_website interface (#3181)
This commit is contained in:
@@ -405,6 +405,8 @@ async def scrape_website(
|
|||||||
draw_boxes: bool = True,
|
draw_boxes: bool = True,
|
||||||
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||||
scroll: bool = True,
|
scroll: bool = True,
|
||||||
|
support_empty_page: bool = False,
|
||||||
|
wait_seconds: float = 3,
|
||||||
) -> ScrapedPage:
|
) -> ScrapedPage:
|
||||||
"""
|
"""
|
||||||
************************************************************************************************
|
************************************************************************************************
|
||||||
@@ -439,6 +441,8 @@ async def scrape_website(
|
|||||||
draw_boxes=draw_boxes,
|
draw_boxes=draw_boxes,
|
||||||
max_screenshot_number=max_screenshot_number,
|
max_screenshot_number=max_screenshot_number,
|
||||||
scroll=scroll,
|
scroll=scroll,
|
||||||
|
support_empty_page=support_empty_page,
|
||||||
|
wait_seconds=wait_seconds,
|
||||||
)
|
)
|
||||||
except ScrapingFailedBlankPage:
|
except ScrapingFailedBlankPage:
|
||||||
raise
|
raise
|
||||||
@@ -517,6 +521,8 @@ async def scrape_web_unsafe(
|
|||||||
draw_boxes: bool = True,
|
draw_boxes: bool = True,
|
||||||
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||||
scroll: bool = True,
|
scroll: bool = True,
|
||||||
|
support_empty_page: bool = False,
|
||||||
|
wait_seconds: float = 3,
|
||||||
) -> ScrapedPage:
|
) -> ScrapedPage:
|
||||||
"""
|
"""
|
||||||
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
|
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
|
||||||
@@ -538,11 +544,11 @@ async def scrape_web_unsafe(
|
|||||||
# This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after
|
# This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after
|
||||||
# clicking start my quote)
|
# clicking start my quote)
|
||||||
url = page.url
|
url = page.url
|
||||||
if url == "about:blank":
|
if url == "about:blank" and not support_empty_page:
|
||||||
raise ScrapingFailedBlankPage()
|
raise ScrapingFailedBlankPage()
|
||||||
|
|
||||||
LOG.info("Waiting for 3 seconds before scraping the website.")
|
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.")
|
||||||
await asyncio.sleep(3)
|
await asyncio.sleep(wait_seconds)
|
||||||
|
|
||||||
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
|
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
|
||||||
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
|
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
|
||||||
@@ -569,7 +575,7 @@ async def scrape_web_unsafe(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# if there are no elements, fail the scraping
|
# if there are no elements, fail the scraping
|
||||||
if not elements:
|
if not elements and not support_empty_page:
|
||||||
raise Exception("No elements found on the page")
|
raise Exception("No elements found on the page")
|
||||||
|
|
||||||
text_content = await get_frame_text(page.main_frame)
|
text_content = await get_frame_text(page.main_frame)
|
||||||
|
|||||||
Reference in New Issue
Block a user