diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py index 4759f860..acd9a034 100644 --- a/skyvern/exceptions.py +++ b/skyvern/exceptions.py @@ -269,9 +269,9 @@ class ScrapingFailed(SkyvernException): super().__init__("Scraping failed.") -class ScrapingFailedNoUrl(ScrapingFailed): +class ScrapingFailedBlankPage(ScrapingFailed): def __init__(self) -> None: - super().__init__(reason="A URL is missing. Please ensure there is a URL for Skyvern to work with.") + super().__init__(reason="It's a blank page. Please ensure there is a non-blank page for Skyvern to work with.") class WorkflowRunContextNotInitialized(SkyvernException): diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 282e2a75..ff1fe431 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -1883,7 +1883,7 @@ class ForgeAgent: step_id=step.step_id, exc_info=True, ) - raise ScrapingFailed() + raise e if scraped_page is None: raise EmptyScrapePage() diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index f6459d5e..33edd0b4 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -13,7 +13,7 @@ from pydantic import BaseModel, PrivateAttr from skyvern.config import settings from skyvern.constants import DEFAULT_MAX_TOKENS, SKYVERN_DIR, SKYVERN_ID_ATTR -from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, UnknownElementTreeFormat +from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, ScrapingFailedBlankPage, UnknownElementTreeFormat from skyvern.forge.sdk.api.crypto import calculate_sha256 from skyvern.forge.sdk.core import skyvern_context from skyvern.forge.sdk.settings_manager import SettingsManager @@ -428,11 +428,6 @@ async def scrape_website( :raises Exception: When scraping fails after maximum retries. """ - # TODO(jdo) why is this a problem? - # ref: https://skyvern.slack.com/archives/C074UNDSRJM/p1752771256298149 - # if not url.strip(): - # raise ScrapingFailedNoUrl() - try: num_retry += 1 return await scrape_web_unsafe( @@ -445,6 +440,8 @@ async def scrape_website( max_screenshot_number=max_screenshot_number, scroll=scroll, ) + except ScrapingFailedBlankPage: + raise except Exception as e: # NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production if num_retry > settings.MAX_SCRAPING_RETRIES: @@ -540,6 +537,9 @@ async def scrape_web_unsafe( # We check if the scroll_y_px_old is the same as scroll_y_px to determine if we have reached the end of the page. # This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after # clicking start my quote) + url = page.url + if url == "about:blank": + raise ScrapingFailedBlankPage() LOG.info("Waiting for 3 seconds before scraping the website.") await asyncio.sleep(3) @@ -598,7 +598,7 @@ async def scrape_web_unsafe( element_tree=element_tree, element_tree_trimmed=element_tree_trimmed, screenshots=screenshots, - url=page.url, + url=url, html=html, extracted_text=text_content, window_dimension=window_dimension,