better failure reason for blank page (#3049)

Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
This commit is contained in:
LawyZheng
2025-07-29 14:40:54 +08:00
committed by GitHub
parent ffce05c6ef
commit ecc0e2e17d
3 changed files with 10 additions and 10 deletions

View File

@@ -269,9 +269,9 @@ class ScrapingFailed(SkyvernException):
super().__init__("Scraping failed.")
class ScrapingFailedNoUrl(ScrapingFailed):
class ScrapingFailedBlankPage(ScrapingFailed):
def __init__(self) -> None:
super().__init__(reason="A URL is missing. Please ensure there is a URL for Skyvern to work with.")
super().__init__(reason="It's a blank page. Please ensure there is a non-blank page for Skyvern to work with.")
class WorkflowRunContextNotInitialized(SkyvernException):

View File

@@ -1883,7 +1883,7 @@ class ForgeAgent:
step_id=step.step_id,
exc_info=True,
)
raise ScrapingFailed()
raise e
if scraped_page is None:
raise EmptyScrapePage()

View File

@@ -13,7 +13,7 @@ from pydantic import BaseModel, PrivateAttr
from skyvern.config import settings
from skyvern.constants import DEFAULT_MAX_TOKENS, SKYVERN_DIR, SKYVERN_ID_ATTR
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, UnknownElementTreeFormat
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, ScrapingFailedBlankPage, UnknownElementTreeFormat
from skyvern.forge.sdk.api.crypto import calculate_sha256
from skyvern.forge.sdk.core import skyvern_context
from skyvern.forge.sdk.settings_manager import SettingsManager
@@ -428,11 +428,6 @@ async def scrape_website(
:raises Exception: When scraping fails after maximum retries.
"""
# TODO(jdo) why is this a problem?
# ref: https://skyvern.slack.com/archives/C074UNDSRJM/p1752771256298149
# if not url.strip():
# raise ScrapingFailedNoUrl()
try:
num_retry += 1
return await scrape_web_unsafe(
@@ -445,6 +440,8 @@ async def scrape_website(
max_screenshot_number=max_screenshot_number,
scroll=scroll,
)
except ScrapingFailedBlankPage:
raise
except Exception as e:
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
if num_retry > settings.MAX_SCRAPING_RETRIES:
@@ -540,6 +537,9 @@ async def scrape_web_unsafe(
# We check if the scroll_y_px_old is the same as scroll_y_px to determine if we have reached the end of the page.
# This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after
# clicking start my quote)
url = page.url
if url == "about:blank":
raise ScrapingFailedBlankPage()
LOG.info("Waiting for 3 seconds before scraping the website.")
await asyncio.sleep(3)
@@ -598,7 +598,7 @@ async def scrape_web_unsafe(
element_tree=element_tree,
element_tree_trimmed=element_tree_trimmed,
screenshots=screenshots,
url=page.url,
url=url,
html=html,
extracted_text=text_content,
window_dimension=window_dimension,