Default Skyvern 2.0 to google.com if generated website fails to load (#2291)

This commit is contained in:
Shuchang Zheng
2025-05-04 21:36:40 -07:00
committed by GitHub
parent df6252f80a
commit 0540e65d06
3 changed files with 106 additions and 28 deletions

View File

@@ -238,6 +238,8 @@ class Settings(BaseSettings):
ENABLE_LOG_ARTIFACTS: bool = False ENABLE_LOG_ARTIFACTS: bool = False
ENABLE_CODE_BLOCK: bool = False ENABLE_CODE_BLOCK: bool = False
TASK_BLOCKED_SITE_FALLBACK_URL: str = "https://www.google.com"
# SkyvernClient Settings # SkyvernClient Settings
SKYVERN_BASE_URL: str = "https://api.skyvern.com" SKYVERN_BASE_URL: str = "https://api.skyvern.com"
SKYVERN_API_KEY: str = "PLACEHOLDER" SKYVERN_API_KEY: str = "PLACEHOLDER"

View File

@@ -10,7 +10,13 @@ from playwright.async_api import Page
from sqlalchemy.exc import OperationalError from sqlalchemy.exc import OperationalError
from skyvern.config import settings from skyvern.config import settings
from skyvern.exceptions import FailedToSendWebhook, TaskTerminationError, TaskV2NotFound, UrlGenerationFailure from skyvern.exceptions import (
FailedToSendWebhook,
MissingBrowserState,
TaskTerminationError,
TaskV2NotFound,
UrlGenerationFailure,
)
from skyvern.forge import app from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.artifact.models import ArtifactType from skyvern.forge.sdk.artifact.models import ArtifactType
@@ -481,31 +487,86 @@ async def run_task_v2_helper(
task_history_record: dict[str, Any] = {} task_history_record: dict[str, Any] = {}
context = skyvern_context.ensure_context() context = skyvern_context.ensure_context()
# Always ensure browser_state is available at the start of the loop
current_url: str | None = None current_url: str | None = None
page: Page | None = None page: Page | None = None
browser_state = app.BROWSER_MANAGER.get_for_workflow_run(workflow_run_id, workflow_run.parent_workflow_run_id) fallback_url = settings.TASK_BLOCKED_SITE_FALLBACK_URL
if browser_state: browser_state = app.BROWSER_MANAGER.get_for_workflow_run(
workflow_run_id=workflow_run_id, parent_workflow_run_id=workflow_run.parent_workflow_run_id
)
if browser_state is None:
fallback_occurred = False
try:
browser_state = await app.BROWSER_MANAGER.get_or_create_for_workflow_run(
workflow_run=workflow_run,
url=url,
browser_session_id=browser_session_id,
)
except Exception:
LOG.warning("Failed to get or create browser state, fallback to Google", exc_info=True, url=url)
browser_state = await app.BROWSER_MANAGER.get_or_create_for_workflow_run(
workflow_run=workflow_run,
url=fallback_url,
browser_session_id=browser_session_id,
)
fallback_occurred = True
if browser_state is None:
LOG.error("Failed to create browser state even after fallback", workflow_run_id=workflow_run_id)
raise MissingBrowserState(workflow_run_id=workflow_run_id)
page = await browser_state.get_working_page()
page_loaded = False
if page:
try:
# Check if the page has a body element to verify it loaded
# page will always be None if browser state failed to load
page_loaded = await browser_state.validate_browser_context(page)
except Exception:
page_loaded = False
LOG.warning(
"Page failed to load properly, fallback to Google",
exc_info=True,
url=url,
current_url=current_url,
)
if not page_loaded:
# Page failed to load properly, fallback to Google
if page:
try:
await page.goto(fallback_url, timeout=15000)
fallback_occurred = True
except Exception:
LOG.exception("Failed to load Google fallback", exc_info=True, url=url, current_url=current_url)
else:
page = await browser_state.get_working_page() page = await browser_state.get_working_page()
if page: if page:
current_url = await SkyvernFrame.get_url(page) current_url = await SkyvernFrame.get_url(page)
if i == 0 and current_url != url: if i == 0 and current_url != url:
# The first iteration is always a GOTO_URL task if fallback_occurred:
task_type = "goto_url" plan = f"Go to Google because the intended website ({url}) failed to load properly."
plan = f"Go to this website: {url}" task_type = "goto_url"
task_history_record = {"type": task_type, "task": plan} task_history_record = {"type": task_type, "task": plan}
block, block_yaml_list, parameter_yaml_list = await _generate_goto_url_task( block, block_yaml_list, parameter_yaml_list = await _generate_goto_url_task(
workflow_id=workflow_id, workflow_id=workflow_id,
url=url, url=fallback_url,
) )
else:
# Page loaded successfully, proceed with original URL
plan = f"Go to this website: {url}"
task_type = "goto_url"
task_history_record = {"type": task_type, "task": plan}
block, block_yaml_list, parameter_yaml_list = await _generate_goto_url_task(
workflow_id=workflow_id,
url=url,
)
else: else:
try: try:
if browser_state is None:
browser_state = await app.BROWSER_MANAGER.get_or_create_for_workflow_run(
workflow_run=workflow_run,
url=url,
browser_session_id=browser_session_id,
)
scraped_page = await scrape_website( scraped_page = await scrape_website(
browser_state, browser_state,
url, url,

View File

@@ -274,16 +274,6 @@ class BrowserContextFactory:
raise UnknownErrorWhileCreatingBrowserContext(browser_type, e) from e raise UnknownErrorWhileCreatingBrowserContext(browser_type, e) from e
@classmethod
def set_validate_browser_context(cls, validator: Callable[[Page], Awaitable[bool]]) -> None:
cls._validator = validator
@classmethod
async def validate_browser_context(cls, page: Page) -> bool:
if cls._validator is None:
return True
return await cls._validator(page)
class VideoArtifact(BaseModel): class VideoArtifact(BaseModel):
video_path: str | None = None video_path: str | None = None
@@ -603,6 +593,7 @@ class BrowserState:
loading_time=end_time - start_time, loading_time=end_time - start_time,
url=url, url=url,
) )
# Do we need this?
await asyncio.sleep(5) await asyncio.sleep(5)
LOG.info(f"Successfully went to {url}", url=url, retry_time=retry_time) LOG.info(f"Successfully went to {url}", url=url, retry_time=retry_time)
return return
@@ -639,6 +630,30 @@ class BrowserState:
await self.set_working_page(last_page, len(self.browser_context.pages) - 1) await self.set_working_page(last_page, len(self.browser_context.pages) - 1)
return last_page return last_page
async def validate_browser_context(self, page: Page) -> bool:
# validate the content
try:
skyvern_frame = await SkyvernFrame.create_instance(frame=page)
html = await skyvern_frame.get_content()
except Exception:
LOG.error(
"Error happened while getting the first page content",
exc_info=True,
)
return False
if "Bad gateway error" in html:
LOG.warning("Bad gateway error on the page, recreate a new browser context with another proxy node")
return False
if "client_connect_forbidden_host" in html:
LOG.warning(
"capture the client_connect_forbidden_host error on the page, recreate a new browser context with another proxy node"
)
return False
return True
async def must_get_working_page(self) -> Page: async def must_get_working_page(self) -> Page:
page = await self.get_working_page() page = await self.get_working_page()
assert page is not None assert page is not None
@@ -710,7 +725,7 @@ class BrowserState:
) )
page = await self.__assert_page() page = await self.__assert_page()
if not await BrowserContextFactory.validate_browser_context(await self.get_working_page()): if not await self.validate_browser_context(await self.get_working_page()):
if not await self.close_current_open_page(): if not await self.close_current_open_page():
LOG.warning("Failed to close the current open page, going to skip the browser context validation") LOG.warning("Failed to close the current open page, going to skip the browser context validation")
return page return page