try to fix screenshot timeout (#502)
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
from enum import StrEnum
|
||||
from pathlib import Path
|
||||
|
||||
# This is the attribute name used to tag interactable elements
|
||||
@@ -6,3 +7,12 @@ SKYVERN_DIR = Path(__file__).parent
|
||||
REPO_ROOT_DIR = SKYVERN_DIR.parent
|
||||
|
||||
INPUT_TEXT_TIMEOUT = 120000 # 2 minutes
|
||||
|
||||
|
||||
class ScrapeType(StrEnum):
|
||||
NORMAL = "normal"
|
||||
STOPLOADING = "stoploading"
|
||||
RELOAD = "reload"
|
||||
|
||||
|
||||
SCRAPE_TYPE_ORDER = [ScrapeType.NORMAL, ScrapeType.STOPLOADING, ScrapeType.RELOAD]
|
||||
|
||||
@@ -165,6 +165,20 @@ class FailedToNavigateToUrl(SkyvernException):
|
||||
super().__init__(f"Failed to navigate to url {url}. Error message: {error_message}")
|
||||
|
||||
|
||||
class FailedToReloadPage(SkyvernException):
|
||||
def __init__(self, url: str, error_message: str) -> None:
|
||||
self.url = url
|
||||
self.error_message = error_message
|
||||
super().__init__(f"Failed to reload page url {url}. Error message: {error_message}")
|
||||
|
||||
|
||||
class FailedToStopLoadingPage(SkyvernException):
|
||||
def __init__(self, url: str, error_message: str) -> None:
|
||||
self.url = url
|
||||
self.error_message = error_message
|
||||
super().__init__(f"Failed to stop loading page url {url}. Error message: {error_message}")
|
||||
|
||||
|
||||
class UnexpectedTaskStatus(SkyvernException):
|
||||
def __init__(self, task_id: str, status: str) -> None:
|
||||
super().__init__(f"Unexpected task status {status} for task {task_id}")
|
||||
@@ -218,6 +232,11 @@ class FailedToTakeScreenshot(SkyvernException):
|
||||
super().__init__(f"Failed to take screenshot. Error message: {error_message}")
|
||||
|
||||
|
||||
class EmptyScrapePage(SkyvernException):
|
||||
def __init__(self) -> None:
|
||||
super().__init__("Failed to scrape the page, returned an NONE result")
|
||||
|
||||
|
||||
class WorkflowRunContextNotInitialized(SkyvernException):
|
||||
def __init__(self, workflow_run_id: str) -> None:
|
||||
super().__init__(f"WorkflowRunContext not initialized for workflow run {workflow_run_id}")
|
||||
|
||||
@@ -11,10 +11,13 @@ from playwright._impl._errors import TargetClosedError
|
||||
from playwright.async_api import Page
|
||||
|
||||
from skyvern import analytics
|
||||
from skyvern.constants import SCRAPE_TYPE_ORDER, ScrapeType
|
||||
from skyvern.exceptions import (
|
||||
BrowserStateMissingPage,
|
||||
EmptyScrapePage,
|
||||
FailedToNavigateToUrl,
|
||||
FailedToSendWebhook,
|
||||
FailedToTakeScreenshot,
|
||||
InvalidWorkflowTaskURLState,
|
||||
MissingBrowserStatePage,
|
||||
SkyvernException,
|
||||
@@ -778,6 +781,36 @@ class ForgeAgent:
|
||||
)
|
||||
return step, browser_state, detailed_output
|
||||
|
||||
async def _scrape_with_type(
|
||||
self,
|
||||
task: Task,
|
||||
step: Step,
|
||||
browser_state: BrowserState,
|
||||
scrape_type: ScrapeType,
|
||||
) -> ScrapedPage | None:
|
||||
if scrape_type == ScrapeType.NORMAL:
|
||||
pass
|
||||
|
||||
elif scrape_type == ScrapeType.STOPLOADING:
|
||||
LOG.info(
|
||||
"Try to stop loading the page before scraping",
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
)
|
||||
await browser_state.stop_page_loading()
|
||||
elif scrape_type == ScrapeType.RELOAD:
|
||||
LOG.info(
|
||||
"Try to reload the page before scraping",
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
)
|
||||
await browser_state.reload_page()
|
||||
|
||||
return await scrape_website(
|
||||
browser_state,
|
||||
task.url,
|
||||
)
|
||||
|
||||
async def _build_and_record_step_prompt(
|
||||
self,
|
||||
task: Task,
|
||||
@@ -788,10 +821,30 @@ class ForgeAgent:
|
||||
self.async_operation_pool.run_operation(task.task_id, AgentPhase.scrape)
|
||||
|
||||
# Scrape the web page and get the screenshot and the elements
|
||||
scraped_page = await scrape_website(
|
||||
browser_state,
|
||||
task.url,
|
||||
)
|
||||
# HACK: try scrape_website three time to handle screenshot timeout
|
||||
# first time: normal scrape to take screenshot
|
||||
# second time: stop window loading before scraping
|
||||
# third time: reload the page before scraping
|
||||
scraped_page: ScrapedPage | None = None
|
||||
for scrape_type in SCRAPE_TYPE_ORDER:
|
||||
try:
|
||||
scraped_page = await self._scrape_with_type(
|
||||
task=task, step=step, browser_state=browser_state, scrape_type=scrape_type
|
||||
)
|
||||
break
|
||||
except FailedToTakeScreenshot as e:
|
||||
if scrape_type == ScrapeType.RELOAD:
|
||||
LOG.error(
|
||||
"Failed to take screenshot after stop-loading and reload-page retry",
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
)
|
||||
raise e
|
||||
continue
|
||||
|
||||
if scraped_page is None:
|
||||
raise EmptyScrapePage
|
||||
|
||||
await app.ARTIFACT_MANAGER.create_artifact(
|
||||
step=step,
|
||||
artifact_type=ArtifactType.HTML_SCRAPE,
|
||||
|
||||
@@ -15,6 +15,8 @@ from pydantic import BaseModel
|
||||
from skyvern.config import settings
|
||||
from skyvern.exceptions import (
|
||||
FailedToNavigateToUrl,
|
||||
FailedToReloadPage,
|
||||
FailedToStopLoadingPage,
|
||||
FailedToTakeScreenshot,
|
||||
MissingBrowserStatePage,
|
||||
UnknownBrowserType,
|
||||
@@ -159,6 +161,12 @@ class BrowserState:
|
||||
self.page = page
|
||||
self.browser_artifacts = browser_artifacts
|
||||
|
||||
def __assert_page(self) -> Page:
|
||||
if self.page is not None:
|
||||
return self.page
|
||||
LOG.error("BrowserState has no page")
|
||||
raise MissingBrowserStatePage()
|
||||
|
||||
async def _close_all_other_pages(self) -> None:
|
||||
if not self.browser_context or not self.page:
|
||||
return
|
||||
@@ -261,6 +269,31 @@ class BrowserState:
|
||||
|
||||
return self.page
|
||||
|
||||
async def stop_page_loading(self) -> None:
|
||||
page = self.__assert_page()
|
||||
try:
|
||||
await page.evaluate("window.stop()")
|
||||
except Exception as e:
|
||||
LOG.exception(f"Error while stop loading the page: {repr(e)}")
|
||||
raise FailedToStopLoadingPage(url=page.url, error_message=repr(e))
|
||||
|
||||
async def reload_page(self) -> None:
|
||||
page = self.__assert_page()
|
||||
|
||||
LOG.info(f"Reload page {page.url} and waiting for 5 seconds")
|
||||
try:
|
||||
start_time = time.time()
|
||||
await page.reload(timeout=settings.BROWSER_LOADING_TIMEOUT_MS)
|
||||
end_time = time.time()
|
||||
LOG.info(
|
||||
"Page loading time",
|
||||
loading_time=end_time - start_time,
|
||||
)
|
||||
await asyncio.sleep(5)
|
||||
except Exception as e:
|
||||
LOG.exception(f"Error while reload url: {repr(e)}")
|
||||
raise FailedToReloadPage(url=page.url, error_message=repr(e))
|
||||
|
||||
async def close(self, close_browser_on_completion: bool = True) -> None:
|
||||
LOG.info("Closing browser state")
|
||||
if self.browser_context and close_browser_on_completion:
|
||||
@@ -307,8 +340,5 @@ class BrowserState:
|
||||
raise FailedToTakeScreenshot(error_message=str(e)) from e
|
||||
|
||||
async def take_screenshot(self, full_page: bool = False, file_path: str | None = None) -> bytes:
|
||||
if not self.page:
|
||||
LOG.error("BrowserState has no page")
|
||||
raise MissingBrowserStatePage()
|
||||
|
||||
return await self.take_screenshot_from_page(self.page, full_page, file_path)
|
||||
page = self.__assert_page()
|
||||
return await self.take_screenshot_from_page(page, full_page, file_path)
|
||||
|
||||
@@ -10,7 +10,7 @@ from playwright.async_api import Frame, Page
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
|
||||
from skyvern.exceptions import UnknownElementTreeFormat
|
||||
from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat
|
||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||
from skyvern.webeye.browser_factory import BrowserState
|
||||
|
||||
@@ -169,7 +169,7 @@ async def scrape_website(
|
||||
try:
|
||||
num_retry += 1
|
||||
return await scrape_web_unsafe(browser_state, url)
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
|
||||
if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES:
|
||||
LOG.error(
|
||||
@@ -178,7 +178,10 @@ async def scrape_website(
|
||||
url=url,
|
||||
exc_info=True,
|
||||
)
|
||||
raise Exception("Scraping failed.")
|
||||
if isinstance(e, FailedToTakeScreenshot):
|
||||
raise e
|
||||
else:
|
||||
raise Exception("Scraping failed.")
|
||||
LOG.info("Scraping failed, will retry", num_retry=num_retry, url=url)
|
||||
return await scrape_website(
|
||||
browser_state,
|
||||
|
||||
Reference in New Issue
Block a user