diff --git a/skyvern/constants.py b/skyvern/constants.py index accb9317..125d4fca 100644 --- a/skyvern/constants.py +++ b/skyvern/constants.py @@ -43,3 +43,6 @@ TEXT_PRESS_MAX_LENGTH = 20 # Script generation constants DEFAULT_SCRIPT_RUN_ID = "default" + +# SkyvernPage constants +SKYVERN_PAGE_MAX_SCRAPING_RETRIES = 2 diff --git a/skyvern/core/script_generations/real_skyvern_page_ai.py b/skyvern/core/script_generations/real_skyvern_page_ai.py index 5b302d5c..87e8c5fb 100644 --- a/skyvern/core/script_generations/real_skyvern_page_ai.py +++ b/skyvern/core/script_generations/real_skyvern_page_ai.py @@ -9,7 +9,7 @@ from jinja2.sandbox import SandboxedEnvironment from playwright.async_api import Page from skyvern.config import settings -from skyvern.constants import SPECIAL_FIELD_VERIFICATION_CODE +from skyvern.constants import SKYVERN_PAGE_MAX_SCRAPING_RETRIES, SPECIAL_FIELD_VERIFICATION_CODE from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi from skyvern.forge import app from skyvern.forge.prompts import prompt_engine @@ -145,7 +145,9 @@ class RealSkyvernPageAi(SkyvernPageAi): # Build the element tree of the current page for the prompt context = skyvern_context.ensure_context() payload_str = _get_context_data(data) - refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() + refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots( + max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES + ) element_tree = refreshed_page.build_element_tree() organization_id = context.organization_id if context else None @@ -244,7 +246,9 @@ class RealSkyvernPageAi(SkyvernPageAi): else: data = {SPECIAL_FIELD_VERIFICATION_CODE: verification_code} - refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() + refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots( + max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES + ) self.scraped_page = refreshed_page # Try to get element_id from selector if selector is provided @@ -348,7 +352,9 @@ class RealSkyvernPageAi(SkyvernPageAi): if files and isinstance(data, dict) and "files" not in data: data["files"] = files - refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() + refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots( + max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES + ) self.scraped_page = refreshed_page # Try to get element_id from selector if selector is provided @@ -445,7 +451,9 @@ class RealSkyvernPageAi(SkyvernPageAi): if value and isinstance(data, dict) and "value" not in data: data["value"] = value - refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() + refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots( + max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES + ) self.scraped_page = refreshed_page element_tree = refreshed_page.build_element_tree() merged_goal = SELECT_OPTION_GOAL.format(intention=intention, prompt=prompt) @@ -501,7 +509,7 @@ class RealSkyvernPageAi(SkyvernPageAi): ) -> dict[str, Any] | list | str | None: """Extract information from the page using AI.""" - scraped_page_refreshed = await self.scraped_page.refresh() + scraped_page_refreshed = await self.scraped_page.refresh(max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES) context = skyvern_context.current() tz_info = datetime.now(tz=timezone.utc).tzinfo if context and context.tz_info: @@ -598,7 +606,9 @@ class RealSkyvernPageAi(SkyvernPageAi): reasoning=action_info.get("reasoning"), ) - refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() + refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots( + max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES + ) self.scraped_page = refreshed_page element_tree = refreshed_page.build_element_tree() diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index e59904ac..3429cf70 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -366,11 +366,12 @@ class ScrapedPage(BaseModel, ElementTreeBuilder): element["children"] = new_children return element - async def refresh(self, draw_boxes: bool = True, scroll: bool = True) -> Self: + async def refresh(self, draw_boxes: bool = True, scroll: bool = True, max_retries: int = 0) -> Self: refreshed_page = await scrape_website( browser_state=self._browser_state, url=self.url, cleanup_element_tree=self._clean_up_func, + max_retries=max_retries, scrape_exclude=self._scrape_exclude, draw_boxes=draw_boxes, scroll=scroll, @@ -390,20 +391,25 @@ class ScrapedPage(BaseModel, ElementTreeBuilder): return self async def generate_scraped_page( - self, draw_boxes: bool = True, scroll: bool = True, take_screenshots: bool = True + self, + draw_boxes: bool = True, + scroll: bool = True, + take_screenshots: bool = True, + max_retries: int = 0, ) -> Self: return await scrape_website( browser_state=self._browser_state, url=self.url, cleanup_element_tree=self._clean_up_func, + max_retries=max_retries, scrape_exclude=self._scrape_exclude, take_screenshots=take_screenshots, draw_boxes=draw_boxes, scroll=scroll, ) - async def generate_scraped_page_without_screenshots(self) -> Self: - return await self.generate_scraped_page(take_screenshots=False) + async def generate_scraped_page_without_screenshots(self, max_retries: int = 0) -> Self: + return await self.generate_scraped_page(take_screenshots=False, max_retries=max_retries) @TraceManager.traced_async(ignore_input=True) @@ -412,6 +418,7 @@ async def scrape_website( url: str, cleanup_element_tree: CleanupElementTreeFunc, num_retry: int = 0, + max_retries: int = settings.MAX_SCRAPING_RETRIES, scrape_exclude: ScrapeExcludeFunc | None = None, take_screenshots: bool = True, draw_boxes: bool = True, @@ -460,10 +467,11 @@ async def scrape_website( raise except Exception as e: # NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production - if num_retry > settings.MAX_SCRAPING_RETRIES: + if num_retry > max_retries: LOG.error( "Scraping failed after max retries, aborting.", - max_retries=settings.MAX_SCRAPING_RETRIES, + max_retries=max_retries, + num_retry=num_retry, url=url, exc_info=True, ) @@ -471,12 +479,14 @@ async def scrape_website( raise e else: raise ScrapingFailed() from e - LOG.info("Scraping failed, will retry", num_retry=num_retry, url=url) + LOG.info("Scraping failed, will retry", max_retries=max_retries, num_retry=num_retry, url=url, wait_seconds=0.5) + await asyncio.sleep(0.5) return await scrape_website( browser_state, url, cleanup_element_tree, num_retry=num_retry, + max_retries=max_retries, scrape_exclude=scrape_exclude, take_screenshots=take_screenshots, draw_boxes=draw_boxes,