improve caching reliability by add retries for scrape_page (#4058)

This commit is contained in:
Shuchang Zheng
2025-11-21 00:08:13 -08:00
committed by GitHub
parent 6358b8b1d7
commit 2b1b28e4d7
3 changed files with 37 additions and 14 deletions

View File

@@ -43,3 +43,6 @@ TEXT_PRESS_MAX_LENGTH = 20
# Script generation constants # Script generation constants
DEFAULT_SCRIPT_RUN_ID = "default" DEFAULT_SCRIPT_RUN_ID = "default"
# SkyvernPage constants
SKYVERN_PAGE_MAX_SCRAPING_RETRIES = 2

View File

@@ -9,7 +9,7 @@ from jinja2.sandbox import SandboxedEnvironment
from playwright.async_api import Page from playwright.async_api import Page
from skyvern.config import settings from skyvern.config import settings
from skyvern.constants import SPECIAL_FIELD_VERIFICATION_CODE from skyvern.constants import SKYVERN_PAGE_MAX_SCRAPING_RETRIES, SPECIAL_FIELD_VERIFICATION_CODE
from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
from skyvern.forge import app from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine from skyvern.forge.prompts import prompt_engine
@@ -145,7 +145,9 @@ class RealSkyvernPageAi(SkyvernPageAi):
# Build the element tree of the current page for the prompt # Build the element tree of the current page for the prompt
context = skyvern_context.ensure_context() context = skyvern_context.ensure_context()
payload_str = _get_context_data(data) payload_str = _get_context_data(data)
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots(
max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES
)
element_tree = refreshed_page.build_element_tree() element_tree = refreshed_page.build_element_tree()
organization_id = context.organization_id if context else None organization_id = context.organization_id if context else None
@@ -244,7 +246,9 @@ class RealSkyvernPageAi(SkyvernPageAi):
else: else:
data = {SPECIAL_FIELD_VERIFICATION_CODE: verification_code} data = {SPECIAL_FIELD_VERIFICATION_CODE: verification_code}
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots(
max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES
)
self.scraped_page = refreshed_page self.scraped_page = refreshed_page
# Try to get element_id from selector if selector is provided # Try to get element_id from selector if selector is provided
@@ -348,7 +352,9 @@ class RealSkyvernPageAi(SkyvernPageAi):
if files and isinstance(data, dict) and "files" not in data: if files and isinstance(data, dict) and "files" not in data:
data["files"] = files data["files"] = files
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots(
max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES
)
self.scraped_page = refreshed_page self.scraped_page = refreshed_page
# Try to get element_id from selector if selector is provided # Try to get element_id from selector if selector is provided
@@ -445,7 +451,9 @@ class RealSkyvernPageAi(SkyvernPageAi):
if value and isinstance(data, dict) and "value" not in data: if value and isinstance(data, dict) and "value" not in data:
data["value"] = value data["value"] = value
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots(
max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES
)
self.scraped_page = refreshed_page self.scraped_page = refreshed_page
element_tree = refreshed_page.build_element_tree() element_tree = refreshed_page.build_element_tree()
merged_goal = SELECT_OPTION_GOAL.format(intention=intention, prompt=prompt) merged_goal = SELECT_OPTION_GOAL.format(intention=intention, prompt=prompt)
@@ -501,7 +509,7 @@ class RealSkyvernPageAi(SkyvernPageAi):
) -> dict[str, Any] | list | str | None: ) -> dict[str, Any] | list | str | None:
"""Extract information from the page using AI.""" """Extract information from the page using AI."""
scraped_page_refreshed = await self.scraped_page.refresh() scraped_page_refreshed = await self.scraped_page.refresh(max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES)
context = skyvern_context.current() context = skyvern_context.current()
tz_info = datetime.now(tz=timezone.utc).tzinfo tz_info = datetime.now(tz=timezone.utc).tzinfo
if context and context.tz_info: if context and context.tz_info:
@@ -598,7 +606,9 @@ class RealSkyvernPageAi(SkyvernPageAi):
reasoning=action_info.get("reasoning"), reasoning=action_info.get("reasoning"),
) )
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots(
max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES
)
self.scraped_page = refreshed_page self.scraped_page = refreshed_page
element_tree = refreshed_page.build_element_tree() element_tree = refreshed_page.build_element_tree()

View File

@@ -366,11 +366,12 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
element["children"] = new_children element["children"] = new_children
return element return element
async def refresh(self, draw_boxes: bool = True, scroll: bool = True) -> Self: async def refresh(self, draw_boxes: bool = True, scroll: bool = True, max_retries: int = 0) -> Self:
refreshed_page = await scrape_website( refreshed_page = await scrape_website(
browser_state=self._browser_state, browser_state=self._browser_state,
url=self.url, url=self.url,
cleanup_element_tree=self._clean_up_func, cleanup_element_tree=self._clean_up_func,
max_retries=max_retries,
scrape_exclude=self._scrape_exclude, scrape_exclude=self._scrape_exclude,
draw_boxes=draw_boxes, draw_boxes=draw_boxes,
scroll=scroll, scroll=scroll,
@@ -390,20 +391,25 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
return self return self
async def generate_scraped_page( async def generate_scraped_page(
self, draw_boxes: bool = True, scroll: bool = True, take_screenshots: bool = True self,
draw_boxes: bool = True,
scroll: bool = True,
take_screenshots: bool = True,
max_retries: int = 0,
) -> Self: ) -> Self:
return await scrape_website( return await scrape_website(
browser_state=self._browser_state, browser_state=self._browser_state,
url=self.url, url=self.url,
cleanup_element_tree=self._clean_up_func, cleanup_element_tree=self._clean_up_func,
max_retries=max_retries,
scrape_exclude=self._scrape_exclude, scrape_exclude=self._scrape_exclude,
take_screenshots=take_screenshots, take_screenshots=take_screenshots,
draw_boxes=draw_boxes, draw_boxes=draw_boxes,
scroll=scroll, scroll=scroll,
) )
async def generate_scraped_page_without_screenshots(self) -> Self: async def generate_scraped_page_without_screenshots(self, max_retries: int = 0) -> Self:
return await self.generate_scraped_page(take_screenshots=False) return await self.generate_scraped_page(take_screenshots=False, max_retries=max_retries)
@TraceManager.traced_async(ignore_input=True) @TraceManager.traced_async(ignore_input=True)
@@ -412,6 +418,7 @@ async def scrape_website(
url: str, url: str,
cleanup_element_tree: CleanupElementTreeFunc, cleanup_element_tree: CleanupElementTreeFunc,
num_retry: int = 0, num_retry: int = 0,
max_retries: int = settings.MAX_SCRAPING_RETRIES,
scrape_exclude: ScrapeExcludeFunc | None = None, scrape_exclude: ScrapeExcludeFunc | None = None,
take_screenshots: bool = True, take_screenshots: bool = True,
draw_boxes: bool = True, draw_boxes: bool = True,
@@ -460,10 +467,11 @@ async def scrape_website(
raise raise
except Exception as e: except Exception as e:
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production # NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
if num_retry > settings.MAX_SCRAPING_RETRIES: if num_retry > max_retries:
LOG.error( LOG.error(
"Scraping failed after max retries, aborting.", "Scraping failed after max retries, aborting.",
max_retries=settings.MAX_SCRAPING_RETRIES, max_retries=max_retries,
num_retry=num_retry,
url=url, url=url,
exc_info=True, exc_info=True,
) )
@@ -471,12 +479,14 @@ async def scrape_website(
raise e raise e
else: else:
raise ScrapingFailed() from e raise ScrapingFailed() from e
LOG.info("Scraping failed, will retry", num_retry=num_retry, url=url) LOG.info("Scraping failed, will retry", max_retries=max_retries, num_retry=num_retry, url=url, wait_seconds=0.5)
await asyncio.sleep(0.5)
return await scrape_website( return await scrape_website(
browser_state, browser_state,
url, url,
cleanup_element_tree, cleanup_element_tree,
num_retry=num_retry, num_retry=num_retry,
max_retries=max_retries,
scrape_exclude=scrape_exclude, scrape_exclude=scrape_exclude,
take_screenshots=take_screenshots, take_screenshots=take_screenshots,
draw_boxes=draw_boxes, draw_boxes=draw_boxes,