improve caching reliability by add retries for scrape_page (#4058)
This commit is contained in:
@@ -43,3 +43,6 @@ TEXT_PRESS_MAX_LENGTH = 20
|
|||||||
|
|
||||||
# Script generation constants
|
# Script generation constants
|
||||||
DEFAULT_SCRIPT_RUN_ID = "default"
|
DEFAULT_SCRIPT_RUN_ID = "default"
|
||||||
|
|
||||||
|
# SkyvernPage constants
|
||||||
|
SKYVERN_PAGE_MAX_SCRAPING_RETRIES = 2
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from jinja2.sandbox import SandboxedEnvironment
|
|||||||
from playwright.async_api import Page
|
from playwright.async_api import Page
|
||||||
|
|
||||||
from skyvern.config import settings
|
from skyvern.config import settings
|
||||||
from skyvern.constants import SPECIAL_FIELD_VERIFICATION_CODE
|
from skyvern.constants import SKYVERN_PAGE_MAX_SCRAPING_RETRIES, SPECIAL_FIELD_VERIFICATION_CODE
|
||||||
from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
|
from skyvern.core.script_generations.skyvern_page_ai import SkyvernPageAi
|
||||||
from skyvern.forge import app
|
from skyvern.forge import app
|
||||||
from skyvern.forge.prompts import prompt_engine
|
from skyvern.forge.prompts import prompt_engine
|
||||||
@@ -145,7 +145,9 @@ class RealSkyvernPageAi(SkyvernPageAi):
|
|||||||
# Build the element tree of the current page for the prompt
|
# Build the element tree of the current page for the prompt
|
||||||
context = skyvern_context.ensure_context()
|
context = skyvern_context.ensure_context()
|
||||||
payload_str = _get_context_data(data)
|
payload_str = _get_context_data(data)
|
||||||
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
|
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots(
|
||||||
|
max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES
|
||||||
|
)
|
||||||
element_tree = refreshed_page.build_element_tree()
|
element_tree = refreshed_page.build_element_tree()
|
||||||
|
|
||||||
organization_id = context.organization_id if context else None
|
organization_id = context.organization_id if context else None
|
||||||
@@ -244,7 +246,9 @@ class RealSkyvernPageAi(SkyvernPageAi):
|
|||||||
else:
|
else:
|
||||||
data = {SPECIAL_FIELD_VERIFICATION_CODE: verification_code}
|
data = {SPECIAL_FIELD_VERIFICATION_CODE: verification_code}
|
||||||
|
|
||||||
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
|
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots(
|
||||||
|
max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES
|
||||||
|
)
|
||||||
self.scraped_page = refreshed_page
|
self.scraped_page = refreshed_page
|
||||||
|
|
||||||
# Try to get element_id from selector if selector is provided
|
# Try to get element_id from selector if selector is provided
|
||||||
@@ -348,7 +352,9 @@ class RealSkyvernPageAi(SkyvernPageAi):
|
|||||||
if files and isinstance(data, dict) and "files" not in data:
|
if files and isinstance(data, dict) and "files" not in data:
|
||||||
data["files"] = files
|
data["files"] = files
|
||||||
|
|
||||||
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
|
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots(
|
||||||
|
max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES
|
||||||
|
)
|
||||||
self.scraped_page = refreshed_page
|
self.scraped_page = refreshed_page
|
||||||
|
|
||||||
# Try to get element_id from selector if selector is provided
|
# Try to get element_id from selector if selector is provided
|
||||||
@@ -445,7 +451,9 @@ class RealSkyvernPageAi(SkyvernPageAi):
|
|||||||
if value and isinstance(data, dict) and "value" not in data:
|
if value and isinstance(data, dict) and "value" not in data:
|
||||||
data["value"] = value
|
data["value"] = value
|
||||||
|
|
||||||
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
|
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots(
|
||||||
|
max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES
|
||||||
|
)
|
||||||
self.scraped_page = refreshed_page
|
self.scraped_page = refreshed_page
|
||||||
element_tree = refreshed_page.build_element_tree()
|
element_tree = refreshed_page.build_element_tree()
|
||||||
merged_goal = SELECT_OPTION_GOAL.format(intention=intention, prompt=prompt)
|
merged_goal = SELECT_OPTION_GOAL.format(intention=intention, prompt=prompt)
|
||||||
@@ -501,7 +509,7 @@ class RealSkyvernPageAi(SkyvernPageAi):
|
|||||||
) -> dict[str, Any] | list | str | None:
|
) -> dict[str, Any] | list | str | None:
|
||||||
"""Extract information from the page using AI."""
|
"""Extract information from the page using AI."""
|
||||||
|
|
||||||
scraped_page_refreshed = await self.scraped_page.refresh()
|
scraped_page_refreshed = await self.scraped_page.refresh(max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES)
|
||||||
context = skyvern_context.current()
|
context = skyvern_context.current()
|
||||||
tz_info = datetime.now(tz=timezone.utc).tzinfo
|
tz_info = datetime.now(tz=timezone.utc).tzinfo
|
||||||
if context and context.tz_info:
|
if context and context.tz_info:
|
||||||
@@ -598,7 +606,9 @@ class RealSkyvernPageAi(SkyvernPageAi):
|
|||||||
reasoning=action_info.get("reasoning"),
|
reasoning=action_info.get("reasoning"),
|
||||||
)
|
)
|
||||||
|
|
||||||
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
|
refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots(
|
||||||
|
max_retries=SKYVERN_PAGE_MAX_SCRAPING_RETRIES
|
||||||
|
)
|
||||||
self.scraped_page = refreshed_page
|
self.scraped_page = refreshed_page
|
||||||
element_tree = refreshed_page.build_element_tree()
|
element_tree = refreshed_page.build_element_tree()
|
||||||
|
|
||||||
|
|||||||
@@ -366,11 +366,12 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
|
|||||||
element["children"] = new_children
|
element["children"] = new_children
|
||||||
return element
|
return element
|
||||||
|
|
||||||
async def refresh(self, draw_boxes: bool = True, scroll: bool = True) -> Self:
|
async def refresh(self, draw_boxes: bool = True, scroll: bool = True, max_retries: int = 0) -> Self:
|
||||||
refreshed_page = await scrape_website(
|
refreshed_page = await scrape_website(
|
||||||
browser_state=self._browser_state,
|
browser_state=self._browser_state,
|
||||||
url=self.url,
|
url=self.url,
|
||||||
cleanup_element_tree=self._clean_up_func,
|
cleanup_element_tree=self._clean_up_func,
|
||||||
|
max_retries=max_retries,
|
||||||
scrape_exclude=self._scrape_exclude,
|
scrape_exclude=self._scrape_exclude,
|
||||||
draw_boxes=draw_boxes,
|
draw_boxes=draw_boxes,
|
||||||
scroll=scroll,
|
scroll=scroll,
|
||||||
@@ -390,20 +391,25 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
async def generate_scraped_page(
|
async def generate_scraped_page(
|
||||||
self, draw_boxes: bool = True, scroll: bool = True, take_screenshots: bool = True
|
self,
|
||||||
|
draw_boxes: bool = True,
|
||||||
|
scroll: bool = True,
|
||||||
|
take_screenshots: bool = True,
|
||||||
|
max_retries: int = 0,
|
||||||
) -> Self:
|
) -> Self:
|
||||||
return await scrape_website(
|
return await scrape_website(
|
||||||
browser_state=self._browser_state,
|
browser_state=self._browser_state,
|
||||||
url=self.url,
|
url=self.url,
|
||||||
cleanup_element_tree=self._clean_up_func,
|
cleanup_element_tree=self._clean_up_func,
|
||||||
|
max_retries=max_retries,
|
||||||
scrape_exclude=self._scrape_exclude,
|
scrape_exclude=self._scrape_exclude,
|
||||||
take_screenshots=take_screenshots,
|
take_screenshots=take_screenshots,
|
||||||
draw_boxes=draw_boxes,
|
draw_boxes=draw_boxes,
|
||||||
scroll=scroll,
|
scroll=scroll,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def generate_scraped_page_without_screenshots(self) -> Self:
|
async def generate_scraped_page_without_screenshots(self, max_retries: int = 0) -> Self:
|
||||||
return await self.generate_scraped_page(take_screenshots=False)
|
return await self.generate_scraped_page(take_screenshots=False, max_retries=max_retries)
|
||||||
|
|
||||||
|
|
||||||
@TraceManager.traced_async(ignore_input=True)
|
@TraceManager.traced_async(ignore_input=True)
|
||||||
@@ -412,6 +418,7 @@ async def scrape_website(
|
|||||||
url: str,
|
url: str,
|
||||||
cleanup_element_tree: CleanupElementTreeFunc,
|
cleanup_element_tree: CleanupElementTreeFunc,
|
||||||
num_retry: int = 0,
|
num_retry: int = 0,
|
||||||
|
max_retries: int = settings.MAX_SCRAPING_RETRIES,
|
||||||
scrape_exclude: ScrapeExcludeFunc | None = None,
|
scrape_exclude: ScrapeExcludeFunc | None = None,
|
||||||
take_screenshots: bool = True,
|
take_screenshots: bool = True,
|
||||||
draw_boxes: bool = True,
|
draw_boxes: bool = True,
|
||||||
@@ -460,10 +467,11 @@ async def scrape_website(
|
|||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
|
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
|
||||||
if num_retry > settings.MAX_SCRAPING_RETRIES:
|
if num_retry > max_retries:
|
||||||
LOG.error(
|
LOG.error(
|
||||||
"Scraping failed after max retries, aborting.",
|
"Scraping failed after max retries, aborting.",
|
||||||
max_retries=settings.MAX_SCRAPING_RETRIES,
|
max_retries=max_retries,
|
||||||
|
num_retry=num_retry,
|
||||||
url=url,
|
url=url,
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
@@ -471,12 +479,14 @@ async def scrape_website(
|
|||||||
raise e
|
raise e
|
||||||
else:
|
else:
|
||||||
raise ScrapingFailed() from e
|
raise ScrapingFailed() from e
|
||||||
LOG.info("Scraping failed, will retry", num_retry=num_retry, url=url)
|
LOG.info("Scraping failed, will retry", max_retries=max_retries, num_retry=num_retry, url=url, wait_seconds=0.5)
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
return await scrape_website(
|
return await scrape_website(
|
||||||
browser_state,
|
browser_state,
|
||||||
url,
|
url,
|
||||||
cleanup_element_tree,
|
cleanup_element_tree,
|
||||||
num_retry=num_retry,
|
num_retry=num_retry,
|
||||||
|
max_retries=max_retries,
|
||||||
scrape_exclude=scrape_exclude,
|
scrape_exclude=scrape_exclude,
|
||||||
take_screenshots=take_screenshots,
|
take_screenshots=take_screenshots,
|
||||||
draw_boxes=draw_boxes,
|
draw_boxes=draw_boxes,
|
||||||
|
|||||||
Reference in New Issue
Block a user