add timeout for page.content() (#541)
This commit is contained in:
@@ -7,6 +7,7 @@ SKYVERN_DIR = Path(__file__).parent
|
||||
REPO_ROOT_DIR = SKYVERN_DIR.parent
|
||||
|
||||
INPUT_TEXT_TIMEOUT = 120000 # 2 minutes
|
||||
PAGE_CONTENT_TIMEOUT = 300 # 5 mins
|
||||
|
||||
|
||||
class ScrapeType(StrEnum):
|
||||
|
||||
@@ -52,7 +52,7 @@ from skyvern.webeye.actions.handler import ActionHandler
|
||||
from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput
|
||||
from skyvern.webeye.actions.responses import ActionResult
|
||||
from skyvern.webeye.browser_factory import BrowserState
|
||||
from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website
|
||||
from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, get_page_content, scrape_website
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
|
||||
@@ -786,7 +786,7 @@ class ForgeAgent:
|
||||
)
|
||||
|
||||
try:
|
||||
html = await browser_state.page.content()
|
||||
html = await get_page_content(browser_state.page)
|
||||
await app.ARTIFACT_MANAGER.create_artifact(
|
||||
step=step,
|
||||
artifact_type=ArtifactType.HTML_ACTION,
|
||||
|
||||
@@ -9,7 +9,7 @@ import structlog
|
||||
from playwright.async_api import Frame, Page
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
|
||||
from skyvern.constants import PAGE_CONTENT_TIMEOUT, SKYVERN_DIR, SKYVERN_ID_ATTR
|
||||
from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat
|
||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||
from skyvern.webeye.browser_factory import BrowserState
|
||||
@@ -289,6 +289,16 @@ async def scrape_web_unsafe(
|
||||
|
||||
text_content = await get_frame_text(page.main_frame)
|
||||
|
||||
html = ""
|
||||
try:
|
||||
html = await get_page_content(page)
|
||||
except Exception:
|
||||
LOG.error(
|
||||
"Failed out to get HTML content",
|
||||
url=url,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
return ScrapedPage(
|
||||
elements=elements,
|
||||
id_to_xpath_dict=id_to_xpath_dict,
|
||||
@@ -298,11 +308,16 @@ async def scrape_web_unsafe(
|
||||
element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)),
|
||||
screenshots=screenshots,
|
||||
url=page.url,
|
||||
html=await page.content(),
|
||||
html=html,
|
||||
extracted_text=text_content,
|
||||
)
|
||||
|
||||
|
||||
async def get_page_content(page: Page, timeout: float = PAGE_CONTENT_TIMEOUT) -> str:
|
||||
async with asyncio.timeout(timeout):
|
||||
return await page.content()
|
||||
|
||||
|
||||
async def get_select2_options(page: Page) -> list[dict[str, Any]]:
|
||||
await page.evaluate(JS_FUNCTION_DEFS)
|
||||
js_script = "async () => await getSelect2Options()"
|
||||
|
||||
Reference in New Issue
Block a user