re-enable iframe (#507)

This commit is contained in:
LawyZheng
2024-06-25 11:54:06 +08:00
committed by GitHub
parent 76255c01b1
commit 6a472da51e
3 changed files with 31 additions and 11 deletions

View File

@@ -809,6 +809,7 @@ class ForgeAgent:
return await scrape_website( return await scrape_website(
browser_state, browser_state,
task.url, task.url,
scrape_exclude=app.scrape_exclude,
) )
async def _build_and_record_step_prompt( async def _build_and_record_step_prompt(

View File

@@ -3,6 +3,7 @@ from typing import Awaitable, Callable
from ddtrace import tracer from ddtrace import tracer
from ddtrace.filters import FilterRequestsOnUrl from ddtrace.filters import FilterRequestsOnUrl
from fastapi import FastAPI from fastapi import FastAPI
from playwright.async_api import Frame, Page
from skyvern.forge.agent import ForgeAgent from skyvern.forge.agent import ForgeAgent
from skyvern.forge.agent_functions import AgentFunction from skyvern.forge.agent_functions import AgentFunction
@@ -40,6 +41,7 @@ LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(SettingsManager.get_s
WORKFLOW_CONTEXT_MANAGER = WorkflowContextManager() WORKFLOW_CONTEXT_MANAGER = WorkflowContextManager()
WORKFLOW_SERVICE = WorkflowService() WORKFLOW_SERVICE = WorkflowService()
AGENT_FUNCTION = AgentFunction() AGENT_FUNCTION = AgentFunction()
scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None
authentication_function: Callable[[str], Awaitable[Organization]] | None = None authentication_function: Callable[[str], Awaitable[Organization]] | None = None
setup_api_app: Callable[[FastAPI], None] | None = None setup_api_app: Callable[[FastAPI], None] | None = None

View File

@@ -3,7 +3,7 @@ import copy
import json import json
from collections import defaultdict from collections import defaultdict
from enum import StrEnum from enum import StrEnum
from typing import Any from typing import Any, Awaitable, Callable
import structlog import structlog
from playwright.async_api import Frame, Page from playwright.async_api import Frame, Page
@@ -144,6 +144,7 @@ async def scrape_website(
browser_state: BrowserState, browser_state: BrowserState,
url: str, url: str,
num_retry: int = 0, num_retry: int = 0,
scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
) -> ScrapedPage: ) -> ScrapedPage:
""" """
************************************************************************************************ ************************************************************************************************
@@ -168,7 +169,7 @@ async def scrape_website(
""" """
try: try:
num_retry += 1 num_retry += 1
return await scrape_web_unsafe(browser_state, url) return await scrape_web_unsafe(browser_state, url, scrape_exclude)
except Exception as e: except Exception as e:
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production # NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES: if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES:
@@ -187,6 +188,7 @@ async def scrape_website(
browser_state, browser_state,
url, url,
num_retry=num_retry, num_retry=num_retry,
scrape_exclude=scrape_exclude,
) )
@@ -219,6 +221,7 @@ async def get_frame_text(iframe: Frame) -> str:
async def scrape_web_unsafe( async def scrape_web_unsafe(
browser_state: BrowserState, browser_state: BrowserState,
url: str, url: str,
scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
) -> ScrapedPage: ) -> ScrapedPage:
""" """
Asynchronous function that performs web scraping without any built-in error handling. This function is intended Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -268,7 +271,7 @@ async def scrape_web_unsafe(
await remove_bounding_boxes(page) await remove_bounding_boxes(page)
await scroll_to_top(page, drow_boxes=False) await scroll_to_top(page, drow_boxes=False)
elements, element_tree = await get_interactable_element_tree(page) elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
element_tree = cleanup_elements(copy.deepcopy(element_tree)) element_tree = cleanup_elements(copy.deepcopy(element_tree))
_build_element_links(elements) _build_element_links(elements)
@@ -307,12 +310,18 @@ async def get_select2_options(page: Page) -> list[dict[str, Any]]:
async def get_interactable_element_tree_in_frame( async def get_interactable_element_tree_in_frame(
frames: list[Frame], elements: list[dict], element_tree: list[dict] frames: list[Frame],
elements: list[dict],
element_tree: list[dict],
scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
) -> tuple[list[dict], list[dict]]: ) -> tuple[list[dict], list[dict]]:
for frame in frames: for frame in frames:
if frame.is_detached(): if frame.is_detached():
continue continue
if scrape_exclude is not None and await scrape_exclude(frame.page, frame):
continue
try: try:
frame_element = await frame.frame_element() frame_element = await frame.frame_element()
except Exception: except Exception:
@@ -331,7 +340,10 @@ async def get_interactable_element_tree_in_frame(
if len(frame.child_frames) > 0: if len(frame.child_frames) > 0:
frame_elements, frame_element_tree = await get_interactable_element_tree_in_frame( frame_elements, frame_element_tree = await get_interactable_element_tree_in_frame(
frame.child_frames, frame_elements, frame_element_tree frame.child_frames,
frame_elements,
frame_element_tree,
scrape_exclude=scrape_exclude,
) )
for element in elements: for element in elements:
@@ -347,7 +359,10 @@ async def get_interactable_element_tree_in_frame(
return elements, element_tree return elements, element_tree
async def get_interactable_element_tree(page: Page) -> tuple[list[dict], list[dict]]: async def get_interactable_element_tree(
page: Page,
scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
) -> tuple[list[dict], list[dict]]:
""" """
Get the element tree of the page, including all the elements that are interactable. Get the element tree of the page, including all the elements that are interactable.
:param page: Page instance to get the element tree from. :param page: Page instance to get the element tree from.
@@ -357,11 +372,13 @@ async def get_interactable_element_tree(page: Page) -> tuple[list[dict], list[di
main_frame_js_script = "async () => await buildTreeFromBody('main.frame', true)" main_frame_js_script = "async () => await buildTreeFromBody('main.frame', true)"
elements, element_tree = await page.evaluate(main_frame_js_script) elements, element_tree = await page.evaluate(main_frame_js_script)
# FIXME: some unexpected exception in iframe. turn off temporarily if len(page.main_frame.child_frames) > 0:
# if len(page.main_frame.child_frames) > 0: elements, element_tree = await get_interactable_element_tree_in_frame(
# elements, element_tree = await get_interactable_element_tree_in_frame( page.main_frame.child_frames,
# page.main_frame.child_frames, elements, element_tree elements,
# ) element_tree,
scrape_exclude=scrape_exclude,
)
return elements, element_tree return elements, element_tree