From 5e49685c76e0338bac2223d2ab8e40a81b1c60c8 Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Tue, 18 Feb 2025 08:58:23 +0800 Subject: [PATCH] refactor unique id generation (#1781) --- skyvern/forge/sdk/core/skyvern_context.py | 3 + skyvern/webeye/actions/handler.py | 4 +- skyvern/webeye/scraper/domUtils.js | 174 +++++++++++++++------- skyvern/webeye/scraper/scraper.py | 104 ++++++++----- skyvern/webeye/utils/dom.py | 3 + skyvern/webeye/utils/page.py | 45 ++++-- 6 files changed, 226 insertions(+), 107 deletions(-) diff --git a/skyvern/forge/sdk/core/skyvern_context.py b/skyvern/forge/sdk/core/skyvern_context.py index bcbc305f..02ca947e 100644 --- a/skyvern/forge/sdk/core/skyvern_context.py +++ b/skyvern/forge/sdk/core/skyvern_context.py @@ -2,6 +2,8 @@ from contextvars import ContextVar from dataclasses import dataclass, field from zoneinfo import ZoneInfo +from playwright.async_api import Frame + @dataclass class SkyvernContext: @@ -17,6 +19,7 @@ class SkyvernContext: log: list[dict] = field(default_factory=list) hashed_href_map: dict[str, str] = field(default_factory=dict) refresh_working_page: bool = False + frame_index_map: dict[Frame, int] = field(default_factory=dict) def __repr__(self) -> str: return f"SkyvernContext(request_id={self.request_id}, organization_id={self.organization_id}, task_id={self.task_id}, workflow_id={self.workflow_id}, workflow_run_id={self.workflow_run_id}, max_steps_override={self.max_steps_override})" diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index e87c5008..d28a4654 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -1630,7 +1630,9 @@ async def choose_auto_completion_dropdown( continue current_element = await skyvern_frame.parse_element_from_html( - skyvern_element.get_frame_id(), element_handler, skyvern_element.is_interactable() + skyvern_element.get_frame_id(), + element_handler, + skyvern_element.is_interactable(), ) confirmed_preserved_list.append(current_element) diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 7a2dde06..79134c00 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -1,6 +1,27 @@ // we only use chromium browser for now let browserNameForWorkarounds = "chromium"; +class SafeCounter { + constructor() { + this.value = 0; + this.lock = Promise.resolve(); + } + + async add() { + await this.lock; + this.lock = new Promise((resolve) => { + this.value += 1; + resolve(); + }); + return this.value; + } + + async get() { + await this.lock; + return this.value; + } +} + // Commands for manipulating rects. // Want to debug this? Run chromium, go to sources, and create a new snippet with the code in domUtils.js class Rect { @@ -1141,19 +1162,52 @@ function getDOMElementBySkyvenElement(elementObj) { return document.querySelector(`[unique_id="${elementObj.id}"]`); } -function uniqueId() { +if (window.elementIdCounter === undefined) { + window.elementIdCounter = new SafeCounter(); +} + +// generate a unique id for the element +// length is 4, the first character is from the frame index, the last 3 characters are from the counter, +async function uniqueId() { const characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + const base = characters.length; + + const extraCharacters = "~!@#$%^&*()-_+="; + const extraBase = extraCharacters.length; + let result = ""; - for (let i = 0; i < 4; i++) { - const randomIndex = Math.floor(Math.random() * characters.length); - result += characters[randomIndex]; + + if ( + window.GlobalSkyvernFrameIndex === undefined || + window.GlobalSkyvernFrameIndex < 0 + ) { + const randomIndex = Math.floor(Math.random() * extraBase); + result += extraCharacters[randomIndex]; + } else { + const c1 = window.GlobalSkyvernFrameIndex % base; + result += characters[c1]; } + + const countPart = + (await window.elementIdCounter.add()) % (base * base * base); + const c2 = Math.floor(countPart / (base * base)); + result += characters[c2]; + const c3 = Math.floor(countPart / base) % base; + result += characters[c3]; + const c4 = countPart % base; + result += characters[c4]; + return result; } -function buildElementObject(frame, element, interactable, purgeable = false) { - var element_id = element.getAttribute("unique_id") ?? uniqueId(); +async function buildElementObject( + frame, + element, + interactable, + purgeable = false, +) { + var element_id = element.getAttribute("unique_id") ?? (await uniqueId()); var elementTagNameLower = element.tagName.toLowerCase(); element.setAttribute("unique_id", element_id); @@ -1224,6 +1278,7 @@ function buildElementObject(frame, element, interactable, purgeable = false) { let elementObj = { id: element_id, frame: frame, + frame_index: window.GlobalSkyvernFrameIndex, interactable: interactable, tagName: elementTagNameLower, attributes: attrs, @@ -1253,7 +1308,7 @@ function buildElementObject(frame, element, interactable, purgeable = false) { let shadowHostId = shadowHostEle.getAttribute("unique_id"); // assign shadowHostId to the shadowHost element if it doesn't have unique_id if (!shadowHostId) { - shadowHostId = uniqueId(); + shadowHostId = await uniqueId(); shadowHostEle.setAttribute("unique_id", shadowHostId); } elementObj.shadowHost = shadowHostId; @@ -1276,11 +1331,25 @@ function buildElementObject(frame, element, interactable, purgeable = false) { return elementObj; } -function buildTreeFromBody(frame = "main.frame") { - return buildElementTree(document.body, frame); +// build the element tree for the body +async function buildTreeFromBody( + frame = "main.frame", + frame_index = undefined, +) { + if ( + window.GlobalSkyvernFrameIndex === undefined && + frame_index !== undefined + ) { + window.GlobalSkyvernFrameIndex = frame_index; + } + return await buildElementTree(document.body, frame); } -function buildElementTree(starter = document.body, frame, full_tree = false) { +async function buildElementTree( + starter = document.body, + frame, + full_tree = false, +) { // Generate hover styles map at the start const hoverStylesMap = getHoverStylesMap(); @@ -1294,7 +1363,7 @@ function buildElementTree(starter = document.body, frame, full_tree = false) { return []; } } - function processElement(element, parentId) { + async function processElement(element, parentId) { if (element === null) { console.log("get a null element"); return; @@ -1322,39 +1391,44 @@ function buildElementTree(starter = document.body, frame, full_tree = false) { let elementObj = null; let isParentSVG = null; if (interactable) { - elementObj = buildElementObject(frame, element, interactable); + elementObj = await buildElementObject(frame, element, interactable); } else if ( tagName === "frameset" || tagName === "iframe" || tagName === "frame" ) { - elementObj = buildElementObject(frame, element, interactable); + elementObj = await buildElementObject(frame, element, interactable); } else if (element.shadowRoot) { - elementObj = buildElementObject(frame, element, interactable); + elementObj = await buildElementObject(frame, element, interactable); children = getChildElements(element.shadowRoot); } else if (isTableRelatedElement(element)) { // build all table related elements into skyvern element // we need these elements to preserve the DOM structure - elementObj = buildElementObject(frame, element, interactable); + elementObj = await buildElementObject(frame, element, interactable); } else if (hasBeforeOrAfterPseudoContent(element)) { - elementObj = buildElementObject(frame, element, interactable); + elementObj = await buildElementObject(frame, element, interactable); } else if (tagName === "svg") { - elementObj = buildElementObject(frame, element, interactable); + elementObj = await buildElementObject(frame, element, interactable); } else if ( (isParentSVG = element.closest("svg")) && isParentSVG.getAttribute("unique_id") ) { // if elemnet is the children of the with an unique_id - elementObj = buildElementObject(frame, element, interactable); + elementObj = await buildElementObject(frame, element, interactable); } else if ( getElementText(element).length > 0 && getElementText(element).length <= 5000 ) { - elementObj = buildElementObject(frame, element, interactable); + elementObj = await buildElementObject(frame, element, interactable); } else if (full_tree) { // when building full tree, we only get text from element itself // elements without text are purgeable - elementObj = buildElementObject(frame, element, interactable, true); + elementObj = await buildElementObject( + frame, + element, + interactable, + true, + ); if (elementObj.text.length > 0) { elementObj.purgeable = false; } @@ -1384,7 +1458,7 @@ function buildElementTree(starter = document.body, frame, full_tree = false) { children = children.concat(getChildElements(element)); for (let i = 0; i < children.length; i++) { const childElement = children[i]; - processElement(childElement, parentId); + await processElement(childElement, parentId); } return; } @@ -1594,7 +1668,7 @@ function buildElementTree(starter = document.body, frame, full_tree = false) { }; // setup before parsing the dom - processElement(starter, null); + await processElement(starter, null); for (var element of elements) { if ( @@ -1660,8 +1734,11 @@ function drawBoundingBoxes(elements) { addHintMarkersToPage(hintMarkers); } -function buildElementsAndDrawBoundingBoxes() { - var elementsAndResultArray = buildTreeFromBody(); +async function buildElementsAndDrawBoundingBoxes( + frame = "main.frame", + frame_index = undefined, +) { + var elementsAndResultArray = await buildTreeFromBody(frame, frame_index); drawBoundingBoxes(elementsAndResultArray[0]); } @@ -1845,11 +1922,15 @@ function removeBoundingBoxes() { } } -function scrollToTop(draw_boxes) { +async function scrollToTop( + draw_boxes, + frame = "main.frame", + frame_index = undefined, +) { removeBoundingBoxes(); window.scroll({ left: 0, top: 0, behavior: "instant" }); if (draw_boxes) { - buildElementsAndDrawBoundingBoxes(); + await buildElementsAndDrawBoundingBoxes(frame, frame_index); } return window.scrollY; } @@ -1862,7 +1943,11 @@ function scrollToXY(x, y) { window.scroll({ left: x, top: y, behavior: "instant" }); } -function scrollToNextPage(draw_boxes) { +async function scrollToNextPage( + draw_boxes, + frame = "main.frame", + frame_index = undefined, +) { // remove bounding boxes, scroll to next page with 200px overlap, then draw bounding boxes again // return true if there is a next page, false otherwise removeBoundingBoxes(); @@ -1872,7 +1957,7 @@ function scrollToNextPage(draw_boxes) { behavior: "instant", }); if (draw_boxes) { - buildElementsAndDrawBoundingBoxes(); + await buildElementsAndDrawBoundingBoxes(frame, frame_index); } return window.scrollY; } @@ -2054,26 +2139,6 @@ function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } -class SafeCounter { - constructor() { - this.value = 0; - this.lock = Promise.resolve(); - } - - async add() { - await this.lock; - this.lock = new Promise((resolve) => { - this.value += 1; - resolve(); - }); - } - - async get() { - await this.lock; - return this.value; - } -} - async function addIncrementalNodeToMap(parentNode, childrenNode) { // make the dom parser async await waitForNextFrame(); @@ -2087,7 +2152,8 @@ async function addIncrementalNodeToMap(parentNode, childrenNode) { try { for (const child of childrenNode) { - const [_, newNodeTree] = buildElementTree(child, "", true); + // Pass -1 as frame_index to indicate the frame number is not sensitive in this case + const [_, newNodeTree] = await buildElementTree(child, "", true); if (newNodeTree.length > 0) { newNodesTreeList.push(...newNodeTree); } @@ -2242,14 +2308,14 @@ async function getIncrementElements() { const depth = sortedDepth[idx]; const treeList = window.globalDomDepthMap.get(depth); - const removeDupAndConcatChildren = (element) => { + const removeDupAndConcatChildren = async (element) => { let children = element.children; for (let i = 0; i < children.length; i++) { const child = children[i]; const domElement = document.querySelector(`[unique_id="${child.id}"]`); // if the element is still on the page, we rebuild the element to update the information if (domElement) { - let newChild = buildElementObject( + let newChild = await buildElementObject( "", domElement, child.interactable, @@ -2272,7 +2338,7 @@ async function getIncrementElements() { idToElement.set(element.id, element); for (let i = 0; i < children.length; i++) { const child = children[i]; - removeDupAndConcatChildren(child); + await removeDupAndConcatChildren(child); } }; @@ -2282,7 +2348,7 @@ async function getIncrementElements() { ); // if the element is still on the page, we rebuild the element to update the information if (domElement) { - let newHead = buildElementObject( + let newHead = await buildElementObject( "", domElement, treeHeadElement.interactable, @@ -2296,7 +2362,7 @@ async function getIncrementElements() { if (!idToElement.has(treeHeadElement.id)) { cleanedTreeList.push(treeHeadElement); } - removeDupAndConcatChildren(treeHeadElement); + await removeDupAndConcatChildren(treeHeadElement); } } diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 0b81d23c..cff5d015 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -469,12 +469,20 @@ async def scrape_web_unsafe( ) -async def get_interactable_element_tree_in_frame( - frames: list[Frame], - elements: list[dict], - element_tree: list[dict], - scrape_exclude: ScrapeExcludeFunc | None = None, -) -> tuple[list[dict], list[dict]]: +async def get_all_children_frames(page: Page) -> list[Frame]: + start_index = 0 + frames = page.main_frame.child_frames + + while start_index < len(frames): + frame = frames[start_index] + start_index += 1 + frames.extend(frame.child_frames) + + return frames + + +async def filter_frames(frames: list[Frame], scrape_exclude: ScrapeExcludeFunc | None = None) -> list[Frame]: + filtered_frames = [] for frame in frames: if frame.is_detached(): continue @@ -482,39 +490,44 @@ async def get_interactable_element_tree_in_frame( if scrape_exclude is not None and await scrape_exclude(frame.page, frame): continue - try: - frame_element = await frame.frame_element() - # it will get stuck when we `frame.evaluate()` on an invisible iframe - if not await frame_element.is_visible(): - continue - unique_id = await frame_element.get_attribute("unique_id") - except Exception: - LOG.warning( - "Unable to get unique_id from frame_element", - exc_info=True, - ) - continue + filtered_frames.append(frame) + return filtered_frames - frame_js_script = f"() => buildTreeFromBody('{unique_id}')" - await SkyvernFrame.evaluate(frame=frame, expression=JS_FUNCTION_DEFS) - frame_elements, frame_element_tree = await SkyvernFrame.evaluate( - frame=frame, expression=frame_js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS +async def add_frame_interactable_elements( + frame: Frame, + frame_index: int, + elements: list[dict], + element_tree: list[dict], +) -> tuple[list[dict], list[dict]]: + """ + Add the interactable element of the frame to the elements and element_tree. + """ + try: + frame_element = await frame.frame_element() + # it will get stuck when we `frame.evaluate()` on an invisible iframe + if not await frame_element.is_visible(): + return elements, element_tree + unique_id = await frame_element.get_attribute("unique_id") + except Exception: + LOG.warning( + "Unable to get unique_id from frame_element", + exc_info=True, ) + return elements, element_tree - if len(frame.child_frames) > 0: - frame_elements, frame_element_tree = await get_interactable_element_tree_in_frame( - frame.child_frames, - frame_elements, - frame_element_tree, - scrape_exclude=scrape_exclude, - ) + frame_js_script = f"async () => await buildTreeFromBody('{unique_id}', {frame_index})" - for element in elements: - if element["id"] == unique_id: - element["children"] = frame_element_tree + await SkyvernFrame.evaluate(frame=frame, expression=JS_FUNCTION_DEFS) + frame_elements, frame_element_tree = await SkyvernFrame.evaluate( + frame=frame, expression=frame_js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS + ) - elements = elements + frame_elements + for element in elements: + if element["id"] == unique_id: + element["children"] = frame_element_tree + + elements = elements + frame_elements return elements, element_tree @@ -529,17 +542,29 @@ async def get_interactable_element_tree( :return: Tuple containing the element tree and a map of element IDs to elements. """ await SkyvernFrame.evaluate(frame=page, expression=JS_FUNCTION_DEFS) - main_frame_js_script = "() => buildTreeFromBody()" + # main page index is 0 + main_frame_js_script = "async () => await buildTreeFromBody('main.frame', 0)" elements, element_tree = await SkyvernFrame.evaluate( frame=page, expression=main_frame_js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS ) - if len(page.main_frame.child_frames) > 0: - elements, element_tree = await get_interactable_element_tree_in_frame( - page.main_frame.child_frames, + context = skyvern_context.ensure_context() + frames = await get_all_children_frames(page) + frames = await filter_frames(frames, scrape_exclude) + + for frame in frames: + frame_index = context.frame_index_map.get(frame, None) + if frame_index is None: + frame_index = len(context.frame_index_map) + 1 + context.frame_index_map[frame] = frame_index + + for frame in frames: + frame_index = context.frame_index_map[frame] + elements, element_tree = await add_frame_interactable_elements( + frame, + frame_index, elements, element_tree, - scrape_exclude=scrape_exclude, ) return elements, element_tree @@ -679,6 +704,9 @@ def trim_element(element: dict) -> dict: if "frame" in queue_ele: del queue_ele["frame"] + if "frame_index" in queue_ele: + del queue_ele["frame_index"] + if "id" in queue_ele and not _should_keep_unique_id(queue_ele): del queue_ele["id"] diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index 7d90010b..68c91737 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -312,6 +312,9 @@ class SkyvernElement: def get_frame(self) -> Page | Frame: return self.__frame + def get_frame_index(self) -> int: + return self.__static_element.get("frame_index", -1) + def get_locator(self) -> Locator: return self.locator diff --git a/skyvern/webeye/utils/page.py b/skyvern/webeye/utils/page.py index 6512aa6a..05de4215 100644 --- a/skyvern/webeye/utils/page.py +++ b/skyvern/webeye/utils/page.py @@ -95,12 +95,16 @@ class SkyvernFrame: max_number: int = settings.MAX_NUM_SCREENSHOTS, ) -> List[bytes]: skyvern_page = await SkyvernFrame.create_instance(frame=page) + + # page is the main frame and the index must be 0 assert isinstance(skyvern_page.frame, Page) + frame = "main.frame" + frame_index = 0 screenshots: List[bytes] = [] if await skyvern_page.is_window_scrollable(): scroll_y_px_old = -30.0 - scroll_y_px = await skyvern_page.scroll_to_top(draw_boxes=draw_boxes) + scroll_y_px = await skyvern_page.scroll_to_top(draw_boxes=draw_boxes, frame=frame, frame_index=frame_index) # Checking max number of screenshots to prevent infinite loop # We are checking the difference between the old and new scroll_y_px to determine if we have reached the end of the # page. If the difference is less than 25, we assume we have reached the end of the page. @@ -109,7 +113,9 @@ class SkyvernFrame: screenshots.append(screenshot) scroll_y_px_old = scroll_y_px LOG.debug("Scrolling to next page", url=url, num_screenshots=len(screenshots)) - scroll_y_px = await skyvern_page.scroll_to_next_page(draw_boxes=draw_boxes) + scroll_y_px = await skyvern_page.scroll_to_next_page( + draw_boxes=draw_boxes, frame=frame, frame_index=frame_index + ) LOG.debug( "Scrolled to next page", scroll_y_px=scroll_y_px, @@ -117,13 +123,13 @@ class SkyvernFrame: ) if draw_boxes: await skyvern_page.remove_bounding_boxes() - await skyvern_page.scroll_to_top(draw_boxes=False) + await skyvern_page.scroll_to_top(draw_boxes=False, frame=frame, frame_index=frame_index) # wait until animation ends, which is triggered by scrolling LOG.debug("Waiting for 2 seconds until animation ends.") await asyncio.sleep(2) else: if draw_boxes: - await skyvern_page.build_elements_and_draw_bounding_boxes() + await skyvern_page.build_elements_and_draw_bounding_boxes(frame=frame, frame_index=frame_index) LOG.debug("Page is not scrollable", url=url, num_screenshots=len(screenshots)) screenshot = await SkyvernFrame.take_screenshot(page=skyvern_page.frame, full_page=False) @@ -167,7 +173,7 @@ class SkyvernFrame: return await self.evaluate(frame=self.frame, expression=js_script, arg=element) async def parse_element_from_html(self, frame: str, element: ElementHandle, interactable: bool) -> Dict: - js_script = "([frame, element, interactable]) => buildElementObject(frame, element, interactable)" + js_script = "async ([frame, element, interactable]) => await buildElementObject(frame, element, interactable)" return await self.evaluate(frame=self.frame, expression=js_script, arg=[frame, element, interactable]) async def get_element_scrollable(self, element: ElementHandle) -> bool: @@ -186,29 +192,35 @@ class SkyvernFrame: js_script = "(element) => getBlockElementUniqueID(element)" return await self.evaluate(frame=self.frame, expression=js_script, arg=element) - async def scroll_to_top(self, draw_boxes: bool) -> float: + async def scroll_to_top(self, draw_boxes: bool, frame: str, frame_index: int) -> float: """ Scroll to the top of the page and take a screenshot. :param drow_boxes: If True, draw bounding boxes around the elements. :param page: Page instance to take the screenshot from. :return: Screenshot of the page. """ - js_script = f"() => scrollToTop({str(draw_boxes).lower()})" + js_script = "async ([draw_boxes, frame, frame_index]) => await scrollToTop(draw_boxes, frame, frame_index)" scroll_y_px = await self.evaluate( - frame=self.frame, expression=js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS + frame=self.frame, + expression=js_script, + timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS, + arg=[draw_boxes, frame, frame_index], ) return scroll_y_px - async def scroll_to_next_page(self, draw_boxes: bool) -> float: + async def scroll_to_next_page(self, draw_boxes: bool, frame: str, frame_index: int) -> float: """ Scroll to the next page and take a screenshot. :param drow_boxes: If True, draw bounding boxes around the elements. :param page: Page instance to take the screenshot from. :return: Screenshot of the page. """ - js_script = f"() => scrollToNextPage({str(draw_boxes).lower()})" + js_script = "async ([draw_boxes, frame, frame_index]) => await scrollToNextPage(draw_boxes, frame, frame_index)" scroll_y_px = await self.evaluate( - frame=self.frame, expression=js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS + frame=self.frame, + expression=js_script, + timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS, + arg=[draw_boxes, frame, frame_index], ) return scroll_y_px @@ -220,9 +232,14 @@ class SkyvernFrame: js_script = "() => removeBoundingBoxes()" await self.evaluate(frame=self.frame, expression=js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS) - async def build_elements_and_draw_bounding_boxes(self) -> None: - js_script = "() => buildElementsAndDrawBoundingBoxes()" - await self.evaluate(frame=self.frame, expression=js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS) + async def build_elements_and_draw_bounding_boxes(self, frame: str, frame_index: int) -> None: + js_script = "async ([frame, frame_index]) => await buildElementsAndDrawBoundingBoxes(frame, frame_index)" + await self.evaluate( + frame=self.frame, + expression=js_script, + timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS, + arg=[frame, frame_index], + ) async def is_window_scrollable(self) -> bool: js_script = "() => isWindowScrollable()"