diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 2bec132b..db1c086a 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -360,6 +360,7 @@ async def handle_input_text_action( result = await select_from_dropdown( action=select_action, page=page, + dom=dom, skyvern_frame=skyvern_frame, incremental_scraped=incremental_scraped, element_trees=incremental_element, @@ -378,10 +379,11 @@ async def handle_input_text_action( ) except Exception as e: await skyvern_element.scroll_into_view() - await skyvern_element.get_locator().press("Escape", timeout=timeout) LOG.exception("Failed to do custom selection transformed from input action") return [ActionFailure(exception=e)] finally: + await skyvern_element.press_key("Escape") + await skyvern_element.blur() await incremental_scraped.stop_listen_dom_increment() # force to move focus back to the element @@ -614,12 +616,19 @@ async def handle_select_option_action( try: await incremental_scraped.start_listen_dom_increment() - await skyvern_element.get_locator().focus(timeout=timeout) + await skyvern_element.focus() - if tag_name == InteractiveElement.INPUT: - await skyvern_element.get_locator().press("ArrowDown", timeout=timeout) - else: + try: await skyvern_element.get_locator().click(timeout=timeout) + except Exception: + LOG.info( + "fail to open dropdown by clicking, try to press ArrowDown to open", + element_id=skyvern_element.get_id(), + task_id=task.task_id, + step_id=step.step_id, + ) + await skyvern_element.focus() + await skyvern_element.press_key("ArrowDown") # wait 5s for options to load await asyncio.sleep(5) @@ -634,6 +643,7 @@ async def handle_select_option_action( result = await select_from_dropdown( action=action, page=page, + dom=dom, skyvern_frame=skyvern_frame, incremental_scraped=incremental_scraped, element_trees=incremental_element, @@ -1160,6 +1170,7 @@ async def input_or_auto_complete_input( async def select_from_dropdown( action: SelectOptionAction, page: Page, + dom: DomUtil, skyvern_frame: SkyvernFrame, incremental_scraped: IncrementalScrapePage, element_trees: list[dict], @@ -1185,7 +1196,9 @@ async def select_from_dropdown( if not force_select and dropdown_menu_element is None: return None - if dropdown_menu_element and dropdown_menu_element.get_scrollable(): + if dropdown_menu_element and await skyvern_frame.get_element_scrollable( + await dropdown_menu_element.get_element_handler() + ): await scroll_down_to_load_all_options( dropdown_menu_element=dropdown_menu_element, skyvern_frame=skyvern_frame, @@ -1198,13 +1211,7 @@ async def select_from_dropdown( trimmed_element_tree = await incremental_scraped.get_incremental_element_tree( app.AGENT_FUNCTION.cleanup_element_tree_factory(step=step, task=task) ) - if dropdown_menu_element: - # if there's a dropdown menu detected, only elements in the dropdown should be sent to LLM - dropdown_id = dropdown_menu_element.get_id() - for head_node in trimmed_element_tree: - if head_node.get("id") == dropdown_id: - trimmed_element_tree = [head_node] - break + trimmed_element_tree = remove_exist_elements(dom=dom, element_tree=trimmed_element_tree) html = incremental_scraped.build_html_tree(element_tree=trimmed_element_tree) @@ -1319,10 +1326,12 @@ async def scroll_down_to_load_all_options( scroll_pace += scroll_interval else: await skyvern_frame.scroll_to_element_bottom(dropdown_menu_element_handle) + # wait for the options to be fully loaded + await asyncio.sleep(2) # scoll a little back and scoll down to trigger the loading - await page.mouse.wheel(0, -20) - await page.mouse.wheel(0, 20) + await page.mouse.wheel(0, -1e-5) + await page.mouse.wheel(0, 1e-5) # wait for while to load new options await asyncio.sleep(10) diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index d8eca858..fa3bf28e 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -936,7 +936,6 @@ function buildElementObject(frame, element, interactable) { elementTagNameLower === "select" || isSelect2Dropdown(element) || isSelect2MultiChoice(element), - isScrollable: isScrollable(element), }; let isInShadowRoot = element.getRootNode() instanceof ShadowRoot; @@ -1571,7 +1570,7 @@ function scrollToElementBottom(element) { element.scroll({ top: element.scrollHeight, left: 0, - behavior: "instant", + behavior: "smooth", }); } @@ -1619,11 +1618,33 @@ if (window.globalOneTimeIncrementElements === undefined) { window.globalOneTimeIncrementElements = []; } +if (window.globalDomDepthMap === undefined) { + window.globalDomDepthMap = new Map(); +} + +function addIncrementalNodeToMap(parentNode, childrenNode) { + // calculate the depth of targetNode element for sorting + const depth = getElementDomDepth(parentNode); + let newNodesTreeList = []; + if (window.globalDomDepthMap.has(depth)) { + newNodesTreeList = window.globalDomDepthMap.get(depth); + } + + for (const child of childrenNode) { + const [_, newNodeTree] = buildElementTree(child, "", false); + if (newNodeTree.length > 0) { + newNodesTreeList.push(...newNodeTree); + } + } + window.globalDomDepthMap.set(depth, newNodesTreeList); +} + if (window.globalObserverForDOMIncrement === undefined) { window.globalObserverForDOMIncrement = new MutationObserver(function ( mutationsList, observer, ) { + // TODO: how to detect duplicated recreate element? for (const mutation of mutationsList) { if (mutation.type === "attributes") { if (mutation.attributeName === "style") { @@ -1637,6 +1658,7 @@ if (window.globalObserverForDOMIncrement === undefined) { targetNode: node, newNodes: [node], }); + addIncrementalNodeToMap(node, [node]); } } @@ -1660,6 +1682,7 @@ if (window.globalObserverForDOMIncrement === undefined) { if (newNodes.length > 0) { changedNode.newNodes = newNodes; window.globalOneTimeIncrementElements.push(changedNode); + addIncrementalNodeToMap(changedNode.targetNode, changedNode.newNodes); } } } @@ -1667,6 +1690,7 @@ if (window.globalObserverForDOMIncrement === undefined) { } function startGlobalIncrementalObserver() { + window.globalDomDepthMap = new Map(); window.globalOneTimeIncrementElements = []; window.globalObserverForDOMIncrement.takeRecords(); // cleanup the older data window.globalObserverForDOMIncrement.observe(document.body, { @@ -1679,63 +1703,50 @@ function startGlobalIncrementalObserver() { } function stopGlobalIncrementalObserver() { + window.globalDomDepthMap = new Map(); window.globalObserverForDOMIncrement.disconnect(); window.globalObserverForDOMIncrement.takeRecords(); // cleanup the older data window.globalOneTimeIncrementElements = []; } -function getIncrementElements(frame) { - const domDepthMap = new Map(); - - for (const element of window.globalOneTimeIncrementElements) { - // calculate the depth of targetNode element for sorting - const depth = getElementDomDepth(element.targetNode); - let newNodesTreeList = []; - if (domDepthMap.has(depth)) { - newNodesTreeList = domDepthMap.get(depth); - } - - for (const child of element.newNodes) { - const [_, newNodeTree] = buildElementTree(child, frame, false); - if (newNodeTree.length > 0) { - newNodesTreeList.push(...newNodeTree); - } - } - domDepthMap.set(depth, newNodesTreeList); - } - +function getIncrementElements() { // cleanup the chidren tree, remove the duplicated element // search starting from the shallowest node: // 1. if deeper, the node could only be the children of the shallower one or no related one. // 2. if depth is same, the node could only be duplicated one or no related one. const idToElement = new Map(); const cleanedTreeList = []; - const sortedDepth = Array.from(domDepthMap.keys()).sort(); + const sortedDepth = Array.from(window.globalDomDepthMap.keys()).sort( + (a, b) => a - b, + ); for (let idx = 0; idx < sortedDepth.length; idx++) { const depth = sortedDepth[idx]; - const treeList = domDepthMap.get(depth); + const treeList = window.globalDomDepthMap.get(depth); + + const removeDupAndConcatChildren = (element) => { + const children = element.children; + if (idToElement.has(element.id)) { + element = idToElement.get(element.id); + for (let i = 0; i < children.length; i++) { + const child = children[i]; + if (!idToElement.get(child.id)) { + element.children.push(child); + } + } + } + idToElement.set(element.id, element); + for (let i = 0; i < children.length; i++) { + const child = children[i]; + removeDupAndConcatChildren(child); + } + }; for (const treeHeadElement of treeList) { // check if the element is existed - if (idToElement.has(treeHeadElement.id)) { - continue; - } - cleanedTreeList.push(treeHeadElement); - - // flatten the tree - let pendingElements = [treeHeadElement]; - let curIndex = 0; - while (curIndex < pendingElements.length) { - const curElement = pendingElements[curIndex]; - if (idToElement.has(curElement.id)) { - curIndex++; - continue; - } - - idToElement.set(curElement.id, curElement); - pendingElements.push(...curElement.children); - curIndex++; + if (!idToElement.has(treeHeadElement.id)) { + cleanedTreeList.push(treeHeadElement); } + removeDupAndConcatChildren(treeHeadElement); } } diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index cbdeb6f5..761556da 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -6,7 +6,7 @@ from enum import StrEnum from typing import Any, Awaitable, Callable import structlog -from playwright.async_api import Frame, Page +from playwright.async_api import Frame, Locator, Page from pydantic import BaseModel from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR @@ -388,13 +388,12 @@ async def get_interactable_element_tree( class IncrementalScrapePage: - id_to_element_dict: dict[str, dict] = {} - id_to_css_dict: dict[str, str] - elements: list[dict] - element_tree: list[dict] - element_tree_trimmed: list[dict] - def __init__(self, skyvern_frame: SkyvernFrame) -> None: + self.id_to_element_dict: dict[str, dict] = dict() + self.id_to_css_dict: dict[str, str] = dict() + self.elements: list[dict] = list() + self.element_tree: list[dict] = list() + self.element_tree_trimmed: list[dict] = list() self.skyvern_frame = skyvern_frame async def get_incremental_element_tree( @@ -403,19 +402,7 @@ class IncrementalScrapePage: ) -> list[dict]: frame = self.skyvern_frame.get_frame() - frame_id = "main.frame" - if isinstance(frame, Frame): - try: - frame_element = await frame.frame_element() - frame_id = await frame_element.get_attribute("unique_id") - except Exception: - # TODO: do we really care about the frame_id ? - LOG.warning( - "Unable to get frame_element", - exc_info=True, - ) - - js_script = f"() => getIncrementElements('{frame_id}')" + js_script = "() => getIncrementElements()" incremental_elements, incremental_tree = await frame.evaluate(js_script) # we listen the incremental elements seperated by frames, so all elements will be in the same SkyvernFrame self.id_to_css_dict, self.id_to_element_dict, _ = build_element_dict(incremental_elements) @@ -442,6 +429,25 @@ class IncrementalScrapePage: js_script = "() => window.globalOneTimeIncrementElements.length" return await self.skyvern_frame.get_frame().evaluate(js_script) + async def select_one_element_by_value(self, value: str) -> Locator | None: + for element in self.elements: + element_id = element.get("id", "") + if not element_id: + continue + + if not element.get("interactable", False): + continue + + text = element.get("text", "") + if text != value: + continue + + locator = self.skyvern_frame.get_frame().locator(f'[{SKYVERN_ID_ATTR}="{element_id}"]') + if await locator.count() > 0: + return locator + + return None + def build_html_tree(self, element_tree: list[dict] | None = None) -> str: return "".join([json_to_html(element) for element in (element_tree or self.element_tree_trimmed)]) diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index c7409b37..29241143 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -7,7 +7,7 @@ from enum import StrEnum from random import uniform import structlog -from playwright.async_api import Frame, FrameLocator, Locator, Page, TimeoutError +from playwright.async_api import ElementHandle, Frame, FrameLocator, Locator, Page, TimeoutError from skyvern.constants import SKYVERN_ID_ATTR from skyvern.exceptions import ( @@ -199,9 +199,6 @@ class SkyvernElement: def get_element_dict(self) -> dict: return self.__static_element - def get_scrollable(self) -> bool: - return self.__static_element.get("isScrollable", False) - def get_selectable(self) -> bool: return self.__static_element.get("isSelectable", False) @@ -230,6 +227,13 @@ class SkyvernElement: def get_locator(self) -> Locator: return self.locator + async def get_element_handler( + self, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS + ) -> ElementHandle: + handler = await self.locator.element_handle(timeout=timeout) + assert handler is not None + return handler + async def get_select2_dropdown(self) -> Select2Dropdown: if not await self.is_select2_dropdown(): raise ElementIsNotSelect2Dropdown(self.get_id(), self.__static_element) @@ -328,6 +332,9 @@ class SkyvernElement: return await self.locator.get_attribute(attr_name, timeout=timeout) + async def focus(self, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS) -> None: + await self.get_locator().focus(timeout=timeout) + async def input_sequentially( self, text: str, default_timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS ) -> None: @@ -340,6 +347,11 @@ class SkyvernElement: await self.press_fill(text, timeout=default_timeout) + async def press_key( + self, key: str, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS + ) -> None: + await self.get_locator().press(key=key, timeout=timeout) + async def press_fill( self, text: str, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS ) -> None: @@ -375,11 +387,11 @@ class SkyvernElement: click_x, click_y = await self.move_mouse_to(page=page, timeout=timeout) await page.mouse.click(click_x, click_y) + async def blur(self) -> None: + await self.get_frame().evaluate("(element) => element.blur()", await self.get_element_handler()) + async def scroll_into_view(self, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS) -> None: - element_handler = await self.get_locator().element_handle() - if element_handler is None: - LOG.warning("element handler is None. ", element_id=self.get_id()) - return + element_handler = await self.get_element_handler(timeout=timeout) try: await element_handler.scroll_into_view_if_needed(timeout=timeout) except TimeoutError: @@ -387,8 +399,8 @@ class SkyvernElement: "Timeout to execute scrolling into view, try to re-focus to locate the element", element_id=self.get_id(), ) - await self.get_frame().evaluate("(element) => element.blur()", element_handler) - await self.get_locator().focus(timeout=timeout) + await self.blur() + await self.focus(timeout=timeout) await asyncio.sleep(2) # wait for scrolling into the target diff --git a/skyvern/webeye/utils/page.py b/skyvern/webeye/utils/page.py index 07c2dc0f..72f2b28b 100644 --- a/skyvern/webeye/utils/page.py +++ b/skyvern/webeye/utils/page.py @@ -172,6 +172,10 @@ class SkyvernFrame: js_script = "([frame, element, interactable]) => buildElementObject(frame, element, interactable)" return await self.frame.evaluate(js_script, [frame, element, interactable]) + async def get_element_scrollable(self, element: ElementHandle) -> bool: + js_script = "(element) => isScrollable(element)" + return await self.frame.evaluate(js_script, element) + async def scroll_to_top(self, draw_boxes: bool) -> float: """ Scroll to the top of the page and take a screenshot.