diff --git a/skyvern/forge/prompts/skyvern/confirm-multi-selection-finish.j2 b/skyvern/forge/prompts/skyvern/confirm-multi-selection-finish.j2 new file mode 100644 index 00000000..89d9d21f --- /dev/null +++ b/skyvern/forge/prompts/skyvern/confirm-multi-selection-finish.j2 @@ -0,0 +1,34 @@ +Confirm if the user has finished the multi-level selection based on the screenshot, user details, the HTML elements and select history provided in the list. + +Reply in JSON format with the following keys: +{ + "page_info": str, // Think step by step. Describe the page information you parsed from the HTML elements. Your action should be based on the current page information. + "think": str, // Think step by step. Describe how you think the user has finished the multi-level selection. + "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence + "is_finished": bool, // True if the user has finished the multi-level selection, False otherwise. +} + +User goal: +``` +{{ navigation_goal }} +``` + +User details: +``` +{{ navigation_payload_str }} +``` + +HTML elements: +``` +{{ elements }} +``` + +Select History: +``` +{{ select_history }} +``` + +Current datetime, ISO format: +``` +{{ local_datetime }} +``` \ No newline at end of file diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 3465cc0e..c6356241 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -161,7 +161,7 @@ def check_disappeared_element_id_in_incremental_factory( incre_page=incremental_scraped, element_id=element_id ) except Exception: - LOG.info( + LOG.debug( "Failed to create skyvern element, going to drop the element from incremental tree", exc_info=True, element_id=element_id, @@ -681,23 +681,25 @@ async def handle_input_text_action( ) await skyvern_element.scroll_into_view() finally: - blocking_element, exist = await skyvern_element.find_blocking_element( - dom=dom, incremental_page=incremental_scraped - ) - if blocking_element and exist: - LOG.info( - "Find a blocking element to the current element, going to blur the blocking element first", - task_id=task.task_id, - step_id=step.step_id, - blocking_element=blocking_element.get_locator(), + if await skyvern_element.is_visible(): + blocking_element, exist = await skyvern_element.find_blocking_element( + dom=dom, incremental_page=incremental_scraped ) - if await blocking_element.get_locator().count(): - await blocking_element.press_key("Escape") - if await blocking_element.get_locator().count(): - await blocking_element.blur() + if blocking_element and exist: + LOG.info( + "Find a blocking element to the current element, going to blur the blocking element first", + task_id=task.task_id, + step_id=step.step_id, + blocking_element=blocking_element.get_locator(), + ) + if await blocking_element.get_locator().count(): + await blocking_element.press_key("Escape") + if await blocking_element.get_locator().count(): + await blocking_element.blur() - await skyvern_element.press_key("Escape") - await skyvern_element.blur() + if await skyvern_element.is_visible(): + await skyvern_element.press_key("Escape") + await skyvern_element.blur() await incremental_scraped.stop_listen_dom_increment() # force to move focus back to the element @@ -1098,6 +1100,7 @@ async def handle_select_option_action( except Exception: LOG.info( "fail to open dropdown by clicking, try to press ArrowDown to open", + exc_info=True, element_id=skyvern_element.get_id(), task_id=task.task_id, step_id=step.step_id, @@ -1154,7 +1157,12 @@ async def handle_select_option_action( results.append(ActionFailure(exception=e)) return results finally: - if is_open and len(results) > 0 and not isinstance(results[-1], ActionSuccess): + if ( + await skyvern_element.is_visible() + and is_open + and len(results) > 0 + and not isinstance(results[-1], ActionSuccess) + ): await skyvern_element.scroll_into_view() await skyvern_element.coordinate_click(page=page) await skyvern_element.press_key("Escape") @@ -1207,11 +1215,16 @@ async def handle_select_option_action( return results finally: - if is_open and len(results) > 0 and not isinstance(results[-1], ActionSuccess): + if ( + await skyvern_element.is_visible() + and is_open + and len(results) > 0 + and not isinstance(results[-1], ActionSuccess) + ): await skyvern_element.scroll_into_view() await skyvern_element.coordinate_click(page=page) await skyvern_element.press_key("Escape") - + is_open = False await skyvern_element.blur() await incremental_scraped.stop_listen_dom_increment() @@ -2013,6 +2026,23 @@ async def sequentially_select_from_dropdown( ) return single_select_result.action_result, values[-1] if len(values) > 0 else None + # it's for typing. it's been verified in `single_select_result.is_done()` + assert single_select_result.dropdown_menu is not None + screenshot = await single_select_result.dropdown_menu.get_locator().screenshot( + timeout=settings.BROWSER_SCREENSHOT_TIMEOUT_MS + ) + prompt = prompt_engine.load_prompt( + "confirm-multi-selection-finish", + navigation_goal=task.navigation_goal, + navigation_payload_str=json.dumps(task.navigation_payload), + elements="".join(json_to_html(element) for element in secondary_increment_element), + select_history=json.dumps(build_sequential_select_history(select_history)), + local_datetime=datetime.now(ensure_context().tz_info).isoformat(), + ) + json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, screenshots=[screenshot], step=step) + if json_response.get("is_finished", False): + return single_select_result.action_result, values[-1] if len(values) > 0 else None + return select_history[-1].action_result if len(select_history) > 0 else None, values[-1] if len( values ) > 0 else None @@ -2292,6 +2322,9 @@ async def locate_dropdown_menu( step: Step, task: Task, ) -> SkyvernElement | None: + if not await current_anchor_element.is_visible(): + return None + skyvern_frame = incremental_scraped.skyvern_frame for idx, element_dict in enumerate(incremental_scraped.element_tree): diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 18d483d8..784d2373 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -2044,25 +2044,60 @@ function isClassNameIncludesHidden(className) { return className.toLowerCase().includes("hide"); } -function addIncrementalNodeToMap(parentNode, childrenNode) { - // calculate the depth of targetNode element for sorting - const depth = getElementDomDepth(parentNode); - let newNodesTreeList = []; - if (window.globalDomDepthMap.has(depth)) { - newNodesTreeList = window.globalDomDepthMap.get(depth); +function waitForNextFrame() { + return new Promise((resolve) => { + requestAnimationFrame(() => resolve()); + }); +} + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +class SafeCounter { + constructor() { + this.value = 0; + this.lock = Promise.resolve(); } - for (const child of childrenNode) { - const [_, newNodeTree] = buildElementTree(child, "", true); - if (newNodeTree.length > 0) { - newNodesTreeList.push(...newNodeTree); - } + async add() { + await this.lock; + this.lock = new Promise((resolve) => { + this.value += 1; + resolve(); + }); } - window.globalDomDepthMap.set(depth, newNodesTreeList); + + async get() { + await this.lock; + return this.value; + } +} + +async function addIncrementalNodeToMap(parentNode, childrenNode) { + // make the dom parser async + await waitForNextFrame(); + if (window.globalListnerFlag) { + // calculate the depth of targetNode element for sorting + const depth = getElementDomDepth(parentNode); + let newNodesTreeList = []; + if (window.globalDomDepthMap.has(depth)) { + newNodesTreeList = window.globalDomDepthMap.get(depth); + } + + for (const child of childrenNode) { + const [_, newNodeTree] = buildElementTree(child, "", true); + if (newNodeTree.length > 0) { + newNodesTreeList.push(...newNodeTree); + } + } + window.globalDomDepthMap.set(depth, newNodesTreeList); + } + await window.globalParsedElementCounter.add(); } if (window.globalObserverForDOMIncrement === undefined) { - window.globalObserverForDOMIncrement = new MutationObserver(function ( + window.globalObserverForDOMIncrement = new MutationObserver(async function ( mutationsList, observer, ) { @@ -2076,13 +2111,14 @@ if (window.globalObserverForDOMIncrement === undefined) { targetNode: node, newNodes: [node], }); - addIncrementalNodeToMap(node, [node]); + await addIncrementalNodeToMap(node, [node]); } } if (mutation.attributeName === "style") { // TODO: need to confirm that elemnent is hidden previously const node = mutation.target; if (node.nodeType === Node.TEXT_NODE) continue; + if (node.tagName.toLowerCase() === "body") continue; const newStyle = getElementComputedStyle(node); const newDisplay = newStyle?.display; if (newDisplay !== "none") { @@ -2090,7 +2126,7 @@ if (window.globalObserverForDOMIncrement === undefined) { targetNode: node, newNodes: [node], }); - addIncrementalNodeToMap(node, [node]); + await addIncrementalNodeToMap(node, [node]); } } if (mutation.attributeName === "class") { @@ -2110,7 +2146,7 @@ if (window.globalObserverForDOMIncrement === undefined) { targetNode: node, newNodes: [node], }); - addIncrementalNodeToMap(node, [node]); + await addIncrementalNodeToMap(node, [node]); } } } @@ -2122,26 +2158,30 @@ if (window.globalObserverForDOMIncrement === undefined) { targetNode: node, // TODO: for future usage, when we want to parse new elements into a tree }; let newNodes = []; - if ( - node.tagName.toLowerCase() === "ul" || - (node.tagName.toLowerCase() === "div" && - node.hasAttribute("role") && - node.getAttribute("role").toLowerCase() === "listbox") - ) { - newNodes.push(node); - } else { - if (mutation.addedNodes && mutation.addedNodes.length > 0) { - for (const node of mutation.addedNodes) { - // skip the text nodes, they won't be interactable - if (node.nodeType === Node.TEXT_NODE) continue; - newNodes.push(node); - } + if (mutation.addedNodes && mutation.addedNodes.length > 0) { + for (const node of mutation.addedNodes) { + // skip the text nodes, they won't be interactable + if (node.nodeType === Node.TEXT_NODE) continue; + newNodes.push(node); } } + if ( + newNodes.length == 0 && + (node.tagName.toLowerCase() === "ul" || + (node.tagName.toLowerCase() === "div" && + node.hasAttribute("role") && + node.getAttribute("role").toLowerCase() === "listbox")) + ) { + newNodes.push(node); + } + if (newNodes.length > 0) { changedNode.newNodes = newNodes; window.globalOneTimeIncrementElements.push(changedNode); - addIncrementalNodeToMap(changedNode.targetNode, changedNode.newNodes); + await addIncrementalNodeToMap( + changedNode.targetNode, + changedNode.newNodes, + ); } } } @@ -2149,8 +2189,10 @@ if (window.globalObserverForDOMIncrement === undefined) { } function startGlobalIncrementalObserver() { + window.globalListnerFlag = true; window.globalDomDepthMap = new Map(); window.globalOneTimeIncrementElements = []; + window.globalParsedElementCounter = new SafeCounter(); window.globalObserverForDOMIncrement.takeRecords(); // cleanup the older data window.globalObserverForDOMIncrement.observe(document.body, { attributes: true, @@ -2161,14 +2203,28 @@ function startGlobalIncrementalObserver() { }); } -function stopGlobalIncrementalObserver() { - window.globalDomDepthMap = new Map(); +async function stopGlobalIncrementalObserver() { + window.globalListnerFlag = false; window.globalObserverForDOMIncrement.disconnect(); window.globalObserverForDOMIncrement.takeRecords(); // cleanup the older data + while ( + (await window.globalParsedElementCounter.get()) < + window.globalOneTimeIncrementElements.length + ) { + await sleep(100); + } window.globalOneTimeIncrementElements = []; + window.globalDomDepthMap = new Map(); } -function getIncrementElements() { +async function getIncrementElements() { + while ( + (await window.globalParsedElementCounter.get()) < + window.globalOneTimeIncrementElements.length + ) { + await sleep(100); + } + // cleanup the chidren tree, remove the duplicated element // search starting from the shallowest node: // 1. if deeper, the node could only be the children of the shallower one or no related one. diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index fce2d06d..84976cc3 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -554,7 +554,7 @@ class IncrementalScrapePage: ) -> list[dict]: frame = self.skyvern_frame.get_frame() - js_script = "() => getIncrementElements()" + js_script = "async () => await getIncrementElements()" incremental_elements, incremental_tree = await SkyvernFrame.evaluate( frame=frame, expression=js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS ) @@ -580,8 +580,10 @@ class IncrementalScrapePage: js_script = "() => window.globalObserverForDOMIncrement === undefined" if await SkyvernFrame.evaluate(frame=self.skyvern_frame.get_frame(), expression=js_script): return - js_script = "() => stopGlobalIncrementalObserver()" - await SkyvernFrame.evaluate(frame=self.skyvern_frame.get_frame(), expression=js_script) + js_script = "async () => await stopGlobalIncrementalObserver()" + await SkyvernFrame.evaluate( + frame=self.skyvern_frame.get_frame(), expression=js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS + ) async def get_incremental_elements_num(self) -> int: js_script = "() => window.globalOneTimeIncrementElements.length" diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index 2dc0838c..7d90010b 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -107,11 +107,11 @@ class SkyvernElement: num_elements = await locator.count() if num_elements < 1: - LOG.warning("No elements found with css. Validation failed.", css=css_selector, element_id=element_id) + LOG.debug("No elements found with css. Validation failed.", css=css_selector, element_id=element_id) raise MissingElement(selector=css_selector, element_id=element_id) elif num_elements > 1: - LOG.warning( + LOG.debug( "Multiple elements found with css. Expected 1. Validation failed.", num_elements=num_elements, selector=css_selector, @@ -584,13 +584,17 @@ class SkyvernElement: await page.mouse.click(click_x, click_y) async def blur(self) -> None: + if not await self.is_visible(): + return await SkyvernFrame.evaluate( frame=self.get_frame(), expression="(element) => element.blur()", arg=await self.get_element_handler() ) async def scroll_into_view(self, timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS) -> None: - element_handler = await self.get_element_handler(timeout=timeout) + if not await self.is_visible(): + return try: + element_handler = await self.get_element_handler(timeout=timeout) await element_handler.scroll_into_view_if_needed(timeout=timeout) except TimeoutError: LOG.info(