From d3ea8ef85bbebaa914b5067f2d6033908127bdc0 Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Tue, 13 May 2025 11:11:16 -0700 Subject: [PATCH] generate element xpath (#2335) --- skyvern/webeye/scraper/domUtils.js | 62 +++++++++++++++++++++++++++--- skyvern/webeye/utils/dom.py | 17 +++++++- 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 0ae577bd..3aa87a48 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -1421,7 +1421,12 @@ async function buildElementTree( return []; } } - async function processElement(element, parentId) { + async function processElement( + element, + parentId, + parent_xpath, + current_node_index, + ) { if (element === null) { _jsConsoleLog("get a null element"); return; @@ -1438,6 +1443,20 @@ async function buildElementTree( return; } + let current_xpath = null; + if (parent_xpath) { + // ignore the namespace, otherwise the xpath sometimes won't find anything, specially for SVG elements + current_xpath = + parent_xpath + + "/" + + '*[name()="' + + tagName + + '"]' + + "[" + + current_node_index + + "]"; + } + // if element is an "a" tag and has a target="_blank" attribute, remove the target attribute // We're doing this so that skyvern can do all the navigation in a single page/tab and not open new tab if (tagName === "a") { @@ -1446,10 +1465,10 @@ async function buildElementTree( } } - let children = []; + let shadowDOMchildren = []; // sometimes the shadowRoot is not visible, but the elemnets in the shadowRoot are visible if (element.shadowRoot) { - children = getChildElements(element.shadowRoot); + shadowDOMchildren = getChildElements(element.shadowRoot); } const isVisible = isElementVisible(element); if (isVisible && !isHidden(element) && !isScriptOrStyle(element)) { @@ -1500,6 +1519,7 @@ async function buildElementTree( } if (elementObj) { + elementObj.xpath = current_xpath; elements.push(elementObj); // If the element is interactable but has no interactable parent, // then it starts a new tree, so add it to the result array @@ -1520,10 +1540,35 @@ async function buildElementTree( } } - children = children.concat(getChildElements(element)); + const children = getChildElements(element); + const xpathMap = new Map(); + for (let i = 0; i < children.length; i++) { const childElement = children[i]; - await processElement(childElement, parentId); + const tagName = childElement?.tagName?.toLowerCase(); + if (!tagName) { + _jsConsoleLog("get a null tagName"); + continue; + } + let current_node_index = xpathMap.get(tagName); + if (current_node_index == undefined) { + current_node_index = 1; + } else { + current_node_index = current_node_index + 1; + } + xpathMap.set(tagName, current_node_index); + await processElement( + childElement, + parentId, + current_xpath, + current_node_index, + ); + } + + // FIXME: xpath won't work when the element is in shadow DOM + for (let i = 0; i < shadowDOMchildren.length; i++) { + const childElement = shadowDOMchildren[i]; + await processElement(childElement, parentId, null, 0); } return; } @@ -1732,8 +1777,13 @@ async function buildElementTree( return trimmedResults; }; + let current_xpath = null; + if (starter === document.body) { + current_xpath = "/html[1]"; + } + // setup before parsing the dom - await processElement(starter, null); + await processElement(starter, null, current_xpath, 1); for (var element of elements) { if ( diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index b2adf6b0..d6fed140 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -93,6 +93,7 @@ class SkyvernElement: When you try to interact with these elements by python, you are supposed to use this class as an interface. """ + # TODO: support to create SkyvernElement from incremental page by xpath @classmethod async def create_from_incremental(cls, incre_page: IncrementalScrapePage, element_id: str) -> SkyvernElement: element_dict = incre_page.id_to_element_dict.get(element_id) @@ -773,8 +774,20 @@ class DomUtil: num_elements = await locator.count() if num_elements < 1: - LOG.warning("No elements found with css. Validation failed.", css=css, element_id=element_id) - raise MissingElement(selector=css, element_id=element_id) + xpath: str | None = element.get("xpath") + if not xpath: + LOG.warning("No elements found with css. Validation failed.", css=css, element_id=element_id) + raise MissingElement(selector=css, element_id=element_id) + else: + # WARNING: current xpath is based on the tag name. + # It can only represent the element possition in the DOM tree with tag name, it's not 100% reliable. + # As long as the current possition has the same element with the tag name, the locator can be found. + # (maybe) we should validate the element hash to make sure the element is the same? + LOG.warning("Fallback to locator element by xpath.", xpath=xpath, element_id=element_id) + locator = frame_content.locator(f"xpath={xpath}") + num_elements = await locator.count() + if num_elements < 1: + raise MissingElement(selector=xpath, element_id=element_id) elif num_elements > 1: LOG.warning(