From d107c3d4db53035fbc5043c8ca6226e426c4c4a0 Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Thu, 14 Nov 2024 02:33:44 +0800 Subject: [PATCH] refactor chain click (#1186) --- skyvern/exceptions.py | 4 +- skyvern/webeye/actions/handler.py | 118 ++++++++++---------- skyvern/webeye/scraper/domUtils.js | 171 ++++++++++++++++++++++++++++- skyvern/webeye/utils/dom.py | 43 ++++++++ skyvern/webeye/utils/page.py | 12 ++ 5 files changed, 280 insertions(+), 68 deletions(-) diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py index 61b52720..a599559f 100644 --- a/skyvern/exceptions.py +++ b/skyvern/exceptions.py @@ -357,8 +357,8 @@ class InputActionOnSelect2Dropdown(SkyvernException): class FailToClick(SkyvernException): - def __init__(self, element_id: str, anchor: str = "self"): - super().__init__(f"Failed to click({anchor}). element_id={element_id}") + def __init__(self, element_id: str, msg: str, anchor: str = "self"): + super().__init__(f"Failed to click({anchor}). element_id={element_id}, error_msg={msg}") class FailToSelectByLabel(SkyvernException): diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 613896ab..376185f7 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -1138,14 +1138,15 @@ async def chain_click( LOG.info("Chain click: main element click succeeded", action=action, locator=locator) return [ActionSuccess()] - except Exception: - action_results: list[ActionResult] = [ActionFailure(FailToClick(action.element_id))] + except Exception as e: + action_results: list[ActionResult] = [ActionFailure(FailToClick(action.element_id, msg=str(e)))] if skyvern_element.get_tag_name() == "label": LOG.info( "Chain click: it's a label element. going to try for-click", task_id=task.task_id, action=action, + element=str(skyvern_element), locator=locator, ) try: @@ -1155,45 +1156,71 @@ async def chain_click( await bound_element.get_locator().click(timeout=timeout) action_results.append(ActionSuccess()) return action_results - except Exception: - action_results.append(ActionFailure(FailToClick(action.element_id, anchor="for"))) - - if skyvern_element.get_tag_name() == InteractiveElement.INPUT: + except Exception as e: + action_results.append(ActionFailure(FailToClick(action.element_id, anchor="for", msg=str(e)))) + else: LOG.info( - "Chain click: it's an input element. going to try sibling click", + "Chain click: it's a non-label element. going to find the bound label element by attribute id and click", task_id=task.task_id, action=action, + element=str(skyvern_element), locator=locator, ) - sibling_action_result = await click_sibling_of_input(locator, timeout=timeout) - action_results.append(sibling_action_result) - if isinstance(sibling_action_result, ActionSuccess): - return action_results + try: + if bound_locator := await skyvern_element.find_bound_label_by_attr_id(): + await bound_locator.click(timeout=timeout) + action_results.append(ActionSuccess()) + return action_results + except Exception as e: + action_results.append(ActionFailure(FailToClick(action.element_id, anchor="attr_id", msg=str(e)))) + + if not await skyvern_element.is_visible(): + LOG.info( + "Chain click: exit since the element is not visible on the page anymore", + taks_id=task.task_id, + action=action, + element=str(skyvern_element), + locator=locator, + ) + return action_results + + blocking_element = await skyvern_element.find_blocking_element( + dom=DomUtil(scraped_page=scraped_page, page=page) + ) + if blocking_element is None: + LOG.info( + "Chain click: exit since the element is not blocking by any skyvern element", + taks_id=task.task_id, + action=action, + element=str(skyvern_element), + locator=locator, + ) + return action_results try: - parent_locator = locator.locator("..") - await parent_locator.click(timeout=timeout) - - LOG.info( - "Chain click: successfully clicked parent element", + LOG.debug( + "Chain click: verifying the blocking element is parent or sibling of the target element", + taks_id=task.task_id, action=action, - parent_locator=parent_locator, + element=str(blocking_element), + locator=locator, ) - action_results.append(ActionSuccess(interacted_with_parent=True)) - except Exception: - LOG.warning( - "Failed to click parent element", - action=action, - parent_locator=parent_locator, - exc_info=True, - ) - action_results.append( - ActionFailure( - FailToClick(action.element_id, anchor="parent"), - interacted_with_parent=True, + if await blocking_element.is_parent_of( + await skyvern_element.get_element_handler() + ) or await blocking_element.is_sibling_of(await skyvern_element.get_element_handler()): + LOG.info( + "Chain click: element is blocked by other elements, going to click on the blocking element", + taks_id=task.task_id, + action=action, + element=str(blocking_element), + locator=locator, ) - ) - # We don't raise exception here because we do log the exception, and return ActionFailure as the last action + + await blocking_element.get_locator().click(timeout=timeout) + action_results.append(ActionSuccess()) + return action_results + except Exception as e: + action_results.append(ActionFailure(FailToClick(action.element_id, anchor="blocking_element", msg=str(e)))) return action_results finally: @@ -2235,35 +2262,6 @@ def get_checkbox_id_in_label_children(scraped_page: ScrapedPage, element_id: str return None -async def click_sibling_of_input( - locator: Locator, - timeout: int, -) -> ActionResult: - try: - input_element = locator.first - parent_locator = locator.locator("..") - if input_element: - input_id = await input_element.get_attribute("id") - sibling_label_css = f'label[for="{input_id}"]' - label_locator = parent_locator.locator(sibling_label_css) - await label_locator.click(timeout=timeout) - LOG.info( - "Successfully clicked sibling label of input element", - sibling_label_css=sibling_label_css, - ) - return ActionSuccess(interacted_with_sibling=True) - # Should never get here - return ActionFailure( - exception=Exception("Failed while trying to click sibling of input element"), - interacted_with_sibling=True, - ) - except Exception: - LOG.warning("Failed to click sibling label of input element", exc_info=True) - return ActionFailure( - exception=Exception("Failed while trying to click sibling of input element"), - ) - - async def extract_information_for_navigation_goal( task: Task, step: Step, diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index d555ef11..9948880d 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -1,3 +1,6 @@ +// we only use chromium browser for now +let browserNameForWorkarounds = "chromium"; + // Commands for manipulating rects. // Want to debug this? Run chromium, go to sources, and create a new snippet with the code in domUtils.js class Rect { @@ -197,14 +200,31 @@ function getElementComputedStyle(element, pseudo) { : undefined; } -// from playwright +// from playwright: https://github.com/microsoft/playwright/blob/1b65f26f0287c0352e76673bc5f85bc36c934b55/packages/playwright-core/src/server/injected/domUtils.ts#L76-L98 function isElementStyleVisibilityVisible(element, style) { style = style ?? getElementComputedStyle(element); if (!style) return true; + // Element.checkVisibility checks for content-visibility and also looks at + // styles up the flat tree including user-agent ShadowRoots, such as the + // details element for example. + // All the browser implement it, but WebKit has a bug which prevents us from using it: + // https://bugs.webkit.org/show_bug.cgi?id=264733 + // @ts-ignore if ( - !element.checkVisibility({ checkOpacity: false, checkVisibilityCSS: false }) - ) - return false; + Element.prototype.checkVisibility && + browserNameForWorkarounds !== "webkit" + ) { + if (!element.checkVisibility()) return false; + } else { + // Manual workaround for WebKit that does not have checkVisibility. + const detailsOrSummary = element.closest("details,summary"); + if ( + detailsOrSummary !== element && + detailsOrSummary?.nodeName === "DETAILS" && + !detailsOrSummary.open + ) + return false; + } if (style.visibility !== "visible") return false; // TODO: support style.clipPath and style.clipRule? @@ -220,7 +240,7 @@ function hasASPClientControl() { return typeof ASPxClientControl !== "undefined"; } -// from playwright +// from playwright: https://github.com/microsoft/playwright/blob/1b65f26f0287c0352e76673bc5f85bc36c934b55/packages/playwright-core/src/server/injected/domUtils.ts#L100-L119 function isElementVisible(element) { // TODO: This is a hack to not check visibility for option elements // because they are not visible by default. We check their parent instead for visibility. @@ -249,7 +269,8 @@ function isElementVisible(element) { isElementVisible(child) ) return true; - // skipping other nodes including text + if (child.nodeType === 3 /* Node.TEXT_NODE */ && isVisibleTextNode(child)) + return true; } return false; } @@ -273,6 +294,144 @@ function isElementVisible(element) { return true; } +// from playwright: https://github.com/microsoft/playwright/blob/1b65f26f0287c0352e76673bc5f85bc36c934b55/packages/playwright-core/src/server/injected/domUtils.ts#L121-L127 +function isVisibleTextNode(node) { + // https://stackoverflow.com/questions/1461059/is-there-an-equivalent-to-getboundingclientrect-for-text-nodes + const range = node.ownerDocument.createRange(); + range.selectNode(node); + const rect = range.getBoundingClientRect(); + if (rect.width <= 0 || rect.height <= 0) { + return false; + } + + // if the center point of the element is not in the page, we tag it as an non-interactable element + // FIXME: sometimes there could be an overflow element blocking the default scrolling, making Y coordinate be wrong. So we currently only check for X + const center_x = (rect.left + rect.width) / 2 + window.scrollX; + if (center_x < 0) { + return false; + } + // const center_y = (rect.top + rect.height) / 2 + window.scrollY; + // if (center_x < 0 || center_y < 0) { + // return false; + // } + return true; +} + +// from playwright: https://github.com/microsoft/playwright/blob/d685763c491e06be38d05675ef529f5c230388bb/packages/playwright-core/src/server/injected/domUtils.ts#L37-L44 +function parentElementOrShadowHost(element) { + if (element.parentElement) return element.parentElement; + if (!element.parentNode) return; + if ( + element.parentNode.nodeType === 11 /* Node.DOCUMENT_FRAGMENT_NODE */ && + element.parentNode.host + ) + return element.parentNode.host; +} + +// from playwright: https://github.com/microsoft/playwright/blob/d685763c491e06be38d05675ef529f5c230388bb/packages/playwright-core/src/server/injected/domUtils.ts#L46-L52 +function enclosingShadowRootOrDocument(element) { + let node = element; + while (node.parentNode) node = node.parentNode; + if ( + node.nodeType === 11 /* Node.DOCUMENT_FRAGMENT_NODE */ || + node.nodeType === 9 /* Node.DOCUMENT_NODE */ + ) + return node; +} + +// from playwright: https://github.com/microsoft/playwright/blob/d685763c491e06be38d05675ef529f5c230388bb/packages/playwright-core/src/server/injected/injectedScript.ts#L799-L859 +function expectHitTarget(hitPoint, targetElement) { + const roots = []; + + // Get all component roots leading to the target element. + // Go from the bottom to the top to make it work with closed shadow roots. + let parentElement = targetElement; + while (parentElement) { + const root = enclosingShadowRootOrDocument(parentElement); + if (!root) break; + roots.push(root); + if (root.nodeType === 9 /* Node.DOCUMENT_NODE */) break; + parentElement = root.host; + } + + // Hit target in each component root should point to the next component root. + // Hit target in the last component root should point to the target or its descendant. + let hitElement; + for (let index = roots.length - 1; index >= 0; index--) { + const root = roots[index]; + // All browsers have different behavior around elementFromPoint and elementsFromPoint. + // https://github.com/w3c/csswg-drafts/issues/556 + // http://crbug.com/1188919 + const elements = root.elementsFromPoint(hitPoint.x, hitPoint.y); + const singleElement = root.elementFromPoint(hitPoint.x, hitPoint.y); + if ( + singleElement && + elements[0] && + parentElementOrShadowHost(singleElement) === elements[0] + ) { + const style = window.getComputedStyle(singleElement); + if (style?.display === "contents") { + // Workaround a case where elementsFromPoint misses the inner-most element with display:contents. + // https://bugs.chromium.org/p/chromium/issues/detail?id=1342092 + elements.unshift(singleElement); + } + } + if ( + elements[0] && + elements[0].shadowRoot === root && + elements[1] === singleElement + ) { + // Workaround webkit but where first two elements are swapped: + // + // #shadow root + // + // elementsFromPoint produces [, ], while it should be [, ] + // In this case, just ignore . + elements.shift(); + } + const innerElement = elements[0]; + if (!innerElement) break; + hitElement = innerElement; + if (index && innerElement !== roots[index - 1].host) break; + } + + // Check whether hit target is the target or its descendant. + const hitParents = []; + while (hitElement && hitElement !== targetElement) { + hitParents.push(hitElement); + hitElement = parentElementOrShadowHost(hitElement); + } + if (hitElement === targetElement) return null; + + return hitParents[0] || document.documentElement; +} + +function isParent(parent, child) { + return parent.contains(child); +} + +function isSibling(el1, el2) { + return el1.parentElement === el2.parentElement; +} + +function getBlockElementUniqueID(element) { + const rect = element.getBoundingClientRect(); + + const hitElement = expectHitTarget( + { + x: rect.left + rect.width / 2, + y: rect.top + rect.height / 2, + }, + element, + ); + + if (!hitElement) { + return ""; + } + + return hitElement.getAttribute("unique_id") ?? ""; +} + function isHidden(element) { const style = getElementComputedStyle(element); if (style?.display === "none") { diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index 467bbd48..a28ce39e 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -127,6 +127,9 @@ class SkyvernElement: self.locator = locator self.hash_value = hash_value + def __repr__(self) -> str: + return f"SkyvernElement({str(self.__static_element)})" + def build_HTML(self, need_trim_element: bool = True, need_skyvern_attrs: bool = True) -> str: element_dict = self.get_element_dict() if need_trim_element: @@ -263,6 +266,22 @@ class SkyvernElement: async def is_selectable(self) -> bool: return self.get_selectable() or self.get_tag_name() in SELECTABLE_ELEMENT + async def is_visible(self) -> bool: + skyvern_frame = await SkyvernFrame.create_instance(self.get_frame()) + return await skyvern_frame.get_element_visible(await self.get_element_handler()) + + async def is_parent_of(self, target: ElementHandle) -> bool: + skyvern_frame = await SkyvernFrame.create_instance(self.get_frame()) + return await skyvern_frame.is_parent(await self.get_element_handler(), target) + + async def is_child_of(self, target: ElementHandle) -> bool: + skyvern_frame = await SkyvernFrame.create_instance(self.get_frame()) + return await skyvern_frame.is_parent(target, await self.get_element_handler()) + + async def is_sibling_of(self, target: ElementHandle) -> bool: + skyvern_frame = await SkyvernFrame.create_instance(self.get_frame()) + return await skyvern_frame.is_sibling(await self.get_element_handler(), target) + def get_element_dict(self) -> dict: return self.__static_element @@ -301,6 +320,13 @@ class SkyvernElement: assert handler is not None return handler + async def find_blocking_element(self, dom: DomUtil) -> SkyvernElement | None: + skyvern_frame = await SkyvernFrame.create_instance(self.get_frame()) + blocking_element_id = await skyvern_frame.get_blocking_element_id(await self.get_element_handler()) + if not blocking_element_id: + return None + return await dom.get_skyvern_element_by_id(blocking_element_id) + def find_element_id_in_label_children(self, element_type: InteractiveElement) -> str | None: tag_name = self.get_tag_name() if tag_name != "label": @@ -354,6 +380,23 @@ class SkyvernElement: return await dom.get_skyvern_element_by_id(unique_id) + async def find_bound_label_by_attr_id( + self, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS + ) -> Locator | None: + if self.get_tag_name() == "label": + return None + + element_id: str = await self.get_attr("id", timeout=timeout) + if not element_id: + return None + + locator = self.get_frame().locator(f"label[for='{element_id}']") + cnt = await locator.count() + if cnt == 1: + return locator + + return None + async def find_selectable_child(self, dom: DomUtil) -> SkyvernElement | None: # BFS to find the first selectable child index = 0 diff --git a/skyvern/webeye/utils/page.py b/skyvern/webeye/utils/page.py index 44924f52..85dbf803 100644 --- a/skyvern/webeye/utils/page.py +++ b/skyvern/webeye/utils/page.py @@ -182,6 +182,10 @@ class SkyvernFrame: js_script = "(element) => checkDisabledFromStyle(element)" return await self.evaluate(frame=self.frame, expression=js_script, arg=element) + async def get_blocking_element_id(self, element: ElementHandle) -> str: + js_script = "(element) => getBlockElementUniqueID(element)" + return await self.evaluate(frame=self.frame, expression=js_script, arg=element) + async def scroll_to_top(self, draw_boxes: bool) -> float: """ Scroll to the top of the page and take a screenshot. @@ -224,6 +228,14 @@ class SkyvernFrame: js_script = "() => isWindowScrollable()" return await self.evaluate(frame=self.frame, expression=js_script) + async def is_parent(self, parent: ElementHandle, child: ElementHandle) -> bool: + js_script = "([parent, child]) => isParent(parent, child)" + return await self.evaluate(frame=self.frame, expression=js_script, arg=[parent, child]) + + async def is_sibling(self, el1: ElementHandle, el2: ElementHandle) -> bool: + js_script = "([el1, el2]) => isSibling(el1, el2)" + return await self.evaluate(frame=self.frame, expression=js_script, arg=[el1, el2]) + async def has_ASP_client_control(self) -> bool: js_script = "() => hasASPClientControl()" return await self.evaluate(frame=self.frame, expression=js_script)