From 257ba1601e8badfc6216485d246b06b78eb5a57b Mon Sep 17 00:00:00 2001 From: Kerem Yilmaz Date: Mon, 1 Jul 2024 21:24:52 -0700 Subject: [PATCH] Decorate bounding boxes with element_ids to improve Skyvern accuracy (+ a few more changes) (#536) --- skyvern/exceptions.py | 7 ++- skyvern/forge/agent.py | 2 +- .../skyvern/extract-action-claude3-sonnet.j2 | 1 + .../forge/prompts/skyvern/extract-action.j2 | 1 + skyvern/webeye/actions/handler.py | 17 ++++--- skyvern/webeye/scraper/domUtils.js | 51 ++++++++++++++----- skyvern/webeye/scraper/scraper.py | 2 + 7 files changed, 59 insertions(+), 22 deletions(-) diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py index 90ff7ba1..034c8a21 100644 --- a/skyvern/exceptions.py +++ b/skyvern/exceptions.py @@ -310,7 +310,7 @@ class ElementIsNotLabel(SkyvernException): class MissingElementDict(SkyvernException): def __init__(self, element_id: str) -> None: - super().__init__(f"Found no element in the dict. element_id={element_id}") + super().__init__(f"Invalid element id. element_id={element_id}") class MissingElementInIframe(SkyvernException): @@ -325,6 +325,11 @@ class InputActionOnSelect2Dropdown(SkyvernException): ) +class FailToClick(SkyvernException): + def __init__(self, element_id: str): + super().__init__(f"Failed to click. element_id={element_id}") + + class FailToSelectByLabel(SkyvernException): def __init__(self, element_id: str): super().__init__(f"Failed to select by label. element_id={element_id}") diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index efdae702..00ab956f 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -930,7 +930,7 @@ class ForgeAgent: LOG.info("Using Claude3 Sonnet prompt template for action extraction") prompt_template = "extract-action-claude3-sonnet" - # TODO: we only use HTML element for now, introduce a way to swith in the future + # TODO: we only use HTML element for now, introduce a way to switch in the future element_tree_format = ElementTreeFormat.HTML LOG.info( "Building element tree", diff --git a/skyvern/forge/prompts/skyvern/extract-action-claude3-sonnet.j2 b/skyvern/forge/prompts/skyvern/extract-action-claude3-sonnet.j2 index 02fdf395..dc12126e 100644 --- a/skyvern/forge/prompts/skyvern/extract-action-claude3-sonnet.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action-claude3-sonnet.j2 @@ -9,6 +9,7 @@ If you see a popup in the page screenshot, prioritize actions on the popup. Reply in JSON format with the following keys: { + "action_plan": str, // A string that describes the plan of actions you're going to take. Be specific and to the point. Use this as a quick summary of the actions you're going to take, and what order you're going to take them in "actions": array // An array of actions. Here's the format of each action: [{ "reasoning": str, // The reasoning behind the action. Be specific, referencing any user information and their fields and element ids in your reasoning. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point. diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2 index a9351b06..f06a2780 100644 --- a/skyvern/forge/prompts/skyvern/extract-action.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action.j2 @@ -9,6 +9,7 @@ If you see a popup in the page screenshot, prioritize actions on the popup. Reply in JSON format with the following keys: { + "action_plan": str, // A string that describes the plan of actions you're going to take. Be specific and to the point. Use this as a quick summary of the actions you're going to take, and what order you're going to take them in "actions": array // An array of actions. Here's the format of each action: [{ "reasoning": str, // The reasoning behind the action. Be specific, referencing any user information and their fields and element ids in your reasoning. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point. diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 86f28b91..6f01b08d 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -11,6 +11,7 @@ from playwright.async_api import Locator, Page, TimeoutError from skyvern.constants import INPUT_TEXT_TIMEOUT, REPO_ROOT_DIR from skyvern.exceptions import ( EmptySelect, + FailToClick, FailToSelectByIndex, FailToSelectByLabel, FailToSelectByValue, @@ -789,10 +790,11 @@ async def chain_click( javascript_triggered=javascript_triggered, ) ] - except Exception as e: + + except Exception: action_results: list[ActionResult] = [ ActionFailure( - e, + FailToClick(action.element_id), javascript_triggered=javascript_triggered, ) ] @@ -826,7 +828,7 @@ async def chain_click( interacted_with_parent=True, ) ) - except Exception as pe: + except Exception: LOG.warning( "Failed to click parent element", action=action, @@ -835,7 +837,7 @@ async def chain_click( ) action_results.append( ActionFailure( - pe, + FailToClick(action.element_id), javascript_triggered=javascript_triggered, interacted_with_parent=True, ) @@ -1073,9 +1075,12 @@ async def click_sibling_of_input( javascript_triggered=javascript_triggered, interacted_with_sibling=True, ) - except Exception as e: + except Exception: LOG.warning("Failed to click sibling label of input element", exc_info=True) - return ActionFailure(exception=e, javascript_triggered=javascript_triggered) + return ActionFailure( + exception=Exception("Failed while trying to click sibling of input element"), + javascript_triggered=javascript_triggered, + ) async def extract_information_for_navigation_goal( diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 74469c76..e6676733 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -213,7 +213,10 @@ function isElementStyleVisibilityVisible(element, style) { function isElementVisible(element) { // TODO: This is a hack to not check visibility for option elements // because they are not visible by default. We check their parent instead for visibility. - if (element.tagName.toLowerCase() === "option") + if ( + element.tagName.toLowerCase() === "option" || + (element.tagName.toLowerCase() === "input" && element.type === "radio") + ) return element.parentElement && isElementVisible(element.parentElement); if (element.className.toString().includes("select2-offscreen")) { @@ -1088,7 +1091,11 @@ async function buildTreeFromBody(frame = "main.frame", open_select = false) { const labelElement = document.querySelector( element.tagName + '[unique_id="' + element.id + '"]', ); - if (labelElement && labelElement.childElementCount === 0) { + if ( + labelElement && + labelElement.childElementCount === 0 && + !labelElement.getAttribute("for") + ) { continue; } } @@ -1234,15 +1241,30 @@ function createHintMarkersForGroups(groups) { return []; } - const hintMarkers = groups.map((group) => createHintMarkerForGroup(group)); - + const hintMarkers = groups + .filter((group) => group.elements.some((element) => element.interactable)) + .map((group) => createHintMarkerForGroup(group)); // fill in marker text - const hintStrings = generateHintStrings(hintMarkers.length); + // const hintStrings = generateHintStrings(hintMarkers.length); for (let i = 0; i < hintMarkers.length; i++) { const hintMarker = hintMarkers[i]; - hintMarker.hintString = hintStrings[i]; + + let interactableElementFound = false; + + for (let i = 0; i < hintMarker.group.elements.length; i++) { + if (hintMarker.group.elements[i].interactable) { + hintMarker.hintString = hintMarker.group.elements[i].id; + interactableElementFound = true; + break; + } + } + + if (!interactableElementFound) { + hintMarker.hintString = ""; + } + try { - hintMarker.element.innerHTML = hintMarker.hintString.toUpperCase(); + hintMarker.element.innerHTML = hintMarker.hintString; } catch (e) { // Ensure trustedTypes is available if (typeof trustedTypes !== "undefined") { @@ -1262,11 +1284,16 @@ function createHintMarkersForGroups(groups) { } function createHintMarkerForGroup(group) { + // Calculate the position of the element relative to the document + var scrollTop = window.pageYOffset || document.documentElement.scrollTop; + var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft; + const marker = {}; // yellow annotation box with string const el = document.createElement("div"); - el.style.left = group.rect.left + "px"; - el.style.top = group.rect.top + "px"; + el.style.position = "absolute"; + el.style.left = group.rect.left + scrollLeft + "px"; + el.style.top = group.rect.top + scrollTop + "px"; // Each group is assigned a different incremental z-index, we use the same z-index for the // bounding box and the hint marker el.style.zIndex = this.currentZIndex; @@ -1274,10 +1301,6 @@ function createHintMarkerForGroup(group) { // The bounding box around the group of hints. const boundingBox = document.createElement("div"); - // Calculate the position of the element relative to the document - var scrollTop = window.pageYOffset || document.documentElement.scrollTop; - var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft; - // Set styles for the bounding box boundingBox.style.position = "absolute"; boundingBox.style.display = "display"; @@ -1302,7 +1325,7 @@ function addHintMarkersToPage(hintMarkers) { const parent = document.createElement("div"); parent.id = "boundingBoxContainer"; for (const hintMarker of hintMarkers) { - // parent.appendChild(hintMarker.element); + parent.appendChild(hintMarker.element); parent.appendChild(hintMarker.boundingBox); } document.documentElement.appendChild(parent); diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 82421dc3..7d7fac13 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -476,6 +476,7 @@ def trim_element_tree(elements: list[dict]) -> list[dict]: def _trimmed_attributes(tag_name: str, attributes: dict) -> dict: new_attributes: dict = {} + for key in attributes: if key == "id" and tag_name in ["input", "textarea", "select"]: # We don't want to remove the id attribute any of these elements in case there's a label for it @@ -484,6 +485,7 @@ def _trimmed_attributes(tag_name: str, attributes: dict) -> dict: new_attributes[key] = attributes[key] if key in RESERVED_ATTRIBUTES and attributes[key]: new_attributes[key] = attributes[key] + return new_attributes