Decorate bounding boxes with element_ids to improve Skyvern accuracy (+ a few more changes) (#536)

This commit is contained in:
Kerem Yilmaz
2024-07-01 21:24:52 -07:00
committed by GitHub
parent eb7478a58a
commit 257ba1601e
7 changed files with 59 additions and 22 deletions

View File

@@ -213,7 +213,10 @@ function isElementStyleVisibilityVisible(element, style) {
function isElementVisible(element) {
// TODO: This is a hack to not check visibility for option elements
// because they are not visible by default. We check their parent instead for visibility.
if (element.tagName.toLowerCase() === "option")
if (
element.tagName.toLowerCase() === "option" ||
(element.tagName.toLowerCase() === "input" && element.type === "radio")
)
return element.parentElement && isElementVisible(element.parentElement);
if (element.className.toString().includes("select2-offscreen")) {
@@ -1088,7 +1091,11 @@ async function buildTreeFromBody(frame = "main.frame", open_select = false) {
const labelElement = document.querySelector(
element.tagName + '[unique_id="' + element.id + '"]',
);
if (labelElement && labelElement.childElementCount === 0) {
if (
labelElement &&
labelElement.childElementCount === 0 &&
!labelElement.getAttribute("for")
) {
continue;
}
}
@@ -1234,15 +1241,30 @@ function createHintMarkersForGroups(groups) {
return [];
}
const hintMarkers = groups.map((group) => createHintMarkerForGroup(group));
const hintMarkers = groups
.filter((group) => group.elements.some((element) => element.interactable))
.map((group) => createHintMarkerForGroup(group));
// fill in marker text
const hintStrings = generateHintStrings(hintMarkers.length);
// const hintStrings = generateHintStrings(hintMarkers.length);
for (let i = 0; i < hintMarkers.length; i++) {
const hintMarker = hintMarkers[i];
hintMarker.hintString = hintStrings[i];
let interactableElementFound = false;
for (let i = 0; i < hintMarker.group.elements.length; i++) {
if (hintMarker.group.elements[i].interactable) {
hintMarker.hintString = hintMarker.group.elements[i].id;
interactableElementFound = true;
break;
}
}
if (!interactableElementFound) {
hintMarker.hintString = "";
}
try {
hintMarker.element.innerHTML = hintMarker.hintString.toUpperCase();
hintMarker.element.innerHTML = hintMarker.hintString;
} catch (e) {
// Ensure trustedTypes is available
if (typeof trustedTypes !== "undefined") {
@@ -1262,11 +1284,16 @@ function createHintMarkersForGroups(groups) {
}
function createHintMarkerForGroup(group) {
// Calculate the position of the element relative to the document
var scrollTop = window.pageYOffset || document.documentElement.scrollTop;
var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
const marker = {};
// yellow annotation box with string
const el = document.createElement("div");
el.style.left = group.rect.left + "px";
el.style.top = group.rect.top + "px";
el.style.position = "absolute";
el.style.left = group.rect.left + scrollLeft + "px";
el.style.top = group.rect.top + scrollTop + "px";
// Each group is assigned a different incremental z-index, we use the same z-index for the
// bounding box and the hint marker
el.style.zIndex = this.currentZIndex;
@@ -1274,10 +1301,6 @@ function createHintMarkerForGroup(group) {
// The bounding box around the group of hints.
const boundingBox = document.createElement("div");
// Calculate the position of the element relative to the document
var scrollTop = window.pageYOffset || document.documentElement.scrollTop;
var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
// Set styles for the bounding box
boundingBox.style.position = "absolute";
boundingBox.style.display = "display";
@@ -1302,7 +1325,7 @@ function addHintMarkersToPage(hintMarkers) {
const parent = document.createElement("div");
parent.id = "boundingBoxContainer";
for (const hintMarker of hintMarkers) {
// parent.appendChild(hintMarker.element);
parent.appendChild(hintMarker.element);
parent.appendChild(hintMarker.boundingBox);
}
document.documentElement.appendChild(parent);

View File

@@ -476,6 +476,7 @@ def trim_element_tree(elements: list[dict]) -> list[dict]:
def _trimmed_attributes(tag_name: str, attributes: dict) -> dict:
new_attributes: dict = {}
for key in attributes:
if key == "id" and tag_name in ["input", "textarea", "select"]:
# We don't want to remove the id attribute any of these elements in case there's a label for it
@@ -484,6 +485,7 @@ def _trimmed_attributes(tag_name: str, attributes: dict) -> dict:
new_attributes[key] = attributes[key]
if key in RESERVED_ATTRIBUTES and attributes[key]:
new_attributes[key] = attributes[key]
return new_attributes