Use 4-char element ids instead of sequential integers (#361)
Co-authored-by: LawyZheng <lawyzheng1106@gmail.com>
This commit is contained in:
@@ -37,7 +37,7 @@ class Action(BaseModel):
|
||||
|
||||
|
||||
class WebAction(Action, abc.ABC):
|
||||
element_id: int
|
||||
element_id: str
|
||||
|
||||
|
||||
class UserDefinedError(BaseModel):
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from typing import Any, Awaitable, Callable, List
|
||||
|
||||
@@ -516,40 +515,20 @@ async def handle_select_option_action(
|
||||
return [ActionFailure(e)]
|
||||
|
||||
try:
|
||||
option_xpath = scraped_page.id_to_xpath_dict[action.option.index]
|
||||
match = re.search(r"option\[(\d+)]$", option_xpath)
|
||||
if match:
|
||||
# This means we were trying to select an option xpath, click the option
|
||||
option_index = int(match.group(1))
|
||||
await page.click(
|
||||
f"xpath={xpath}",
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.select_option(
|
||||
xpath,
|
||||
index=option_index,
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.click(
|
||||
f"xpath={xpath}",
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
return [ActionSuccess()]
|
||||
else:
|
||||
# This means the supplied index was for the select element, not a reference to the xpath dict
|
||||
await page.click(
|
||||
f"xpath={xpath}",
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.select_option(
|
||||
xpath,
|
||||
index=action.option.index,
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.click(
|
||||
f"xpath={xpath}",
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
# This means the supplied index was for the select element, not a reference to the xpath dict
|
||||
await page.click(
|
||||
f"xpath={xpath}",
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.select_option(
|
||||
xpath,
|
||||
index=action.option.index,
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.click(
|
||||
f"xpath={xpath}",
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
return [ActionSuccess()]
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to click on the option by index", action=action, exc_info=True)
|
||||
@@ -782,12 +761,11 @@ async def chain_click(
|
||||
page.remove_listener("filechooser", fc_func)
|
||||
|
||||
|
||||
def get_anchor_to_click(scraped_page: ScrapedPage, element_id: int) -> str | None:
|
||||
def get_anchor_to_click(scraped_page: ScrapedPage, element_id: str) -> str | None:
|
||||
"""
|
||||
Get the anchor tag under the label to click
|
||||
"""
|
||||
LOG.info("Getting anchor tag to click", element_id=element_id)
|
||||
element_id = int(element_id)
|
||||
for ele in scraped_page.elements:
|
||||
if "id" in ele and ele["id"] == element_id:
|
||||
for child in ele["children"]:
|
||||
@@ -796,7 +774,7 @@ def get_anchor_to_click(scraped_page: ScrapedPage, element_id: int) -> str | Non
|
||||
return None
|
||||
|
||||
|
||||
def get_select_id_in_label_children(scraped_page: ScrapedPage, element_id: int) -> int | None:
|
||||
def get_select_id_in_label_children(scraped_page: ScrapedPage, element_id: str) -> str | None:
|
||||
"""
|
||||
search <select> in the children of <label>
|
||||
"""
|
||||
@@ -812,7 +790,7 @@ def get_select_id_in_label_children(scraped_page: ScrapedPage, element_id: int)
|
||||
return None
|
||||
|
||||
|
||||
def get_checkbox_id_in_label_children(scraped_page: ScrapedPage, element_id: int) -> int | None:
|
||||
def get_checkbox_id_in_label_children(scraped_page: ScrapedPage, element_id: str) -> str | None:
|
||||
"""
|
||||
search checkbox/radio in the children of <label>
|
||||
"""
|
||||
@@ -933,7 +911,7 @@ async def click_listbox_option(
|
||||
scraped_page: ScrapedPage,
|
||||
page: Page,
|
||||
action: actions.SelectOptionAction,
|
||||
listbox_element_id: int,
|
||||
listbox_element_id: str,
|
||||
) -> bool:
|
||||
listbox_element = scraped_page.id_to_element_dict[listbox_element_id]
|
||||
# this is a listbox element, get all the children
|
||||
|
||||
@@ -540,6 +540,7 @@ function getElementContent(element, skipped_element = null) {
|
||||
function getSelectOptions(element) {
|
||||
const options = Array.from(element.options);
|
||||
const selectOptions = [];
|
||||
|
||||
for (const option of options) {
|
||||
selectOptions.push({
|
||||
optionIndex: option.index,
|
||||
@@ -554,7 +555,8 @@ function getListboxOptions(element) {
|
||||
var optionElements = element.querySelectorAll('[role="option"]');
|
||||
let selectOptions = [];
|
||||
for (var i = 0; i < optionElements.length; i++) {
|
||||
var ele = optionElements[i];
|
||||
let ele = optionElements[i];
|
||||
|
||||
selectOptions.push({
|
||||
optionIndex: i,
|
||||
text: removeMultipleSpaces(ele.textContent),
|
||||
@@ -563,6 +565,17 @@ function getListboxOptions(element) {
|
||||
return selectOptions;
|
||||
}
|
||||
|
||||
function uniqueId() {
|
||||
const characters =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
|
||||
let result = "";
|
||||
for (let i = 0; i < 4; i++) {
|
||||
const randomIndex = Math.floor(Math.random() * characters.length);
|
||||
result += characters[randomIndex];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function buildTreeFromBody() {
|
||||
var elements = [];
|
||||
var resultArray = [];
|
||||
@@ -620,7 +633,7 @@ function buildTreeFromBody() {
|
||||
};
|
||||
|
||||
function buildElementObject(element, interactable) {
|
||||
var element_id = elements.length;
|
||||
var element_id = element.getAttribute("unique_id") ?? uniqueId();
|
||||
var elementTagNameLower = element.tagName.toLowerCase();
|
||||
element.setAttribute("unique_id", element_id);
|
||||
// if element is an "a" tag and has a target="_blank" attribute, remove the target attribute
|
||||
@@ -733,7 +746,10 @@ function buildTreeFromBody() {
|
||||
// If the element is interactable and has an interactable parent,
|
||||
// then add it to the children of the parent
|
||||
else {
|
||||
elements[parentId].children.push(elementObj);
|
||||
// TODO: use dict/object so that we access these in O(1) instead
|
||||
elements
|
||||
.find((element) => element.id === parentId)
|
||||
.children.push(elementObj);
|
||||
}
|
||||
// options already added to the select.options, no need to add options anymore
|
||||
if (elementObj.options && elementObj.options.length > 0) {
|
||||
@@ -772,13 +788,16 @@ function buildTreeFromBody() {
|
||||
if (parentId === null) {
|
||||
resultArray.push(elementObj);
|
||||
} else {
|
||||
elements[parentId].children.push(elementObj);
|
||||
// TODO: use dict/object so that we access these in O(1) instead
|
||||
elements
|
||||
.find((element) => element.id === parentId)
|
||||
.children.push(elementObj);
|
||||
}
|
||||
parentId = elementObj.id;
|
||||
}
|
||||
}
|
||||
getChildElements(element).forEach((child) => {
|
||||
let children = processElement(child, parentId);
|
||||
processElement(child, parentId);
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -975,8 +994,6 @@ function buildTreeFromBody() {
|
||||
// TODO: Handle iframes
|
||||
// setup before parsing the dom
|
||||
checkSelect2();
|
||||
// Clear all the unique_id attributes so that there are no conflicts
|
||||
removeAllUniqueIdAttributes();
|
||||
processElement(document.body, null);
|
||||
|
||||
for (var element of elements) {
|
||||
@@ -1029,14 +1046,6 @@ function drawBoundingBoxes(elements) {
|
||||
addHintMarkersToPage(hintMarkers);
|
||||
}
|
||||
|
||||
function removeAllUniqueIdAttributes() {
|
||||
var elementsWithUniqueId = document.querySelectorAll("[unique_id]");
|
||||
|
||||
elementsWithUniqueId.forEach(function (element) {
|
||||
element.removeAttribute("unique_id");
|
||||
});
|
||||
}
|
||||
|
||||
function captchaSolvedCallback() {
|
||||
console.log("captcha solved");
|
||||
if (!window["captchaSolvedCounter"]) {
|
||||
|
||||
@@ -121,8 +121,8 @@ class ScrapedPage(BaseModel):
|
||||
"""
|
||||
|
||||
elements: list[dict]
|
||||
id_to_element_dict: dict[int, dict] = {}
|
||||
id_to_xpath_dict: dict[int, str]
|
||||
id_to_element_dict: dict[str, dict] = {}
|
||||
id_to_xpath_dict: dict[str, str]
|
||||
element_tree: list[dict]
|
||||
element_tree_trimmed: list[dict]
|
||||
screenshots: list[bytes]
|
||||
|
||||
Reference in New Issue
Block a user