better detect dropdown menu (#778)
This commit is contained in:
@@ -14,6 +14,7 @@ Reply in JSON format with the following keys:
|
||||
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
|
||||
"id": str, // The id of the element to take action on. The id has to be one from the elements list
|
||||
"value": str, // The value to select.
|
||||
"relevant": bool, // True if the value you select is relevant to the target value, otherwise False.
|
||||
}
|
||||
|
||||
Context:
|
||||
|
||||
@@ -388,7 +388,6 @@ async def handle_input_text_action(
|
||||
dom=dom,
|
||||
skyvern_frame=skyvern_frame,
|
||||
incremental_scraped=incremental_scraped,
|
||||
element_trees=incremental_element,
|
||||
llm_handler=app.SECONDARY_LLM_API_HANDLER,
|
||||
step=step,
|
||||
task=task,
|
||||
@@ -402,10 +401,14 @@ async def handle_input_text_action(
|
||||
element_id=skyvern_element.get_id(),
|
||||
action=action,
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
await skyvern_element.scroll_into_view()
|
||||
LOG.exception("Failed to do custom selection transformed from input action")
|
||||
return [ActionFailure(exception=e)]
|
||||
LOG.warning(
|
||||
"Failed to do custom selection transformed from input action, continue to input text",
|
||||
exc_info=True,
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
)
|
||||
finally:
|
||||
await skyvern_element.press_key("Escape")
|
||||
await skyvern_element.blur()
|
||||
@@ -682,7 +685,6 @@ async def handle_select_option_action(
|
||||
dom=dom,
|
||||
skyvern_frame=skyvern_frame,
|
||||
incremental_scraped=incremental_scraped,
|
||||
element_trees=incremental_element,
|
||||
llm_handler=app.SECONDARY_LLM_API_HANDLER,
|
||||
step=step,
|
||||
task=task,
|
||||
@@ -1251,21 +1253,23 @@ async def select_from_dropdown(
|
||||
dom: DomUtil,
|
||||
skyvern_frame: SkyvernFrame,
|
||||
incremental_scraped: IncrementalScrapePage,
|
||||
element_trees: list[dict],
|
||||
llm_handler: LLMAPIHandler,
|
||||
step: Step,
|
||||
task: Task,
|
||||
force_select: bool = False,
|
||||
should_relevant: bool = True,
|
||||
) -> tuple[ActionResult | None, str | None]:
|
||||
"""
|
||||
force_select is used to choose an element to click even there's no dropdown menu
|
||||
None will be only returned when force_select is false and no dropdown menu popped
|
||||
force_select: is used to choose an element to click even there's no dropdown menu;
|
||||
should_relevant: only valid when force_select is "False". When "True", the chosen value must be relevant to the target value;
|
||||
None will be only returned when:
|
||||
1. force_select is false and no dropdown menu popped
|
||||
2. force_select is false and match value is not relevant to the target value
|
||||
"""
|
||||
timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
|
||||
|
||||
dropdown_menu_element = await locate_dropdown_menu(
|
||||
incremental_scraped=incremental_scraped,
|
||||
element_trees=element_trees,
|
||||
llm_handler=llm_handler,
|
||||
step=step,
|
||||
task=task,
|
||||
@@ -1297,7 +1301,10 @@ async def select_from_dropdown(
|
||||
raise NoLabelOrValueForCustomSelection(element_id=action.element_id)
|
||||
|
||||
prompt = prompt_engine.load_prompt(
|
||||
"custom-select", context_reasoning=action.reasoning, target_value=target_value, elements=html
|
||||
"custom-select",
|
||||
context_reasoning=action.reasoning,
|
||||
target_value=target_value,
|
||||
elements=html,
|
||||
)
|
||||
|
||||
LOG.info(
|
||||
@@ -1320,6 +1327,16 @@ async def select_from_dropdown(
|
||||
if not element_id:
|
||||
raise NoElementMatchedForTargetOption(target=target_value, reason=json_response.get("reasoning"))
|
||||
|
||||
if not force_select and should_relevant:
|
||||
if not json_response.get("relevant", False):
|
||||
LOG.debug(
|
||||
"The selected option is not relevant to the target value",
|
||||
element_id=element_id,
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
)
|
||||
return None, None
|
||||
|
||||
try:
|
||||
selected_element = await SkyvernElement.create_from_incremental(incremental_scraped, element_id)
|
||||
await selected_element.scroll_into_view()
|
||||
@@ -1362,7 +1379,7 @@ async def select_from_dropdown_by_value(
|
||||
step: Step,
|
||||
) -> ActionResult:
|
||||
timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
|
||||
element_trees = await incremental_scraped.get_incremental_element_tree(
|
||||
await incremental_scraped.get_incremental_element_tree(
|
||||
clean_and_remove_element_tree_factory(task=task, step=step, dom=dom),
|
||||
)
|
||||
|
||||
@@ -1373,7 +1390,6 @@ async def select_from_dropdown_by_value(
|
||||
|
||||
dropdown_menu_element = await locate_dropdown_menu(
|
||||
incremental_scraped=incremental_scraped,
|
||||
element_trees=element_trees,
|
||||
llm_handler=llm_handler,
|
||||
step=step,
|
||||
task=task,
|
||||
@@ -1419,12 +1435,35 @@ async def select_from_dropdown_by_value(
|
||||
|
||||
async def locate_dropdown_menu(
|
||||
incremental_scraped: IncrementalScrapePage,
|
||||
element_trees: list[dict],
|
||||
llm_handler: LLMAPIHandler,
|
||||
step: Step,
|
||||
task: Task,
|
||||
) -> SkyvernElement | None:
|
||||
for idx, element_dict in enumerate(element_trees):
|
||||
skyvern_frame = incremental_scraped.skyvern_frame
|
||||
|
||||
async def is_ul_or_listbox_element(element_dict: dict) -> bool:
|
||||
element_id: str = element_dict.get("id", "")
|
||||
try:
|
||||
element = await SkyvernElement.create_from_incremental(incremental_scraped, element_id)
|
||||
except Exception:
|
||||
LOG.debug(
|
||||
"Failed to element in the incremental page",
|
||||
element_id=element_id,
|
||||
step_id=step.step_id,
|
||||
task_id=task.task_id,
|
||||
exc_info=True,
|
||||
)
|
||||
return False
|
||||
|
||||
if element.get_tag_name() == "ul":
|
||||
return True
|
||||
|
||||
if await element.get_attr("role") == "listbox":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
for idx, element_dict in enumerate(incremental_scraped.element_tree):
|
||||
# FIXME: confirm max to 10 nodes for now, preventing sendindg too many requests to LLM
|
||||
if idx >= 10:
|
||||
break
|
||||
@@ -1432,7 +1471,7 @@ async def locate_dropdown_menu(
|
||||
element_id = element_dict.get("id")
|
||||
if not element_id:
|
||||
LOG.debug(
|
||||
"Skip the non-interactable element for the dropdown menu confirm",
|
||||
"Skip the element without id for the dropdown menu confirm",
|
||||
step_id=step.step_id,
|
||||
task_id=task.task_id,
|
||||
element=element_dict,
|
||||
@@ -1451,6 +1490,38 @@ async def locate_dropdown_menu(
|
||||
)
|
||||
continue
|
||||
|
||||
found_element_id = await head_element.find_children_element_id_by_callback(
|
||||
cb=is_ul_or_listbox_element,
|
||||
)
|
||||
if found_element_id and found_element_id != element_id:
|
||||
LOG.debug(
|
||||
"Found 'ul or listbox' element in children list",
|
||||
element_id=found_element_id,
|
||||
step_id=step.step_id,
|
||||
task_id=task.task_id,
|
||||
)
|
||||
|
||||
try:
|
||||
head_element = await SkyvernElement.create_from_incremental(incremental_scraped, found_element_id)
|
||||
element_id = found_element_id
|
||||
except Exception:
|
||||
LOG.debug(
|
||||
"Failed to get head element by found element id, use the orignal element id",
|
||||
element_id=found_element_id,
|
||||
step_id=step.step_id,
|
||||
task_id=task.task_id,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
if not await skyvern_frame.get_element_visible(await head_element.get_element_handler()):
|
||||
LOG.debug(
|
||||
"Skip the element since it's invisible",
|
||||
step_id=step.step_id,
|
||||
task_id=task.task_id,
|
||||
element_id=element_id,
|
||||
)
|
||||
continue
|
||||
|
||||
screenshot = await head_element.get_locator().screenshot(
|
||||
timeout=SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS
|
||||
)
|
||||
|
||||
@@ -888,7 +888,7 @@ function uniqueId() {
|
||||
return result;
|
||||
}
|
||||
|
||||
function buildElementObject(frame, element, interactable) {
|
||||
function buildElementObject(frame, element, interactable, purgeable = false) {
|
||||
var element_id = element.getAttribute("unique_id") ?? uniqueId();
|
||||
var elementTagNameLower = element.tagName.toLowerCase();
|
||||
element.setAttribute("unique_id", element_id);
|
||||
@@ -940,6 +940,8 @@ function buildElementObject(frame, element, interactable) {
|
||||
text: getElementContent(element),
|
||||
children: [],
|
||||
rect: DomUtils.getVisibleClientRect(element, true),
|
||||
// if purgeable is True, which means this element is only used for building the tree relationship
|
||||
purgeable: purgeable,
|
||||
// don't trim any attr of this element if keepAllAttr=True
|
||||
keepAllAttr:
|
||||
elementTagNameLower === "svg" || element.closest("svg") !== null,
|
||||
@@ -979,11 +981,11 @@ function buildElementObject(frame, element, interactable) {
|
||||
return elementObj;
|
||||
}
|
||||
|
||||
function buildTreeFromBody(frame = "main.frame", open_select = false) {
|
||||
return buildElementTree(document.body, frame, open_select);
|
||||
function buildTreeFromBody(frame = "main.frame") {
|
||||
return buildElementTree(document.body, frame);
|
||||
}
|
||||
|
||||
function buildElementTree(starter = document.body, frame = "main.frame") {
|
||||
function buildElementTree(starter = document.body, frame, full_tree = false) {
|
||||
var elements = [];
|
||||
var resultArray = [];
|
||||
|
||||
@@ -1078,6 +1080,23 @@ function buildElementTree(starter = document.body, frame = "main.frame") {
|
||||
// build all table related elements into skyvern element
|
||||
// we need these elements to preserve the DOM structure
|
||||
elementObj = buildElementObject(frame, element, false);
|
||||
} else if (full_tree) {
|
||||
// when building full tree, we only get text from element itself
|
||||
// elements without text are purgeable
|
||||
elementObj = buildElementObject(frame, element, false, true);
|
||||
let textContent = "";
|
||||
if (isElementVisible(element)) {
|
||||
for (let i = 0; i < element.childNodes.length; i++) {
|
||||
var node = element.childNodes[i];
|
||||
if (node.nodeType === Node.TEXT_NODE) {
|
||||
textContent += node.data.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
elementObj.text = textContent;
|
||||
if (textContent.length > 0) {
|
||||
elementObj.purgeable = false;
|
||||
}
|
||||
} else {
|
||||
// character length limit for non-interactable elements should be 5000
|
||||
// we don't use element context in HTML format,
|
||||
@@ -1673,7 +1692,7 @@ function addIncrementalNodeToMap(parentNode, childrenNode) {
|
||||
}
|
||||
|
||||
for (const child of childrenNode) {
|
||||
const [_, newNodeTree] = buildElementTree(child, "", false);
|
||||
const [_, newNodeTree] = buildElementTree(child, "", true);
|
||||
if (newNodeTree.length > 0) {
|
||||
newNodesTreeList.push(...newNodeTree);
|
||||
}
|
||||
|
||||
@@ -109,6 +109,9 @@ def json_to_html(element: dict) -> str:
|
||||
for option in element.get("options", [])
|
||||
)
|
||||
|
||||
if element.get("purgeable", False):
|
||||
return children_html + option_html
|
||||
|
||||
# Check if the element is self-closing
|
||||
if tag in ["img", "input", "br", "hr", "meta", "link"] and not option_html and not children_html:
|
||||
return f'<{tag}{attributes_html if not attributes_html else " "+attributes_html}>'
|
||||
@@ -338,7 +341,7 @@ async def get_interactable_element_tree_in_frame(
|
||||
|
||||
unique_id = await frame_element.get_attribute("unique_id")
|
||||
|
||||
frame_js_script = f"() => buildTreeFromBody('{unique_id}', true)"
|
||||
frame_js_script = f"() => buildTreeFromBody('{unique_id}')"
|
||||
|
||||
await frame.evaluate(JS_FUNCTION_DEFS)
|
||||
frame_elements, frame_element_tree = await frame.evaluate(frame_js_script)
|
||||
@@ -374,7 +377,7 @@ async def get_interactable_element_tree(
|
||||
:return: Tuple containing the element tree and a map of element IDs to elements.
|
||||
"""
|
||||
await page.evaluate(JS_FUNCTION_DEFS)
|
||||
main_frame_js_script = "() => buildTreeFromBody('main.frame', true)"
|
||||
main_frame_js_script = "() => buildTreeFromBody()"
|
||||
elements, element_tree = await page.evaluate(main_frame_js_script)
|
||||
|
||||
if len(page.main_frame.child_frames) > 0:
|
||||
@@ -504,8 +507,7 @@ def trim_element_tree(elements: list[dict]) -> list[dict]:
|
||||
del queue_ele["attributes"]
|
||||
|
||||
if "attributes" in queue_ele and not queue_ele.get("keepAllAttr", False):
|
||||
tag_name = queue_ele["tagName"] if "tagName" in queue_ele else ""
|
||||
new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"])
|
||||
new_attributes = _trimmed_attributes(queue_ele["attributes"])
|
||||
if new_attributes:
|
||||
queue_ele["attributes"] = new_attributes
|
||||
else:
|
||||
@@ -536,13 +538,10 @@ def _trimmed_base64_data(attributes: dict) -> dict:
|
||||
return new_attributes
|
||||
|
||||
|
||||
def _trimmed_attributes(tag_name: str, attributes: dict) -> dict:
|
||||
def _trimmed_attributes(attributes: dict) -> dict:
|
||||
new_attributes: dict = {}
|
||||
|
||||
for key in attributes:
|
||||
if key == "id" and tag_name in ["input", "textarea", "select"]:
|
||||
# We don't want to remove the id attribute any of these elements in case there's a label for it
|
||||
new_attributes[key] = attributes[key]
|
||||
if key == "role" and attributes[key] in ["listbox", "option"]:
|
||||
new_attributes[key] = attributes[key]
|
||||
if key in RESERVED_ATTRIBUTES and attributes[key]:
|
||||
|
||||
@@ -269,6 +269,23 @@ class SkyvernElement:
|
||||
|
||||
return None
|
||||
|
||||
async def find_children_element_id_by_callback(
|
||||
self, cb: typing.Callable[[dict], typing.Awaitable[bool]]
|
||||
) -> str | None:
|
||||
index = 0
|
||||
queue = [self.get_element_dict()]
|
||||
while index < len(queue):
|
||||
item = queue[index]
|
||||
if await cb(item):
|
||||
return item.get("id", "")
|
||||
|
||||
children: list[dict] = item.get("children", [])
|
||||
for child in children:
|
||||
queue.append(child)
|
||||
|
||||
index += 1
|
||||
return None
|
||||
|
||||
async def find_label_for(
|
||||
self, dom: DomUtil, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
|
||||
) -> SkyvernElement | None:
|
||||
|
||||
@@ -188,6 +188,10 @@ class SkyvernFrame:
|
||||
js_script = "(element) => isScrollable(element)"
|
||||
return await self.frame.evaluate(js_script, element)
|
||||
|
||||
async def get_element_visible(self, element: ElementHandle) -> bool:
|
||||
js_script = "(element) => isElementVisible(element) && !isHidden(element)"
|
||||
return await self.frame.evaluate(js_script, element)
|
||||
|
||||
async def scroll_to_top(self, draw_boxes: bool) -> float:
|
||||
"""
|
||||
Scroll to the top of the page and take a screenshot.
|
||||
|
||||
@@ -16,18 +16,20 @@ class SkyvernClient:
|
||||
self.base_url = base_url
|
||||
self.credentials = credentials
|
||||
|
||||
def generate_curl_params(self, task_request_body: TaskRequest) -> PreparedRequest:
|
||||
def generate_curl_params(self, task_request_body: TaskRequest, max_steps: int | None = None) -> PreparedRequest:
|
||||
url = f"{self.base_url}/tasks"
|
||||
payload = task_request_body.model_dump()
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"x-api-key": self.credentials,
|
||||
}
|
||||
if max_steps is not None:
|
||||
headers["x-max-steps-override"] = str(max_steps)
|
||||
|
||||
return url, payload, headers
|
||||
|
||||
def create_task(self, task_request_body: TaskRequest) -> str | None:
|
||||
url, payload, headers = self.generate_curl_params(task_request_body)
|
||||
def create_task(self, task_request_body: TaskRequest, max_steps: int | None = None) -> str | None:
|
||||
url, payload, headers = self.generate_curl_params(task_request_body, max_steps=max_steps)
|
||||
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
if "task_id" not in response.json():
|
||||
|
||||
Reference in New Issue
Block a user