speed up extraction (#1617)

This commit is contained in:
Shuchang Zheng
2025-01-22 22:43:50 +08:00
committed by GitHub
parent b8e9ab85d7
commit ed4d0c59e7
3 changed files with 20 additions and 5 deletions

View File

@@ -2684,7 +2684,7 @@ async def extract_information_for_navigation_goal(
# TODO: we only use HTML element for now, introduce a way to switch in the future
element_tree_format = ElementTreeFormat.HTML
element_tree_in_prompt: str = scraped_page.build_element_tree(element_tree_format)
element_tree_in_prompt: str = scraped_page.build_element_tree(element_tree_format, html_need_skyvern_attrs=False)
scraped_page_refreshed = await scraped_page.refresh()

View File

@@ -703,9 +703,19 @@ function isInteractable(element, hoverStylesMap) {
// Check if element has hover styles that change cursor to pointer
// This is to handle the case where an element's cursor is "auto", but resolves to "pointer" on hover
if (elementCursor === "auto") {
// TODO: we need a better algorithm to match the selector with better performance
for (const [selector, styles] of hoverStylesMap) {
if (element.matches(selector) && styles.cursor === "pointer") {
return true;
let shouldMatch = false;
for (const className of element.classList) {
if (selector.includes(className)) {
shouldMatch = true;
break;
}
}
if (shouldMatch || selector.includes(tagName)) {
if (element.matches(selector) && styles.cursor === "pointer") {
return true;
}
}
}
}

View File

@@ -247,12 +247,17 @@ class ScrapedPage(BaseModel):
self._clean_up_func = clean_up_func
self._scrape_exclude = scrape_exclude
def build_element_tree(self, fmt: ElementTreeFormat = ElementTreeFormat.HTML) -> str:
def build_element_tree(
self, fmt: ElementTreeFormat = ElementTreeFormat.HTML, html_need_skyvern_attrs: bool = True
) -> str:
if fmt == ElementTreeFormat.JSON:
return json.dumps(self.element_tree_trimmed)
if fmt == ElementTreeFormat.HTML:
return "".join(json_to_html(element) for element in self.element_tree_trimmed)
return "".join(
json_to_html(element, need_skyvern_attrs=html_need_skyvern_attrs)
for element in self.element_tree_trimmed
)
raise UnknownElementTreeFormat(fmt=fmt)