optimize scraping part 2 (#3185)

This commit is contained in:
LawyZheng
2025-08-14 14:51:43 +08:00
committed by GitHub
parent 30606645ea
commit 65e9cb10e9

View File

@@ -92,6 +92,7 @@ class Rect {
} }
class DomUtils { class DomUtils {
static visibleClientRectCache = new WeakMap();
// //
// Bounds the rect by the current viewport dimensions. If the rect is offscreen or has a height or // Bounds the rect by the current viewport dimensions. If the rect is offscreen or has a height or
// width < 3 then null is returned instead of a rect. // width < 3 then null is returned instead of a rect.
@@ -113,7 +114,18 @@ class DomUtils {
} }
} }
// add cache to optimize performance
static getVisibleClientRect(element, testChildren) { static getVisibleClientRect(element, testChildren) {
// check cache
const cacheKey = `${testChildren}`;
if (DomUtils.visibleClientRectCache.has(element)) {
const elementCache = DomUtils.visibleClientRectCache.get(element);
if (elementCache.has(cacheKey)) {
_jsConsoleLog("hit cache to get the rect of element");
return elementCache.get(cacheKey);
}
}
// Note: this call will be expensive if we modify the DOM in between calls. // Note: this call will be expensive if we modify the DOM in between calls.
let clientRect; let clientRect;
if (testChildren == null) testChildren = false; if (testChildren == null) testChildren = false;
@@ -138,6 +150,8 @@ class DomUtils {
return isInlineZeroFontSize; return isInlineZeroFontSize;
}; };
let result = null;
for (clientRect of clientRects) { for (clientRect of clientRects) {
// If the link has zero dimensions, it may be wrapping visible but floated elements. Check for // If the link has zero dimensions, it may be wrapping visible but floated elements. Check for
// this. // this.
@@ -172,8 +186,10 @@ class DomUtils {
childClientRect.height < 3 childClientRect.height < 3
) )
continue; continue;
return childClientRect; result = childClientRect;
break;
} }
if (result) break;
} else { } else {
clientRect = this.cropRectToVisible(clientRect); clientRect = this.cropRectToVisible(clientRect);
@@ -192,11 +208,23 @@ class DomUtils {
if (computedStyle.getPropertyValue("visibility") !== "visible") if (computedStyle.getPropertyValue("visibility") !== "visible")
continue; continue;
return clientRect; result = clientRect;
break;
} }
} }
return null; // cache result
if (!DomUtils.visibleClientRectCache.has(element)) {
DomUtils.visibleClientRectCache.set(element, new Map());
}
DomUtils.visibleClientRectCache.get(element).set(cacheKey, result);
return result;
}
// clear cache
static clearVisibleClientRectCache() {
DomUtils.visibleClientRectCache = new WeakMap();
} }
static getViewportTopLeft() { static getViewportTopLeft() {
@@ -1453,7 +1481,6 @@ async function buildElementObject(
text: getElementText(element), text: getElementText(element),
afterPseudoText: getPseudoContent(element, "::after"), afterPseudoText: getPseudoContent(element, "::after"),
children: [], children: [],
rect: DomUtils.getVisibleClientRect(element, true),
// if purgeable is True, which means this element is only used for building the tree relationship // if purgeable is True, which means this element is only used for building the tree relationship
purgeable: purgeable, purgeable: purgeable,
// don't trim any attr of this element if keepAllAttr=True // don't trim any attr of this element if keepAllAttr=True
@@ -1763,9 +1790,15 @@ async function buildElementTree(
function drawBoundingBoxes(elements) { function drawBoundingBoxes(elements) {
// draw a red border around the elements // draw a red border around the elements
DomUtils.clearVisibleClientRectCache();
elements.forEach((element) => {
const ele = getDOMElementBySkyvenElement(element);
element.rect = DomUtils.getVisibleClientRect(ele, true);
});
var groups = groupElementsVisually(elements); var groups = groupElementsVisually(elements);
var hintMarkers = createHintMarkersForGroups(groups); var hintMarkers = createHintMarkersForGroups(groups);
addHintMarkersToPage(hintMarkers); addHintMarkersToPage(hintMarkers);
DomUtils.clearVisibleClientRectCache();
} }
async function buildElementsAndDrawBoundingBoxes( async function buildElementsAndDrawBoundingBoxes(