optimize scraping part 1 (#3184)

This commit is contained in:
LawyZheng
2025-08-14 14:24:21 +08:00
committed by GitHub
parent b88cf18590
commit 30606645ea

View File

@@ -225,6 +225,113 @@ class DomUtils {
} }
} }
class QuadTreeNode {
constructor(bounds, maxElements = 10, maxDepth = 4) {
this.bounds = bounds; // {x, y, width, height}
this.maxElements = maxElements;
this.maxDepth = maxDepth;
this.elements = [];
this.children = null;
this.depth = 0;
}
insert(element) {
if (!this.contains(element.rect)) {
return false;
}
if (this.children === null && this.elements.length < this.maxElements) {
this.elements.push(element);
return true;
}
if (this.children === null) {
this.subdivide();
}
for (const child of this.children) {
if (child.insert(element)) {
return true;
}
}
this.elements.push(element);
return true;
}
subdivide() {
const x = this.bounds.x;
const y = this.bounds.y;
const w = this.bounds.width / 2;
const h = this.bounds.height / 2;
this.children = [
new QuadTreeNode(
{ x, y, width: w, height: h },
this.maxElements,
this.maxDepth,
),
new QuadTreeNode(
{ x: x + w, y, width: w, height: h },
this.maxElements,
this.maxDepth,
),
new QuadTreeNode(
{ x, y: y + h, width: w, height: h },
this.maxElements,
this.maxDepth,
),
new QuadTreeNode(
{ x: x + w, y: y + h, width: w, height: h },
this.maxElements,
this.maxDepth,
),
];
for (const child of this.children) {
child.depth = this.depth + 1;
}
}
contains(rect) {
return (
rect.left >= this.bounds.x &&
rect.right <= this.bounds.x + this.bounds.width &&
rect.top >= this.bounds.y &&
rect.bottom <= this.bounds.y + this.bounds.height
);
}
query(rect) {
const result = [];
this.queryRecursive(rect, result);
return result;
}
queryRecursive(rect, result) {
if (!this.intersects(rect)) {
return;
}
result.push(...this.elements);
if (this.children) {
for (const child of this.children) {
child.queryRecursive(rect, result);
}
}
}
intersects(rect) {
return (
rect.left < this.bounds.x + this.bounds.width &&
rect.right > this.bounds.x &&
rect.top < this.bounds.y + this.bounds.height &&
rect.bottom > this.bounds.y
);
}
}
// from playwright // from playwright
function getElementComputedStyle(element, pseudo) { function getElementComputedStyle(element, pseudo) {
return element.ownerDocument && element.ownerDocument.defaultView return element.ownerDocument && element.ownerDocument.defaultView
@@ -1686,39 +1793,88 @@ function getCaptchaSolves() {
} }
function groupElementsVisually(elements) { function groupElementsVisually(elements) {
const groups = []; // Quadtree O(n log n)
// o n^2 const validElements = elements.filter((element) => element.rect);
// go through each hint and see if it overlaps with any other hints, if it does, add it to the group of the other hint
// *** if we start from the bigger elements (top -> bottom) we can avoid merging groups if (validElements.length === 0) return [];
for (const element of elements) {
if (!element.rect) { // Calculate bounds
continue; const bounds = calculateBounds(validElements);
}
const group = groups.find((group) => { // Create quadtree
for (const groupElement of group.elements) { const quadTree = new QuadTreeNode(bounds);
if (Rect.intersects(groupElement.rect, element.rect)) { validElements.forEach((element) => quadTree.insert(element));
return true;
} const groups = [];
} const processed = new Set();
return false;
}); for (const element of validElements) {
if (group) { if (processed.has(element)) continue;
group.elements.push(element);
} else { const group = { elements: [element], rect: null };
groups.push({ processed.add(element);
elements: [element],
}); // Find all elements overlapping with current element
} const overlapping = findOverlappingElements(
} element,
validElements,
quadTree,
processed,
);
for (const overlappingElement of overlapping) {
group.elements.push(overlappingElement);
processed.add(overlappingElement);
}
// go through each group and create a rectangle that encompasses all the hints in the group
for (const group of groups) {
group.rect = createRectangleForGroup(group); group.rect = createRectangleForGroup(group);
groups.push(group);
} }
return groups; return groups;
} }
// Helper functions
function calculateBounds(elements) {
const rects = elements.map((el) => el.rect);
const left = Math.min(...rects.map((r) => r.left));
const top = Math.min(...rects.map((r) => r.top));
const right = Math.max(...rects.map((r) => r.right));
const bottom = Math.max(...rects.map((r) => r.bottom));
return {
x: left,
y: top,
width: right - left,
height: bottom - top,
};
}
function findOverlappingElements(element, allElements, quadTree, processed) {
const result = [];
const queue = [element];
while (queue.length > 0) {
const current = queue.shift();
// Use quadtree to query nearby elements
const nearby = quadTree.query(current.rect);
for (const nearbyElement of nearby) {
if (
!processed.has(nearbyElement) &&
nearbyElement !== current &&
Rect.intersects(current.rect, nearbyElement.rect)
) {
result.push(nearbyElement);
processed.add(nearbyElement);
queue.push(nearbyElement);
}
}
}
return result;
}
function createRectangleForGroup(group) { function createRectangleForGroup(group) {
const rects = group.elements.map((element) => element.rect); const rects = group.elements.map((element) => element.rect);
const top = Math.min(...rects.map((rect) => rect.top)); const top = Math.min(...rects.map((rect) => rect.top));