optimize scraping part 1 (#3184)
This commit is contained in:
@@ -225,6 +225,113 @@ class DomUtils {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class QuadTreeNode {
|
||||||
|
constructor(bounds, maxElements = 10, maxDepth = 4) {
|
||||||
|
this.bounds = bounds; // {x, y, width, height}
|
||||||
|
this.maxElements = maxElements;
|
||||||
|
this.maxDepth = maxDepth;
|
||||||
|
this.elements = [];
|
||||||
|
this.children = null;
|
||||||
|
this.depth = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
insert(element) {
|
||||||
|
if (!this.contains(element.rect)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.children === null && this.elements.length < this.maxElements) {
|
||||||
|
this.elements.push(element);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.children === null) {
|
||||||
|
this.subdivide();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const child of this.children) {
|
||||||
|
if (child.insert(element)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.elements.push(element);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
subdivide() {
|
||||||
|
const x = this.bounds.x;
|
||||||
|
const y = this.bounds.y;
|
||||||
|
const w = this.bounds.width / 2;
|
||||||
|
const h = this.bounds.height / 2;
|
||||||
|
|
||||||
|
this.children = [
|
||||||
|
new QuadTreeNode(
|
||||||
|
{ x, y, width: w, height: h },
|
||||||
|
this.maxElements,
|
||||||
|
this.maxDepth,
|
||||||
|
),
|
||||||
|
new QuadTreeNode(
|
||||||
|
{ x: x + w, y, width: w, height: h },
|
||||||
|
this.maxElements,
|
||||||
|
this.maxDepth,
|
||||||
|
),
|
||||||
|
new QuadTreeNode(
|
||||||
|
{ x, y: y + h, width: w, height: h },
|
||||||
|
this.maxElements,
|
||||||
|
this.maxDepth,
|
||||||
|
),
|
||||||
|
new QuadTreeNode(
|
||||||
|
{ x: x + w, y: y + h, width: w, height: h },
|
||||||
|
this.maxElements,
|
||||||
|
this.maxDepth,
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const child of this.children) {
|
||||||
|
child.depth = this.depth + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
contains(rect) {
|
||||||
|
return (
|
||||||
|
rect.left >= this.bounds.x &&
|
||||||
|
rect.right <= this.bounds.x + this.bounds.width &&
|
||||||
|
rect.top >= this.bounds.y &&
|
||||||
|
rect.bottom <= this.bounds.y + this.bounds.height
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
query(rect) {
|
||||||
|
const result = [];
|
||||||
|
this.queryRecursive(rect, result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
queryRecursive(rect, result) {
|
||||||
|
if (!this.intersects(rect)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.push(...this.elements);
|
||||||
|
|
||||||
|
if (this.children) {
|
||||||
|
for (const child of this.children) {
|
||||||
|
child.queryRecursive(rect, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
intersects(rect) {
|
||||||
|
return (
|
||||||
|
rect.left < this.bounds.x + this.bounds.width &&
|
||||||
|
rect.right > this.bounds.x &&
|
||||||
|
rect.top < this.bounds.y + this.bounds.height &&
|
||||||
|
rect.bottom > this.bounds.y
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// from playwright
|
// from playwright
|
||||||
function getElementComputedStyle(element, pseudo) {
|
function getElementComputedStyle(element, pseudo) {
|
||||||
return element.ownerDocument && element.ownerDocument.defaultView
|
return element.ownerDocument && element.ownerDocument.defaultView
|
||||||
@@ -1686,39 +1793,88 @@ function getCaptchaSolves() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function groupElementsVisually(elements) {
|
function groupElementsVisually(elements) {
|
||||||
const groups = [];
|
// Quadtree O(n log n)
|
||||||
// o n^2
|
const validElements = elements.filter((element) => element.rect);
|
||||||
// go through each hint and see if it overlaps with any other hints, if it does, add it to the group of the other hint
|
|
||||||
// *** if we start from the bigger elements (top -> bottom) we can avoid merging groups
|
if (validElements.length === 0) return [];
|
||||||
for (const element of elements) {
|
|
||||||
if (!element.rect) {
|
// Calculate bounds
|
||||||
continue;
|
const bounds = calculateBounds(validElements);
|
||||||
}
|
|
||||||
const group = groups.find((group) => {
|
// Create quadtree
|
||||||
for (const groupElement of group.elements) {
|
const quadTree = new QuadTreeNode(bounds);
|
||||||
if (Rect.intersects(groupElement.rect, element.rect)) {
|
validElements.forEach((element) => quadTree.insert(element));
|
||||||
return true;
|
|
||||||
}
|
const groups = [];
|
||||||
}
|
const processed = new Set();
|
||||||
return false;
|
|
||||||
});
|
for (const element of validElements) {
|
||||||
if (group) {
|
if (processed.has(element)) continue;
|
||||||
group.elements.push(element);
|
|
||||||
} else {
|
const group = { elements: [element], rect: null };
|
||||||
groups.push({
|
processed.add(element);
|
||||||
elements: [element],
|
|
||||||
});
|
// Find all elements overlapping with current element
|
||||||
}
|
const overlapping = findOverlappingElements(
|
||||||
}
|
element,
|
||||||
|
validElements,
|
||||||
|
quadTree,
|
||||||
|
processed,
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const overlappingElement of overlapping) {
|
||||||
|
group.elements.push(overlappingElement);
|
||||||
|
processed.add(overlappingElement);
|
||||||
|
}
|
||||||
|
|
||||||
// go through each group and create a rectangle that encompasses all the hints in the group
|
|
||||||
for (const group of groups) {
|
|
||||||
group.rect = createRectangleForGroup(group);
|
group.rect = createRectangleForGroup(group);
|
||||||
|
groups.push(group);
|
||||||
}
|
}
|
||||||
|
|
||||||
return groups;
|
return groups;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper functions
|
||||||
|
function calculateBounds(elements) {
|
||||||
|
const rects = elements.map((el) => el.rect);
|
||||||
|
const left = Math.min(...rects.map((r) => r.left));
|
||||||
|
const top = Math.min(...rects.map((r) => r.top));
|
||||||
|
const right = Math.max(...rects.map((r) => r.right));
|
||||||
|
const bottom = Math.max(...rects.map((r) => r.bottom));
|
||||||
|
|
||||||
|
return {
|
||||||
|
x: left,
|
||||||
|
y: top,
|
||||||
|
width: right - left,
|
||||||
|
height: bottom - top,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function findOverlappingElements(element, allElements, quadTree, processed) {
|
||||||
|
const result = [];
|
||||||
|
const queue = [element];
|
||||||
|
|
||||||
|
while (queue.length > 0) {
|
||||||
|
const current = queue.shift();
|
||||||
|
|
||||||
|
// Use quadtree to query nearby elements
|
||||||
|
const nearby = quadTree.query(current.rect);
|
||||||
|
|
||||||
|
for (const nearbyElement of nearby) {
|
||||||
|
if (
|
||||||
|
!processed.has(nearbyElement) &&
|
||||||
|
nearbyElement !== current &&
|
||||||
|
Rect.intersects(current.rect, nearbyElement.rect)
|
||||||
|
) {
|
||||||
|
result.push(nearbyElement);
|
||||||
|
processed.add(nearbyElement);
|
||||||
|
queue.push(nearbyElement);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
function createRectangleForGroup(group) {
|
function createRectangleForGroup(group) {
|
||||||
const rects = group.elements.map((element) => element.rect);
|
const rects = group.elements.map((element) => element.rect);
|
||||||
const top = Math.min(...rects.map((rect) => rect.top));
|
const top = Math.min(...rects.map((rect) => rect.top));
|
||||||
|
|||||||
Reference in New Issue
Block a user