feat: scrapable heuristics
This commit is contained in:
@@ -11,3 +11,43 @@ function getBiggestElement(selector) {
|
|||||||
);
|
);
|
||||||
return biggest;
|
return biggest;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates structural selector (describing element by its DOM tree location).
|
||||||
|
*
|
||||||
|
* **The generated selector is not guaranteed to be unique!** (In fact, this is
|
||||||
|
* the desired behaviour in here.)
|
||||||
|
* @param {HTMLElement} element Element being described.
|
||||||
|
* @returns {string} CSS-compliant selector describing the element's location in the DOM tree.
|
||||||
|
*/
|
||||||
|
function GetSelectorStructural(element) {
|
||||||
|
// Base conditions for the recursive approach.
|
||||||
|
if (element.tagName === 'BODY') {
|
||||||
|
return 'BODY';
|
||||||
|
}
|
||||||
|
const selector = element.tagName;
|
||||||
|
if (element.parentElement) {
|
||||||
|
return `${GetSelectorStructural(element.parentElement)} > ${selector}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return selector;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Heuristic method to find collections of "interesting" items on the page.
|
||||||
|
* @returns {Array<HTMLElement>} A collection of interesting DOM nodes
|
||||||
|
* (online store products, plane tickets, list items... and many more?)
|
||||||
|
*/
|
||||||
|
function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, metricType = 'size_deviation') {
|
||||||
|
const restoreScroll = (() => {
|
||||||
|
const { scrollX, scrollY } = window;
|
||||||
|
return () => {
|
||||||
|
window.scrollTo(scrollX, scrollY);
|
||||||
|
};
|
||||||
|
})();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @typedef {Array<{x: number, y: number}>} Grid
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user