From c04fafd726d2189d19db84f4f26fafb8458f1d1c Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Tue, 16 Jul 2024 00:25:47 +0530 Subject: [PATCH] feat: scrapable heuristics --- mx-interpreter/browserSide/scraper.js | 40 +++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/mx-interpreter/browserSide/scraper.js b/mx-interpreter/browserSide/scraper.js index c64c4808..d9e5a0d9 100644 --- a/mx-interpreter/browserSide/scraper.js +++ b/mx-interpreter/browserSide/scraper.js @@ -11,3 +11,43 @@ function getBiggestElement(selector) { ); return biggest; } + +/** + * Generates structural selector (describing element by its DOM tree location). + * + * **The generated selector is not guaranteed to be unique!** (In fact, this is + * the desired behaviour in here.) + * @param {HTMLElement} element Element being described. + * @returns {string} CSS-compliant selector describing the element's location in the DOM tree. + */ +function GetSelectorStructural(element) { + // Base conditions for the recursive approach. + if (element.tagName === 'BODY') { + return 'BODY'; + } + const selector = element.tagName; + if (element.parentElement) { + return `${GetSelectorStructural(element.parentElement)} > ${selector}`; + } + + return selector; +} + +/** + * Heuristic method to find collections of "interesting" items on the page. + * @returns {Array} A collection of interesting DOM nodes + * (online store products, plane tickets, list items... and many more?) + */ +function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, metricType = 'size_deviation') { + const restoreScroll = (() => { + const { scrollX, scrollY } = window; + return () => { + window.scrollTo(scrollX, scrollY); + }; + })(); + + /** +* @typedef {Array<{x: number, y: number}>} Grid +*/ + + \ No newline at end of file