diff --git a/mx-interpreter/browserSide/scraper.js b/mx-interpreter/browserSide/scraper.js index 719d8f59..9cede7aa 100644 --- a/mx-interpreter/browserSide/scraper.js +++ b/mx-interpreter/browserSide/scraper.js @@ -69,4 +69,60 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return out; } - \ No newline at end of file + let maxSelector = { selector: 'body', metric: 0 }; + + const updateMaximumWithPoint = (point) => { + const currentElement = document.elementFromPoint(point.x, point.y); + const selector = GetSelectorStructural(currentElement); + + const elements = Array.from(document.querySelectorAll(selector)) + .filter((element) => area(element) > minArea); + + // If the current selector targets less than three elements, + // we consider it not interesting (would be a very underwhelming scraper) + if (elements.length < 3) { + return; + } + + let metric = null; + + if (metricType === 'total_area') { + metric = elements + .reduce((p, x) => p + area(x), 0); + } else if (metricType === 'size_deviation') { + // This could use a proper "statistics" approach... but meh, so far so good! + const sizes = elements + .map((element) => area(element)); + + metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes)); + } + + // console.debug(`Total ${metricType} is ${metric}.`) + if (metric > maxSelector.metric && elements.length < maxCountPerPage) { + maxSelector = { selector, metric }; + } + }; + + for (let scroll = 0; scroll < scrolls; scroll += 1) { + window.scrollTo(0, scroll * window.innerHeight); + + const grid = getGrid(); + + grid.forEach(updateMaximumWithPoint); + } + + restoreScroll(); + + let out = Array.from(document.querySelectorAll(maxSelector.selector)); + + const different = (x, i, a) => a.findIndex((e) => e === x) === i; + // as long as we don't merge any two elements by substituing them for their parents, + // we substitute. + while (out.map((x) => x.parentElement).every(different) + && out.forEach((x) => x.parentElement !== null)) { + out = out.map((x) => x.parentElement ?? x); + } + + return out; +} +