diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 9c9182d5..83d196fe 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -132,12 +132,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, */ // Wrap the entire function in an IIFE (Immediately Invoked Function Expression) // and attach it to the window object -(function(window) { +(function (window) { /** * Returns a "scrape" result from the current page. * @returns {Array} *Curated* array of scraped information (with sparse rows removed) */ - window.scrape = function(selector = null) { + window.scrape = function (selector = null) { /** * **crudeRecords** contains uncurated rundowns of "scrapable" elements * @type {Array} @@ -182,59 +182,59 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return crudeRecords; }; -/** - * Given an object with named lists of elements, - * groups the elements by their distance in the DOM tree. - * @param {Object.} lists The named lists of HTML elements. - * @returns {Array.>} - */ -window.scrapeSchema = function (lists) { - function omap(object, f, kf = (x) => x) { - return Object.fromEntries( - Object.entries(object) - .map(([k, v]) => [kf(k), f(v)]), - ); + /** + * Given an object with named lists of elements, + * groups the elements by their distance in the DOM tree. + * @param {Object.} lists The named lists of HTML elements. + * @returns {Array.>} + */ + window.scrapeSchema = function (lists) { + function omap(object, f, kf = (x) => x) { + return Object.fromEntries( + Object.entries(object) + .map(([k, v]) => [kf(k), f(v)]), + ); + } + + function ofilter(object, f) { + return Object.fromEntries( + Object.entries(object) + .filter(([k, v]) => f(k, v)), + ); + } + + function getSeedKey(listObj) { + const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length))); + return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0]; + } + + function getMBEs(elements) { + return elements.map((element) => { + let candidate = element; + const isUniqueChild = (e) => elements + .filter((elem) => e.parentNode?.contains(elem)) + .length === 1; + + while (candidate && isUniqueChild(candidate)) { + candidate = candidate.parentNode; + } + + return candidate; + }); + } + + const seedName = getSeedKey(lists); + const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector)); + const MBEs = getMBEs(seedElements); + + return MBEs.map((mbe) => omap( + lists, + ({ selector }, key) => { + const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem)); + return elem ? elem.innerText : undefined; + }, + (key) => key // Use the original key in the output + )); } - function ofilter(object, f) { - return Object.fromEntries( - Object.entries(object) - .filter(([k, v]) => f(k, v)), - ); - } - - function getSeedKey(listObj) { - const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length))); - return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0]; - } - - function getMBEs(elements) { - return elements.map((element) => { - let candidate = element; - const isUniqueChild = (e) => elements - .filter((elem) => e.parentNode?.contains(elem)) - .length === 1; - - while (candidate && isUniqueChild(candidate)) { - candidate = candidate.parentNode; - } - - return candidate; - }); - } - - const seedName = getSeedKey(lists); - const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector)); - const MBEs = getMBEs(seedElements); - - return MBEs.map((mbe) => omap( - lists, - ({ selector }, key) => { - const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem)); - return elem ? elem.innerText : undefined; - }, - (key) => key // Use the original key in the output - )); -} - })(window); \ No newline at end of file