diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index d67c425a..6bb5e61a 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -132,12 +132,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, */ // Wrap the entire function in an IIFE (Immediately Invoked Function Expression) // and attach it to the window object -(function(window) { +(function (window) { /** * Returns a "scrape" result from the current page. * @returns {Array} *Curated* array of scraped information (with sparse rows removed) */ - window.scrape = function(selector = null) { + window.scrape = function (selector = null) { /** * **crudeRecords** contains uncurated rundowns of "scrapable" elements * @type {Array} @@ -188,49 +188,48 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @param {Object.} lists The named lists of HTML elements. * @returns {Array.>} */ -window.scrapeSchema = function (lists) { - function omap(object, f, kf = (x) => x) { - return Object.fromEntries( - Object.entries(object) - .map(([k, v]) => [kf(k), f(v)]), - ); + window.scrapeSchema = function (lists) { + function omap(object, f, kf = (x) => x) { + return Object.fromEntries( + Object.entries(object) + .map(([k, v]) => [kf(k), f(v)]), + ); + } + + function ofilter(object, f) { + return Object.fromEntries( + Object.entries(object) + .filter(([k, v]) => f(k, v)), + ); + } + + function getSeedKey(listObj) { + const maxLength = Math.max(...Object.values(omap(listObj, (x) => x.length))); + return Object.keys(ofilter(listObj, (_, v) => v.length === maxLength))[0]; + } + + function getMBEs(elements) { + return elements.map((element) => { + let candidate = element; + const isUniqueChild = (e) => elements + .filter((elem) => e.parentNode?.contains(elem)) + .length === 1; + + while (candidate && isUniqueChild(candidate)) { + candidate = candidate.parentNode; + } + + return candidate; + }); + } + + const seedName = getSeedKey(lists); + const MBEs = getMBEs(lists[seedName]); + + return MBEs.map((mbe) => omap( + lists, + (listOfElements) => listOfElements.find((elem) => mbe.contains(elem))?.innerText, + )); } - function ofilter(object, f) { - return Object.fromEntries( - Object.entries(object) - .filter(([k, v]) => f(k, v)), - ); - } - - function getSeedKey(listObj) { - const maxLength = Math.max(...Object.values(omap(listObj, (x) => x.length))); - return Object.keys(ofilter(listObj, (_, v) => v.length === maxLength))[0]; - } - - function getMBEs(elements) { - return elements.map((element) => { - let candidate = element; - const isUniqueChild = (e) => elements - .filter((elem) => e.parentNode?.contains(elem)) - .length === 1; - - while (candidate && isUniqueChild(candidate)) { - candidate = candidate.parentNode; - } - - return candidate; - }); - } - - const seedName = getSeedKey(lists); - const MBEs = getMBEs(lists[seedName]); - - return MBEs.map((mbe) => omap( - lists, - (listOfElements) => listOfElements.find((elem) => mbe.contains(elem))?.innerText, - )); -} - - })(window); \ No newline at end of file