From 37ab9dcfe9e5e9ff7a9e529d40360a00aa6816ea Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Tue, 16 Jul 2024 00:28:05 +0530 Subject: [PATCH] feat: scrape schema --- mx-interpreter/browserSide/scraper.js | 50 +++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/mx-interpreter/browserSide/scraper.js b/mx-interpreter/browserSide/scraper.js index 19267957..568bad06 100644 --- a/mx-interpreter/browserSide/scraper.js +++ b/mx-interpreter/browserSide/scraper.js @@ -173,4 +173,54 @@ function scrape(selector = null) { })); return crudeRecords; +} + +/** + * Given an object with named lists of elements, + * groups the elements by their distance in the DOM tree. + * @param {Object.} lists The named lists of HTML elements. + * @returns {Array.>} + */ +function scrapeSchema(lists) { + function omap(object, f, kf = (x) => x) { + return Object.fromEntries( + Object.entries(object) + .map(([k, v]) => [kf(k), f(v)]), + ); + } + + function ofilter(object, f) { + return Object.fromEntries( + Object.entries(object) + .filter(([k, v]) => f(k, v)), + ); + } + + function getSeedKey(listObj) { + const maxLength = Math.max(...Object.values(omap(listObj, (x) => x.length))); + return Object.keys(ofilter(listObj, (_, v) => v.length === maxLength))[0]; + } + + function getMBEs(elements) { + return elements.map((element) => { + let candidate = element; + const isUniqueChild = (e) => elements + .filter((elem) => e.parentNode?.contains(elem)) + .length === 1; + + while (candidate && isUniqueChild(candidate)) { + candidate = candidate.parentNode; + } + + return candidate; + }); + } + + const seedName = getSeedKey(lists); + const MBEs = getMBEs(lists[seedName]); + + return MBEs.map((mbe) => omap( + lists, + (listOfElements) => listOfElements.find((elem) => mbe.contains(elem))?.innerText, + )); } \ No newline at end of file