From 2ad663e8693eca49d5479b436455b0e19534bf35 Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Tue, 16 Jul 2024 00:27:43 +0530 Subject: [PATCH] feat: scrape result from current page --- mx-interpreter/browserSide/scraper.js | 48 +++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/mx-interpreter/browserSide/scraper.js b/mx-interpreter/browserSide/scraper.js index 9cede7aa..19267957 100644 --- a/mx-interpreter/browserSide/scraper.js +++ b/mx-interpreter/browserSide/scraper.js @@ -126,3 +126,51 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return out; } +/** + * Returns a "scrape" result from the current page. + * @returns {Array} *Curated* array of scraped information (with sparse rows removed) + */ +function scrape(selector = null) { + /** + * **crudeRecords** contains uncurated rundowns of "scrapable" elements + * @type {Array} + */ + const crudeRecords = (selector + ? Array.from(document.querySelectorAll(selector)) + : scrapableHeuristics()) + .map((record) => ({ + ...Array.from(record.querySelectorAll('img')) + .reduce((p, x, i) => { + let url = null; + if (x.srcset) { + const urls = x.srcset.split(', '); + [url] = urls[urls.length - 1].split(' '); + } + + /** + * Contains the largest elements from `srcset` - if `srcset` is not present, contains + * URL from the `src` attribute + * + * If the `src` attribute contains a data url, imgUrl contains `undefined`. + */ + let imgUrl; + if (x.srcset) { + imgUrl = url; + } else if (x.src.indexOf('data:') === -1) { + imgUrl = x.src; + } + + return ({ + ...p, + ...(imgUrl ? { [`img_${i}`]: imgUrl } : {}), + }); + }, {}), + ...record.innerText.split('\n') + .reduce((p, x, i) => ({ + ...p, + [`record_${String(i).padStart(4, '0')}`]: x.trim(), + }), {}), + })); + + return crudeRecords; +} \ No newline at end of file