From e943a8c253cf6742f082ee2c148218897bafb0ad Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Sun, 4 Aug 2024 03:53:59 +0530 Subject: [PATCH] feat(core): extraction based on attribute --- maxun-core/src/browserSide/scraper.js | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 83d196fe..009cf9da 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -229,12 +229,25 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return MBEs.map((mbe) => omap( lists, - ({ selector }, key) => { - const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem)); - return elem ? elem.innerText : undefined; + ({ selector, attribute }, key) => { + const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem)); + if (!elem) return undefined; + + switch (attribute) { + case 'href': + return elem.getAttribute('href'); + case 'src': + return elem.getAttribute('src'); + case 'innerText': + return elem.innerText; + case 'textContent': + return elem.textContent; + default: + return elem.innerText; + } }, (key) => key // Use the original key in the output - )); + )); } })(window); \ No newline at end of file