From 871d4fd48a8eb1f2cf500e515724cfc31cd478ba Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Mon, 12 Aug 2024 05:26:15 +0530 Subject: [PATCH] feat: scrapeListAuto --- maxun-core/src/browserSide/scraper.js | 71 +++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 0ee2b4a1..6d42b43a 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -317,15 +317,18 @@ async function scrollDownToLoadMore(selector, limit) { for (const [fieldName, fieldConfig] of Object.entries(fields)) { let element; - if (flexible) { - // try multiple strategies to find the element - element = item.querySelector(fieldConfig.selector) || - item.querySelector(`[class*="${fieldConfig.selector}"]`) || - Array.from(item.querySelectorAll('*')) - .find(el => el.textContent.trim() === fieldConfig.selector); - } else { - element = item.querySelector(fieldConfig.selector); - } + // if (flexible) { + // // try multiple strategies to find the element + // element = item.querySelector(fieldConfig.selector) || + // item.querySelector(`[class*="${fieldConfig.selector}"]`) || + // Array.from(item.querySelectorAll('*')) + // .find(el => el.textContent.trim() === fieldConfig.selector); + // } else { + element = item.querySelector(fieldConfig.selector); + + console.debug('Element:', element); + + // } if (element) { switch (fieldConfig.attribute) { @@ -344,7 +347,8 @@ async function scrollDownToLoadMore(selector, limit) { break; } } else { - scrapedItem[fieldName] = null; + // send a message that says it failed + scrapedItem[fieldName] = `Failed to scrape ${fieldName}`; } } @@ -353,4 +357,51 @@ async function scrollDownToLoadMore(selector, limit) { }); }; + + /** + * Gets all children of the elements matching the listSelector, + * returning their CSS selectors and innerText. + * @param {string} listSelector - Selector for the list container(s) + * @returns {Array.} Array of objects, each containing the CSS selector and innerText of the children + */ +window.scrapeListAuto = function (listSelector) { + const lists = Array.from(document.querySelectorAll(listSelector)); + + const results = []; + + lists.forEach(list => { + const children = Array.from(list.children); + + children.forEach(child => { + const selectors = []; + let element = child; + + // Traverse up to gather the CSS selector for the element + while (element && element !== document) { + let selector = element.nodeName.toLowerCase(); + if (element.id) { + selector += `#${element.id}`; + selectors.push(selector); + break; + } else { + const className = element.className.trim().split(/\s+/).join('.'); + if (className) { + selector += `.${className}`; + } + selectors.push(selector); + element = element.parentElement; + } + } + + results.push({ + selector: selectors.reverse().join(' > '), + innerText: child.innerText.trim() + }); + }); + }); + + return results; +}; + + })(window); \ No newline at end of file