diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index e36af779..17962f0e 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -278,40 +278,74 @@ async function scrollDownToLoadMore(selector, limit) { * @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ - window.scrapeList = function ({ listSelector, fields, limit = 10, pagination }) { - // Get all parent elements matching the listSelector - const parentElements = Array.from(document.querySelectorAll(listSelector)).slice(0, limit); - + window.scrapeList = async function({ listSelector, fields, limit = 10, pagination = null }) { const scrapedData = []; - // Iterate through each parent element - parentElements.forEach(parent => { - const record = {}; + while (scrapedData.length < limit) { + // Get all parent elements matching the listSelector + const parentElements = Array.from(document.querySelectorAll(listSelector)); - // For each field, select the corresponding element within the parent - for (const [label, { selector, attribute }] of Object.entries(fields)) { - const fieldElement = parent.querySelector(selector); + // Iterate through each parent element + for (const parent of parentElements) { + if (scrapedData.length >= limit) break; + const record = {}; - // Depending on the attribute specified, extract the data - if (fieldElement) { - if (attribute === 'innerText') { - record[label] = fieldElement.innerText.trim(); - } else if (attribute === 'innerHTML') { - record[label] = fieldElement.innerHTML.trim(); - } else if (attribute === 'src') { - record[label] = fieldElement.src; - } else if (attribute === 'href') { - record[label] = fieldElement.href; - } else { - // Default to attribute retrieval - record[label] = fieldElement.getAttribute(attribute); - } + // For each field, select the corresponding element within the parent + for (const [label, { selector, attribute }] of Object.entries(fields)) { + const fieldElement = parent.querySelector(selector); + + // Depending on the attribute specified, extract the data + if (fieldElement) { + if (attribute === 'innerText') { + record[label] = fieldElement.innerText.trim(); + } else if (attribute === 'innerHTML') { + record[label] = fieldElement.innerHTML.trim(); + } else if (attribute === 'src') { + record[label] = fieldElement.src; + } else if (attribute === 'href') { + record[label] = fieldElement.href; + } else { + // Default to attribute retrieval + record[label] = fieldElement.getAttribute(attribute); + } + } + } + + // Add the record to the scrapedData array + scrapedData.push(record); } - } - scrapedData.push(record); - }); - return scrapedData; - }; + + // Check if we need to paginate + if (pagination && scrapedData.length < limit) { + switch (pagination.type) { + case 'scrollDown': + //await scrollDownPagination(); + break; + case 'scrollUp': + //await scrollUpPagination(); + break; + case 'clickNext': + //await clickNextPagination(pagination.selector); + break; + case 'clickLoadMore': + //await clickLoadMorePagination(pagination.selector); + break; + case 'none': + // No more items to load + break; + default: + console.warn("Unknown pagination type"); + break; + } + await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for content to load + } else { + break; // No more items to load or no pagination + } + } + + return scrapedData.slice(0, limit); // Return only the limited number of records +}; + /** * Gets all children of the elements matching the listSelector,