feat: handle pagination types

This commit is contained in:
karishmas6
2024-08-14 05:58:57 +05:30
parent 375590fc0d
commit 2407c79dd2

View File

@@ -278,40 +278,74 @@ async function scrollDownToLoadMore(selector, limit) {
* @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors * @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list * @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/ */
window.scrapeList = function ({ listSelector, fields, limit = 10, pagination }) { window.scrapeList = async function({ listSelector, fields, limit = 10, pagination = null }) {
// Get all parent elements matching the listSelector
const parentElements = Array.from(document.querySelectorAll(listSelector)).slice(0, limit);
const scrapedData = []; const scrapedData = [];
// Iterate through each parent element while (scrapedData.length < limit) {
parentElements.forEach(parent => { // Get all parent elements matching the listSelector
const record = {}; const parentElements = Array.from(document.querySelectorAll(listSelector));
// For each field, select the corresponding element within the parent // Iterate through each parent element
for (const [label, { selector, attribute }] of Object.entries(fields)) { for (const parent of parentElements) {
const fieldElement = parent.querySelector(selector); if (scrapedData.length >= limit) break;
const record = {};
// Depending on the attribute specified, extract the data // For each field, select the corresponding element within the parent
if (fieldElement) { for (const [label, { selector, attribute }] of Object.entries(fields)) {
if (attribute === 'innerText') { const fieldElement = parent.querySelector(selector);
record[label] = fieldElement.innerText.trim();
} else if (attribute === 'innerHTML') { // Depending on the attribute specified, extract the data
record[label] = fieldElement.innerHTML.trim(); if (fieldElement) {
} else if (attribute === 'src') { if (attribute === 'innerText') {
record[label] = fieldElement.src; record[label] = fieldElement.innerText.trim();
} else if (attribute === 'href') { } else if (attribute === 'innerHTML') {
record[label] = fieldElement.href; record[label] = fieldElement.innerHTML.trim();
} else { } else if (attribute === 'src') {
// Default to attribute retrieval record[label] = fieldElement.src;
record[label] = fieldElement.getAttribute(attribute); } else if (attribute === 'href') {
} record[label] = fieldElement.href;
} else {
// Default to attribute retrieval
record[label] = fieldElement.getAttribute(attribute);
}
}
}
// Add the record to the scrapedData array
scrapedData.push(record);
} }
}
scrapedData.push(record); // Check if we need to paginate
}); if (pagination && scrapedData.length < limit) {
return scrapedData; switch (pagination.type) {
}; case 'scrollDown':
//await scrollDownPagination();
break;
case 'scrollUp':
//await scrollUpPagination();
break;
case 'clickNext':
//await clickNextPagination(pagination.selector);
break;
case 'clickLoadMore':
//await clickLoadMorePagination(pagination.selector);
break;
case 'none':
// No more items to load
break;
default:
console.warn("Unknown pagination type");
break;
}
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for content to load
} else {
break; // No more items to load or no pagination
}
}
return scrapedData.slice(0, limit); // Return only the limited number of records
};
/** /**
* Gets all children of the elements matching the listSelector, * Gets all children of the elements matching the listSelector,