feat: scrape multiple pages

This commit is contained in:
karishmas6
2024-08-19 19:23:31 +05:30
parent a715c56afe
commit e4b7ca5235

View File

@@ -374,7 +374,7 @@ export default class Interpreter extends EventEmitter {
let previousHeight = 0; let previousHeight = 0;
let currentPage = 1; let currentPage = 1;
// track unique items per page to avoid re-scraping // track unique items per page to avoid re-scraping
let scrapedItemsPerPage: Set<string>[] = []; let scrapedItems: Set<string> = new Set<string>(); // Track unique items across all pages
while (true) { while (true) {
switch (config.pagination.type) { switch (config.pagination.type) {
@@ -393,39 +393,41 @@ export default class Interpreter extends EventEmitter {
break; break;
case 'scrollUp': case 'scrollUp':
break; break;
case 'clickNext': case 'clickNext':
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); while (true) {
// Scrape the current page
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
// Initialize a new Set for the current page if it doesn't exist // Filter out already scraped items
if (!scrapedItemsPerPage[currentPage - 1]) { const newResults = pageResults.filter(item => {
scrapedItemsPerPage[currentPage - 1] = new Set<string>(); const uniqueKey = JSON.stringify(item);
} if (scrapedItems.has(uniqueKey)) return false; // Ignore if already scraped
scrapedItems.add(uniqueKey); // Mark as scraped
return true;
});
const newResults = pageResults.filter(item => { allResults = allResults.concat(newResults);
const uniqueKey = JSON.stringify(item);
if (scrapedItemsPerPage[currentPage - 1].has(uniqueKey)) return false;
scrapedItemsPerPage[currentPage - 1].add(uniqueKey);
return true;
});
allResults = allResults.concat(newResults); // Stop if limit is reached
if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit);
}
if (config.limit && allResults.length >= config.limit) { // Move to the next page
return allResults.slice(0, config.limit); const nextButton = await page.$(config.pagination.selector);
} if (!nextButton) {
return allResults; // No more pages to scrape
}
const nextButton = await page.$(config.pagination.selector); // Click the "Next" button and wait for the next page to load
if (!nextButton) { await Promise.all([
return allResults; nextButton.click(),
} page.waitForNavigation({ waitUntil: 'networkidle' })
]);
await Promise.all([ currentPage++; // Increment page count and proceed
nextButton.click(), break;
page.waitForNavigation({ waitUntil: 'networkidle' }) }
]);
currentPage++;
break;
case 'clickLoadMore': case 'clickLoadMore':
const loadMoreButton = await page.$(config.pagination.selector); const loadMoreButton = await page.$(config.pagination.selector);
if (!loadMoreButton) { if (!loadMoreButton) {