From b6bcc1d516ecbf27fbf4e16b749180e8635179b0 Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Mon, 19 Aug 2024 01:38:41 +0530 Subject: [PATCH] feat: handle clickNext pagination by tracking already scraped items --- maxun-core/src/interpret.ts | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 2c84f64f..09076a4d 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -371,7 +371,8 @@ export default class Interpreter extends EventEmitter { private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) { let allResults: Record[] = []; - let previousHeight = 0 + let previousHeight = 0; + let scrapedItems: Set = new Set(); // Track unique items to avoid re-scraping while (true) { switch (config.pagination.type) { @@ -391,16 +392,35 @@ export default class Interpreter extends EventEmitter { case 'scrollUp': break; case 'clickNext': - const nextButton = await page.$(config.pagination.selector); - if (!nextButton) { - const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); - allResults = allResults.concat(finalResults); - return allResults; + const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + + // Filter out items that have already been scraped + const newResults = pageResults.filter(item => { + const uniqueKey = JSON.stringify(item); + if (scrapedItems.has(uniqueKey)) return false; + scrapedItems.add(uniqueKey); + return true; + }); + + allResults = allResults.concat(newResults); + + // If the limit is reached, return the required number of items + if (config.limit && allResults.length >= config.limit) { + return allResults.slice(0, config.limit); } + + // Check if there's a next page button + const nextButton = await page.$(config.pagination.selector); + if (!nextButton) { + return allResults; // No more pages to navigate + } + + // Click the next button and wait for the navigation to complete await Promise.all([ - nextButton.click(), - page.waitForNavigation({ waitUntil: 'networkidle' }) + nextButton.click(), + page.waitForNavigation({ waitUntil: 'networkidle' }) ]); + break; case 'clickLoadMore': const loadMoreButton = await page.$(config.pagination.selector);