From a715c56afee941fc78e559a6fecd328f2937b1db Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Mon, 19 Aug 2024 18:18:30 +0530 Subject: [PATCH] feat: initialize new set for unique items per page --- maxun-core/src/interpret.ts | 58 +++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 326e394e..03acaefe 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -372,9 +372,9 @@ export default class Interpreter extends EventEmitter { private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) { let allResults: Record[] = []; let previousHeight = 0; - // track unique items to avoid re-scraping - let scrapedItems: Set = new Set(); - let currentPage = 1 + let currentPage = 1; + // track unique items per page to avoid re-scraping + let scrapedItemsPerPage: Set[] = []; while (true) { switch (config.pagination.type) { @@ -395,29 +395,37 @@ export default class Interpreter extends EventEmitter { break; case 'clickNext': const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); - // filter out items that have already been scraped - const newResults = pageResults.filter(item => { - const uniqueKey = JSON.stringify(item); - if (scrapedItems.has(uniqueKey)) return false; - scrapedItems.add(uniqueKey); - return true; - }); - allResults = allResults.concat(newResults); - // if the limit is reached, return the required number of items - if (config.limit && allResults.length >= config.limit) { - return allResults.slice(0, config.limit); - } - const nextButton = await page.$(config.pagination.selector); - if (!nextButton) { - return allResults; - } - await Promise.all([ - nextButton.click(), - page.waitForNavigation({ waitUntil: 'networkidle' }) - ]); + + // Initialize a new Set for the current page if it doesn't exist + if (!scrapedItemsPerPage[currentPage - 1]) { + scrapedItemsPerPage[currentPage - 1] = new Set(); + } - currentPage += 1; - break; + const newResults = pageResults.filter(item => { + const uniqueKey = JSON.stringify(item); + if (scrapedItemsPerPage[currentPage - 1].has(uniqueKey)) return false; + scrapedItemsPerPage[currentPage - 1].add(uniqueKey); + return true; + }); + + allResults = allResults.concat(newResults); + + if (config.limit && allResults.length >= config.limit) { + return allResults.slice(0, config.limit); + } + + const nextButton = await page.$(config.pagination.selector); + if (!nextButton) { + return allResults; + } + + await Promise.all([ + nextButton.click(), + page.waitForNavigation({ waitUntil: 'networkidle' }) + ]); + + currentPage++; + break; case 'clickLoadMore': const loadMoreButton = await page.$(config.pagination.selector); if (!loadMoreButton) {