From 7761b2af35761c7f00c7c622e1b09bd5d911e5f3 Mon Sep 17 00:00:00 2001 From: Rohit Date: Tue, 8 Apr 2025 15:09:38 +0530 Subject: [PATCH] feat: track results and return scroll pagination --- maxun-core/src/interpret.ts | 44 +++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 53b999fa..e662683c 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -572,6 +572,7 @@ export default class Interpreter extends EventEmitter { let visitedUrls: Set = new Set(); const MAX_RETRIES = 3; const RETRY_DELAY = 1000; // 1 second delay between retries + const MAX_UNCHANGED_RESULTS = 5; const debugLog = (message: string, ...args: any[]) => { console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args); @@ -661,18 +662,36 @@ export default class Interpreter extends EventEmitter { }; let availableSelectors = config.pagination.selector.split(','); + let unchangedResultCounter = 0; try { while (true) { switch (config.pagination.type) { case 'scrollDown': { + let previousResultCount = allResults.length; + + await scrapeCurrentPage(); + + if (checkLimit()) { + return allResults; + } + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.waitForTimeout(2000); const currentHeight = await page.evaluate(() => document.body.scrollHeight); + const currentResultCount = allResults.length; + + if (currentResultCount === previousResultCount) { + unchangedResultCounter++; + if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) { + return allResults; + } + } else { + unchangedResultCounter = 0; + } + if (currentHeight === previousHeight) { - const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); - allResults = allResults.concat(finalResults); return allResults; } @@ -681,13 +700,30 @@ export default class Interpreter extends EventEmitter { } case 'scrollUp': { + let previousResultCount = allResults.length; + + await scrapeCurrentPage(); + + if (checkLimit()) { + return allResults; + } + await page.evaluate(() => window.scrollTo(0, 0)); await page.waitForTimeout(2000); const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop); + const currentResultCount = allResults.length; + + if (currentResultCount === previousResultCount) { + unchangedResultCounter++; + if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) { + return allResults; + } + } else { + unchangedResultCounter = 0; + } + if (currentTopHeight === 0) { - const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); - allResults = allResults.concat(finalResults); return allResults; }