From 95631541b9d74f7fb97756387acb87b0b2d5790a Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Sat, 17 Aug 2024 23:54:00 +0530 Subject: [PATCH] feat: handle pagination on server side --- maxun-core/src/interpret.ts | 67 +++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 44dddf8d..4edc2131 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -291,9 +291,15 @@ export default class Interpreter extends EventEmitter { await this.options.serializableCallback(scrapeResult); }, + // scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => { + // await this.ensureScriptsLoaded(page); + // const scrapeResults: Record[] = await page.evaluate((cfg) => window.scrapeList(cfg), config); + // await this.options.serializableCallback(scrapeResults); + // }, + scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => { await this.ensureScriptsLoaded(page); - const scrapeResults: Record[] = await page.evaluate((cfg) => window.scrapeList(cfg), config); + const scrapeResults: Record[] = await this.handlePagination(page, config); await this.options.serializableCallback(scrapeResults); }, @@ -357,6 +363,63 @@ export default class Interpreter extends EventEmitter { } } + + private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) { + let allResults: Record[] = []; + let currentPage = 1; + + while (true) { + // Scrape current page + const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(pageResults); + + if (config.limit && allResults.length >= config.limit) { + allResults = allResults.slice(0, config.limit); + break; + } + + switch (config.pagination.type) { + case 'scrollDown': + await page.evaluate(() => window.scrollDown(config.listSelector, config.limit)); + break; + case 'scrollUp': + await page.evaluate(() => window.scrollUp(config.listSelector, config.limit)); + break; + case 'clickNext': + const nextButton = await page.$(config.pagination.selector); + if (!nextButton) { + return allResults; // No more pages + } + await nextButton.click(); + break; + case 'clickLoadMore': + const loadMoreButton = await page.$(config.pagination.selector); + if (!loadMoreButton) { + return allResults; // No more items to load + } + await loadMoreButton.click(); + break; + default: + return allResults; // No pagination or unknown type + } + + // Check if new items were loaded + const newItemsLoaded = await page.evaluate((prevCount, listSelector) => { + const currentCount = document.querySelectorAll(listSelector).length; + return currentCount > prevCount; + }, allResults.length, config.listSelector); + + if (!newItemsLoaded) { + return allResults; // No new items, end pagination + } + + currentPage++; + await page.waitForTimeout(1000); // Wait for page to load + } + + return allResults; + } + private async runLoop(p: Page, workflow: Workflow) { const usedActions: string[] = []; let lastAction = null; @@ -429,7 +492,7 @@ export default class Interpreter extends EventEmitter { } private async ensureScriptsLoaded(page: Page) { - const isScriptLoaded = await page.evaluate(() => typeof window.scrape === 'function' && typeof window.scrapeSchema === 'function' && typeof window.scrapeList === 'function' && typeof window.scrapeListAuto === 'function'); + const isScriptLoaded = await page.evaluate(() => typeof window.scrape === 'function' && typeof window.scrapeSchema === 'function' && typeof window.scrapeList === 'function' && typeof window.scrapeListAuto === 'function' && typeof window.scrollDown === 'function' && typeof window.scrollUp === 'function'); if (!isScriptLoaded) { await page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') }); }