diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index e09ac5d5..70d425a1 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -545,6 +545,9 @@ export default class Interpreter extends EventEmitter { let previousHeight = 0; // track unique items per page to avoid re-scraping let scrapedItems: Set = new Set(); + let visitedUrls: string[] = []; + + let availableSelectors = config.pagination.selector.split(','); while (true) { switch (config.pagination.type) { @@ -575,8 +578,9 @@ export default class Interpreter extends EventEmitter { previousHeight = currentTopHeight; break; case 'clickNext': + console.log("Page URL:", page.url()); const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); - + // console.log("Page results:", pageResults); // Filter out already scraped items @@ -588,37 +592,149 @@ export default class Interpreter extends EventEmitter { }); allResults = allResults.concat(newResults); + console.log("Results so far:", allResults.length); if (config.limit && allResults.length >= config.limit) { return allResults.slice(0, config.limit); } - const nextButton = await page.$(config.pagination.selector); + let checkButton = null; + let workingSelector = null; + + for (let i = 0; i < availableSelectors.length; i++) { + const selector = availableSelectors[i]; + try { + // Wait for selector with a short timeout + checkButton = await page.waitForSelector(selector, { state: 'attached' }); + if (checkButton) { + workingSelector = selector; + break; + } + } catch (error) { + console.log(`Selector failed: ${selector}`); + } + } + + if (!workingSelector) { + return allResults; + } + + // const nextButton = await page.$(config.pagination.selector); + const nextButton = await page.$(workingSelector); if (!nextButton) { return allResults; // No more pages to scrape } - await Promise.all([ - nextButton.dispatchEvent('click'), - page.waitForNavigation({ waitUntil: 'networkidle' }) - ]); + const selectorIndex = availableSelectors.indexOf(workingSelector!); + availableSelectors = availableSelectors.slice(selectorIndex); + + // await Promise.all([ + // nextButton.dispatchEvent('click'), + // page.waitForNavigation({ waitUntil: 'networkidle' }) + // ]); + + const previousUrl = page.url(); + visitedUrls.push(previousUrl); + + try { + // Try both click methods simultaneously + await Promise.race([ + Promise.all([ + page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }), + nextButton.click() + ]), + Promise.all([ + page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }), + nextButton.dispatchEvent('click') + ]) + ]); + } catch (error) { + // Verify if navigation actually succeeded + const currentUrl = page.url(); + if (currentUrl === previousUrl) { + console.log("Previous URL same as current URL. Navigation failed."); + } + } + + const currentUrl = page.url(); + if (visitedUrls.includes(currentUrl)) { + console.log(`Detected navigation to a previously visited URL: ${currentUrl}`); + + // Extract the current page number from the URL + const match = currentUrl.match(/\d+/); + if (match) { + const currentNumber = match[0]; + // Use visitedUrls.length + 1 as the next page number + const nextNumber = visitedUrls.length + 1; + + // Create new URL by replacing the current number with the next number + const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString()); + + console.log(`Navigating to constructed URL: ${nextUrl}`); + + // Navigate to the next page + await Promise.all([ + page.waitForNavigation({ waitUntil: 'networkidle' }), + page.goto(nextUrl) + ]); + } + } + + // Give the page a moment to stabilize after navigation await page.waitForTimeout(1000); break; case 'clickLoadMore': while (true) { - const loadMoreButton = await page.$(config.pagination.selector); + let checkButton = null; + let workingSelector = null; + + for (let i = 0; i < availableSelectors.length; i++) { + const selector = availableSelectors[i]; + try { + // Wait for selector with a short timeout + checkButton = await page.waitForSelector(selector, { state: 'attached' }); + if (checkButton) { + workingSelector = selector; + break; + } + } catch (error) { + console.log(`Selector failed: ${selector}`); + } + } + + if (!workingSelector) { + // No more working selectors available, so scrape the remaining items + const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(finalResults); + return allResults; + } + + const loadMoreButton = await page.$(workingSelector); if (!loadMoreButton) { // No more "Load More" button, so scrape the remaining items const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); allResults = allResults.concat(finalResults); return allResults; } + + const selectorIndex = availableSelectors.indexOf(workingSelector!); + availableSelectors = availableSelectors.slice(selectorIndex); + // Click the 'Load More' button to load additional items - await loadMoreButton.dispatchEvent('click'); + // await loadMoreButton.dispatchEvent('click'); + try { + await Promise.race([ + loadMoreButton.click(), + loadMoreButton.dispatchEvent('click') + ]); + } catch (error) { + console.log('Both click attempts failed'); + } await page.waitForTimeout(2000); // Wait for new items to load // After clicking 'Load More', scroll down to load more items await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.waitForTimeout(2000); + // Check if more items are available const currentHeight = await page.evaluate(() => document.body.scrollHeight); if (currentHeight === previousHeight) { @@ -628,6 +744,7 @@ export default class Interpreter extends EventEmitter { return allResults; } previousHeight = currentHeight; + if (config.limit && allResults.length >= config.limit) { // If limit is set and reached, return the limited results allResults = allResults.slice(0, config.limit); diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index d1bccbe4..7368f0cb 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -64,6 +64,8 @@ export class WorkflowGenerator { private listSelector: string = ''; + private paginationMode: boolean = false; + /** * The public constructor of the WorkflowGenerator. * Takes socket for communication as a parameter and registers some important events on it. @@ -120,6 +122,9 @@ export class WorkflowGenerator { this.socket.on('listSelector', (data: { selector: string }) => { this.listSelector = data.selector; }) + this.socket.on('setPaginationMode', (data: { pagination: boolean }) => { + this.paginationMode = data.pagination; + }) } /** @@ -702,6 +707,25 @@ export class WorkflowGenerator { const selectorBasedOnCustomAction = (this.getList === true) ? await getNonUniqueSelectors(page, coordinates, this.listSelector) : await getSelectors(page, coordinates); + + if (this.paginationMode && selectorBasedOnCustomAction) { + // Chain selectors in specific priority order + const selectors = selectorBasedOnCustomAction; + const selectorChain = [ + selectors?.iframeSelector?.full, + selectors?.shadowSelector?.full, + selectors?.testIdSelector, + selectors?.id, + selectors?.hrefSelector, + selectors?.accessibilitySelector, + selectors?.attrSelector, + selectors?.generalSelector + ] + .filter(selector => selector !== null && selector !== undefined) + .join(','); + + return selectorChain; + } const bestSelector = getBestSelectorForAction( { diff --git a/src/components/browser/BrowserWindow.tsx b/src/components/browser/BrowserWindow.tsx index cde65f96..5eb72aa4 100644 --- a/src/components/browser/BrowserWindow.tsx +++ b/src/components/browser/BrowserWindow.tsx @@ -260,6 +260,7 @@ export const BrowserWindow = () => { setPaginationSelector(highlighterData.selector); notify(`info`, t('browser_window.attribute_modal.notifications.pagination_select_success')); addListStep(listSelector!, fields, currentListId || 0, { type: paginationType, selector: highlighterData.selector }); + socket?.emit('setPaginationMode', { pagination: false }); } return; } diff --git a/src/context/browserActions.tsx b/src/context/browserActions.tsx index c8e6b238..bc377bbb 100644 --- a/src/context/browserActions.tsx +++ b/src/context/browserActions.tsx @@ -65,6 +65,7 @@ export const ActionProvider = ({ children }: { children: ReactNode }) => { setPaginationMode(true); setCaptureStage('pagination'); socket?.emit('setGetList', { getList: false }); + socket?.emit('setPaginationMode', { pagination: true }); }; const stopPaginationMode = () => setPaginationMode(false);