From b172636eb1b9c7c785e88b63205e545ce3ff7d29 Mon Sep 17 00:00:00 2001 From: Rohit Date: Mon, 17 Mar 2025 16:03:14 +0530 Subject: [PATCH 1/2] feat: add pagination support for SPA and Ajax pagination --- maxun-core/src/interpret.ts | 116 +++++++++++++++++++++++------------- 1 file changed, 74 insertions(+), 42 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 2887b634..94ec4f1d 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -704,13 +704,13 @@ export default class Interpreter extends EventEmitter { await scrapeCurrentPage(); if (checkLimit()) return allResults; - + const { button, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors); availableSelectors = updatedSelectors; - + if (!button || !workingSelector) { - // Final retry for navigation when no selectors work + // Final retry for navigation when no selectors work const success = await retryOperation(async () => { try { await page.evaluate(() => window.history.forward()); @@ -724,70 +724,102 @@ export default class Interpreter extends EventEmitter { if (!success) return allResults; break; } - + let retryCount = 0; - let navigationSuccess = false; - - while (retryCount < MAX_RETRIES && !navigationSuccess) { + let paginationSuccess = false; + + // Capture basic content signature before click + const captureContentSignature = async () => { + return await page.evaluate((selector) => { + const items = document.querySelectorAll(selector); + return { + url: window.location.href, + itemCount: items.length, + firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|') + }; + }, config.listSelector); + }; + + const beforeSignature = await captureContentSignature(); + debugLog(`Before click: ${beforeSignature.itemCount} items`); + + while (retryCount < MAX_RETRIES && !paginationSuccess) { try { try { await Promise.all([ page.waitForNavigation({ waitUntil: 'networkidle', timeout: 15000 + }).catch(e => { + throw e; }), button.click() ]); - navigationSuccess = true; - } catch (error) { - debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`); - - // If regular click fails, try dispatchEvent - if (page.url() === currentUrl) { + debugLog("Navigation successful after regular click"); + paginationSuccess = true; + } catch (navError) { + debugLog("Regular click with navigation failed, trying dispatch event with navigation"); + try { + await Promise.all([ + page.waitForNavigation({ + waitUntil: 'networkidle', + timeout: 15000 + }).catch(e => { + throw e; + }), + button.dispatchEvent('click') + ]); + debugLog("Navigation successful after dispatch event"); + paginationSuccess = true; + } catch (dispatchNavError) { try { - await Promise.all([ - page.waitForNavigation({ - waitUntil: 'networkidle', - timeout: 15000 - }), - button.dispatchEvent('click') - ]); - navigationSuccess = true; - } catch (dispatchError) { - debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`); + await button.click(); + await page.waitForTimeout(2000); + } catch (clickError) { + await button.dispatchEvent('click'); + await page.waitForTimeout(2000); } - } else { - navigationSuccess = true; } } - - const newUrl = page.url(); - if (visitedUrls.has(newUrl)) { - debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`); - navigationSuccess = false; - } - - if (navigationSuccess) { - await page.waitForTimeout(1000); + + await page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => {}); + + if (!paginationSuccess) { + const newUrl = page.url(); + const afterSignature = await captureContentSignature(); + + if (newUrl !== currentUrl) { + debugLog(`URL changed to ${newUrl}`); + visitedUrls.add(newUrl); + paginationSuccess = true; + } + else if (afterSignature.firstItems !== beforeSignature.firstItems) { + debugLog("Content changed without URL change"); + paginationSuccess = true; + } + else if (afterSignature.itemCount !== beforeSignature.itemCount) { + debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`); + paginationSuccess = true; + } } } catch (error) { - debugLog(`Navigation attempt ${retryCount + 1} failed completely.`); - navigationSuccess = false; + debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`); } - - if (!navigationSuccess) { + + if (!paginationSuccess) { retryCount++; if (retryCount < MAX_RETRIES) { - debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`); + debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`); await page.waitForTimeout(RETRY_DELAY); } } } - - if (!navigationSuccess) { - debugLog(`Navigation failed after ${MAX_RETRIES} attempts`); + + if (!paginationSuccess) { + debugLog(`Pagination failed after ${MAX_RETRIES} attempts`); return allResults; } + break; } From 0f0b53472e7bbfabba3ecceaba16f71fab7745d7 Mon Sep 17 00:00:00 2001 From: Rohit Date: Tue, 18 Mar 2025 12:46:25 +0530 Subject: [PATCH 2/2] feat: improve url extraction for images --- maxun-core/src/browserSide/scraper.js | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index c04a9993..6e4bf029 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -524,7 +524,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, // Enhanced value extraction with context awareness function extractValue(element, attribute) { if (!element) return null; - + // Get context-aware base URL const baseURL = element.ownerDocument?.location?.href || window.location.origin; @@ -535,14 +535,34 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return shadowContent.trim(); } } - + if (attribute === 'innerText') { return element.innerText.trim(); } else if (attribute === 'innerHTML') { return element.innerHTML.trim(); } else if (attribute === 'src' || attribute === 'href') { const attrValue = element.getAttribute(attribute); - return attrValue ? new URL(attrValue, baseURL).href : null; + + const dataAttr = attrValue || element.getAttribute('data-' + attribute); + + if (!dataAttr || dataAttr.trim() === '') { + if (attribute === 'src') { + const style = window.getComputedStyle(element); + const bgImage = style.backgroundImage; + if (bgImage && bgImage !== 'none') { + const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); + return matches ? new URL(matches[1], baseURL).href : null; + } + } + return null; + } + + try { + return new URL(dataAttr, baseURL).href; + } catch (e) { + console.warn('Error creating URL from', dataAttr, e); + return dataAttr; // Return the original value if URL construction fails + } } return element.getAttribute(attribute); }