feat: add retry mechanism, modularization

This commit is contained in:
Rohit
2025-01-27 15:00:19 +05:30
parent dde130106c
commit 54f7deb3ed

View File

@@ -1,5 +1,5 @@
/* eslint-disable no-await-in-loop, no-restricted-syntax */ /* eslint-disable no-await-in-loop, no-restricted-syntax */
import { Page, PageScreenshotOptions } from 'playwright'; import { ElementHandle, Page, PageScreenshotOptions } from 'playwright';
import { PlaywrightBlocker } from '@cliqz/adblocker-playwright'; import { PlaywrightBlocker } from '@cliqz/adblocker-playwright';
import fetch from 'cross-fetch'; import fetch from 'cross-fetch';
import path from 'path'; import path from 'path';
@@ -548,232 +548,274 @@ export default class Interpreter extends EventEmitter {
} }
} }
private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) { private async handlePagination(page: Page, config: {
listSelector: string,
fields: any,
limit?: number,
pagination: any
}) {
let allResults: Record<string, any>[] = []; let allResults: Record<string, any>[] = [];
let previousHeight = 0; let previousHeight = 0;
// track unique items per page to avoid re-scraping
let scrapedItems: Set<string> = new Set<string>(); let scrapedItems: Set<string> = new Set<string>();
let visitedUrls: string[] = []; let visitedUrls: Set<string> = new Set<string>();
const MAX_RETRIES = 3;
const RETRY_DELAY = 1000; // 1 second delay between retries
// Debug logging helper
const debugLog = (message: string, ...args: any[]) => { const debugLog = (message: string, ...args: any[]) => {
console.log(`[Page ${visitedUrls.length + 1}] ${message}`, ...args); console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
};
const scrapeCurrentPage = async () => {
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
const newResults = results.filter(item => {
const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey)) return false;
scrapedItems.add(uniqueKey);
return true;
});
allResults = allResults.concat(newResults);
debugLog("Results collected:", allResults.length);
};
const checkLimit = () => {
if (config.limit && allResults.length >= config.limit) {
allResults = allResults.slice(0, config.limit);
return true;
}
return false;
};
// Enhanced button finder with retry mechanism
const findWorkingButton = async (selectors: string[], retryCount = 0): Promise<{
button: ElementHandle | null,
workingSelector: string | null
}> => {
for (const selector of selectors) {
try {
const button = await page.waitForSelector(selector, {
state: 'attached',
timeout: 10000 // Reduced timeout for faster checks
});
if (button) {
debugLog('Found working selector:', selector);
return { button, workingSelector: selector };
}
} catch (error) {
debugLog(`Selector failed: ${selector}`);
}
}
// Implement retry mechanism when no selectors work
if (selectors.length > 0 && retryCount < MAX_RETRIES) {
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
await page.waitForTimeout(RETRY_DELAY);
return findWorkingButton(selectors, retryCount + 1);
}
return { button: null, workingSelector: null };
};
const retryOperation = async (operation: () => Promise<boolean>, retryCount = 0): Promise<boolean> => {
try {
return await operation();
} catch (error) {
if (retryCount < MAX_RETRIES) {
debugLog(`Retrying operation. Attempt ${retryCount + 1} of ${MAX_RETRIES}`);
await page.waitForTimeout(RETRY_DELAY);
return retryOperation(operation, retryCount + 1);
}
debugLog(`Operation failed after ${MAX_RETRIES} retries`);
return false;
}
}; };
let availableSelectors = config.pagination.selector.split(','); let availableSelectors = config.pagination.selector.split(',');
while (true) { try {
while (true) {
// Reduced timeout for faster performance
await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {});
switch (config.pagination.type) { switch (config.pagination.type) {
case 'scrollDown': case 'scrollDown':
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000); await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight); const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) { if (currentHeight === previousHeight) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults); allResults = allResults.concat(finalResults);
return allResults; return allResults;
} }
previousHeight = currentHeight; previousHeight = currentHeight;
break; break;
case 'scrollUp': case 'scrollUp':
await page.evaluate(() => window.scrollTo(0, 0)); await page.evaluate(() => window.scrollTo(0, 0));
await page.waitForTimeout(2000); await page.waitForTimeout(2000);
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop); const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
if (currentTopHeight === 0) { if (currentTopHeight === 0) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults); allResults = allResults.concat(finalResults);
return allResults; return allResults;
} }
previousHeight = currentTopHeight; previousHeight = currentTopHeight;
break; break;
case 'clickNext': case 'clickNext': {
debugLog("Current URL:", page.url()); const currentUrl = page.url();
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); visitedUrls.add(currentUrl);
// Filter out already scraped items await scrapeCurrentPage();
const newResults = pageResults.filter(item => { if (checkLimit()) return allResults;
const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey)) return false;
scrapedItems.add(uniqueKey);
return true;
});
allResults = allResults.concat(newResults);
debugLog("Results collected so far:", allResults.length);
if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit);
}
await page.waitForLoadState('networkidle', { timeout: 30000 }); const { button, workingSelector } = await findWorkingButton(availableSelectors);
await page.waitForTimeout(2000); if (!button || !workingSelector) {
// Final retry for navigation when no selectors work
let checkButton = null; const success = await retryOperation(async () => {
let workingSelector = null;
// Try each selector with explicit waiting
for (const selector of availableSelectors) {
try { try {
checkButton = await page.waitForSelector(selector, { await page.evaluate(() => window.history.forward());
state: 'attached', const newUrl = page.url();
timeout: 30000 return !visitedUrls.has(newUrl);
}); } catch {
if (checkButton) { return false;
workingSelector = selector;
debugLog('Found working selector:', selector);
break;
}
} catch (error) {
debugLog(`Selector failed: ${selector} - ${error.message}`);
} }
} });
if (!success) return allResults;
break;
}
if (!workingSelector) { availableSelectors = availableSelectors.slice(
debugLog('No working selector found after trying all options'); availableSelectors.indexOf(workingSelector)
return allResults; );
}
const nextButton = await page.$(workingSelector); let retryCount = 0;
if (!nextButton) { let navigationSuccess = false;
debugLog('Next button not found');
return allResults;
}
const selectorIndex = availableSelectors.indexOf(workingSelector);
availableSelectors = availableSelectors.slice(selectorIndex);
while (retryCount < MAX_RETRIES && !navigationSuccess) {
try { try {
// Store current URL to check if navigation succeeded
const previousUrl = page.url();
visitedUrls.push(previousUrl);
// Try both click methods in sequence
try { try {
await Promise.all([ await Promise.all([
page.waitForNavigation({ page.waitForNavigation({
waitUntil: 'networkidle', waitUntil: 'networkidle',
timeout: 15000 timeout: 15000
}), }),
nextButton.click() button.click()
]); ]);
} catch (error) { navigationSuccess = true;
// If we're still on the same URL, try dispatch event
if (page.url() === previousUrl) {
await Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}),
nextButton.dispatchEvent('click')
]);
}
}
await page.waitForLoadState('domcontentloaded');
await page.waitForLoadState('networkidle', { timeout: 30000 });
const currentUrl = page.url();
if (visitedUrls.includes(currentUrl)) {
debugLog(`Navigation failed/Detected navigation to previously visited URL: ${currentUrl}`);
return allResults;
}
// Give the page a moment to stabilize after navigation
await page.waitForTimeout(1000);
} catch (error) {
debugLog(`Navigation failed completely: ${error.message}`);
return allResults;
}
break;
case 'clickLoadMore':
while (true) {
let checkButton = null;
let workingSelector = null;
for (const selector of availableSelectors) {
try {
checkButton = await page.waitForSelector(selector, {
state: 'attached',
timeout: 30000
});
if (checkButton) {
workingSelector = selector;
debugLog('Found working selector:', selector);
break;
}
} catch (error) {
debugLog(`Load More selector failed: ${selector}`);
}
}
if (!workingSelector) {
debugLog('No working Load More selector found');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const loadMoreButton = await page.$(workingSelector);
if (!loadMoreButton) {
debugLog('Load More button not found');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const selectorIndex = availableSelectors.indexOf(workingSelector);
availableSelectors = availableSelectors.slice(selectorIndex);
try {
try {
await loadMoreButton.click();
} catch (error) {
await loadMoreButton.dispatchEvent('click');
}
} catch (error) { } catch (error) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
allResults = allResults.concat(finalResults);
return allResults; // If regular click fails, try dispatchEvent
if (page.url() === currentUrl) {
try {
await Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}),
button.dispatchEvent('click')
]);
navigationSuccess = true;
} catch (dispatchError) {
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
}
}
} }
await page.waitForTimeout(2000); const newUrl = page.url();
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); if (visitedUrls.has(newUrl)) {
await page.waitForTimeout(2000); debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
navigationSuccess = false;
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
debugLog('No more items loaded after Load More');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
} }
previousHeight = currentHeight;
if (navigationSuccess) {
if (config.limit && allResults.length >= config.limit) { await page.waitForTimeout(1000);
allResults = allResults.slice(0, config.limit); }
break; } catch (error) {
debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
navigationSuccess = false;
}
if (!navigationSuccess) {
retryCount++;
if (retryCount < MAX_RETRIES) {
debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
await page.waitForTimeout(RETRY_DELAY);
} }
} }
break; }
default: if (!navigationSuccess) {
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config); debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
allResults = allResults.concat(results);
return allResults; return allResults;
}
break;
}
case 'clickLoadMore': {
while (true) {
const { button, workingSelector } = await findWorkingButton(availableSelectors);
if (!button || !workingSelector) {
// Final retry for load more when no selectors work
const success = await retryOperation(async () => {
await scrapeCurrentPage();
return allResults.length > 0;
});
if (!success) return allResults;
break;
}
availableSelectors = availableSelectors.slice(
availableSelectors.indexOf(workingSelector)
);
const loadMoreSuccess = await retryOperation(async () => {
try {
await button.click().catch(() => button.dispatchEvent('click'));
await page.waitForTimeout(1000);
await page.evaluate(() =>
window.scrollTo(0, document.body.scrollHeight)
);
await page.waitForTimeout(1000);
const currentHeight = await page.evaluate(() =>
document.body.scrollHeight
);
if (currentHeight === previousHeight) {
await scrapeCurrentPage();
return false;
}
previousHeight = currentHeight;
return true;
} catch (error) {
await scrapeCurrentPage();
return false;
}
});
if (!loadMoreSuccess || checkLimit()) return allResults;
}
}
default:
await scrapeCurrentPage();
return allResults;
} }
if (config.limit && allResults.length >= config.limit) { if (checkLimit()) break;
allResults = allResults.slice(0, config.limit); }
break; } catch (error) {
} debugLog(`Fatal error: ${error.message}`);
return allResults;
} }
return allResults; return allResults;