feat: add retry mechanism, modularization

This commit is contained in:
Rohit
2025-01-27 15:00:19 +05:30
parent dde130106c
commit 54f7deb3ed

View File

@@ -1,5 +1,5 @@
/* eslint-disable no-await-in-loop, no-restricted-syntax */
import { Page, PageScreenshotOptions } from 'playwright';
import { ElementHandle, Page, PageScreenshotOptions } from 'playwright';
import { PlaywrightBlocker } from '@cliqz/adblocker-playwright';
import fetch from 'cross-fetch';
import path from 'path';
@@ -548,232 +548,274 @@ export default class Interpreter extends EventEmitter {
}
}
private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) {
private async handlePagination(page: Page, config: {
listSelector: string,
fields: any,
limit?: number,
pagination: any
}) {
let allResults: Record<string, any>[] = [];
let previousHeight = 0;
// track unique items per page to avoid re-scraping
let scrapedItems: Set<string> = new Set<string>();
let visitedUrls: string[] = [];
let visitedUrls: Set<string> = new Set<string>();
const MAX_RETRIES = 3;
const RETRY_DELAY = 1000; // 1 second delay between retries
// Debug logging helper
const debugLog = (message: string, ...args: any[]) => {
console.log(`[Page ${visitedUrls.length + 1}] ${message}`, ...args);
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
};
const scrapeCurrentPage = async () => {
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
const newResults = results.filter(item => {
const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey)) return false;
scrapedItems.add(uniqueKey);
return true;
});
allResults = allResults.concat(newResults);
debugLog("Results collected:", allResults.length);
};
const checkLimit = () => {
if (config.limit && allResults.length >= config.limit) {
allResults = allResults.slice(0, config.limit);
return true;
}
return false;
};
// Enhanced button finder with retry mechanism
const findWorkingButton = async (selectors: string[], retryCount = 0): Promise<{
button: ElementHandle | null,
workingSelector: string | null
}> => {
for (const selector of selectors) {
try {
const button = await page.waitForSelector(selector, {
state: 'attached',
timeout: 10000 // Reduced timeout for faster checks
});
if (button) {
debugLog('Found working selector:', selector);
return { button, workingSelector: selector };
}
} catch (error) {
debugLog(`Selector failed: ${selector}`);
}
}
// Implement retry mechanism when no selectors work
if (selectors.length > 0 && retryCount < MAX_RETRIES) {
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
await page.waitForTimeout(RETRY_DELAY);
return findWorkingButton(selectors, retryCount + 1);
}
return { button: null, workingSelector: null };
};
const retryOperation = async (operation: () => Promise<boolean>, retryCount = 0): Promise<boolean> => {
try {
return await operation();
} catch (error) {
if (retryCount < MAX_RETRIES) {
debugLog(`Retrying operation. Attempt ${retryCount + 1} of ${MAX_RETRIES}`);
await page.waitForTimeout(RETRY_DELAY);
return retryOperation(operation, retryCount + 1);
}
debugLog(`Operation failed after ${MAX_RETRIES} retries`);
return false;
}
};
let availableSelectors = config.pagination.selector.split(',');
while (true) {
try {
while (true) {
// Reduced timeout for faster performance
await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {});
switch (config.pagination.type) {
case 'scrollDown':
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);
case 'scrollDown':
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
previousHeight = currentHeight;
break;
previousHeight = currentHeight;
break;
case 'scrollUp':
await page.evaluate(() => window.scrollTo(0, 0));
await page.waitForTimeout(2000);
case 'scrollUp':
await page.evaluate(() => window.scrollTo(0, 0));
await page.waitForTimeout(2000);
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
if (currentTopHeight === 0) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
if (currentTopHeight === 0) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
previousHeight = currentTopHeight;
break;
previousHeight = currentTopHeight;
break;
case 'clickNext':
debugLog("Current URL:", page.url());
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
// Filter out already scraped items
const newResults = pageResults.filter(item => {
const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey)) return false;
scrapedItems.add(uniqueKey);
return true;
});
allResults = allResults.concat(newResults);
debugLog("Results collected so far:", allResults.length);
if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit);
}
case 'clickNext': {
const currentUrl = page.url();
visitedUrls.add(currentUrl);
await scrapeCurrentPage();
if (checkLimit()) return allResults;
await page.waitForLoadState('networkidle', { timeout: 30000 });
await page.waitForTimeout(2000);
let checkButton = null;
let workingSelector = null;
// Try each selector with explicit waiting
for (const selector of availableSelectors) {
const { button, workingSelector } = await findWorkingButton(availableSelectors);
if (!button || !workingSelector) {
// Final retry for navigation when no selectors work
const success = await retryOperation(async () => {
try {
checkButton = await page.waitForSelector(selector, {
state: 'attached',
timeout: 30000
});
if (checkButton) {
workingSelector = selector;
debugLog('Found working selector:', selector);
break;
}
} catch (error) {
debugLog(`Selector failed: ${selector} - ${error.message}`);
await page.evaluate(() => window.history.forward());
const newUrl = page.url();
return !visitedUrls.has(newUrl);
} catch {
return false;
}
}
});
if (!success) return allResults;
break;
}
if (!workingSelector) {
debugLog('No working selector found after trying all options');
return allResults;
}
availableSelectors = availableSelectors.slice(
availableSelectors.indexOf(workingSelector)
);
const nextButton = await page.$(workingSelector);
if (!nextButton) {
debugLog('Next button not found');
return allResults;
}
const selectorIndex = availableSelectors.indexOf(workingSelector);
availableSelectors = availableSelectors.slice(selectorIndex);
let retryCount = 0;
let navigationSuccess = false;
while (retryCount < MAX_RETRIES && !navigationSuccess) {
try {
// Store current URL to check if navigation succeeded
const previousUrl = page.url();
visitedUrls.push(previousUrl);
// Try both click methods in sequence
try {
await Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
waitUntil: 'networkidle',
timeout: 15000
}),
nextButton.click()
button.click()
]);
} catch (error) {
// If we're still on the same URL, try dispatch event
if (page.url() === previousUrl) {
await Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}),
nextButton.dispatchEvent('click')
]);
}
}
await page.waitForLoadState('domcontentloaded');
await page.waitForLoadState('networkidle', { timeout: 30000 });
const currentUrl = page.url();
if (visitedUrls.includes(currentUrl)) {
debugLog(`Navigation failed/Detected navigation to previously visited URL: ${currentUrl}`);
return allResults;
}
// Give the page a moment to stabilize after navigation
await page.waitForTimeout(1000);
} catch (error) {
debugLog(`Navigation failed completely: ${error.message}`);
return allResults;
}
break;
case 'clickLoadMore':
while (true) {
let checkButton = null;
let workingSelector = null;
for (const selector of availableSelectors) {
try {
checkButton = await page.waitForSelector(selector, {
state: 'attached',
timeout: 30000
});
if (checkButton) {
workingSelector = selector;
debugLog('Found working selector:', selector);
break;
}
} catch (error) {
debugLog(`Load More selector failed: ${selector}`);
}
}
if (!workingSelector) {
debugLog('No working Load More selector found');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const loadMoreButton = await page.$(workingSelector);
if (!loadMoreButton) {
debugLog('Load More button not found');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const selectorIndex = availableSelectors.indexOf(workingSelector);
availableSelectors = availableSelectors.slice(selectorIndex);
try {
try {
await loadMoreButton.click();
} catch (error) {
await loadMoreButton.dispatchEvent('click');
}
navigationSuccess = true;
} catch (error) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
// If regular click fails, try dispatchEvent
if (page.url() === currentUrl) {
try {
await Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}),
button.dispatchEvent('click')
]);
navigationSuccess = true;
} catch (dispatchError) {
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
}
}
}
await page.waitForTimeout(2000);
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
debugLog('No more items loaded after Load More');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
const newUrl = page.url();
if (visitedUrls.has(newUrl)) {
debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
navigationSuccess = false;
}
previousHeight = currentHeight;
if (config.limit && allResults.length >= config.limit) {
allResults = allResults.slice(0, config.limit);
break;
if (navigationSuccess) {
await page.waitForTimeout(1000);
}
} catch (error) {
debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
navigationSuccess = false;
}
if (!navigationSuccess) {
retryCount++;
if (retryCount < MAX_RETRIES) {
debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
await page.waitForTimeout(RETRY_DELAY);
}
}
break;
}
default:
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(results);
if (!navigationSuccess) {
debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
return allResults;
}
break;
}
case 'clickLoadMore': {
while (true) {
const { button, workingSelector } = await findWorkingButton(availableSelectors);
if (!button || !workingSelector) {
// Final retry for load more when no selectors work
const success = await retryOperation(async () => {
await scrapeCurrentPage();
return allResults.length > 0;
});
if (!success) return allResults;
break;
}
availableSelectors = availableSelectors.slice(
availableSelectors.indexOf(workingSelector)
);
const loadMoreSuccess = await retryOperation(async () => {
try {
await button.click().catch(() => button.dispatchEvent('click'));
await page.waitForTimeout(1000);
await page.evaluate(() =>
window.scrollTo(0, document.body.scrollHeight)
);
await page.waitForTimeout(1000);
const currentHeight = await page.evaluate(() =>
document.body.scrollHeight
);
if (currentHeight === previousHeight) {
await scrapeCurrentPage();
return false;
}
previousHeight = currentHeight;
return true;
} catch (error) {
await scrapeCurrentPage();
return false;
}
});
if (!loadMoreSuccess || checkLimit()) return allResults;
}
}
default:
await scrapeCurrentPage();
return allResults;
}
if (config.limit && allResults.length >= config.limit) {
allResults = allResults.slice(0, config.limit);
break;
}
if (checkLimit()) break;
}
} catch (error) {
debugLog(`Fatal error: ${error.message}`);
return allResults;
}
return allResults;