feat: add retry mechanism, modularization
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
/* eslint-disable no-await-in-loop, no-restricted-syntax */
|
/* eslint-disable no-await-in-loop, no-restricted-syntax */
|
||||||
import { Page, PageScreenshotOptions } from 'playwright';
|
import { ElementHandle, Page, PageScreenshotOptions } from 'playwright';
|
||||||
import { PlaywrightBlocker } from '@cliqz/adblocker-playwright';
|
import { PlaywrightBlocker } from '@cliqz/adblocker-playwright';
|
||||||
import fetch from 'cross-fetch';
|
import fetch from 'cross-fetch';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
@@ -548,232 +548,274 @@ export default class Interpreter extends EventEmitter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) {
|
private async handlePagination(page: Page, config: {
|
||||||
|
listSelector: string,
|
||||||
|
fields: any,
|
||||||
|
limit?: number,
|
||||||
|
pagination: any
|
||||||
|
}) {
|
||||||
let allResults: Record<string, any>[] = [];
|
let allResults: Record<string, any>[] = [];
|
||||||
let previousHeight = 0;
|
let previousHeight = 0;
|
||||||
// track unique items per page to avoid re-scraping
|
|
||||||
let scrapedItems: Set<string> = new Set<string>();
|
let scrapedItems: Set<string> = new Set<string>();
|
||||||
let visitedUrls: string[] = [];
|
let visitedUrls: Set<string> = new Set<string>();
|
||||||
|
const MAX_RETRIES = 3;
|
||||||
|
const RETRY_DELAY = 1000; // 1 second delay between retries
|
||||||
|
|
||||||
// Debug logging helper
|
|
||||||
const debugLog = (message: string, ...args: any[]) => {
|
const debugLog = (message: string, ...args: any[]) => {
|
||||||
console.log(`[Page ${visitedUrls.length + 1}] ${message}`, ...args);
|
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
||||||
|
};
|
||||||
|
|
||||||
|
const scrapeCurrentPage = async () => {
|
||||||
|
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
|
const newResults = results.filter(item => {
|
||||||
|
const uniqueKey = JSON.stringify(item);
|
||||||
|
if (scrapedItems.has(uniqueKey)) return false;
|
||||||
|
scrapedItems.add(uniqueKey);
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
allResults = allResults.concat(newResults);
|
||||||
|
debugLog("Results collected:", allResults.length);
|
||||||
|
};
|
||||||
|
|
||||||
|
const checkLimit = () => {
|
||||||
|
if (config.limit && allResults.length >= config.limit) {
|
||||||
|
allResults = allResults.slice(0, config.limit);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Enhanced button finder with retry mechanism
|
||||||
|
const findWorkingButton = async (selectors: string[], retryCount = 0): Promise<{
|
||||||
|
button: ElementHandle | null,
|
||||||
|
workingSelector: string | null
|
||||||
|
}> => {
|
||||||
|
for (const selector of selectors) {
|
||||||
|
try {
|
||||||
|
const button = await page.waitForSelector(selector, {
|
||||||
|
state: 'attached',
|
||||||
|
timeout: 10000 // Reduced timeout for faster checks
|
||||||
|
});
|
||||||
|
if (button) {
|
||||||
|
debugLog('Found working selector:', selector);
|
||||||
|
return { button, workingSelector: selector };
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
debugLog(`Selector failed: ${selector}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Implement retry mechanism when no selectors work
|
||||||
|
if (selectors.length > 0 && retryCount < MAX_RETRIES) {
|
||||||
|
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
||||||
|
await page.waitForTimeout(RETRY_DELAY);
|
||||||
|
return findWorkingButton(selectors, retryCount + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return { button: null, workingSelector: null };
|
||||||
|
};
|
||||||
|
|
||||||
|
const retryOperation = async (operation: () => Promise<boolean>, retryCount = 0): Promise<boolean> => {
|
||||||
|
try {
|
||||||
|
return await operation();
|
||||||
|
} catch (error) {
|
||||||
|
if (retryCount < MAX_RETRIES) {
|
||||||
|
debugLog(`Retrying operation. Attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
||||||
|
await page.waitForTimeout(RETRY_DELAY);
|
||||||
|
return retryOperation(operation, retryCount + 1);
|
||||||
|
}
|
||||||
|
debugLog(`Operation failed after ${MAX_RETRIES} retries`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let availableSelectors = config.pagination.selector.split(',');
|
let availableSelectors = config.pagination.selector.split(',');
|
||||||
|
|
||||||
while (true) {
|
try {
|
||||||
|
while (true) {
|
||||||
|
// Reduced timeout for faster performance
|
||||||
|
await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {});
|
||||||
|
|
||||||
switch (config.pagination.type) {
|
switch (config.pagination.type) {
|
||||||
case 'scrollDown':
|
case 'scrollDown':
|
||||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||||
await page.waitForTimeout(2000);
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
if (currentHeight === previousHeight) {
|
if (currentHeight === previousHeight) {
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
allResults = allResults.concat(finalResults);
|
allResults = allResults.concat(finalResults);
|
||||||
return allResults;
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
previousHeight = currentHeight;
|
previousHeight = currentHeight;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'scrollUp':
|
case 'scrollUp':
|
||||||
await page.evaluate(() => window.scrollTo(0, 0));
|
await page.evaluate(() => window.scrollTo(0, 0));
|
||||||
await page.waitForTimeout(2000);
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
|
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
|
||||||
if (currentTopHeight === 0) {
|
if (currentTopHeight === 0) {
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
allResults = allResults.concat(finalResults);
|
allResults = allResults.concat(finalResults);
|
||||||
return allResults;
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
previousHeight = currentTopHeight;
|
previousHeight = currentTopHeight;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'clickNext':
|
case 'clickNext': {
|
||||||
debugLog("Current URL:", page.url());
|
const currentUrl = page.url();
|
||||||
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
visitedUrls.add(currentUrl);
|
||||||
|
|
||||||
// Filter out already scraped items
|
await scrapeCurrentPage();
|
||||||
const newResults = pageResults.filter(item => {
|
if (checkLimit()) return allResults;
|
||||||
const uniqueKey = JSON.stringify(item);
|
|
||||||
if (scrapedItems.has(uniqueKey)) return false;
|
|
||||||
scrapedItems.add(uniqueKey);
|
|
||||||
return true;
|
|
||||||
});
|
|
||||||
|
|
||||||
allResults = allResults.concat(newResults);
|
|
||||||
debugLog("Results collected so far:", allResults.length);
|
|
||||||
|
|
||||||
if (config.limit && allResults.length >= config.limit) {
|
|
||||||
return allResults.slice(0, config.limit);
|
|
||||||
}
|
|
||||||
|
|
||||||
await page.waitForLoadState('networkidle', { timeout: 30000 });
|
const { button, workingSelector } = await findWorkingButton(availableSelectors);
|
||||||
await page.waitForTimeout(2000);
|
if (!button || !workingSelector) {
|
||||||
|
// Final retry for navigation when no selectors work
|
||||||
let checkButton = null;
|
const success = await retryOperation(async () => {
|
||||||
let workingSelector = null;
|
|
||||||
|
|
||||||
// Try each selector with explicit waiting
|
|
||||||
for (const selector of availableSelectors) {
|
|
||||||
try {
|
try {
|
||||||
checkButton = await page.waitForSelector(selector, {
|
await page.evaluate(() => window.history.forward());
|
||||||
state: 'attached',
|
const newUrl = page.url();
|
||||||
timeout: 30000
|
return !visitedUrls.has(newUrl);
|
||||||
});
|
} catch {
|
||||||
if (checkButton) {
|
return false;
|
||||||
workingSelector = selector;
|
|
||||||
debugLog('Found working selector:', selector);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
debugLog(`Selector failed: ${selector} - ${error.message}`);
|
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
|
if (!success) return allResults;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (!workingSelector) {
|
availableSelectors = availableSelectors.slice(
|
||||||
debugLog('No working selector found after trying all options');
|
availableSelectors.indexOf(workingSelector)
|
||||||
return allResults;
|
);
|
||||||
}
|
|
||||||
|
|
||||||
const nextButton = await page.$(workingSelector);
|
let retryCount = 0;
|
||||||
if (!nextButton) {
|
let navigationSuccess = false;
|
||||||
debugLog('Next button not found');
|
|
||||||
return allResults;
|
|
||||||
}
|
|
||||||
|
|
||||||
const selectorIndex = availableSelectors.indexOf(workingSelector);
|
|
||||||
availableSelectors = availableSelectors.slice(selectorIndex);
|
|
||||||
|
|
||||||
|
while (retryCount < MAX_RETRIES && !navigationSuccess) {
|
||||||
try {
|
try {
|
||||||
// Store current URL to check if navigation succeeded
|
|
||||||
const previousUrl = page.url();
|
|
||||||
visitedUrls.push(previousUrl);
|
|
||||||
|
|
||||||
// Try both click methods in sequence
|
|
||||||
try {
|
try {
|
||||||
await Promise.all([
|
await Promise.all([
|
||||||
page.waitForNavigation({
|
page.waitForNavigation({
|
||||||
waitUntil: 'networkidle',
|
waitUntil: 'networkidle',
|
||||||
timeout: 15000
|
timeout: 15000
|
||||||
}),
|
}),
|
||||||
nextButton.click()
|
button.click()
|
||||||
]);
|
]);
|
||||||
} catch (error) {
|
navigationSuccess = true;
|
||||||
// If we're still on the same URL, try dispatch event
|
|
||||||
if (page.url() === previousUrl) {
|
|
||||||
await Promise.all([
|
|
||||||
page.waitForNavigation({
|
|
||||||
waitUntil: 'networkidle',
|
|
||||||
timeout: 15000
|
|
||||||
}),
|
|
||||||
nextButton.dispatchEvent('click')
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
await page.waitForLoadState('domcontentloaded');
|
|
||||||
await page.waitForLoadState('networkidle', { timeout: 30000 });
|
|
||||||
|
|
||||||
const currentUrl = page.url();
|
|
||||||
if (visitedUrls.includes(currentUrl)) {
|
|
||||||
debugLog(`Navigation failed/Detected navigation to previously visited URL: ${currentUrl}`);
|
|
||||||
return allResults;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Give the page a moment to stabilize after navigation
|
|
||||||
await page.waitForTimeout(1000);
|
|
||||||
|
|
||||||
} catch (error) {
|
|
||||||
debugLog(`Navigation failed completely: ${error.message}`);
|
|
||||||
return allResults;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 'clickLoadMore':
|
|
||||||
while (true) {
|
|
||||||
let checkButton = null;
|
|
||||||
let workingSelector = null;
|
|
||||||
|
|
||||||
for (const selector of availableSelectors) {
|
|
||||||
try {
|
|
||||||
checkButton = await page.waitForSelector(selector, {
|
|
||||||
state: 'attached',
|
|
||||||
timeout: 30000
|
|
||||||
});
|
|
||||||
if (checkButton) {
|
|
||||||
workingSelector = selector;
|
|
||||||
debugLog('Found working selector:', selector);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
debugLog(`Load More selector failed: ${selector}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!workingSelector) {
|
|
||||||
debugLog('No working Load More selector found');
|
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
|
||||||
}
|
|
||||||
|
|
||||||
const loadMoreButton = await page.$(workingSelector);
|
|
||||||
if (!loadMoreButton) {
|
|
||||||
debugLog('Load More button not found');
|
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
|
||||||
}
|
|
||||||
|
|
||||||
const selectorIndex = availableSelectors.indexOf(workingSelector);
|
|
||||||
availableSelectors = availableSelectors.slice(selectorIndex);
|
|
||||||
|
|
||||||
try {
|
|
||||||
try {
|
|
||||||
await loadMoreButton.click();
|
|
||||||
} catch (error) {
|
|
||||||
await loadMoreButton.dispatchEvent('click');
|
|
||||||
}
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
// If regular click fails, try dispatchEvent
|
||||||
|
if (page.url() === currentUrl) {
|
||||||
|
try {
|
||||||
|
await Promise.all([
|
||||||
|
page.waitForNavigation({
|
||||||
|
waitUntil: 'networkidle',
|
||||||
|
timeout: 15000
|
||||||
|
}),
|
||||||
|
button.dispatchEvent('click')
|
||||||
|
]);
|
||||||
|
navigationSuccess = true;
|
||||||
|
} catch (dispatchError) {
|
||||||
|
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await page.waitForTimeout(2000);
|
const newUrl = page.url();
|
||||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
if (visitedUrls.has(newUrl)) {
|
||||||
await page.waitForTimeout(2000);
|
debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
|
||||||
|
navigationSuccess = false;
|
||||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
||||||
if (currentHeight === previousHeight) {
|
|
||||||
debugLog('No more items loaded after Load More');
|
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
|
||||||
}
|
}
|
||||||
previousHeight = currentHeight;
|
|
||||||
|
if (navigationSuccess) {
|
||||||
if (config.limit && allResults.length >= config.limit) {
|
await page.waitForTimeout(1000);
|
||||||
allResults = allResults.slice(0, config.limit);
|
}
|
||||||
break;
|
} catch (error) {
|
||||||
|
debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
|
||||||
|
navigationSuccess = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!navigationSuccess) {
|
||||||
|
retryCount++;
|
||||||
|
if (retryCount < MAX_RETRIES) {
|
||||||
|
debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
||||||
|
await page.waitForTimeout(RETRY_DELAY);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
|
|
||||||
default:
|
if (!navigationSuccess) {
|
||||||
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
|
||||||
allResults = allResults.concat(results);
|
|
||||||
return allResults;
|
return allResults;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'clickLoadMore': {
|
||||||
|
while (true) {
|
||||||
|
const { button, workingSelector } = await findWorkingButton(availableSelectors);
|
||||||
|
if (!button || !workingSelector) {
|
||||||
|
// Final retry for load more when no selectors work
|
||||||
|
const success = await retryOperation(async () => {
|
||||||
|
await scrapeCurrentPage();
|
||||||
|
return allResults.length > 0;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!success) return allResults;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
availableSelectors = availableSelectors.slice(
|
||||||
|
availableSelectors.indexOf(workingSelector)
|
||||||
|
);
|
||||||
|
|
||||||
|
const loadMoreSuccess = await retryOperation(async () => {
|
||||||
|
try {
|
||||||
|
await button.click().catch(() => button.dispatchEvent('click'));
|
||||||
|
await page.waitForTimeout(1000);
|
||||||
|
|
||||||
|
await page.evaluate(() =>
|
||||||
|
window.scrollTo(0, document.body.scrollHeight)
|
||||||
|
);
|
||||||
|
await page.waitForTimeout(1000);
|
||||||
|
|
||||||
|
const currentHeight = await page.evaluate(() =>
|
||||||
|
document.body.scrollHeight
|
||||||
|
);
|
||||||
|
|
||||||
|
if (currentHeight === previousHeight) {
|
||||||
|
await scrapeCurrentPage();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
previousHeight = currentHeight;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
} catch (error) {
|
||||||
|
await scrapeCurrentPage();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!loadMoreSuccess || checkLimit()) return allResults;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
await scrapeCurrentPage();
|
||||||
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config.limit && allResults.length >= config.limit) {
|
if (checkLimit()) break;
|
||||||
allResults = allResults.slice(0, config.limit);
|
}
|
||||||
break;
|
} catch (error) {
|
||||||
}
|
debugLog(`Fatal error: ${error.message}`);
|
||||||
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
return allResults;
|
return allResults;
|
||||||
|
|||||||
Reference in New Issue
Block a user