feat: rm url creation and click race condition

This commit is contained in:
Rohit
2025-01-21 14:24:50 +05:30
parent a6e48bb908
commit ec2a695187

View File

@@ -547,6 +547,11 @@ export default class Interpreter extends EventEmitter {
let scrapedItems: Set<string> = new Set<string>(); let scrapedItems: Set<string> = new Set<string>();
let visitedUrls: string[] = []; let visitedUrls: string[] = [];
// Debug logging helper
const debugLog = (message: string, ...args: any[]) => {
console.log(`[Page ${visitedUrls.length + 1}] ${message}`, ...args);
};
let availableSelectors = config.pagination.selector.split(','); let availableSelectors = config.pagination.selector.split(',');
while (true) { while (true) {
@@ -564,6 +569,7 @@ export default class Interpreter extends EventEmitter {
previousHeight = currentHeight; previousHeight = currentHeight;
break; break;
case 'scrollUp': case 'scrollUp':
await page.evaluate(() => window.scrollTo(0, 0)); await page.evaluate(() => window.scrollTo(0, 0));
await page.waitForTimeout(2000); await page.waitForTimeout(2000);
@@ -577,133 +583,131 @@ export default class Interpreter extends EventEmitter {
previousHeight = currentTopHeight; previousHeight = currentTopHeight;
break; break;
case 'clickNext':
console.log("Page URL:", page.url());
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
// console.log("Page results:", pageResults); case 'clickNext':
debugLog("Current URL:", page.url());
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
// Filter out already scraped items // Filter out already scraped items
const newResults = pageResults.filter(item => { const newResults = pageResults.filter(item => {
const uniqueKey = JSON.stringify(item); const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey)) return false; // Ignore if already scraped if (scrapedItems.has(uniqueKey)) return false;
scrapedItems.add(uniqueKey); // Mark as scraped scrapedItems.add(uniqueKey);
return true; return true;
}); });
allResults = allResults.concat(newResults); allResults = allResults.concat(newResults);
console.log("Results so far:", allResults.length); debugLog("Results collected so far:", allResults.length);
if (config.limit && allResults.length >= config.limit) { if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit); return allResults.slice(0, config.limit);
} }
await page.waitForLoadState('networkidle', { timeout: 30000 });
await page.waitForTimeout(2000);
let checkButton = null; let checkButton = null;
let workingSelector = null; let workingSelector = null;
for (let i = 0; i < availableSelectors.length; i++) { // Try each selector with explicit waiting
const selector = availableSelectors[i]; for (const selector of availableSelectors) {
try { try {
// Wait for selector with a short timeout checkButton = await page.waitForSelector(selector, {
checkButton = await page.waitForSelector(selector, { state: 'attached' }); state: 'attached',
timeout: 30000
});
if (checkButton) { if (checkButton) {
workingSelector = selector; workingSelector = selector;
debugLog('Found working selector:', selector);
break; break;
} }
} catch (error) { } catch (error) {
console.log(`Selector failed: ${selector}`); debugLog(`Selector failed: ${selector} - ${error.message}`);
} }
} }
if (!workingSelector) { if (!workingSelector) {
debugLog('No working selector found after trying all options');
return allResults; return allResults;
} }
// const nextButton = await page.$(config.pagination.selector);
const nextButton = await page.$(workingSelector); const nextButton = await page.$(workingSelector);
if (!nextButton) { if (!nextButton) {
return allResults; // No more pages to scrape debugLog('Next button not found');
return allResults;
} }
const selectorIndex = availableSelectors.indexOf(workingSelector!); const selectorIndex = availableSelectors.indexOf(workingSelector);
availableSelectors = availableSelectors.slice(selectorIndex); availableSelectors = availableSelectors.slice(selectorIndex);
// await Promise.all([ try {
// nextButton.dispatchEvent('click'), // Store current URL to check if navigation succeeded
// page.waitForNavigation({ waitUntil: 'networkidle' })
// ]);
const previousUrl = page.url(); const previousUrl = page.url();
visitedUrls.push(previousUrl); visitedUrls.push(previousUrl);
// Try both click methods in sequence
try { try {
// Try both click methods simultaneously await Promise.all([
await Promise.race([ page.waitForNavigation({
Promise.all([ waitUntil: 'networkidle',
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }), timeout: 15000
}),
nextButton.click() nextButton.click()
]),
Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
nextButton.dispatchEvent('click')
])
]); ]);
} catch (error) { } catch (error) {
// Verify if navigation actually succeeded // If we're still on the same URL, try dispatch event
const currentUrl = page.url(); if (page.url() === previousUrl) {
if (currentUrl === previousUrl) { await Promise.all([
console.log("Previous URL same as current URL. Navigation failed."); page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}),
nextButton.dispatchEvent('click')
]);
} }
} }
await page.waitForLoadState('domcontentloaded');
await page.waitForLoadState('networkidle', { timeout: 30000 });
const currentUrl = page.url(); const currentUrl = page.url();
if (visitedUrls.includes(currentUrl)) { if (visitedUrls.includes(currentUrl)) {
console.log(`Detected navigation to a previously visited URL: ${currentUrl}`); debugLog(`Navigation failed/Detected navigation to previously visited URL: ${currentUrl}`);
return allResults;
// Extract the current page number from the URL
const match = currentUrl.match(/\d+/);
if (match) {
const currentNumber = match[0];
// Use visitedUrls.length + 1 as the next page number
const nextNumber = visitedUrls.length + 1;
// Create new URL by replacing the current number with the next number
const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());
console.log(`Navigating to constructed URL: ${nextUrl}`);
// Navigate to the next page
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle' }),
page.goto(nextUrl)
]);
}
} }
// Give the page a moment to stabilize after navigation // Give the page a moment to stabilize after navigation
await page.waitForTimeout(1000); await page.waitForTimeout(1000);
} catch (error) {
debugLog(`Navigation failed completely: ${error.message}`);
return allResults;
}
break; break;
case 'clickLoadMore': case 'clickLoadMore':
while (true) { while (true) {
let checkButton = null; let checkButton = null;
let workingSelector = null; let workingSelector = null;
for (let i = 0; i < availableSelectors.length; i++) { for (const selector of availableSelectors) {
const selector = availableSelectors[i];
try { try {
// Wait for selector with a short timeout checkButton = await page.waitForSelector(selector, {
checkButton = await page.waitForSelector(selector, { state: 'attached' }); state: 'attached',
timeout: 30000
});
if (checkButton) { if (checkButton) {
workingSelector = selector; workingSelector = selector;
debugLog('Found working selector:', selector);
break; break;
} }
} catch (error) { } catch (error) {
console.log(`Selector failed: ${selector}`); debugLog(`Load More selector failed: ${selector}`);
} }
} }
if (!workingSelector) { if (!workingSelector) {
// No more working selectors available, so scrape the remaining items debugLog('No working Load More selector found');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults); allResults = allResults.concat(finalResults);
return allResults; return allResults;
@@ -711,34 +715,34 @@ export default class Interpreter extends EventEmitter {
const loadMoreButton = await page.$(workingSelector); const loadMoreButton = await page.$(workingSelector);
if (!loadMoreButton) { if (!loadMoreButton) {
// No more "Load More" button, so scrape the remaining items debugLog('Load More button not found');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults); allResults = allResults.concat(finalResults);
return allResults; return allResults;
} }
const selectorIndex = availableSelectors.indexOf(workingSelector!); const selectorIndex = availableSelectors.indexOf(workingSelector);
availableSelectors = availableSelectors.slice(selectorIndex); availableSelectors = availableSelectors.slice(selectorIndex);
// Click the 'Load More' button to load additional items
// await loadMoreButton.dispatchEvent('click');
try { try {
await Promise.race([ try {
loadMoreButton.click(), await loadMoreButton.click();
loadMoreButton.dispatchEvent('click')
]);
} catch (error) { } catch (error) {
console.log('Both click attempts failed'); await loadMoreButton.dispatchEvent('click');
} }
await page.waitForTimeout(2000); // Wait for new items to load } catch (error) {
// After clicking 'Load More', scroll down to load more items const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
await page.waitForTimeout(2000);
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000); await page.waitForTimeout(2000);
// Check if more items are available
const currentHeight = await page.evaluate(() => document.body.scrollHeight); const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) { if (currentHeight === previousHeight) {
// No more items loaded, return the scraped results debugLog('No more items loaded after Load More');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults); allResults = allResults.concat(finalResults);
return allResults; return allResults;
@@ -746,12 +750,12 @@ export default class Interpreter extends EventEmitter {
previousHeight = currentHeight; previousHeight = currentHeight;
if (config.limit && allResults.length >= config.limit) { if (config.limit && allResults.length >= config.limit) {
// If limit is set and reached, return the limited results
allResults = allResults.slice(0, config.limit); allResults = allResults.slice(0, config.limit);
break; break;
} }
} }
break; break;
default: default:
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config); const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(results); allResults = allResults.concat(results);