feat: rm url creation and click race condition

This commit is contained in:
Rohit
2025-01-21 14:24:50 +05:30
parent a6e48bb908
commit ec2a695187

View File

@@ -547,221 +547,225 @@ export default class Interpreter extends EventEmitter {
let scrapedItems: Set<string> = new Set<string>(); let scrapedItems: Set<string> = new Set<string>();
let visitedUrls: string[] = []; let visitedUrls: string[] = [];
// Debug logging helper
const debugLog = (message: string, ...args: any[]) => {
console.log(`[Page ${visitedUrls.length + 1}] ${message}`, ...args);
};
let availableSelectors = config.pagination.selector.split(','); let availableSelectors = config.pagination.selector.split(',');
while (true) { while (true) {
switch (config.pagination.type) { switch (config.pagination.type) {
case 'scrollDown': case 'scrollDown':
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000); await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight); const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) { if (currentHeight === previousHeight) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults); allResults = allResults.concat(finalResults);
return allResults; return allResults;
} }
previousHeight = currentHeight; previousHeight = currentHeight;
break;
case 'scrollUp':
await page.evaluate(() => window.scrollTo(0, 0));
await page.waitForTimeout(2000);
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
if (currentTopHeight === 0) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
previousHeight = currentTopHeight;
break;
case 'clickNext':
console.log("Page URL:", page.url());
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
// console.log("Page results:", pageResults);
// Filter out already scraped items
const newResults = pageResults.filter(item => {
const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey)) return false; // Ignore if already scraped
scrapedItems.add(uniqueKey); // Mark as scraped
return true;
});
allResults = allResults.concat(newResults);
console.log("Results so far:", allResults.length);
if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit);
}
let checkButton = null;
let workingSelector = null;
for (let i = 0; i < availableSelectors.length; i++) {
const selector = availableSelectors[i];
try {
// Wait for selector with a short timeout
checkButton = await page.waitForSelector(selector, { state: 'attached' });
if (checkButton) {
workingSelector = selector;
break; break;
case 'scrollUp':
await page.evaluate(() => window.scrollTo(0, 0));
await page.waitForTimeout(2000);
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
if (currentTopHeight === 0) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
previousHeight = currentTopHeight;
break;
case 'clickNext':
debugLog("Current URL:", page.url());
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
// Filter out already scraped items
const newResults = pageResults.filter(item => {
const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey)) return false;
scrapedItems.add(uniqueKey);
return true;
});
allResults = allResults.concat(newResults);
debugLog("Results collected so far:", allResults.length);
if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit);
} }
} catch (error) {
console.log(`Selector failed: ${selector}`);
}
}
if (!workingSelector) { await page.waitForLoadState('networkidle', { timeout: 30000 });
return allResults; await page.waitForTimeout(2000);
}
// const nextButton = await page.$(config.pagination.selector); let checkButton = null;
const nextButton = await page.$(workingSelector); let workingSelector = null;
if (!nextButton) {
return allResults; // No more pages to scrape
}
const selectorIndex = availableSelectors.indexOf(workingSelector!); // Try each selector with explicit waiting
availableSelectors = availableSelectors.slice(selectorIndex); for (const selector of availableSelectors) {
try {
checkButton = await page.waitForSelector(selector, {
state: 'attached',
timeout: 30000
});
if (checkButton) {
workingSelector = selector;
debugLog('Found working selector:', selector);
break;
}
} catch (error) {
debugLog(`Selector failed: ${selector} - ${error.message}`);
}
}
// await Promise.all([ if (!workingSelector) {
// nextButton.dispatchEvent('click'), debugLog('No working selector found after trying all options');
// page.waitForNavigation({ waitUntil: 'networkidle' }) return allResults;
// ]); }
const previousUrl = page.url(); const nextButton = await page.$(workingSelector);
visitedUrls.push(previousUrl); if (!nextButton) {
debugLog('Next button not found');
return allResults;
}
try { const selectorIndex = availableSelectors.indexOf(workingSelector);
// Try both click methods simultaneously availableSelectors = availableSelectors.slice(selectorIndex);
await Promise.race([
Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
nextButton.click()
]),
Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
nextButton.dispatchEvent('click')
])
]);
} catch (error) {
// Verify if navigation actually succeeded
const currentUrl = page.url();
if (currentUrl === previousUrl) {
console.log("Previous URL same as current URL. Navigation failed.");
}
}
const currentUrl = page.url();
if (visitedUrls.includes(currentUrl)) {
console.log(`Detected navigation to a previously visited URL: ${currentUrl}`);
// Extract the current page number from the URL
const match = currentUrl.match(/\d+/);
if (match) {
const currentNumber = match[0];
// Use visitedUrls.length + 1 as the next page number
const nextNumber = visitedUrls.length + 1;
// Create new URL by replacing the current number with the next number
const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());
console.log(`Navigating to constructed URL: ${nextUrl}`);
// Navigate to the next page
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle' }),
page.goto(nextUrl)
]);
}
}
// Give the page a moment to stabilize after navigation
await page.waitForTimeout(1000);
break;
case 'clickLoadMore':
while (true) {
let checkButton = null;
let workingSelector = null;
for (let i = 0; i < availableSelectors.length; i++) {
const selector = availableSelectors[i];
try { try {
// Wait for selector with a short timeout // Store current URL to check if navigation succeeded
checkButton = await page.waitForSelector(selector, { state: 'attached' }); const previousUrl = page.url();
if (checkButton) { visitedUrls.push(previousUrl);
workingSelector = selector;
// Try both click methods in sequence
try {
await Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}),
nextButton.click()
]);
} catch (error) {
// If we're still on the same URL, try dispatch event
if (page.url() === previousUrl) {
await Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}),
nextButton.dispatchEvent('click')
]);
}
}
await page.waitForLoadState('domcontentloaded');
await page.waitForLoadState('networkidle', { timeout: 30000 });
const currentUrl = page.url();
if (visitedUrls.includes(currentUrl)) {
debugLog(`Navigation failed/Detected navigation to previously visited URL: ${currentUrl}`);
return allResults;
}
// Give the page a moment to stabilize after navigation
await page.waitForTimeout(1000);
} catch (error) {
debugLog(`Navigation failed completely: ${error.message}`);
return allResults;
}
break;
case 'clickLoadMore':
while (true) {
let checkButton = null;
let workingSelector = null;
for (const selector of availableSelectors) {
try {
checkButton = await page.waitForSelector(selector, {
state: 'attached',
timeout: 30000
});
if (checkButton) {
workingSelector = selector;
debugLog('Found working selector:', selector);
break;
}
} catch (error) {
debugLog(`Load More selector failed: ${selector}`);
}
}
if (!workingSelector) {
debugLog('No working Load More selector found');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const loadMoreButton = await page.$(workingSelector);
if (!loadMoreButton) {
debugLog('Load More button not found');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const selectorIndex = availableSelectors.indexOf(workingSelector);
availableSelectors = availableSelectors.slice(selectorIndex);
try {
try {
await loadMoreButton.click();
} catch (error) {
await loadMoreButton.dispatchEvent('click');
}
} catch (error) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
await page.waitForTimeout(2000);
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
debugLog('No more items loaded after Load More');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
previousHeight = currentHeight;
if (config.limit && allResults.length >= config.limit) {
allResults = allResults.slice(0, config.limit);
break; break;
} }
} catch (error) {
console.log(`Selector failed: ${selector}`);
} }
}
if (!workingSelector) {
// No more working selectors available, so scrape the remaining items
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const loadMoreButton = await page.$(workingSelector);
if (!loadMoreButton) {
// No more "Load More" button, so scrape the remaining items
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const selectorIndex = availableSelectors.indexOf(workingSelector!);
availableSelectors = availableSelectors.slice(selectorIndex);
// Click the 'Load More' button to load additional items
// await loadMoreButton.dispatchEvent('click');
try {
await Promise.race([
loadMoreButton.click(),
loadMoreButton.dispatchEvent('click')
]);
} catch (error) {
console.log('Both click attempts failed');
}
await page.waitForTimeout(2000); // Wait for new items to load
// After clicking 'Load More', scroll down to load more items
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);
// Check if more items are available
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
// No more items loaded, return the scraped results
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
previousHeight = currentHeight;
if (config.limit && allResults.length >= config.limit) {
// If limit is set and reached, return the limited results
allResults = allResults.slice(0, config.limit);
break; break;
}
}
break;
default:
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(results);
return allResults;
}
if (config.limit && allResults.length >= config.limit) { default:
allResults = allResults.slice(0, config.limit); const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
break; allResults = allResults.concat(results);
} return allResults;
}
if (config.limit && allResults.length >= config.limit) {
allResults = allResults.slice(0, config.limit);
break;
}
} }
return allResults; return allResults;