feat: revamp click load more scraping
This commit is contained in:
@@ -768,14 +768,20 @@ export default class Interpreter extends EventEmitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
case 'clickLoadMore': {
|
case 'clickLoadMore': {
|
||||||
|
await scrapeCurrentPage();
|
||||||
|
if (checkLimit()) return allResults;
|
||||||
|
|
||||||
|
let loadMoreCounter = 0;
|
||||||
|
let previousResultCount = allResults.length;
|
||||||
|
let noNewItemsCounter = 0;
|
||||||
|
const MAX_NO_NEW_ITEMS = 2;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// Find working button with retry mechanism, consistent with clickNext
|
// Find working button with retry mechanism
|
||||||
const { button: loadMoreButton, workingSelector } = await findWorkingButton(availableSelectors);
|
const { button: loadMoreButton, workingSelector } = await findWorkingButton(availableSelectors);
|
||||||
|
|
||||||
if (!workingSelector || !loadMoreButton) {
|
if (!workingSelector || !loadMoreButton) {
|
||||||
debugLog('No working Load More selector found after retries');
|
debugLog('No working Load More selector found after retries');
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -808,6 +814,8 @@ export default class Interpreter extends EventEmitter {
|
|||||||
|
|
||||||
if (clickSuccess) {
|
if (clickSuccess) {
|
||||||
await page.waitForTimeout(1000);
|
await page.waitForTimeout(1000);
|
||||||
|
loadMoreCounter++;
|
||||||
|
debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
debugLog(`Click attempt ${retryCount + 1} failed completely.`);
|
debugLog(`Click attempt ${retryCount + 1} failed completely.`);
|
||||||
@@ -822,8 +830,6 @@ export default class Interpreter extends EventEmitter {
|
|||||||
|
|
||||||
if (!clickSuccess) {
|
if (!clickSuccess) {
|
||||||
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -833,20 +839,34 @@ export default class Interpreter extends EventEmitter {
|
|||||||
await page.waitForTimeout(2000);
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
if (currentHeight === previousHeight) {
|
const heightChanged = currentHeight !== previousHeight;
|
||||||
debugLog('No more items loaded after Load More');
|
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
|
||||||
}
|
|
||||||
previousHeight = currentHeight;
|
previousHeight = currentHeight;
|
||||||
|
|
||||||
if (config.limit && allResults.length >= config.limit) {
|
await scrapeCurrentPage();
|
||||||
allResults = allResults.slice(0, config.limit);
|
|
||||||
break;
|
const currentResultCount = allResults.length;
|
||||||
|
const newItemsAdded = currentResultCount > previousResultCount;
|
||||||
|
|
||||||
|
if (!newItemsAdded) {
|
||||||
|
noNewItemsCounter++;
|
||||||
|
debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
|
||||||
|
|
||||||
|
if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
|
||||||
|
debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
noNewItemsCounter = 0;
|
||||||
|
previousResultCount = currentResultCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkLimit()) return allResults;
|
||||||
|
|
||||||
|
if (!heightChanged) {
|
||||||
|
debugLog('No more items loaded after Load More');
|
||||||
|
return allResults;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
default: {
|
default: {
|
||||||
|
|||||||
Reference in New Issue
Block a user