feat: track results and return scroll pagination
This commit is contained in:
@@ -572,6 +572,7 @@ export default class Interpreter extends EventEmitter {
|
|||||||
let visitedUrls: Set<string> = new Set<string>();
|
let visitedUrls: Set<string> = new Set<string>();
|
||||||
const MAX_RETRIES = 3;
|
const MAX_RETRIES = 3;
|
||||||
const RETRY_DELAY = 1000; // 1 second delay between retries
|
const RETRY_DELAY = 1000; // 1 second delay between retries
|
||||||
|
const MAX_UNCHANGED_RESULTS = 5;
|
||||||
|
|
||||||
const debugLog = (message: string, ...args: any[]) => {
|
const debugLog = (message: string, ...args: any[]) => {
|
||||||
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
||||||
@@ -661,18 +662,36 @@ export default class Interpreter extends EventEmitter {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let availableSelectors = config.pagination.selector.split(',');
|
let availableSelectors = config.pagination.selector.split(',');
|
||||||
|
let unchangedResultCounter = 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
while (true) {
|
while (true) {
|
||||||
switch (config.pagination.type) {
|
switch (config.pagination.type) {
|
||||||
case 'scrollDown': {
|
case 'scrollDown': {
|
||||||
|
let previousResultCount = allResults.length;
|
||||||
|
|
||||||
|
await scrapeCurrentPage();
|
||||||
|
|
||||||
|
if (checkLimit()) {
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
|
||||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||||
await page.waitForTimeout(2000);
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
|
const currentResultCount = allResults.length;
|
||||||
|
|
||||||
|
if (currentResultCount === previousResultCount) {
|
||||||
|
unchangedResultCounter++;
|
||||||
|
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
unchangedResultCounter = 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (currentHeight === previousHeight) {
|
if (currentHeight === previousHeight) {
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -681,13 +700,30 @@ export default class Interpreter extends EventEmitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
case 'scrollUp': {
|
case 'scrollUp': {
|
||||||
|
let previousResultCount = allResults.length;
|
||||||
|
|
||||||
|
await scrapeCurrentPage();
|
||||||
|
|
||||||
|
if (checkLimit()) {
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
|
||||||
await page.evaluate(() => window.scrollTo(0, 0));
|
await page.evaluate(() => window.scrollTo(0, 0));
|
||||||
await page.waitForTimeout(2000);
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
|
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
|
||||||
|
const currentResultCount = allResults.length;
|
||||||
|
|
||||||
|
if (currentResultCount === previousResultCount) {
|
||||||
|
unchangedResultCounter++;
|
||||||
|
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
unchangedResultCounter = 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (currentTopHeight === 0) {
|
if (currentTopHeight === 0) {
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user