Merge pull request #36 from amhsirak/develop
feat: handle scroll up & load more pagination options
This commit is contained in:
@@ -97,7 +97,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes));
|
metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes));
|
||||||
}
|
}
|
||||||
|
|
||||||
// console.debug(`Total ${metricType} is ${metric}.`)
|
|
||||||
if (metric > maxSelector.metric && elements.length < maxCountPerPage) {
|
if (metric > maxSelector.metric && elements.length < maxCountPerPage) {
|
||||||
maxSelector = { selector, metric };
|
maxSelector = { selector, metric };
|
||||||
}
|
}
|
||||||
@@ -126,85 +125,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrollDownToLoadMore(selector, limit) {
|
|
||||||
let previousHeight = 0;
|
|
||||||
let itemsLoaded = 0;
|
|
||||||
|
|
||||||
while (itemsLoaded < limit) {
|
|
||||||
window.scrollBy(0, window.innerHeight);
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
||||||
|
|
||||||
const currentHeight = document.body.scrollHeight;
|
|
||||||
|
|
||||||
if (currentHeight === previousHeight) {
|
|
||||||
break; // No more items to load
|
|
||||||
}
|
|
||||||
|
|
||||||
previousHeight = currentHeight;
|
|
||||||
itemsLoaded += document.querySelectorAll(selector).length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function scrollUpToLoadMore(selector, limit) {
|
|
||||||
let previousHeight = 0;
|
|
||||||
let itemsLoaded = 0;
|
|
||||||
|
|
||||||
while (itemsLoaded < limit) {
|
|
||||||
window.scrollBy(0, -window.innerHeight);
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
||||||
|
|
||||||
const currentHeight = document.body.scrollHeight;
|
|
||||||
|
|
||||||
if (currentHeight === previousHeight) {
|
|
||||||
break; // No more items to load
|
|
||||||
}
|
|
||||||
|
|
||||||
previousHeight = currentHeight;
|
|
||||||
itemsLoaded += document.querySelectorAll(selector).length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function clickNextPagination(selector, scrapedData, limit) {
|
|
||||||
// Check if the limit is already met
|
|
||||||
if (scrapedData.length >= limit) {
|
|
||||||
return false; // Return false to indicate no further action is needed
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if a single "Next" button exists
|
|
||||||
let nextButton = document.querySelector(selector);
|
|
||||||
|
|
||||||
if (nextButton) {
|
|
||||||
nextButton.click();
|
|
||||||
return true; // Indicate that pagination occurred
|
|
||||||
} else {
|
|
||||||
// Handle pagination with numbers
|
|
||||||
const paginationButtons = document.querySelectorAll(selector);
|
|
||||||
let clicked = false;
|
|
||||||
|
|
||||||
// Loop through pagination buttons to find the current active page
|
|
||||||
for (let i = 0; i < paginationButtons.length - 1; i++) {
|
|
||||||
const button = paginationButtons[i];
|
|
||||||
if (button.classList.contains('active')) {
|
|
||||||
// Click the next button if available
|
|
||||||
const nextButtonInPagination = paginationButtons[i + 1];
|
|
||||||
if (nextButtonInPagination) {
|
|
||||||
nextButtonInPagination.click();
|
|
||||||
clicked = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If no next button was clicked, we might be on the last page
|
|
||||||
if (!clicked) {
|
|
||||||
throw new Error("No more items to load or pagination has ended.");
|
|
||||||
}
|
|
||||||
|
|
||||||
return clicked; // Indicate whether pagination occurred
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a "scrape" result from the current page.
|
* Returns a "scrape" result from the current page.
|
||||||
* @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
|
* @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
|
||||||
@@ -420,43 +340,5 @@ async function clickNextPagination(selector, scrapedData, limit) {
|
|||||||
|
|
||||||
return results;
|
return results;
|
||||||
};
|
};
|
||||||
|
|
||||||
window.scrollDown = async function (selector, limit) {
|
|
||||||
let previousHeight = 0;
|
|
||||||
let itemsLoaded = 0;
|
|
||||||
|
|
||||||
while (itemsLoaded < limit) {
|
|
||||||
window.scrollTo(0, document.body.scrollHeight);
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
||||||
|
|
||||||
const currentHeight = document.body.scrollHeight;
|
|
||||||
|
|
||||||
if (currentHeight === previousHeight) {
|
|
||||||
break; // No more items to load
|
|
||||||
}
|
|
||||||
|
|
||||||
previousHeight = currentHeight;
|
|
||||||
itemsLoaded += document.querySelectorAll(selector).length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
window.scrollUp = async function (selector, limit) {
|
|
||||||
let previousHeight = 0;
|
|
||||||
let itemsLoaded = 0;
|
|
||||||
|
|
||||||
while (itemsLoaded < limit) {
|
|
||||||
window.scrollBy(0, -window.innerHeight);
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
||||||
|
|
||||||
const currentHeight = document.body.scrollHeight;
|
|
||||||
|
|
||||||
if (currentHeight === previousHeight) {
|
|
||||||
break; // No more items to load
|
|
||||||
}
|
|
||||||
|
|
||||||
previousHeight = currentHeight;
|
|
||||||
itemsLoaded += document.querySelectorAll(selector).length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
})(window);
|
})(window);
|
||||||
@@ -406,6 +406,17 @@ export default class Interpreter extends EventEmitter {
|
|||||||
previousHeight = currentHeight;
|
previousHeight = currentHeight;
|
||||||
break;
|
break;
|
||||||
case 'scrollUp':
|
case 'scrollUp':
|
||||||
|
await page.evaluate(() => window.scrollTo(0, 0));
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
|
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
|
||||||
|
if (currentTopHeight === 0) {
|
||||||
|
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
|
allResults = allResults.concat(finalResults);
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
|
||||||
|
previousHeight = currentTopHeight;
|
||||||
break;
|
break;
|
||||||
case 'clickNext':
|
case 'clickNext':
|
||||||
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
@@ -436,11 +447,35 @@ export default class Interpreter extends EventEmitter {
|
|||||||
await page.waitForTimeout(1000);
|
await page.waitForTimeout(1000);
|
||||||
break;
|
break;
|
||||||
case 'clickLoadMore':
|
case 'clickLoadMore':
|
||||||
const loadMoreButton = await page.$(config.pagination.selector);
|
while (true) {
|
||||||
if (!loadMoreButton) {
|
const loadMoreButton = await page.$(config.pagination.selector);
|
||||||
return allResults;
|
if (!loadMoreButton) {
|
||||||
|
// No more "Load More" button, so scrape the remaining items
|
||||||
|
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
|
allResults = allResults.concat(finalResults);
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
// Click the 'Load More' button to load additional items
|
||||||
|
await loadMoreButton.click();
|
||||||
|
await page.waitForTimeout(2000); // Wait for new items to load
|
||||||
|
// After clicking 'Load More', scroll down to load more items
|
||||||
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
// Check if more items are available
|
||||||
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
|
if (currentHeight === previousHeight) {
|
||||||
|
// No more items loaded, return the scraped results
|
||||||
|
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
|
allResults = allResults.concat(finalResults);
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
previousHeight = currentHeight;
|
||||||
|
if (config.limit && allResults.length >= config.limit) {
|
||||||
|
// If limit is set and reached, return the limited results
|
||||||
|
allResults = allResults.slice(0, config.limit);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
await loadMoreButton.click();
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
|
|||||||
@@ -228,9 +228,8 @@ export const BrowserWindow = () => {
|
|||||||
setFields(prevFields => {
|
setFields(prevFields => {
|
||||||
const updatedFields = {
|
const updatedFields = {
|
||||||
...prevFields,
|
...prevFields,
|
||||||
[newField.id]: newField
|
[newField.label]: newField
|
||||||
};
|
};
|
||||||
console.log(updatedFields)
|
|
||||||
return updatedFields;
|
return updatedFields;
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -288,7 +287,7 @@ export const BrowserWindow = () => {
|
|||||||
setFields(prevFields => {
|
setFields(prevFields => {
|
||||||
const updatedFields = {
|
const updatedFields = {
|
||||||
...prevFields,
|
...prevFields,
|
||||||
[newField.id]: newField
|
[newField.label]: newField
|
||||||
};
|
};
|
||||||
return updatedFields;
|
return updatedFields;
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user