diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 1ea4b3b4..79893568 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -97,7 +97,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes)); } - // console.debug(`Total ${metricType} is ${metric}.`) if (metric > maxSelector.metric && elements.length < maxCountPerPage) { maxSelector = { selector, metric }; } @@ -126,85 +125,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return out; } -async function scrollDownToLoadMore(selector, limit) { - let previousHeight = 0; - let itemsLoaded = 0; - - while (itemsLoaded < limit) { - window.scrollBy(0, window.innerHeight); - await new Promise(resolve => setTimeout(resolve, 1000)); - - const currentHeight = document.body.scrollHeight; - - if (currentHeight === previousHeight) { - break; // No more items to load - } - - previousHeight = currentHeight; - itemsLoaded += document.querySelectorAll(selector).length; - } -} - -async function scrollUpToLoadMore(selector, limit) { - let previousHeight = 0; - let itemsLoaded = 0; - - while (itemsLoaded < limit) { - window.scrollBy(0, -window.innerHeight); - await new Promise(resolve => setTimeout(resolve, 1000)); - - const currentHeight = document.body.scrollHeight; - - if (currentHeight === previousHeight) { - break; // No more items to load - } - - previousHeight = currentHeight; - itemsLoaded += document.querySelectorAll(selector).length; - } -} - -async function clickNextPagination(selector, scrapedData, limit) { - // Check if the limit is already met - if (scrapedData.length >= limit) { - return false; // Return false to indicate no further action is needed - } - - // Check if a single "Next" button exists - let nextButton = document.querySelector(selector); - - if (nextButton) { - nextButton.click(); - return true; // Indicate that pagination occurred - } else { - // Handle pagination with numbers - const paginationButtons = document.querySelectorAll(selector); - let clicked = false; - - // Loop through pagination buttons to find the current active page - for (let i = 0; i < paginationButtons.length - 1; i++) { - const button = paginationButtons[i]; - if (button.classList.contains('active')) { - // Click the next button if available - const nextButtonInPagination = paginationButtons[i + 1]; - if (nextButtonInPagination) { - nextButtonInPagination.click(); - clicked = true; - break; - } - } - } - - // If no next button was clicked, we might be on the last page - if (!clicked) { - throw new Error("No more items to load or pagination has ended."); - } - - return clicked; // Indicate whether pagination occurred - } -} - - /** * Returns a "scrape" result from the current page. * @returns {Array} *Curated* array of scraped information (with sparse rows removed) @@ -420,43 +340,5 @@ async function clickNextPagination(selector, scrapedData, limit) { return results; }; - - window.scrollDown = async function (selector, limit) { - let previousHeight = 0; - let itemsLoaded = 0; - - while (itemsLoaded < limit) { - window.scrollTo(0, document.body.scrollHeight); - await new Promise(resolve => setTimeout(resolve, 1000)); - - const currentHeight = document.body.scrollHeight; - - if (currentHeight === previousHeight) { - break; // No more items to load - } - - previousHeight = currentHeight; - itemsLoaded += document.querySelectorAll(selector).length; - } - } - - window.scrollUp = async function (selector, limit) { - let previousHeight = 0; - let itemsLoaded = 0; - - while (itemsLoaded < limit) { - window.scrollBy(0, -window.innerHeight); - await new Promise(resolve => setTimeout(resolve, 1000)); - - const currentHeight = document.body.scrollHeight; - - if (currentHeight === previousHeight) { - break; // No more items to load - } - - previousHeight = currentHeight; - itemsLoaded += document.querySelectorAll(selector).length; - } - } - + })(window); \ No newline at end of file diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 4068f7be..0543d71f 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -406,6 +406,17 @@ export default class Interpreter extends EventEmitter { previousHeight = currentHeight; break; case 'scrollUp': + await page.evaluate(() => window.scrollTo(0, 0)); + await page.waitForTimeout(2000); + + const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop); + if (currentTopHeight === 0) { + const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(finalResults); + return allResults; + } + + previousHeight = currentTopHeight; break; case 'clickNext': const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); @@ -436,11 +447,35 @@ export default class Interpreter extends EventEmitter { await page.waitForTimeout(1000); break; case 'clickLoadMore': - const loadMoreButton = await page.$(config.pagination.selector); - if (!loadMoreButton) { - return allResults; + while (true) { + const loadMoreButton = await page.$(config.pagination.selector); + if (!loadMoreButton) { + // No more "Load More" button, so scrape the remaining items + const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(finalResults); + return allResults; + } + // Click the 'Load More' button to load additional items + await loadMoreButton.click(); + await page.waitForTimeout(2000); // Wait for new items to load + // After clicking 'Load More', scroll down to load more items + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await page.waitForTimeout(2000); + // Check if more items are available + const currentHeight = await page.evaluate(() => document.body.scrollHeight); + if (currentHeight === previousHeight) { + // No more items loaded, return the scraped results + const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(finalResults); + return allResults; + } + previousHeight = currentHeight; + if (config.limit && allResults.length >= config.limit) { + // If limit is set and reached, return the limited results + allResults = allResults.slice(0, config.limit); + break; + } } - await loadMoreButton.click(); break; default: const results = await page.evaluate((cfg) => window.scrapeList(cfg), config); diff --git a/src/components/organisms/BrowserWindow.tsx b/src/components/organisms/BrowserWindow.tsx index b2592f1d..11ae4320 100644 --- a/src/components/organisms/BrowserWindow.tsx +++ b/src/components/organisms/BrowserWindow.tsx @@ -228,9 +228,8 @@ export const BrowserWindow = () => { setFields(prevFields => { const updatedFields = { ...prevFields, - [newField.id]: newField + [newField.label]: newField }; - console.log(updatedFields) return updatedFields; }); @@ -288,7 +287,7 @@ export const BrowserWindow = () => { setFields(prevFields => { const updatedFields = { ...prevFields, - [newField.id]: newField + [newField.label]: newField }; return updatedFields; });