From e8e6a2d9b0cb6f16a60e6fc8b365d9f8ada1866f Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 20:17:45 +0530 Subject: [PATCH 01/11] feat: remove browser based click next pagination --- maxun-core/src/browserSide/scraper.js | 41 --------------------------- 1 file changed, 41 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 1ea4b3b4..706e332e 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -164,47 +164,6 @@ async function scrollUpToLoadMore(selector, limit) { } } -async function clickNextPagination(selector, scrapedData, limit) { - // Check if the limit is already met - if (scrapedData.length >= limit) { - return false; // Return false to indicate no further action is needed - } - - // Check if a single "Next" button exists - let nextButton = document.querySelector(selector); - - if (nextButton) { - nextButton.click(); - return true; // Indicate that pagination occurred - } else { - // Handle pagination with numbers - const paginationButtons = document.querySelectorAll(selector); - let clicked = false; - - // Loop through pagination buttons to find the current active page - for (let i = 0; i < paginationButtons.length - 1; i++) { - const button = paginationButtons[i]; - if (button.classList.contains('active')) { - // Click the next button if available - const nextButtonInPagination = paginationButtons[i + 1]; - if (nextButtonInPagination) { - nextButtonInPagination.click(); - clicked = true; - break; - } - } - } - - // If no next button was clicked, we might be on the last page - if (!clicked) { - throw new Error("No more items to load or pagination has ended."); - } - - return clicked; // Indicate whether pagination occurred - } -} - - /** * Returns a "scrape" result from the current page. * @returns {Array} *Curated* array of scraped information (with sparse rows removed) From 7ab668633c5fd062afadf74c29d98013dbcf4954 Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 20:19:29 +0530 Subject: [PATCH 02/11] feat: remove browser side scroll up --- maxun-core/src/browserSide/scraper.js | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 706e332e..c5ec54bf 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -145,25 +145,6 @@ async function scrollDownToLoadMore(selector, limit) { } } -async function scrollUpToLoadMore(selector, limit) { - let previousHeight = 0; - let itemsLoaded = 0; - - while (itemsLoaded < limit) { - window.scrollBy(0, -window.innerHeight); - await new Promise(resolve => setTimeout(resolve, 1000)); - - const currentHeight = document.body.scrollHeight; - - if (currentHeight === previousHeight) { - break; // No more items to load - } - - previousHeight = currentHeight; - itemsLoaded += document.querySelectorAll(selector).length; - } -} - /** * Returns a "scrape" result from the current page. * @returns {Array} *Curated* array of scraped information (with sparse rows removed) From e9f1ff0294c6473338b2a5f830c6e84e1e2ea080 Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 20:19:55 +0530 Subject: [PATCH 03/11] feat: remove browser side scroll down --- maxun-core/src/browserSide/scraper.js | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index c5ec54bf..56b91408 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -126,25 +126,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return out; } -async function scrollDownToLoadMore(selector, limit) { - let previousHeight = 0; - let itemsLoaded = 0; - - while (itemsLoaded < limit) { - window.scrollBy(0, window.innerHeight); - await new Promise(resolve => setTimeout(resolve, 1000)); - - const currentHeight = document.body.scrollHeight; - - if (currentHeight === previousHeight) { - break; // No more items to load - } - - previousHeight = currentHeight; - itemsLoaded += document.querySelectorAll(selector).length; - } -} - /** * Returns a "scrape" result from the current page. * @returns {Array} *Curated* array of scraped information (with sparse rows removed) From b4eab606192bc4b44fd494f0ddfba359bf876865 Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 20:20:44 +0530 Subject: [PATCH 04/11] chore: remove comment --- maxun-core/src/browserSide/scraper.js | 1 - 1 file changed, 1 deletion(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 56b91408..a4b2c19e 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -97,7 +97,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes)); } - // console.debug(`Total ${metricType} is ${metric}.`) if (metric > maxSelector.metric && elements.length < maxCountPerPage) { maxSelector = { selector, metric }; } From f377e9702ec6877585d0a09ddbcf2a42d60782f7 Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 20:22:23 +0530 Subject: [PATCH 05/11] feat: remove window scrollDown & scrollUp --- maxun-core/src/browserSide/scraper.js | 40 +-------------------------- 1 file changed, 1 insertion(+), 39 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index a4b2c19e..79893568 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -340,43 +340,5 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return results; }; - - window.scrollDown = async function (selector, limit) { - let previousHeight = 0; - let itemsLoaded = 0; - - while (itemsLoaded < limit) { - window.scrollTo(0, document.body.scrollHeight); - await new Promise(resolve => setTimeout(resolve, 1000)); - - const currentHeight = document.body.scrollHeight; - - if (currentHeight === previousHeight) { - break; // No more items to load - } - - previousHeight = currentHeight; - itemsLoaded += document.querySelectorAll(selector).length; - } - } - - window.scrollUp = async function (selector, limit) { - let previousHeight = 0; - let itemsLoaded = 0; - - while (itemsLoaded < limit) { - window.scrollBy(0, -window.innerHeight); - await new Promise(resolve => setTimeout(resolve, 1000)); - - const currentHeight = document.body.scrollHeight; - - if (currentHeight === previousHeight) { - break; // No more items to load - } - - previousHeight = currentHeight; - itemsLoaded += document.querySelectorAll(selector).length; - } - } - + })(window); \ No newline at end of file From b598d265b2c4e93fd919ca307c5d7caba54495e3 Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 20:26:51 +0530 Subject: [PATCH 06/11] feat: scroll up --- maxun-core/src/interpret.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 4068f7be..8a5b3788 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -406,6 +406,17 @@ export default class Interpreter extends EventEmitter { previousHeight = currentHeight; break; case 'scrollUp': + await page.evaluate(() => window.scrollTo(0, 0)); + await page.waitForTimeout(2000); + + const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop); + if (currentTopHeight === 0) { + const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(finalResults); + return allResults; + } + + previousHeight = currentTopHeight; break; case 'clickNext': const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); From 9c25818c4f86bbd4e40654a1e21f6ccb06853be6 Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 23:34:19 +0530 Subject: [PATCH 07/11] feat: use newField.label instead of id --- src/components/organisms/BrowserWindow.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/components/organisms/BrowserWindow.tsx b/src/components/organisms/BrowserWindow.tsx index b2592f1d..e5119594 100644 --- a/src/components/organisms/BrowserWindow.tsx +++ b/src/components/organisms/BrowserWindow.tsx @@ -228,7 +228,7 @@ export const BrowserWindow = () => { setFields(prevFields => { const updatedFields = { ...prevFields, - [newField.id]: newField + [newField.label]: newField }; console.log(updatedFields) return updatedFields; @@ -288,7 +288,7 @@ export const BrowserWindow = () => { setFields(prevFields => { const updatedFields = { ...prevFields, - [newField.id]: newField + [newField.label]: newField }; return updatedFields; }); From ce770e3c508657ba52baa7cf80050371a5830ccc Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 23:34:49 +0530 Subject: [PATCH 08/11] chore: -rm console.log --- src/components/organisms/BrowserWindow.tsx | 1 - 1 file changed, 1 deletion(-) diff --git a/src/components/organisms/BrowserWindow.tsx b/src/components/organisms/BrowserWindow.tsx index e5119594..11ae4320 100644 --- a/src/components/organisms/BrowserWindow.tsx +++ b/src/components/organisms/BrowserWindow.tsx @@ -230,7 +230,6 @@ export const BrowserWindow = () => { ...prevFields, [newField.label]: newField }; - console.log(updatedFields) return updatedFields; }); From fb6ce1069312c95fa42c0327b90de669f73cf81b Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 23:35:49 +0530 Subject: [PATCH 09/11] feat: click load more --- maxun-core/src/interpret.ts | 38 +++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 8a5b3788..38cf9cb0 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -447,11 +447,41 @@ export default class Interpreter extends EventEmitter { await page.waitForTimeout(1000); break; case 'clickLoadMore': - const loadMoreButton = await page.$(config.pagination.selector); - if (!loadMoreButton) { - return allResults; + while (true) { + // Find and click the 'Load More' button + const loadMoreButton = await page.$(config.pagination.selector); + if (!loadMoreButton) { + // No more "Load More" button, so scrape the remaining items + const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(finalResults); + return allResults; + } + + // Click the 'Load More' button to load additional items + await loadMoreButton.click(); + await page.waitForTimeout(2000); // Wait for new items to load + + // After clicking 'Load More', scroll down to load more items + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await page.waitForTimeout(2000); + + // Check if more items are available + const currentHeight = await page.evaluate(() => document.body.scrollHeight); + if (currentHeight === previousHeight) { + // No more items loaded, return the scraped results + const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(finalResults); + return allResults; + } + + previousHeight = currentHeight; + + if (config.limit && allResults.length >= config.limit) { + // If limit is set and reached, return the limited results + allResults = allResults.slice(0, config.limit); + break; + } } - await loadMoreButton.click(); break; default: const results = await page.evaluate((cfg) => window.scrapeList(cfg), config); From a7a1e9b305a4f3ffea77f4ab11434a404c9efa0a Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 23:36:28 +0530 Subject: [PATCH 10/11] fix: spacing --- maxun-core/src/interpret.ts | 6 ------ 1 file changed, 6 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 38cf9cb0..5abcbf39 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -448,7 +448,6 @@ export default class Interpreter extends EventEmitter { break; case 'clickLoadMore': while (true) { - // Find and click the 'Load More' button const loadMoreButton = await page.$(config.pagination.selector); if (!loadMoreButton) { // No more "Load More" button, so scrape the remaining items @@ -456,15 +455,12 @@ export default class Interpreter extends EventEmitter { allResults = allResults.concat(finalResults); return allResults; } - // Click the 'Load More' button to load additional items await loadMoreButton.click(); await page.waitForTimeout(2000); // Wait for new items to load - // After clicking 'Load More', scroll down to load more items await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.waitForTimeout(2000); - // Check if more items are available const currentHeight = await page.evaluate(() => document.body.scrollHeight); if (currentHeight === previousHeight) { @@ -473,9 +469,7 @@ export default class Interpreter extends EventEmitter { allResults = allResults.concat(finalResults); return allResults; } - previousHeight = currentHeight; - if (config.limit && allResults.length >= config.limit) { // If limit is set and reached, return the limited results allResults = allResults.slice(0, config.limit); From 061714fcdcb3df73825a7be79f6a435ab13dcdff Mon Sep 17 00:00:00 2001 From: karishmas6 Date: Fri, 20 Sep 2024 23:37:00 +0530 Subject: [PATCH 11/11] chore: lint --- maxun-core/src/interpret.ts | 64 ++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 5abcbf39..0543d71f 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -408,14 +408,14 @@ export default class Interpreter extends EventEmitter { case 'scrollUp': await page.evaluate(() => window.scrollTo(0, 0)); await page.waitForTimeout(2000); - + const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop); if (currentTopHeight === 0) { - const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); - allResults = allResults.concat(finalResults); - return allResults; + const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(finalResults); + return allResults; } - + previousHeight = currentTopHeight; break; case 'clickNext': @@ -448,33 +448,33 @@ export default class Interpreter extends EventEmitter { break; case 'clickLoadMore': while (true) { - const loadMoreButton = await page.$(config.pagination.selector); - if (!loadMoreButton) { - // No more "Load More" button, so scrape the remaining items - const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); - allResults = allResults.concat(finalResults); - return allResults; - } - // Click the 'Load More' button to load additional items - await loadMoreButton.click(); - await page.waitForTimeout(2000); // Wait for new items to load - // After clicking 'Load More', scroll down to load more items - await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); - await page.waitForTimeout(2000); - // Check if more items are available - const currentHeight = await page.evaluate(() => document.body.scrollHeight); - if (currentHeight === previousHeight) { - // No more items loaded, return the scraped results - const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); - allResults = allResults.concat(finalResults); - return allResults; - } - previousHeight = currentHeight; - if (config.limit && allResults.length >= config.limit) { - // If limit is set and reached, return the limited results - allResults = allResults.slice(0, config.limit); - break; - } + const loadMoreButton = await page.$(config.pagination.selector); + if (!loadMoreButton) { + // No more "Load More" button, so scrape the remaining items + const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(finalResults); + return allResults; + } + // Click the 'Load More' button to load additional items + await loadMoreButton.click(); + await page.waitForTimeout(2000); // Wait for new items to load + // After clicking 'Load More', scroll down to load more items + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await page.waitForTimeout(2000); + // Check if more items are available + const currentHeight = await page.evaluate(() => document.body.scrollHeight); + if (currentHeight === previousHeight) { + // No more items loaded, return the scraped results + const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + allResults = allResults.concat(finalResults); + return allResults; + } + previousHeight = currentHeight; + if (config.limit && allResults.length >= config.limit) { + // If limit is set and reached, return the limited results + allResults = allResults.slice(0, config.limit); + break; + } } break; default: