2024-04-20 00:47:50 +05:30
|
|
|
import { PlaywrightCrawler, Configuration } from 'crawlee';
|
2024-04-18 22:20:06 +05:30
|
|
|
|
2024-04-18 22:19:40 +05:30
|
|
|
async function scrapeData(url, selectors, waitForSeconds = 2) {
|
2024-04-19 03:21:35 +05:30
|
|
|
const scrapedData = [];
|
2024-04-18 22:19:40 +05:30
|
|
|
const crawler = new PlaywrightCrawler({
|
2024-04-20 00:47:50 +05:30
|
|
|
requestHandler: async ({ page, request }) => {
|
2024-04-18 22:19:40 +05:30
|
|
|
await page.goto(url);
|
|
|
|
|
|
|
|
|
|
await page.waitForTimeout(waitForSeconds * 1000);
|
|
|
|
|
|
|
|
|
|
console.log('Received selectors:', selectors);
|
|
|
|
|
|
|
|
|
|
for (const selector of selectors) {
|
|
|
|
|
const elementData = await page.$$eval(selector, elements => elements.map(el => el.textContent.trim()));
|
|
|
|
|
scrapedData.push(...elementData);
|
|
|
|
|
}
|
|
|
|
|
|
2024-04-19 01:01:16 +05:30
|
|
|
console.log('Scraped data:', scrapedData);
|
2024-04-18 22:19:40 +05:30
|
|
|
},
|
2024-04-20 00:47:50 +05:30
|
|
|
},
|
|
|
|
|
new Configuration({
|
|
|
|
|
persistStorage: false,
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
await crawler.run([ url ]);
|
2024-04-19 03:21:35 +05:30
|
|
|
return scrapedData;
|
2024-04-20 00:47:50 +05:30
|
|
|
|
2024-04-19 02:23:46 +05:30
|
|
|
}
|
|
|
|
|
|
2024-04-19 03:21:35 +05:30
|
|
|
export default scrapeData;
|