chore: setup scraper engine

This commit is contained in:
karishmas6
2024-04-04 09:12:35 +05:30
parent 2e6a0290f2
commit 792e26e01f
9 changed files with 6236 additions and 0 deletions

15
scraper/src/main.ts Normal file
View File

@@ -0,0 +1,15 @@
// For more information, see https://crawlee.dev/
import { PlaywrightCrawler, ProxyConfiguration } from 'crawlee';
import { router } from './routes.js';
const startUrls = ['https://crawlee.dev'];
const crawler = new PlaywrightCrawler({
// proxyConfiguration: new ProxyConfiguration({ proxyUrls: ['...'] }),
requestHandler: router,
// Comment this option to scrape the full website.
maxRequestsPerCrawl: 20,
});
await crawler.run(startUrls);

21
scraper/src/routes.ts Normal file
View File

@@ -0,0 +1,21 @@
import { createPlaywrightRouter } from 'crawlee';
export const router = createPlaywrightRouter();
router.addDefaultHandler(async ({ enqueueLinks, log }) => {
log.info(`enqueueing new URLs`);
await enqueueLinks({
globs: ['https://crawlee.dev/**'],
label: 'detail',
});
});
router.addHandler('detail', async ({ request, page, log, pushData }) => {
const title = await page.title();
log.info(`${title}`, { url: request.loadedUrl });
await pushData({
url: request.loadedUrl,
title,
});
});