Merge pull request #947 from getmaxun/crawl-fix

fix(core): max depth crawling logic
This commit is contained in:
Karishma Shukla
2026-01-22 18:56:34 +05:30
committed by GitHub

View File

@@ -706,7 +706,6 @@ export default class Interpreter extends EventEmitter {
return;
}
if (this.options.debugChannel?.setActionType) {
this.options.debugChannel.setActionType('crawl');
}
@@ -728,130 +727,141 @@ export default class Interpreter extends EventEmitter {
const parsedBase = new URL(baseUrl);
const baseDomain = parsedBase.hostname;
let discoveredUrls: string[] = [];
interface RobotRules {
disallowedPaths: string[];
allowedPaths: string[];
crawlDelay: number | null;
}
if (crawlConfig.useSitemap) {
this.log('Fetching sitemap URLs...', Level.LOG);
let robotRules: RobotRules = {
disallowedPaths: [],
allowedPaths: [],
crawlDelay: null
};
if (crawlConfig.respectRobots) {
this.log('Fetching robots.txt...', Level.LOG);
try {
const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
const robotsUrl = `${parsedBase.protocol}//${parsedBase.host}/robots.txt`;
const sitemapUrls = await page.evaluate((url) => {
return new Promise<string[]>((resolve) => {
const robotsContent = await page.evaluate((url) => {
return new Promise<string>((resolve) => {
const xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onload = function() {
if (xhr.status === 200) {
const text = xhr.responseText;
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
resolve(urls);
resolve(xhr.responseText);
} else {
resolve([]);
resolve('');
}
};
xhr.onerror = function() {
resolve([]);
resolve('');
};
xhr.send();
});
}, sitemapUrl);
}, robotsUrl);
if (sitemapUrls.length > 0) {
const nestedSitemaps = sitemapUrls.filter(url =>
url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/')
);
const regularUrls = sitemapUrls.filter(url =>
!url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/')
);
if (robotsContent) {
const lines = robotsContent.split('\n');
let isRelevantUserAgent = false;
let foundSpecificUserAgent = false;
discoveredUrls.push(...regularUrls);
this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, Level.LOG);
for (const line of lines) {
const trimmedLine = line.trim().toLowerCase();
for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
try {
this.log(`Fetching nested sitemap: ${nestedUrl}`, Level.LOG);
const nestedUrls = await page.evaluate((url) => {
return new Promise<string[]>((resolve) => {
const xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onload = function() {
if (xhr.status === 200) {
const text = xhr.responseText;
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
resolve(urls);
if (trimmedLine.startsWith('#') || trimmedLine === '') {
continue;
}
const colonIndex = line.indexOf(':');
if (colonIndex === -1) continue;
const directive = line.substring(0, colonIndex).trim().toLowerCase();
const value = line.substring(colonIndex + 1).trim();
if (directive === 'user-agent') {
const agent = value.toLowerCase();
if (agent === '*' && !foundSpecificUserAgent) {
isRelevantUserAgent = true;
} else if (agent.includes('bot') || agent.includes('crawler') || agent.includes('spider')) {
isRelevantUserAgent = true;
foundSpecificUserAgent = true;
} else {
resolve([]);
if (!foundSpecificUserAgent) {
isRelevantUserAgent = false;
}
}
} else if (isRelevantUserAgent) {
if (directive === 'disallow' && value) {
robotRules.disallowedPaths.push(value);
} else if (directive === 'allow' && value) {
robotRules.allowedPaths.push(value);
} else if (directive === 'crawl-delay' && value) {
const delay = parseFloat(value);
if (!isNaN(delay) && delay > 0) {
robotRules.crawlDelay = delay * 1000;
}
}
}
}
this.log(`Robots.txt parsed: ${robotRules.disallowedPaths.length} disallowed paths, ${robotRules.allowedPaths.length} allowed paths, crawl-delay: ${robotRules.crawlDelay || 'none'}`, Level.LOG);
} else {
this.log('No robots.txt found or not accessible, proceeding without restrictions', Level.WARN);
}
} catch (error) {
this.log(`Failed to fetch robots.txt: ${error.message}, proceeding without restrictions`, Level.WARN);
}
}
const isUrlAllowedByRobots = (url: string): boolean => {
if (!crawlConfig.respectRobots) return true;
try {
const urlObj = new URL(url);
const pathname = urlObj.pathname;
for (const allowedPath of robotRules.allowedPaths) {
if (allowedPath === pathname || pathname.startsWith(allowedPath)) {
return true;
}
if (allowedPath.includes('*')) {
const regex = new RegExp('^' + allowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
if (regex.test(pathname)) {
return true;
}
}
}
for (const disallowedPath of robotRules.disallowedPaths) {
if (disallowedPath === '/') {
return false;
}
if (pathname.startsWith(disallowedPath)) {
return false;
}
if (disallowedPath.includes('*')) {
const regex = new RegExp('^' + disallowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
if (regex.test(pathname)) {
return false;
}
}
if (disallowedPath.endsWith('$')) {
const pattern = disallowedPath.slice(0, -1);
if (pathname === pattern || pathname.endsWith(pattern)) {
return false;
}
}
}
return true;
} catch (error) {
return true;
}
};
xhr.onerror = function() {
resolve([]);
};
xhr.send();
});
}, nestedUrl);
if (nestedUrls.length > 0) {
discoveredUrls.push(...nestedUrls);
this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG);
}
} catch (error) {
this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, Level.WARN);
}
}
this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, Level.LOG);
} else {
this.log('No URLs found in sitemap or sitemap not available', Level.WARN);
}
} catch (error) {
this.log(`Sitemap fetch failed: ${error.message}`, Level.WARN);
}
}
if (crawlConfig.followLinks) {
this.log('Extracting links from current page...', Level.LOG);
try {
await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
this.log('Network did not become idle, continuing anyway', Level.WARN);
});
await new Promise(resolve => setTimeout(resolve, 5000));
const anchorCount = await page.evaluate(() => {
return document.querySelectorAll('a').length;
});
this.log(`Page has ${anchorCount} total anchor tags`, Level.LOG);
const pageLinks = await page.evaluate(() => {
const links: string[] = [];
const allAnchors = document.querySelectorAll('a');
console.log('Total anchors found:', allAnchors.length);
for (let i = 0; i < allAnchors.length; i++) {
const anchor = allAnchors[i] as HTMLAnchorElement;
const href = anchor.getAttribute('href');
const fullHref = anchor.href;
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
links.push(fullHref);
}
}
console.log('Links extracted:', links.length);
return links;
});
discoveredUrls.push(...pageLinks);
this.log(`Found ${pageLinks.length} links from page`, Level.LOG);
} catch (error) {
this.log(`Link extraction failed: ${error.message}`, Level.WARN);
}
}
const filteredUrls = discoveredUrls.filter(url => {
const isUrlAllowedByConfig = (url: string): boolean => {
try {
const urlObj = new URL(url);
@@ -865,16 +875,24 @@ export default class Interpreter extends EventEmitter {
if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
const matches = crawlConfig.includePaths.some(pattern => {
try {
const regex = new RegExp(pattern);
return regex.test(url);
} catch {
return url.includes(pattern);
}
});
if (!matches) return false;
}
if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
const matches = crawlConfig.excludePaths.some(pattern => {
try {
const regex = new RegExp(pattern);
return regex.test(url);
} catch {
return url.includes(pattern);
}
});
if (matches) return false;
}
@@ -883,50 +901,42 @@ export default class Interpreter extends EventEmitter {
} catch (error) {
return false;
}
});
};
const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
const normalizeUrl = (url: string): string => {
return url.replace(/#.*$/, '').replace(/\/$/, '');
})));
};
const basePathname = parsedBase.pathname;
const prioritizedUrls = uniqueUrls.sort((a, b) => {
const extractLinksFromPage = async (): Promise<string[]> => {
try {
const aUrl = new URL(a);
const bUrl = new URL(b);
const aMatchesBase = aUrl.pathname.startsWith(basePathname);
const bMatchesBase = bUrl.pathname.startsWith(basePathname);
await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {});
await new Promise(resolve => setTimeout(resolve, 1000));
if (aMatchesBase && !bMatchesBase) return -1;
if (!aMatchesBase && bMatchesBase) return 1;
const pageLinks = await page.evaluate(() => {
const links: string[] = [];
const allAnchors = document.querySelectorAll('a');
return 0;
} catch (error) {
return 0;
for (let i = 0; i < allAnchors.length; i++) {
const anchor = allAnchors[i] as HTMLAnchorElement;
const fullHref = anchor.href;
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
links.push(fullHref);
}
}
return links;
});
const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, Level.LOG);
this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, Level.LOG);
const crawlResults = [];
for (let i = 0; i < finalUrls.length; i++) {
const url = finalUrls[i];
try {
this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, Level.LOG);
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 30000
}).catch(() => {
this.log(`Failed to navigate to ${url}, skipping...`, Level.WARN);
});
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
return pageLinks;
} catch (error) {
this.log(`Link extraction failed: ${error.message}`, Level.WARN);
return [];
}
};
const scrapePageContent = async (url: string) => {
const pageData = await page.evaluate(() => {
const getMeta = (name: string) => {
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
@@ -973,32 +983,197 @@ export default class Interpreter extends EventEmitter {
};
});
crawlResults.push({
return {
metadata: {
...pageData.metadata,
url: url,
sourceURL: url
},
} as Record<string, any>,
html: pageData.html,
text: pageData.text,
links: pageData.links,
wordCount: pageData.wordCount,
scrapedAt: new Date().toISOString()
};
};
const visitedUrls = new Set<string>();
const crawlResults: any[] = [];
interface CrawlQueueItem {
url: string;
depth: number;
}
const crawlQueue: CrawlQueueItem[] = [];
const normalizedBaseUrl = normalizeUrl(baseUrl);
visitedUrls.add(normalizedBaseUrl);
crawlQueue.push({ url: baseUrl, depth: 0 });
this.log(`Starting breadth-first crawl with maxDepth: ${crawlConfig.maxDepth}, limit: ${crawlConfig.limit}`, Level.LOG);
if (crawlConfig.useSitemap) {
this.log('Fetching sitemap URLs...', Level.LOG);
try {
const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
const sitemapUrls = await page.evaluate((url) => {
return new Promise<string[]>((resolve) => {
const xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onload = function() {
if (xhr.status === 200) {
const text = xhr.responseText;
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
resolve(urls);
} else {
resolve([]);
}
};
xhr.onerror = function() {
resolve([]);
};
xhr.send();
});
}, sitemapUrl);
if (sitemapUrls.length > 0) {
const nestedSitemaps = sitemapUrls.filter(url =>
url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/')
);
const regularUrls = sitemapUrls.filter(url =>
!url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/')
);
for (const sitemapPageUrl of regularUrls) {
const normalized = normalizeUrl(sitemapPageUrl);
if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(sitemapPageUrl) && isUrlAllowedByRobots(sitemapPageUrl)) {
visitedUrls.add(normalized);
crawlQueue.push({ url: sitemapPageUrl, depth: 1 });
}
}
this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, Level.LOG);
for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
try {
this.log(`Fetching nested sitemap: ${nestedUrl}`, Level.LOG);
const nestedUrls = await page.evaluate((url) => {
return new Promise<string[]>((resolve) => {
const xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onload = function() {
if (xhr.status === 200) {
const text = xhr.responseText;
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
resolve(urls);
} else {
resolve([]);
}
};
xhr.onerror = function() {
resolve([]);
};
xhr.send();
});
}, nestedUrl);
for (const nestedPageUrl of nestedUrls) {
const normalized = normalizeUrl(nestedPageUrl);
if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(nestedPageUrl) && isUrlAllowedByRobots(nestedPageUrl)) {
visitedUrls.add(normalized);
crawlQueue.push({ url: nestedPageUrl, depth: 1 });
}
}
this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG);
} catch (error) {
this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, Level.WARN);
}
}
this.log(`Total URLs queued from sitemaps: ${crawlQueue.length - 1}`, Level.LOG);
} else {
this.log('No URLs found in sitemap or sitemap not available', Level.WARN);
}
} catch (error) {
this.log(`Sitemap fetch failed: ${error.message}`, Level.WARN);
}
}
let processedCount = 0;
while (crawlQueue.length > 0 && crawlResults.length < crawlConfig.limit) {
if (this.isAborted) {
this.log('Workflow aborted during crawl', Level.WARN);
break;
}
const { url, depth } = crawlQueue.shift()!;
processedCount++;
this.log(`[${crawlResults.length + 1}/${crawlConfig.limit}] Crawling (depth ${depth}): ${url}`, Level.LOG);
try {
if (robotRules.crawlDelay && crawlResults.length > 0) {
this.log(`Applying crawl delay: ${robotRules.crawlDelay}ms`, Level.LOG);
await new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay!));
}
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 30000
}).catch((err) => {
throw new Error(`Navigation failed: ${err.message}`);
});
this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, Level.LOG);
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
const pageResult = await scrapePageContent(url);
pageResult.metadata.depth = depth;
crawlResults.push(pageResult);
this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, Level.LOG);
if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
const newLinks = await extractLinksFromPage();
let addedCount = 0;
for (const link of newLinks) {
const normalized = normalizeUrl(link);
if (!visitedUrls.has(normalized) &&
isUrlAllowedByConfig(link) &&
isUrlAllowedByRobots(link)) {
visitedUrls.add(normalized);
crawlQueue.push({ url: link, depth: depth + 1 });
addedCount++;
}
}
if (addedCount > 0) {
this.log(`Added ${addedCount} new URLs to queue at depth ${depth + 1}`, Level.LOG);
}
}
} catch (error) {
this.log(`Failed to scrape ${url}: ${error.message}`, Level.WARN);
this.log(`Failed to crawl ${url}: ${error.message}`, Level.WARN);
crawlResults.push({
metadata: {
url: url,
sourceURL: url,
depth: depth
},
error: error.message,
scrapedAt: new Date().toISOString()
});
}
}
this.log(`Successfully scraped ${crawlResults.length} pages`, Level.LOG);
this.log(`Crawl completed: ${crawlResults.length} pages scraped (${processedCount} URLs processed, ${visitedUrls.size} URLs discovered)`, Level.LOG);
const actionType = "crawl";
const actionName = "Crawl Results";
@@ -1031,8 +1206,6 @@ export default class Interpreter extends EventEmitter {
provider?: 'duckduckgo';
filters?: {
timeRange?: 'day' | 'week' | 'month' | 'year';
location?: string;
lang?: string;
};
mode: 'discover' | 'scrape';
}) => {