fix: allow robots.txt, max depth config

2026-01-19 00:38:06 +05:30
parent f3cb9273ec
commit 19f2df87fc
1 changed files with 344 additions and 171 deletions
--- a/maxun-core/src/interpret.ts
+++ b/maxun-core/src/interpret.ts
@@ -706,7 +706,6 @@ export default class Interpreter extends EventEmitter {
          return;
        }
        if (this.options.debugChannel?.setActionType) {
          this.options.debugChannel.setActionType('crawl');
        }
@@ -728,7 +727,291 @@ export default class Interpreter extends EventEmitter {
          const parsedBase = new URL(baseUrl);
          const baseDomain = parsedBase.hostname;
-          let discoveredUrls: string[] = [];
+          interface RobotRules {
            disallowedPaths: string[];
            allowedPaths: string[];
            crawlDelay: number | null;
          }
          let robotRules: RobotRules = {
            disallowedPaths: [],
            allowedPaths: [],
            crawlDelay: null
          };
          if (crawlConfig.respectRobots) {
            this.log('Fetching robots.txt...', Level.LOG);
            try {
              const robotsUrl = `${parsedBase.protocol}//${parsedBase.host}/robots.txt`;
              const robotsContent = await page.evaluate((url) => {
                return new Promise<string>((resolve) => {
                  const xhr = new XMLHttpRequest();
                  xhr.open('GET', url, true);
                  xhr.onload = function() {
                    if (xhr.status === 200) {
                      resolve(xhr.responseText);
                    } else {
                      resolve('');
                    }
                  };
                  xhr.onerror = function() {
                    resolve('');
                  };
                  xhr.send();
                });
              }, robotsUrl);
              if (robotsContent) {
                const lines = robotsContent.split('\n');
                let isRelevantUserAgent = false;
                let foundSpecificUserAgent = false;
                for (const line of lines) {
                  const trimmedLine = line.trim().toLowerCase();
                  if (trimmedLine.startsWith('#') || trimmedLine === '') {
                    continue;
                  }
                  const colonIndex = line.indexOf(':');
                  if (colonIndex === -1) continue;
                  const directive = line.substring(0, colonIndex).trim().toLowerCase();
                  const value = line.substring(colonIndex + 1).trim();
                  if (directive === 'user-agent') {
                    const agent = value.toLowerCase();
                    if (agent === '*' && !foundSpecificUserAgent) {
                      isRelevantUserAgent = true;
                    } else if (agent.includes('bot') || agent.includes('crawler') || agent.includes('spider')) {
                      isRelevantUserAgent = true;
                      foundSpecificUserAgent = true;
                    } else {
                      if (!foundSpecificUserAgent) {
                        isRelevantUserAgent = false;
                      }
                    }
                  } else if (isRelevantUserAgent) {
                    if (directive === 'disallow' && value) {
                      robotRules.disallowedPaths.push(value);
                    } else if (directive === 'allow' && value) {
                      robotRules.allowedPaths.push(value);
                    } else if (directive === 'crawl-delay' && value) {
                      const delay = parseFloat(value);
                      if (!isNaN(delay) && delay > 0) {
                        robotRules.crawlDelay = delay * 1000;
                      }
                    }
                  }
                }
                this.log(`Robots.txt parsed: ${robotRules.disallowedPaths.length} disallowed paths, ${robotRules.allowedPaths.length} allowed paths, crawl-delay: ${robotRules.crawlDelay || 'none'}`, Level.LOG);
              } else {
                this.log('No robots.txt found or not accessible, proceeding without restrictions', Level.WARN);
              }
            } catch (error) {
              this.log(`Failed to fetch robots.txt: ${error.message}, proceeding without restrictions`, Level.WARN);
            }
          }
          const isUrlAllowedByRobots = (url: string): boolean => {
            if (!crawlConfig.respectRobots) return true;
            try {
              const urlObj = new URL(url);
              const pathname = urlObj.pathname;
              for (const allowedPath of robotRules.allowedPaths) {
                if (allowedPath === pathname || pathname.startsWith(allowedPath)) {
                  return true;
                }
                if (allowedPath.includes('*')) {
                  const regex = new RegExp('^' + allowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
                  if (regex.test(pathname)) {
                    return true;
                  }
                }
              }
              for (const disallowedPath of robotRules.disallowedPaths) {
                if (disallowedPath === '/') {
                  return false;
                }
                if (pathname.startsWith(disallowedPath)) {
                  return false;
                }
                if (disallowedPath.includes('*')) {
                  const regex = new RegExp('^' + disallowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
                  if (regex.test(pathname)) {
                    return false;
                  }
                }
                if (disallowedPath.endsWith('$')) {
                  const pattern = disallowedPath.slice(0, -1);
                  if (pathname === pattern || pathname.endsWith(pattern)) {
                    return false;
                  }
                }
              }
              return true;
            } catch (error) {
              return true;
            }
          };
          const isUrlAllowedByConfig = (url: string): boolean => {
            try {
              const urlObj = new URL(url);
              if (crawlConfig.mode === 'domain') {
                if (urlObj.hostname !== baseDomain) return false;
              } else if (crawlConfig.mode === 'subdomain') {
                if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain) return false;
              } else if (crawlConfig.mode === 'path') {
                if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname)) return false;
              }
              if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
                const matches = crawlConfig.includePaths.some(pattern => {
                  try {
                    const regex = new RegExp(pattern);
                    return regex.test(url);
                  } catch {
                    return url.includes(pattern);
                  }
                });
                if (!matches) return false;
              }
              if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
                const matches = crawlConfig.excludePaths.some(pattern => {
                  try {
                    const regex = new RegExp(pattern);
                    return regex.test(url);
                  } catch {
                    return url.includes(pattern);
                  }
                });
                if (matches) return false;
              }
              return true;
            } catch (error) {
              return false;
            }
          };
          const normalizeUrl = (url: string): string => {
            return url.replace(/#.*$/, '').replace(/\/$/, '');
          };
          const extractLinksFromPage = async (): Promise<string[]> => {
            try {
              await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
              await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {});
              await new Promise(resolve => setTimeout(resolve, 1000));
              const pageLinks = await page.evaluate(() => {
                const links: string[] = [];
                const allAnchors = document.querySelectorAll('a');
                for (let i = 0; i < allAnchors.length; i++) {
                  const anchor = allAnchors[i] as HTMLAnchorElement;
                  const fullHref = anchor.href;
                  if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
                    links.push(fullHref);
                  }
                }
                return links;
              });
              return pageLinks;
            } catch (error) {
              this.log(`Link extraction failed: ${error.message}`, Level.WARN);
              return [];
            }
          };
          const scrapePageContent = async (url: string) => {
            const pageData = await page.evaluate(() => {
              const getMeta = (name: string) => {
                const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
                return meta?.getAttribute('content') || '';
              };
              const getAllMeta = () => {
                const metadata: Record<string, string> = {};
                const metaTags = document.querySelectorAll('meta');
                metaTags.forEach(tag => {
                  const name = tag.getAttribute('name') || tag.getAttribute('property');
                  const content = tag.getAttribute('content');
                  if (name && content) {
                    metadata[name] = content;
                  }
                });
                return metadata;
              };
              const title = document.title || '';
              const bodyText = document.body?.innerText || '';
              const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
              elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
              const html = document.documentElement.outerHTML;
              const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
              const allMetadata = getAllMeta();
              return {
                title,
                description: getMeta('description'),
                text: bodyText,
                html: html,
                links: links,
                wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
                metadata: {
                  ...allMetadata,
                  title,
                  language: document.documentElement.lang || '',
                  favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '',
                  statusCode: 200
                }
              };
            });
            return {
              metadata: {
                ...pageData.metadata,
                url: url,
                sourceURL: url
              } as Record<string, any>,
              html: pageData.html,
              text: pageData.text,
              links: pageData.links,
              wordCount: pageData.wordCount,
              scrapedAt: new Date().toISOString()
            };
          };
          const visitedUrls = new Set<string>();
          const crawlResults: any[] = [];
          interface CrawlQueueItem {
            url: string;
            depth: number;
          }
          const crawlQueue: CrawlQueueItem[] = [];
          const normalizedBaseUrl = normalizeUrl(baseUrl);
          visitedUrls.add(normalizedBaseUrl);
          crawlQueue.push({ url: baseUrl, depth: 0 });
          this.log(`Starting breadth-first crawl with maxDepth: ${crawlConfig.maxDepth}, limit: ${crawlConfig.limit}`, Level.LOG);
          if (crawlConfig.useSitemap) {
            this.log('Fetching sitemap URLs...', Level.LOG);
@@ -764,7 +1047,14 @@ export default class Interpreter extends EventEmitter {
                  !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/')
                );
-                discoveredUrls.push(...regularUrls);
+                for (const sitemapPageUrl of regularUrls) {
                  const normalized = normalizeUrl(sitemapPageUrl);
                  if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(sitemapPageUrl) && isUrlAllowedByRobots(sitemapPageUrl)) {
                    visitedUrls.add(normalized);
                    crawlQueue.push({ url: sitemapPageUrl, depth: 1 });
                  }
                }
                this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, Level.LOG);
                for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
@@ -791,16 +1081,21 @@ export default class Interpreter extends EventEmitter {
                      });
                    }, nestedUrl);
-                    if (nestedUrls.length > 0) {
+                    for (const nestedPageUrl of nestedUrls) {
-                      discoveredUrls.push(...nestedUrls);
+                      const normalized = normalizeUrl(nestedPageUrl);
-                      this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG);
+                      if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(nestedPageUrl) && isUrlAllowedByRobots(nestedPageUrl)) {
                        visitedUrls.add(normalized);
                        crawlQueue.push({ url: nestedPageUrl, depth: 1 });
                      }
                    }
                    this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG);
                  } catch (error) {
                    this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, Level.WARN);
                  }
                }
-                this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, Level.LOG);
+                this.log(`Total URLs queued from sitemaps: ${crawlQueue.length - 1}`, Level.LOG);
              } else {
                this.log('No URLs found in sitemap or sitemap not available', Level.WARN);
              }
@@ -809,196 +1104,76 @@ export default class Interpreter extends EventEmitter {
            }
          }
-          if (crawlConfig.followLinks) {
+          let processedCount = 0;
            this.log('Extracting links from current page...', Level.LOG);
            try {
              await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
-              await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
+          while (crawlQueue.length > 0 && crawlResults.length < crawlConfig.limit) {
-                this.log('Network did not become idle, continuing anyway', Level.WARN);
+            if (this.isAborted) {
-              });
+              this.log('Workflow aborted during crawl', Level.WARN);
-
+              break;
              await new Promise(resolve => setTimeout(resolve, 5000));
              const anchorCount = await page.evaluate(() => {
                return document.querySelectorAll('a').length;
              });
              this.log(`Page has ${anchorCount} total anchor tags`, Level.LOG);
              const pageLinks = await page.evaluate(() => {
                const links: string[] = [];
                const allAnchors = document.querySelectorAll('a');
                console.log('Total anchors found:', allAnchors.length);
                for (let i = 0; i < allAnchors.length; i++) {
                  const anchor = allAnchors[i] as HTMLAnchorElement;
                  const href = anchor.getAttribute('href');
                  const fullHref = anchor.href;
                  if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
                    links.push(fullHref);
                  }
                }
                console.log('Links extracted:', links.length);
                return links;
              });
              discoveredUrls.push(...pageLinks);
              this.log(`Found ${pageLinks.length} links from page`, Level.LOG);
            } catch (error) {
              this.log(`Link extraction failed: ${error.message}`, Level.WARN);
            }
          }
-          const filteredUrls = discoveredUrls.filter(url => {
+            const { url, depth } = crawlQueue.shift()!;
            processedCount++;
            this.log(`[${crawlResults.length + 1}/${crawlConfig.limit}] Crawling (depth ${depth}): ${url}`, Level.LOG);
            try {
-              const urlObj = new URL(url);
+              if (robotRules.crawlDelay && crawlResults.length > 0) {
-
+                this.log(`Applying crawl delay: ${robotRules.crawlDelay}ms`, Level.LOG);
-              if (crawlConfig.mode === 'domain') {
+                await new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay!));
                if (urlObj.hostname !== baseDomain) return false;
              } else if (crawlConfig.mode === 'subdomain') {
                if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain) return false;
              } else if (crawlConfig.mode === 'path') {
                if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname)) return false;
              }
              if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
                const matches = crawlConfig.includePaths.some(pattern => {
                  const regex = new RegExp(pattern);
                  return regex.test(url);
                });
                if (!matches) return false;
              }
              if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
                const matches = crawlConfig.excludePaths.some(pattern => {
                  const regex = new RegExp(pattern);
                  return regex.test(url);
                });
                if (matches) return false;
              }
              return true;
            } catch (error) {
              return false;
            }
          });
          const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
            return url.replace(/#.*$/, '').replace(/\/$/, '');
          })));
          const basePathname = parsedBase.pathname;
          const prioritizedUrls = uniqueUrls.sort((a, b) => {
            try {
              const aUrl = new URL(a);
              const bUrl = new URL(b);
              const aMatchesBase = aUrl.pathname.startsWith(basePathname);
              const bMatchesBase = bUrl.pathname.startsWith(basePathname);
              if (aMatchesBase && !bMatchesBase) return -1;
              if (!aMatchesBase && bMatchesBase) return 1;
              return 0;
            } catch (error) {
              return 0;
            }
          });
          const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
          this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, Level.LOG);
          this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, Level.LOG);
          const crawlResults = [];
          for (let i = 0; i < finalUrls.length; i++) {
            const url = finalUrls[i];
            try {
              this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, Level.LOG);
              await page.goto(url, {
                waitUntil: 'domcontentloaded',
                timeout: 30000
-              }).catch(() => {
+              }).catch((err) => {
-                this.log(`Failed to navigate to ${url}, skipping...`, Level.WARN);
+                throw new Error(`Navigation failed: ${err.message}`);
              });
              await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
-              const pageData = await page.evaluate(() => {
+              const pageResult = await scrapePageContent(url);
-                const getMeta = (name: string) => {
+              pageResult.metadata.depth = depth;
-                  const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
+              crawlResults.push(pageResult);
                  return meta?.getAttribute('content') || '';
                };
-                const getAllMeta = () => {
+              this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, Level.LOG);
                  const metadata: Record<string, string> = {};
                  const metaTags = document.querySelectorAll('meta');
                  metaTags.forEach(tag => {
                    const name = tag.getAttribute('name') || tag.getAttribute('property');
                    const content = tag.getAttribute('content');
                    if (name && content) {
                      metadata[name] = content;
                    }
                  });
                  return metadata;
                };
-                const title = document.title || '';
+              if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
-                const bodyText = document.body?.innerText || '';
+                const newLinks = await extractLinksFromPage();
                let addedCount = 0;
-                const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
+                for (const link of newLinks) {
-                elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
+                  const normalized = normalizeUrl(link);
-                const html = document.documentElement.outerHTML;
+                  if (!visitedUrls.has(normalized) &&
-                const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
+                      isUrlAllowedByConfig(link) &&
-                const allMetadata = getAllMeta();
+                      isUrlAllowedByRobots(link)) {
-
+                    visitedUrls.add(normalized);
-                return {
+                    crawlQueue.push({ url: link, depth: depth + 1 });
-                  title,
+                    addedCount++;
                  description: getMeta('description'),
                  text: bodyText,
                  html: html,
                  links: links,
                  wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
                  metadata: {
                    ...allMetadata,
                    title,
                    language: document.documentElement.lang || '',
                    favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '',
                    statusCode: 200
                  }
-                };
+                }
              });
-              crawlResults.push({
+                if (addedCount > 0) {
-                metadata: {
+                  this.log(`Added ${addedCount} new URLs to queue at depth ${depth + 1}`, Level.LOG);
-                  ...pageData.metadata,
+                }
-                  url: url,
+              }
                  sourceURL: url
                },
                html: pageData.html,
                text: pageData.text,
                links: pageData.links,
                wordCount: pageData.wordCount,
                scrapedAt: new Date().toISOString()
              });
              this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, Level.LOG);
            } catch (error) {
-              this.log(`Failed to scrape ${url}: ${error.message}`, Level.WARN);
+              this.log(`Failed to crawl ${url}: ${error.message}`, Level.WARN);
              crawlResults.push({
-                url: url,
+                metadata: {
                  url: url,
                  sourceURL: url,
                  depth: depth
                },
                error: error.message,
                scrapedAt: new Date().toISOString()
              });
            }
          }
-          this.log(`Successfully scraped ${crawlResults.length} pages`, Level.LOG);
+          this.log(`Crawl completed: ${crawlResults.length} pages scraped (${processedCount} URLs processed, ${visitedUrls.size} URLs discovered)`, Level.LOG);
          const actionType = "crawl";
          const actionName = "Crawl Results";
@@ -1031,8 +1206,6 @@ export default class Interpreter extends EventEmitter {
        provider?: 'duckduckgo';
        filters?: {
          timeRange?: 'day' | 'week' | 'month' | 'year';
          location?: string;
          lang?: string;
        };
        mode: 'discover' | 'scrape';
      }) => {