Merge pull request #947 from getmaxun/crawl-fix

fix(core): max depth crawling logic
2026-01-22 18:56:34 +05:30
parent 05db2ccfe6 19f2df87fc
commit 60cbceaa3d
1 changed files with 344 additions and 171 deletions
--- a/maxun-core/src/interpret.ts
+++ b/maxun-core/src/interpret.ts
@@ -706,7 +706,6 @@ export default class Interpreter extends EventEmitter {
          return;
        }

-
        if (this.options.debugChannel?.setActionType) {
          this.options.debugChannel.setActionType('crawl');
        }
@@ -728,130 +727,141 @@ export default class Interpreter extends EventEmitter {
          const parsedBase = new URL(baseUrl);
          const baseDomain = parsedBase.hostname;

-          let discoveredUrls: string[] = [];
+          interface RobotRules {
+            disallowedPaths: string[];
+            allowedPaths: string[];
+            crawlDelay: number | null;
+          }

-          if (crawlConfig.useSitemap) {
-            this.log('Fetching sitemap URLs...', Level.LOG);
+          let robotRules: RobotRules = {
+            disallowedPaths: [],
+            allowedPaths: [],
+            crawlDelay: null
+          };
+
+          if (crawlConfig.respectRobots) {
+            this.log('Fetching robots.txt...', Level.LOG);
            try {
-              const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
+              const robotsUrl = `${parsedBase.protocol}//${parsedBase.host}/robots.txt`;

-              const sitemapUrls = await page.evaluate((url) => {
-                return new Promise<string[]>((resolve) => {
+              const robotsContent = await page.evaluate((url) => {
+                return new Promise<string>((resolve) => {
                  const xhr = new XMLHttpRequest();
                  xhr.open('GET', url, true);
                  xhr.onload = function() {
                    if (xhr.status === 200) {
-                      const text = xhr.responseText;
-                      const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
-                      const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
-                      resolve(urls);
+                      resolve(xhr.responseText);
                    } else {
-                      resolve([]);
+                      resolve('');
                    }
                  };
                  xhr.onerror = function() {
-                    resolve([]);
+                    resolve('');
                  };
                  xhr.send();
                });
-              }, sitemapUrl);
+              }, robotsUrl);

-              if (sitemapUrls.length > 0) {
-                const nestedSitemaps = sitemapUrls.filter(url =>
-                  url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/')
-                );
-                const regularUrls = sitemapUrls.filter(url =>
-                  !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/')
-                );
+              if (robotsContent) {
+                const lines = robotsContent.split('\n');
+                let isRelevantUserAgent = false;
+                let foundSpecificUserAgent = false;

-                discoveredUrls.push(...regularUrls);
-                this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, Level.LOG);
+                for (const line of lines) {
+                  const trimmedLine = line.trim().toLowerCase();

-                for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
-                  try {
-                    this.log(`Fetching nested sitemap: ${nestedUrl}`, Level.LOG);
-                    const nestedUrls = await page.evaluate((url) => {
-                      return new Promise<string[]>((resolve) => {
-                        const xhr = new XMLHttpRequest();
-                        xhr.open('GET', url, true);
-                        xhr.onload = function() {
-                          if (xhr.status === 200) {
-                            const text = xhr.responseText;
-                            const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
-                            const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
-                            resolve(urls);
+                  if (trimmedLine.startsWith('#') || trimmedLine === '') {
+                    continue;
+                  }
+
+                  const colonIndex = line.indexOf(':');
+                  if (colonIndex === -1) continue;
+
+                  const directive = line.substring(0, colonIndex).trim().toLowerCase();
+                  const value = line.substring(colonIndex + 1).trim();
+
+                  if (directive === 'user-agent') {
+                    const agent = value.toLowerCase();
+                    if (agent === '*' && !foundSpecificUserAgent) {
+                      isRelevantUserAgent = true;
+                    } else if (agent.includes('bot') || agent.includes('crawler') || agent.includes('spider')) {
+                      isRelevantUserAgent = true;
+                      foundSpecificUserAgent = true;
                    } else {
-                            resolve([]);
+                      if (!foundSpecificUserAgent) {
+                        isRelevantUserAgent = false;
+                      }
+                    }
+                  } else if (isRelevantUserAgent) {
+                    if (directive === 'disallow' && value) {
+                      robotRules.disallowedPaths.push(value);
+                    } else if (directive === 'allow' && value) {
+                      robotRules.allowedPaths.push(value);
+                    } else if (directive === 'crawl-delay' && value) {
+                      const delay = parseFloat(value);
+                      if (!isNaN(delay) && delay > 0) {
+                        robotRules.crawlDelay = delay * 1000;
+                      }
+                    }
+                  }
+                }
+
+                this.log(`Robots.txt parsed: ${robotRules.disallowedPaths.length} disallowed paths, ${robotRules.allowedPaths.length} allowed paths, crawl-delay: ${robotRules.crawlDelay || 'none'}`, Level.LOG);
+              } else {
+                this.log('No robots.txt found or not accessible, proceeding without restrictions', Level.WARN);
+              }
+            } catch (error) {
+              this.log(`Failed to fetch robots.txt: ${error.message}, proceeding without restrictions`, Level.WARN);
+            }
+          }
+
+          const isUrlAllowedByRobots = (url: string): boolean => {
+            if (!crawlConfig.respectRobots) return true;
+
+            try {
+              const urlObj = new URL(url);
+              const pathname = urlObj.pathname;
+
+              for (const allowedPath of robotRules.allowedPaths) {
+                if (allowedPath === pathname || pathname.startsWith(allowedPath)) {
+                  return true;
+                }
+                if (allowedPath.includes('*')) {
+                  const regex = new RegExp('^' + allowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
+                  if (regex.test(pathname)) {
+                    return true;
+                  }
+                }
+              }
+
+              for (const disallowedPath of robotRules.disallowedPaths) {
+                if (disallowedPath === '/') {
+                  return false;
+                }
+                if (pathname.startsWith(disallowedPath)) {
+                  return false;
+                }
+                if (disallowedPath.includes('*')) {
+                  const regex = new RegExp('^' + disallowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
+                  if (regex.test(pathname)) {
+                    return false;
+                  }
+                }
+                if (disallowedPath.endsWith('$')) {
+                  const pattern = disallowedPath.slice(0, -1);
+                  if (pathname === pattern || pathname.endsWith(pattern)) {
+                    return false;
+                  }
+                }
+              }
+
+              return true;
+            } catch (error) {
+              return true;
            }
          };
-                        xhr.onerror = function() {
-                          resolve([]);
-                        };
-                        xhr.send();
-                      });
-                    }, nestedUrl);

-                    if (nestedUrls.length > 0) {
-                      discoveredUrls.push(...nestedUrls);
-                      this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG);
-                    }
-                  } catch (error) {
-                    this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, Level.WARN);
-                  }
-                }
-
-                this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, Level.LOG);
-              } else {
-                this.log('No URLs found in sitemap or sitemap not available', Level.WARN);
-              }
-            } catch (error) {
-              this.log(`Sitemap fetch failed: ${error.message}`, Level.WARN);
-            }
-          }
-
-          if (crawlConfig.followLinks) {
-            this.log('Extracting links from current page...', Level.LOG);
-            try {
-              await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
-
-              await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
-                this.log('Network did not become idle, continuing anyway', Level.WARN);
-              });
-
-              await new Promise(resolve => setTimeout(resolve, 5000));
-
-              const anchorCount = await page.evaluate(() => {
-                return document.querySelectorAll('a').length;
-              });
-              this.log(`Page has ${anchorCount} total anchor tags`, Level.LOG);
-
-              const pageLinks = await page.evaluate(() => {
-                const links: string[] = [];
-                const allAnchors = document.querySelectorAll('a');
-                console.log('Total anchors found:', allAnchors.length);
-
-                for (let i = 0; i < allAnchors.length; i++) {
-                  const anchor = allAnchors[i] as HTMLAnchorElement;
-                  const href = anchor.getAttribute('href');
-                  const fullHref = anchor.href;
-
-                  if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
-                    links.push(fullHref);
-                  }
-                }
-
-                console.log('Links extracted:', links.length);
-                return links;
-              });
-
-              discoveredUrls.push(...pageLinks);
-              this.log(`Found ${pageLinks.length} links from page`, Level.LOG);
-            } catch (error) {
-              this.log(`Link extraction failed: ${error.message}`, Level.WARN);
-            }
-          }
-
-          const filteredUrls = discoveredUrls.filter(url => {
+          const isUrlAllowedByConfig = (url: string): boolean => {
            try {
              const urlObj = new URL(url);

@@ -865,16 +875,24 @@ export default class Interpreter extends EventEmitter {

              if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
                const matches = crawlConfig.includePaths.some(pattern => {
+                  try {
                    const regex = new RegExp(pattern);
                    return regex.test(url);
+                  } catch {
+                    return url.includes(pattern);
+                  }
                });
                if (!matches) return false;
              }

              if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
                const matches = crawlConfig.excludePaths.some(pattern => {
+                  try {
                    const regex = new RegExp(pattern);
                    return regex.test(url);
+                  } catch {
+                    return url.includes(pattern);
+                  }
                });
                if (matches) return false;
              }
@@ -883,50 +901,42 @@ export default class Interpreter extends EventEmitter {
            } catch (error) {
              return false;
            }
-          });
+          };

-          const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
+          const normalizeUrl = (url: string): string => {
            return url.replace(/#.*$/, '').replace(/\/$/, '');
-          })));
+          };

-          const basePathname = parsedBase.pathname;
-          const prioritizedUrls = uniqueUrls.sort((a, b) => {
+          const extractLinksFromPage = async (): Promise<string[]> => {
            try {
-              const aUrl = new URL(a);
-              const bUrl = new URL(b);
-              const aMatchesBase = aUrl.pathname.startsWith(basePathname);
-              const bMatchesBase = bUrl.pathname.startsWith(basePathname);
+              await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
+              await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {});
+              await new Promise(resolve => setTimeout(resolve, 1000));

-              if (aMatchesBase && !bMatchesBase) return -1;
-              if (!aMatchesBase && bMatchesBase) return 1;
+              const pageLinks = await page.evaluate(() => {
+                const links: string[] = [];
+                const allAnchors = document.querySelectorAll('a');

-              return 0;
-            } catch (error) {
-              return 0;
+                for (let i = 0; i < allAnchors.length; i++) {
+                  const anchor = allAnchors[i] as HTMLAnchorElement;
+                  const fullHref = anchor.href;
+
+                  if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
+                    links.push(fullHref);
                  }
+                }
+
+                return links;
              });

-          const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
-
-          this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, Level.LOG);
-
-          this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, Level.LOG);
-          const crawlResults = [];
-
-          for (let i = 0; i < finalUrls.length; i++) {
-            const url = finalUrls[i];
-            try {
-              this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, Level.LOG);
-
-              await page.goto(url, {
-                waitUntil: 'domcontentloaded',
-                timeout: 30000
-              }).catch(() => {
-                this.log(`Failed to navigate to ${url}, skipping...`, Level.WARN);
-              });
-
-              await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
+              return pageLinks;
+            } catch (error) {
+              this.log(`Link extraction failed: ${error.message}`, Level.WARN);
+              return [];
+            }
+          };

+          const scrapePageContent = async (url: string) => {
            const pageData = await page.evaluate(() => {
              const getMeta = (name: string) => {
                const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
@@ -973,32 +983,197 @@ export default class Interpreter extends EventEmitter {
              };
            });

-              crawlResults.push({
+            return {
              metadata: {
                ...pageData.metadata,
                url: url,
                sourceURL: url
-                },
+              } as Record<string, any>,
              html: pageData.html,
              text: pageData.text,
              links: pageData.links,
              wordCount: pageData.wordCount,
              scrapedAt: new Date().toISOString()
+            };
+          };
+
+          const visitedUrls = new Set<string>();
+          const crawlResults: any[] = [];
+
+          interface CrawlQueueItem {
+            url: string;
+            depth: number;
+          }
+
+          const crawlQueue: CrawlQueueItem[] = [];
+
+          const normalizedBaseUrl = normalizeUrl(baseUrl);
+          visitedUrls.add(normalizedBaseUrl);
+          crawlQueue.push({ url: baseUrl, depth: 0 });
+
+          this.log(`Starting breadth-first crawl with maxDepth: ${crawlConfig.maxDepth}, limit: ${crawlConfig.limit}`, Level.LOG);
+
+          if (crawlConfig.useSitemap) {
+            this.log('Fetching sitemap URLs...', Level.LOG);
+            try {
+              const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
+
+              const sitemapUrls = await page.evaluate((url) => {
+                return new Promise<string[]>((resolve) => {
+                  const xhr = new XMLHttpRequest();
+                  xhr.open('GET', url, true);
+                  xhr.onload = function() {
+                    if (xhr.status === 200) {
+                      const text = xhr.responseText;
+                      const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
+                      const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
+                      resolve(urls);
+                    } else {
+                      resolve([]);
+                    }
+                  };
+                  xhr.onerror = function() {
+                    resolve([]);
+                  };
+                  xhr.send();
+                });
+              }, sitemapUrl);
+
+              if (sitemapUrls.length > 0) {
+                const nestedSitemaps = sitemapUrls.filter(url =>
+                  url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/')
+                );
+                const regularUrls = sitemapUrls.filter(url =>
+                  !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/')
+                );
+
+                for (const sitemapPageUrl of regularUrls) {
+                  const normalized = normalizeUrl(sitemapPageUrl);
+                  if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(sitemapPageUrl) && isUrlAllowedByRobots(sitemapPageUrl)) {
+                    visitedUrls.add(normalized);
+                    crawlQueue.push({ url: sitemapPageUrl, depth: 1 });
+                  }
+                }
+
+                this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, Level.LOG);
+
+                for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
+                  try {
+                    this.log(`Fetching nested sitemap: ${nestedUrl}`, Level.LOG);
+                    const nestedUrls = await page.evaluate((url) => {
+                      return new Promise<string[]>((resolve) => {
+                        const xhr = new XMLHttpRequest();
+                        xhr.open('GET', url, true);
+                        xhr.onload = function() {
+                          if (xhr.status === 200) {
+                            const text = xhr.responseText;
+                            const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
+                            const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
+                            resolve(urls);
+                          } else {
+                            resolve([]);
+                          }
+                        };
+                        xhr.onerror = function() {
+                          resolve([]);
+                        };
+                        xhr.send();
+                      });
+                    }, nestedUrl);
+
+                    for (const nestedPageUrl of nestedUrls) {
+                      const normalized = normalizeUrl(nestedPageUrl);
+                      if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(nestedPageUrl) && isUrlAllowedByRobots(nestedPageUrl)) {
+                        visitedUrls.add(normalized);
+                        crawlQueue.push({ url: nestedPageUrl, depth: 1 });
+                      }
+                    }
+
+                    this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG);
+                  } catch (error) {
+                    this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, Level.WARN);
+                  }
+                }
+
+                this.log(`Total URLs queued from sitemaps: ${crawlQueue.length - 1}`, Level.LOG);
+              } else {
+                this.log('No URLs found in sitemap or sitemap not available', Level.WARN);
+              }
+            } catch (error) {
+              this.log(`Sitemap fetch failed: ${error.message}`, Level.WARN);
+            }
+          }
+
+          let processedCount = 0;
+
+          while (crawlQueue.length > 0 && crawlResults.length < crawlConfig.limit) {
+            if (this.isAborted) {
+              this.log('Workflow aborted during crawl', Level.WARN);
+              break;
+            }
+
+            const { url, depth } = crawlQueue.shift()!;
+            processedCount++;
+
+            this.log(`[${crawlResults.length + 1}/${crawlConfig.limit}] Crawling (depth ${depth}): ${url}`, Level.LOG);
+
+            try {
+              if (robotRules.crawlDelay && crawlResults.length > 0) {
+                this.log(`Applying crawl delay: ${robotRules.crawlDelay}ms`, Level.LOG);
+                await new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay!));
+              }
+
+              await page.goto(url, {
+                waitUntil: 'domcontentloaded',
+                timeout: 30000
+              }).catch((err) => {
+                throw new Error(`Navigation failed: ${err.message}`);
              });

-              this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, Level.LOG);
+              await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
+
+              const pageResult = await scrapePageContent(url);
+              pageResult.metadata.depth = depth;
+              crawlResults.push(pageResult);
+
+              this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, Level.LOG);
+
+              if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
+                const newLinks = await extractLinksFromPage();
+                let addedCount = 0;
+
+                for (const link of newLinks) {
+                  const normalized = normalizeUrl(link);
+
+                  if (!visitedUrls.has(normalized) &&
+                      isUrlAllowedByConfig(link) &&
+                      isUrlAllowedByRobots(link)) {
+                    visitedUrls.add(normalized);
+                    crawlQueue.push({ url: link, depth: depth + 1 });
+                    addedCount++;
+                  }
+                }
+
+                if (addedCount > 0) {
+                  this.log(`Added ${addedCount} new URLs to queue at depth ${depth + 1}`, Level.LOG);
+                }
+              }

            } catch (error) {
-              this.log(`Failed to scrape ${url}: ${error.message}`, Level.WARN);
+              this.log(`Failed to crawl ${url}: ${error.message}`, Level.WARN);
              crawlResults.push({
+                metadata: {
                  url: url,
+                  sourceURL: url,
+                  depth: depth
+                },
                error: error.message,
                scrapedAt: new Date().toISOString()
              });
            }
          }

-          this.log(`Successfully scraped ${crawlResults.length} pages`, Level.LOG);
+          this.log(`Crawl completed: ${crawlResults.length} pages scraped (${processedCount} URLs processed, ${visitedUrls.size} URLs discovered)`, Level.LOG);

          const actionType = "crawl";
          const actionName = "Crawl Results";
@@ -1031,8 +1206,6 @@ export default class Interpreter extends EventEmitter {
        provider?: 'duckduckgo';
        filters?: {
          timeRange?: 'day' | 'week' | 'month' | 'year';
-          location?: string;
-          lang?: string;
        };
        mode: 'discover' | 'scrape';
      }) => {