feat: add server side crawl and search logic

This commit is contained in:
Rohit Rajan
2026-01-02 15:46:10 +05:30
parent 3689eb96bd
commit 9898dc410d
15 changed files with 1577 additions and 124 deletions

View File

@@ -79,7 +79,9 @@ export default class Interpreter extends EventEmitter {
private serializableDataByType: Record<string, Record<string, any>> = {
scrapeList: {},
scrapeSchema: {}
scrapeSchema: {},
crawl: {},
search: {}
};
private scrapeListCounter: number = 0;
@@ -565,7 +567,9 @@ export default class Interpreter extends EventEmitter {
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList,
scrapeSchema: this.serializableDataByType.scrapeSchema
scrapeSchema: this.serializableDataByType.scrapeSchema,
crawl: this.serializableDataByType.crawl || {},
search: this.serializableDataByType.search || {}
});
},
@@ -703,6 +707,750 @@ export default class Interpreter extends EventEmitter {
}
},
crawl: async (crawlConfig: {
mode: 'domain' | 'subdomain' | 'path';
limit: number;
maxDepth: number;
includePaths: string[];
excludePaths: string[];
useSitemap: boolean;
followLinks: boolean;
respectRobots: boolean;
}) => {
if (this.isAborted) {
this.log('Workflow aborted, stopping crawl', Level.WARN);
return;
}
if (this.options.debugChannel?.setActionType) {
this.options.debugChannel.setActionType('crawl');
}
this.log('Starting crawl operation', Level.LOG);
try {
const currentUrl = page.url();
this.log(`Current page URL: ${currentUrl}`, Level.LOG);
if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
this.log('Page not yet navigated, waiting for navigation...', Level.WARN);
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
}
const baseUrl = page.url();
this.log(`Using base URL for crawl: ${baseUrl}`, Level.LOG);
const parsedBase = new URL(baseUrl);
const baseDomain = parsedBase.hostname;
let discoveredUrls: string[] = [];
if (crawlConfig.useSitemap) {
this.log('Fetching sitemap URLs...', Level.LOG);
try {
const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
const sitemapUrls = await page.evaluate((url) => {
return new Promise<string[]>((resolve) => {
const xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onload = function() {
if (xhr.status === 200) {
const text = xhr.responseText;
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
resolve(urls);
} else {
resolve([]);
}
};
xhr.onerror = function() {
resolve([]);
};
xhr.send();
});
}, sitemapUrl);
if (sitemapUrls.length > 0) {
const nestedSitemaps = sitemapUrls.filter(url =>
url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/')
);
const regularUrls = sitemapUrls.filter(url =>
!url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/')
);
discoveredUrls.push(...regularUrls);
this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, Level.LOG);
for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
try {
this.log(`Fetching nested sitemap: ${nestedUrl}`, Level.LOG);
const nestedUrls = await page.evaluate((url) => {
return new Promise<string[]>((resolve) => {
const xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onload = function() {
if (xhr.status === 200) {
const text = xhr.responseText;
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
resolve(urls);
} else {
resolve([]);
}
};
xhr.onerror = function() {
resolve([]);
};
xhr.send();
});
}, nestedUrl);
if (nestedUrls.length > 0) {
discoveredUrls.push(...nestedUrls);
this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG);
}
} catch (error) {
this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, Level.WARN);
}
}
this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, Level.LOG);
} else {
this.log('No URLs found in sitemap or sitemap not available', Level.WARN);
}
} catch (error) {
this.log(`Sitemap fetch failed: ${error.message}`, Level.WARN);
}
}
if (crawlConfig.followLinks) {
this.log('Extracting links from current page...', Level.LOG);
try {
await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
this.log('Network did not become idle, continuing anyway', Level.WARN);
});
await new Promise(resolve => setTimeout(resolve, 5000));
const anchorCount = await page.evaluate(() => {
return document.querySelectorAll('a').length;
});
this.log(`Page has ${anchorCount} total anchor tags`, Level.LOG);
const pageLinks = await page.evaluate(() => {
const links: string[] = [];
const allAnchors = document.querySelectorAll('a');
console.log('Total anchors found:', allAnchors.length);
for (let i = 0; i < allAnchors.length; i++) {
const anchor = allAnchors[i] as HTMLAnchorElement;
const href = anchor.getAttribute('href');
const fullHref = anchor.href;
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
links.push(fullHref);
}
}
console.log('Links extracted:', links.length);
return links;
});
discoveredUrls.push(...pageLinks);
this.log(`Found ${pageLinks.length} links from page`, Level.LOG);
} catch (error) {
this.log(`Link extraction failed: ${error.message}`, Level.WARN);
}
}
const filteredUrls = discoveredUrls.filter(url => {
try {
const urlObj = new URL(url);
if (crawlConfig.mode === 'domain') {
if (urlObj.hostname !== baseDomain) return false;
} else if (crawlConfig.mode === 'subdomain') {
if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain) return false;
} else if (crawlConfig.mode === 'path') {
if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname)) return false;
}
if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
const matches = crawlConfig.includePaths.some(pattern => {
const regex = new RegExp(pattern);
return regex.test(url);
});
if (!matches) return false;
}
if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
const matches = crawlConfig.excludePaths.some(pattern => {
const regex = new RegExp(pattern);
return regex.test(url);
});
if (matches) return false;
}
return true;
} catch (error) {
return false;
}
});
const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
return url.replace(/#.*$/, '').replace(/\/$/, '');
})));
const basePathname = parsedBase.pathname;
const prioritizedUrls = uniqueUrls.sort((a, b) => {
try {
const aUrl = new URL(a);
const bUrl = new URL(b);
const aMatchesBase = aUrl.pathname.startsWith(basePathname);
const bMatchesBase = bUrl.pathname.startsWith(basePathname);
if (aMatchesBase && !bMatchesBase) return -1;
if (!aMatchesBase && bMatchesBase) return 1;
return 0;
} catch (error) {
return 0;
}
});
const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, Level.LOG);
this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, Level.LOG);
const crawlResults = [];
for (let i = 0; i < finalUrls.length; i++) {
const url = finalUrls[i];
try {
this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, Level.LOG);
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 30000
}).catch(() => {
this.log(`Failed to navigate to ${url}, skipping...`, Level.WARN);
});
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
const pageData = await page.evaluate(() => {
const getMeta = (name: string) => {
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
return meta?.getAttribute('content') || '';
};
const getAllMeta = () => {
const metadata: Record<string, string> = {};
const metaTags = document.querySelectorAll('meta');
metaTags.forEach(tag => {
const name = tag.getAttribute('name') || tag.getAttribute('property');
const content = tag.getAttribute('content');
if (name && content) {
metadata[name] = content;
}
});
return metadata;
};
const title = document.title || '';
const bodyText = document.body?.innerText || '';
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
const html = document.documentElement.outerHTML;
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
const allMetadata = getAllMeta();
return {
title,
description: getMeta('description'),
text: bodyText,
html: html,
links: links,
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
metadata: {
...allMetadata,
title,
language: document.documentElement.lang || '',
favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '',
statusCode: 200
}
};
});
crawlResults.push({
metadata: {
...pageData.metadata,
url: url,
sourceURL: url
},
html: pageData.html,
text: pageData.text,
links: pageData.links,
wordCount: pageData.wordCount,
scrapedAt: new Date().toISOString()
});
this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, Level.LOG);
} catch (error) {
this.log(`Failed to scrape ${url}: ${error.message}`, Level.WARN);
crawlResults.push({
url: url,
error: error.message,
scrapedAt: new Date().toISOString()
});
}
}
this.log(`Successfully scraped ${crawlResults.length} pages`, Level.LOG);
const actionType = "crawl";
const actionName = "Crawl Results";
if (!this.serializableDataByType[actionType]) {
this.serializableDataByType[actionType] = {};
}
if (!this.serializableDataByType[actionType][actionName]) {
this.serializableDataByType[actionType][actionName] = [];
}
this.serializableDataByType[actionType][actionName] = crawlResults;
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList || {},
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
crawl: this.serializableDataByType.crawl || {},
search: this.serializableDataByType.search || {}
});
} catch (error) {
this.log(`Crawl action failed: ${error.message}`, Level.ERROR);
throw new Error(`Crawl execution error: ${error.message}`);
}
},
search: async (searchConfig: {
query: string;
limit: number;
provider?: 'duckduckgo';
filters?: {
timeRange?: 'day' | 'week' | 'month' | 'year';
location?: string;
lang?: string;
};
mode: 'discover' | 'scrape';
}) => {
if (this.isAborted) {
this.log('Workflow aborted, stopping search', Level.WARN);
return;
}
if (this.options.debugChannel?.setActionType) {
this.options.debugChannel.setActionType('search');
}
searchConfig.provider = 'duckduckgo';
this.log(`Performing DuckDuckGo search for: ${searchConfig.query}`, Level.LOG);
try {
let searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(searchConfig.query)}`;
if (searchConfig.filters?.timeRange) {
const timeMap: Record<string, string> = {
'day': 'd',
'week': 'w',
'month': 'm',
'year': 'y'
};
searchUrl += `&df=${timeMap[searchConfig.filters.timeRange]}`;
}
const initialDelay = 500 + Math.random() * 1000;
await new Promise(resolve => setTimeout(resolve, initialDelay));
await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {
this.log('Load state timeout, continuing anyway', Level.WARN);
});
const pageLoadDelay = 2000 + Math.random() * 1500;
await new Promise(resolve => setTimeout(resolve, pageLoadDelay));
let searchResults: any[] = [];
let retryCount = 0;
const maxRetries = 2;
while (searchResults.length === 0 && retryCount <= maxRetries) {
if (retryCount > 0) {
this.log(`Retry attempt ${retryCount}/${maxRetries} for DuckDuckGo search...`, Level.LOG);
const retryDelay = 1000 * Math.pow(2, retryCount) + Math.random() * 1000;
await new Promise(resolve => setTimeout(resolve, retryDelay));
}
this.log('Attempting to extract DuckDuckGo search results...', Level.LOG);
await page.waitForSelector('[data-testid="result"], .result', { timeout: 5000 }).catch(() => {
this.log('DuckDuckGo results not found on initial wait', Level.WARN);
});
let currentResultCount = 0;
const maxLoadAttempts = Math.ceil(searchConfig.limit / 10) * 2;
let loadAttempts = 0;
let noNewResultsCount = 0;
while (currentResultCount < searchConfig.limit && loadAttempts < maxLoadAttempts && noNewResultsCount < 3) {
const previousCount = currentResultCount;
currentResultCount = await page.evaluate(() => {
const selectors = [
'[data-testid="result"]',
'article[data-testid="result"]',
'li[data-layout="organic"]',
'.result',
'article[data-testid]'
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 0) {
return elements.length;
}
}
return 0;
});
if (currentResultCount >= searchConfig.limit) {
this.log(`Reached desired result count: ${currentResultCount}`, Level.LOG);
break;
}
if (currentResultCount === previousCount) {
noNewResultsCount++;
this.log(`No new results after load more (attempt ${noNewResultsCount}/3)`, Level.WARN);
if (noNewResultsCount >= 3) break;
} else {
noNewResultsCount = 0;
this.log(`Current results count: ${currentResultCount}/${searchConfig.limit}`, Level.LOG);
}
await page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight);
});
await new Promise(resolve => setTimeout(resolve, 800));
const loadMoreClicked = await page.evaluate(() => {
const selectors = [
'#more-results',
'button:has-text("More results")',
'button:has-text("more results")',
'button[id*="more"]',
'button:has-text("Load more")'
];
for (const selector of selectors) {
try {
const button = document.querySelector(selector) as HTMLButtonElement;
if (button && button.offsetParent !== null) {
button.click();
console.log(`Clicked load more button with selector: ${selector}`);
return true;
}
} catch (e) {
continue;
}
}
return false;
});
if (loadMoreClicked) {
this.log('Clicked "More results" button', Level.LOG);
await new Promise(resolve => setTimeout(resolve, 1500 + Math.random() * 1000));
} else {
this.log('No "More results" button found, results may be limited', Level.WARN);
break;
}
loadAttempts++;
}
this.log(`Finished pagination. Total results available: ${currentResultCount}`, Level.LOG);
searchResults = await page.evaluate((limit: number) => {
const results: any[] = [];
const cleanDescription = (text: string): string => {
if (!text) return '';
let cleaned = text.replace(/^\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago\s*/i, '');
cleaned = cleaned.replace(/^[A-Z][a-z]{2}\s+\d{1,2},?\s+\d{4}\s*[—\-]\s*/i, '');
cleaned = cleaned.replace(/^\d{4}-\d{2}-\d{2}\s*[—\-]\s*/i, '');
cleaned = cleaned.trim().replace(/\s+/g, ' ');
return cleaned;
};
const selectors = [
'[data-testid="result"]',
'article[data-testid="result"]',
'li[data-layout="organic"]',
'.result',
'article[data-testid]'
];
let allElements: Element[] = [];
for (const selector of selectors) {
const elements = Array.from(document.querySelectorAll(selector));
if (elements.length > 0) {
console.log(`Found ${elements.length} DDG elements with: ${selector}`);
allElements = elements;
break;
}
}
for (let i = 0; i < Math.min(allElements.length, limit); i++) {
const element = allElements[i];
const titleEl = element.querySelector('h2, [data-testid="result-title-a"], h3, [data-testid="result-title"]');
let linkEl = titleEl?.querySelector('a[href]') as HTMLAnchorElement;
if (!linkEl) {
linkEl = element.querySelector('a[href]') as HTMLAnchorElement;
}
if (!linkEl || !linkEl.href) continue;
let actualUrl = linkEl.href;
if (actualUrl.includes('uddg=')) {
try {
const urlParams = new URLSearchParams(actualUrl.split('?')[1]);
const uddgUrl = urlParams.get('uddg');
if (uddgUrl) {
actualUrl = decodeURIComponent(uddgUrl);
}
} catch (e) {
console.log('Failed to parse uddg parameter:', e);
}
}
if (actualUrl.includes('duckduckgo.com')) {
console.log(`Skipping DDG internal URL: ${actualUrl}`);
continue;
}
const descEl = element.querySelector('[data-result="snippet"], .result__snippet, [data-testid="result-snippet"]');
if (titleEl && titleEl.textContent && actualUrl) {
const rawDescription = (descEl?.textContent || '').trim();
const cleanedDescription = cleanDescription(rawDescription);
results.push({
url: actualUrl,
title: titleEl.textContent.trim(),
description: cleanedDescription,
position: results.length + 1
});
}
}
console.log(`Extracted ${results.length} DuckDuckGo search results`);
return results;
}, searchConfig.limit);
if (searchResults.length === 0) {
this.log(`No DuckDuckGo results found (attempt ${retryCount + 1}/${maxRetries + 1})`, Level.WARN);
retryCount++;
} else {
this.log(`Successfully extracted ${searchResults.length} results`, Level.LOG);
break;
}
}
this.log(`Search found ${searchResults.length} results`, Level.LOG);
if (searchConfig.mode === 'discover') {
const actionType = "search";
const actionName = "Search Results";
if (!this.serializableDataByType[actionType]) {
this.serializableDataByType[actionType] = {};
}
if (!this.serializableDataByType[actionType][actionName]) {
this.serializableDataByType[actionType][actionName] = {};
}
const searchData = {
query: searchConfig.query,
provider: searchConfig.provider,
filters: searchConfig.filters || {},
resultsCount: searchResults.length,
results: searchResults,
searchedAt: new Date().toISOString()
};
this.serializableDataByType[actionType][actionName] = searchData;
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList || {},
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
crawl: this.serializableDataByType.crawl || {},
search: this.serializableDataByType.search || {}
});
this.log(`Search completed in discover mode with ${searchResults.length} results`, Level.LOG);
return;
}
this.log(`Starting to scrape content from ${searchResults.length} search results...`, Level.LOG);
const scrapedResults = [];
for (let i = 0; i < searchResults.length; i++) {
const result = searchResults[i];
try {
this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, Level.LOG);
await page.goto(result.url, {
waitUntil: 'domcontentloaded',
timeout: 30000
}).catch(() => {
this.log(`Failed to navigate to ${result.url}, skipping...`, Level.WARN);
});
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
const pageData = await page.evaluate(() => {
const getMeta = (name: string) => {
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
return meta?.getAttribute('content') || '';
};
const getAllMeta = () => {
const metadata: Record<string, string> = {};
const metaTags = document.querySelectorAll('meta');
metaTags.forEach(tag => {
const name = tag.getAttribute('name') || tag.getAttribute('property');
const content = tag.getAttribute('content');
if (name && content) {
metadata[name] = content;
}
});
return metadata;
};
const title = document.title || '';
const bodyText = document.body?.innerText || '';
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
const html = document.documentElement.outerHTML;
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
const allMetadata = getAllMeta();
return {
title,
description: getMeta('description'),
text: bodyText,
html: html,
links: links,
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
metadata: {
...allMetadata,
title,
language: document.documentElement.lang || '',
favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '',
statusCode: 200
}
};
});
scrapedResults.push({
searchResult: {
query: searchConfig.query,
position: result.position,
searchTitle: result.title,
searchDescription: result.description,
},
metadata: {
...pageData.metadata,
url: result.url,
sourceURL: result.url
},
html: pageData.html,
text: pageData.text,
links: pageData.links,
wordCount: pageData.wordCount,
scrapedAt: new Date().toISOString()
});
this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, Level.LOG);
} catch (error) {
this.log(`Failed to scrape ${result.url}: ${error.message}`, Level.WARN);
scrapedResults.push({
searchResult: {
query: searchConfig.query,
position: result.position,
searchTitle: result.title,
searchDescription: result.description,
},
url: result.url,
error: error.message,
scrapedAt: new Date().toISOString()
});
}
}
this.log(`Successfully scraped ${scrapedResults.length} search results`, Level.LOG);
const actionType = "search";
const actionName = "Search Results";
if (!this.serializableDataByType[actionType]) {
this.serializableDataByType[actionType] = {};
}
if (!this.serializableDataByType[actionType][actionName]) {
this.serializableDataByType[actionType][actionName] = {};
}
const searchData = {
query: searchConfig.query,
provider: searchConfig.provider,
filters: searchConfig.filters || {},
mode: searchConfig.mode,
resultsCount: scrapedResults.length,
results: scrapedResults,
searchedAt: new Date().toISOString()
};
this.serializableDataByType[actionType][actionName] = searchData;
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList || {},
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
crawl: this.serializableDataByType.crawl || {},
search: this.serializableDataByType.search || {}
});
} catch (error) {
this.log(`Search action failed: ${error.message}`, Level.ERROR);
throw new Error(`Search execution error: ${error.message}`);
}
},
flag: async () => new Promise((res) => {
if (this.options.debugChannel?.setActionType) {
this.options.debugChannel.setActionType('flag');
@@ -885,7 +1633,9 @@ export default class Interpreter extends EventEmitter {
this.serializableDataByType[actionType][actionName] = [...allResults];
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList,
scrapeSchema: this.serializableDataByType.scrapeSchema
scrapeSchema: this.serializableDataByType.scrapeSchema,
crawl: this.serializableDataByType.crawl || {},
search: this.serializableDataByType.search || {}
});
};
@@ -1735,7 +2485,7 @@ export default class Interpreter extends EventEmitter {
// Clear accumulated data to free memory
this.cumulativeResults = [];
this.namedResults = {};
this.serializableDataByType = { scrapeList: {}, scrapeSchema: {} };
this.serializableDataByType = { scrapeList: {}, scrapeSchema: {}, crawl: {}, search: {} };
// Reset state
this.isAborted = false;

View File

@@ -28,7 +28,7 @@ type MethodNames<T> = {
[K in keyof T]: T[K] extends Function ? K : never;
}[keyof T];
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto';
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto' | 'crawl' | 'search';
export type What = {
action: MethodNames<Page> | CustomFunctions,