feat: add server side crawl and search logic

This commit is contained in:
Rohit Rajan
2026-01-02 15:46:10 +05:30
parent 3689eb96bd
commit 9898dc410d
15 changed files with 1577 additions and 124 deletions

View File

@@ -79,7 +79,9 @@ export default class Interpreter extends EventEmitter {
private serializableDataByType: Record<string, Record<string, any>> = {
scrapeList: {},
scrapeSchema: {}
scrapeSchema: {},
crawl: {},
search: {}
};
private scrapeListCounter: number = 0;
@@ -565,7 +567,9 @@ export default class Interpreter extends EventEmitter {
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList,
scrapeSchema: this.serializableDataByType.scrapeSchema
scrapeSchema: this.serializableDataByType.scrapeSchema,
crawl: this.serializableDataByType.crawl || {},
search: this.serializableDataByType.search || {}
});
},
@@ -703,6 +707,750 @@ export default class Interpreter extends EventEmitter {
}
},
crawl: async (crawlConfig: {
mode: 'domain' | 'subdomain' | 'path';
limit: number;
maxDepth: number;
includePaths: string[];
excludePaths: string[];
useSitemap: boolean;
followLinks: boolean;
respectRobots: boolean;
}) => {
if (this.isAborted) {
this.log('Workflow aborted, stopping crawl', Level.WARN);
return;
}
if (this.options.debugChannel?.setActionType) {
this.options.debugChannel.setActionType('crawl');
}
this.log('Starting crawl operation', Level.LOG);
try {
const currentUrl = page.url();
this.log(`Current page URL: ${currentUrl}`, Level.LOG);
if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
this.log('Page not yet navigated, waiting for navigation...', Level.WARN);
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
}
const baseUrl = page.url();
this.log(`Using base URL for crawl: ${baseUrl}`, Level.LOG);
const parsedBase = new URL(baseUrl);
const baseDomain = parsedBase.hostname;
let discoveredUrls: string[] = [];
if (crawlConfig.useSitemap) {
this.log('Fetching sitemap URLs...', Level.LOG);
try {
const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
const sitemapUrls = await page.evaluate((url) => {
return new Promise<string[]>((resolve) => {
const xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onload = function() {
if (xhr.status === 200) {
const text = xhr.responseText;
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
resolve(urls);
} else {
resolve([]);
}
};
xhr.onerror = function() {
resolve([]);
};
xhr.send();
});
}, sitemapUrl);
if (sitemapUrls.length > 0) {
const nestedSitemaps = sitemapUrls.filter(url =>
url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/')
);
const regularUrls = sitemapUrls.filter(url =>
!url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/')
);
discoveredUrls.push(...regularUrls);
this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, Level.LOG);
for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
try {
this.log(`Fetching nested sitemap: ${nestedUrl}`, Level.LOG);
const nestedUrls = await page.evaluate((url) => {
return new Promise<string[]>((resolve) => {
const xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onload = function() {
if (xhr.status === 200) {
const text = xhr.responseText;
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
resolve(urls);
} else {
resolve([]);
}
};
xhr.onerror = function() {
resolve([]);
};
xhr.send();
});
}, nestedUrl);
if (nestedUrls.length > 0) {
discoveredUrls.push(...nestedUrls);
this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG);
}
} catch (error) {
this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, Level.WARN);
}
}
this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, Level.LOG);
} else {
this.log('No URLs found in sitemap or sitemap not available', Level.WARN);
}
} catch (error) {
this.log(`Sitemap fetch failed: ${error.message}`, Level.WARN);
}
}
if (crawlConfig.followLinks) {
this.log('Extracting links from current page...', Level.LOG);
try {
await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
this.log('Network did not become idle, continuing anyway', Level.WARN);
});
await new Promise(resolve => setTimeout(resolve, 5000));
const anchorCount = await page.evaluate(() => {
return document.querySelectorAll('a').length;
});
this.log(`Page has ${anchorCount} total anchor tags`, Level.LOG);
const pageLinks = await page.evaluate(() => {
const links: string[] = [];
const allAnchors = document.querySelectorAll('a');
console.log('Total anchors found:', allAnchors.length);
for (let i = 0; i < allAnchors.length; i++) {
const anchor = allAnchors[i] as HTMLAnchorElement;
const href = anchor.getAttribute('href');
const fullHref = anchor.href;
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
links.push(fullHref);
}
}
console.log('Links extracted:', links.length);
return links;
});
discoveredUrls.push(...pageLinks);
this.log(`Found ${pageLinks.length} links from page`, Level.LOG);
} catch (error) {
this.log(`Link extraction failed: ${error.message}`, Level.WARN);
}
}
const filteredUrls = discoveredUrls.filter(url => {
try {
const urlObj = new URL(url);
if (crawlConfig.mode === 'domain') {
if (urlObj.hostname !== baseDomain) return false;
} else if (crawlConfig.mode === 'subdomain') {
if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain) return false;
} else if (crawlConfig.mode === 'path') {
if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname)) return false;
}
if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
const matches = crawlConfig.includePaths.some(pattern => {
const regex = new RegExp(pattern);
return regex.test(url);
});
if (!matches) return false;
}
if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
const matches = crawlConfig.excludePaths.some(pattern => {
const regex = new RegExp(pattern);
return regex.test(url);
});
if (matches) return false;
}
return true;
} catch (error) {
return false;
}
});
const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
return url.replace(/#.*$/, '').replace(/\/$/, '');
})));
const basePathname = parsedBase.pathname;
const prioritizedUrls = uniqueUrls.sort((a, b) => {
try {
const aUrl = new URL(a);
const bUrl = new URL(b);
const aMatchesBase = aUrl.pathname.startsWith(basePathname);
const bMatchesBase = bUrl.pathname.startsWith(basePathname);
if (aMatchesBase && !bMatchesBase) return -1;
if (!aMatchesBase && bMatchesBase) return 1;
return 0;
} catch (error) {
return 0;
}
});
const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, Level.LOG);
this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, Level.LOG);
const crawlResults = [];
for (let i = 0; i < finalUrls.length; i++) {
const url = finalUrls[i];
try {
this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, Level.LOG);
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 30000
}).catch(() => {
this.log(`Failed to navigate to ${url}, skipping...`, Level.WARN);
});
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
const pageData = await page.evaluate(() => {
const getMeta = (name: string) => {
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
return meta?.getAttribute('content') || '';
};
const getAllMeta = () => {
const metadata: Record<string, string> = {};
const metaTags = document.querySelectorAll('meta');
metaTags.forEach(tag => {
const name = tag.getAttribute('name') || tag.getAttribute('property');
const content = tag.getAttribute('content');
if (name && content) {
metadata[name] = content;
}
});
return metadata;
};
const title = document.title || '';
const bodyText = document.body?.innerText || '';
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
const html = document.documentElement.outerHTML;
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
const allMetadata = getAllMeta();
return {
title,
description: getMeta('description'),
text: bodyText,
html: html,
links: links,
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
metadata: {
...allMetadata,
title,
language: document.documentElement.lang || '',
favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '',
statusCode: 200
}
};
});
crawlResults.push({
metadata: {
...pageData.metadata,
url: url,
sourceURL: url
},
html: pageData.html,
text: pageData.text,
links: pageData.links,
wordCount: pageData.wordCount,
scrapedAt: new Date().toISOString()
});
this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, Level.LOG);
} catch (error) {
this.log(`Failed to scrape ${url}: ${error.message}`, Level.WARN);
crawlResults.push({
url: url,
error: error.message,
scrapedAt: new Date().toISOString()
});
}
}
this.log(`Successfully scraped ${crawlResults.length} pages`, Level.LOG);
const actionType = "crawl";
const actionName = "Crawl Results";
if (!this.serializableDataByType[actionType]) {
this.serializableDataByType[actionType] = {};
}
if (!this.serializableDataByType[actionType][actionName]) {
this.serializableDataByType[actionType][actionName] = [];
}
this.serializableDataByType[actionType][actionName] = crawlResults;
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList || {},
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
crawl: this.serializableDataByType.crawl || {},
search: this.serializableDataByType.search || {}
});
} catch (error) {
this.log(`Crawl action failed: ${error.message}`, Level.ERROR);
throw new Error(`Crawl execution error: ${error.message}`);
}
},
search: async (searchConfig: {
query: string;
limit: number;
provider?: 'duckduckgo';
filters?: {
timeRange?: 'day' | 'week' | 'month' | 'year';
location?: string;
lang?: string;
};
mode: 'discover' | 'scrape';
}) => {
if (this.isAborted) {
this.log('Workflow aborted, stopping search', Level.WARN);
return;
}
if (this.options.debugChannel?.setActionType) {
this.options.debugChannel.setActionType('search');
}
searchConfig.provider = 'duckduckgo';
this.log(`Performing DuckDuckGo search for: ${searchConfig.query}`, Level.LOG);
try {
let searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(searchConfig.query)}`;
if (searchConfig.filters?.timeRange) {
const timeMap: Record<string, string> = {
'day': 'd',
'week': 'w',
'month': 'm',
'year': 'y'
};
searchUrl += `&df=${timeMap[searchConfig.filters.timeRange]}`;
}
const initialDelay = 500 + Math.random() * 1000;
await new Promise(resolve => setTimeout(resolve, initialDelay));
await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {
this.log('Load state timeout, continuing anyway', Level.WARN);
});
const pageLoadDelay = 2000 + Math.random() * 1500;
await new Promise(resolve => setTimeout(resolve, pageLoadDelay));
let searchResults: any[] = [];
let retryCount = 0;
const maxRetries = 2;
while (searchResults.length === 0 && retryCount <= maxRetries) {
if (retryCount > 0) {
this.log(`Retry attempt ${retryCount}/${maxRetries} for DuckDuckGo search...`, Level.LOG);
const retryDelay = 1000 * Math.pow(2, retryCount) + Math.random() * 1000;
await new Promise(resolve => setTimeout(resolve, retryDelay));
}
this.log('Attempting to extract DuckDuckGo search results...', Level.LOG);
await page.waitForSelector('[data-testid="result"], .result', { timeout: 5000 }).catch(() => {
this.log('DuckDuckGo results not found on initial wait', Level.WARN);
});
let currentResultCount = 0;
const maxLoadAttempts = Math.ceil(searchConfig.limit / 10) * 2;
let loadAttempts = 0;
let noNewResultsCount = 0;
while (currentResultCount < searchConfig.limit && loadAttempts < maxLoadAttempts && noNewResultsCount < 3) {
const previousCount = currentResultCount;
currentResultCount = await page.evaluate(() => {
const selectors = [
'[data-testid="result"]',
'article[data-testid="result"]',
'li[data-layout="organic"]',
'.result',
'article[data-testid]'
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 0) {
return elements.length;
}
}
return 0;
});
if (currentResultCount >= searchConfig.limit) {
this.log(`Reached desired result count: ${currentResultCount}`, Level.LOG);
break;
}
if (currentResultCount === previousCount) {
noNewResultsCount++;
this.log(`No new results after load more (attempt ${noNewResultsCount}/3)`, Level.WARN);
if (noNewResultsCount >= 3) break;
} else {
noNewResultsCount = 0;
this.log(`Current results count: ${currentResultCount}/${searchConfig.limit}`, Level.LOG);
}
await page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight);
});
await new Promise(resolve => setTimeout(resolve, 800));
const loadMoreClicked = await page.evaluate(() => {
const selectors = [
'#more-results',
'button:has-text("More results")',
'button:has-text("more results")',
'button[id*="more"]',
'button:has-text("Load more")'
];
for (const selector of selectors) {
try {
const button = document.querySelector(selector) as HTMLButtonElement;
if (button && button.offsetParent !== null) {
button.click();
console.log(`Clicked load more button with selector: ${selector}`);
return true;
}
} catch (e) {
continue;
}
}
return false;
});
if (loadMoreClicked) {
this.log('Clicked "More results" button', Level.LOG);
await new Promise(resolve => setTimeout(resolve, 1500 + Math.random() * 1000));
} else {
this.log('No "More results" button found, results may be limited', Level.WARN);
break;
}
loadAttempts++;
}
this.log(`Finished pagination. Total results available: ${currentResultCount}`, Level.LOG);
searchResults = await page.evaluate((limit: number) => {
const results: any[] = [];
const cleanDescription = (text: string): string => {
if (!text) return '';
let cleaned = text.replace(/^\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago\s*/i, '');
cleaned = cleaned.replace(/^[A-Z][a-z]{2}\s+\d{1,2},?\s+\d{4}\s*[—\-]\s*/i, '');
cleaned = cleaned.replace(/^\d{4}-\d{2}-\d{2}\s*[—\-]\s*/i, '');
cleaned = cleaned.trim().replace(/\s+/g, ' ');
return cleaned;
};
const selectors = [
'[data-testid="result"]',
'article[data-testid="result"]',
'li[data-layout="organic"]',
'.result',
'article[data-testid]'
];
let allElements: Element[] = [];
for (const selector of selectors) {
const elements = Array.from(document.querySelectorAll(selector));
if (elements.length > 0) {
console.log(`Found ${elements.length} DDG elements with: ${selector}`);
allElements = elements;
break;
}
}
for (let i = 0; i < Math.min(allElements.length, limit); i++) {
const element = allElements[i];
const titleEl = element.querySelector('h2, [data-testid="result-title-a"], h3, [data-testid="result-title"]');
let linkEl = titleEl?.querySelector('a[href]') as HTMLAnchorElement;
if (!linkEl) {
linkEl = element.querySelector('a[href]') as HTMLAnchorElement;
}
if (!linkEl || !linkEl.href) continue;
let actualUrl = linkEl.href;
if (actualUrl.includes('uddg=')) {
try {
const urlParams = new URLSearchParams(actualUrl.split('?')[1]);
const uddgUrl = urlParams.get('uddg');
if (uddgUrl) {
actualUrl = decodeURIComponent(uddgUrl);
}
} catch (e) {
console.log('Failed to parse uddg parameter:', e);
}
}
if (actualUrl.includes('duckduckgo.com')) {
console.log(`Skipping DDG internal URL: ${actualUrl}`);
continue;
}
const descEl = element.querySelector('[data-result="snippet"], .result__snippet, [data-testid="result-snippet"]');
if (titleEl && titleEl.textContent && actualUrl) {
const rawDescription = (descEl?.textContent || '').trim();
const cleanedDescription = cleanDescription(rawDescription);
results.push({
url: actualUrl,
title: titleEl.textContent.trim(),
description: cleanedDescription,
position: results.length + 1
});
}
}
console.log(`Extracted ${results.length} DuckDuckGo search results`);
return results;
}, searchConfig.limit);
if (searchResults.length === 0) {
this.log(`No DuckDuckGo results found (attempt ${retryCount + 1}/${maxRetries + 1})`, Level.WARN);
retryCount++;
} else {
this.log(`Successfully extracted ${searchResults.length} results`, Level.LOG);
break;
}
}
this.log(`Search found ${searchResults.length} results`, Level.LOG);
if (searchConfig.mode === 'discover') {
const actionType = "search";
const actionName = "Search Results";
if (!this.serializableDataByType[actionType]) {
this.serializableDataByType[actionType] = {};
}
if (!this.serializableDataByType[actionType][actionName]) {
this.serializableDataByType[actionType][actionName] = {};
}
const searchData = {
query: searchConfig.query,
provider: searchConfig.provider,
filters: searchConfig.filters || {},
resultsCount: searchResults.length,
results: searchResults,
searchedAt: new Date().toISOString()
};
this.serializableDataByType[actionType][actionName] = searchData;
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList || {},
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
crawl: this.serializableDataByType.crawl || {},
search: this.serializableDataByType.search || {}
});
this.log(`Search completed in discover mode with ${searchResults.length} results`, Level.LOG);
return;
}
this.log(`Starting to scrape content from ${searchResults.length} search results...`, Level.LOG);
const scrapedResults = [];
for (let i = 0; i < searchResults.length; i++) {
const result = searchResults[i];
try {
this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, Level.LOG);
await page.goto(result.url, {
waitUntil: 'domcontentloaded',
timeout: 30000
}).catch(() => {
this.log(`Failed to navigate to ${result.url}, skipping...`, Level.WARN);
});
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
const pageData = await page.evaluate(() => {
const getMeta = (name: string) => {
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
return meta?.getAttribute('content') || '';
};
const getAllMeta = () => {
const metadata: Record<string, string> = {};
const metaTags = document.querySelectorAll('meta');
metaTags.forEach(tag => {
const name = tag.getAttribute('name') || tag.getAttribute('property');
const content = tag.getAttribute('content');
if (name && content) {
metadata[name] = content;
}
});
return metadata;
};
const title = document.title || '';
const bodyText = document.body?.innerText || '';
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
const html = document.documentElement.outerHTML;
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
const allMetadata = getAllMeta();
return {
title,
description: getMeta('description'),
text: bodyText,
html: html,
links: links,
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
metadata: {
...allMetadata,
title,
language: document.documentElement.lang || '',
favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '',
statusCode: 200
}
};
});
scrapedResults.push({
searchResult: {
query: searchConfig.query,
position: result.position,
searchTitle: result.title,
searchDescription: result.description,
},
metadata: {
...pageData.metadata,
url: result.url,
sourceURL: result.url
},
html: pageData.html,
text: pageData.text,
links: pageData.links,
wordCount: pageData.wordCount,
scrapedAt: new Date().toISOString()
});
this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, Level.LOG);
} catch (error) {
this.log(`Failed to scrape ${result.url}: ${error.message}`, Level.WARN);
scrapedResults.push({
searchResult: {
query: searchConfig.query,
position: result.position,
searchTitle: result.title,
searchDescription: result.description,
},
url: result.url,
error: error.message,
scrapedAt: new Date().toISOString()
});
}
}
this.log(`Successfully scraped ${scrapedResults.length} search results`, Level.LOG);
const actionType = "search";
const actionName = "Search Results";
if (!this.serializableDataByType[actionType]) {
this.serializableDataByType[actionType] = {};
}
if (!this.serializableDataByType[actionType][actionName]) {
this.serializableDataByType[actionType][actionName] = {};
}
const searchData = {
query: searchConfig.query,
provider: searchConfig.provider,
filters: searchConfig.filters || {},
mode: searchConfig.mode,
resultsCount: scrapedResults.length,
results: scrapedResults,
searchedAt: new Date().toISOString()
};
this.serializableDataByType[actionType][actionName] = searchData;
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList || {},
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
crawl: this.serializableDataByType.crawl || {},
search: this.serializableDataByType.search || {}
});
} catch (error) {
this.log(`Search action failed: ${error.message}`, Level.ERROR);
throw new Error(`Search execution error: ${error.message}`);
}
},
flag: async () => new Promise((res) => {
if (this.options.debugChannel?.setActionType) {
this.options.debugChannel.setActionType('flag');
@@ -885,7 +1633,9 @@ export default class Interpreter extends EventEmitter {
this.serializableDataByType[actionType][actionName] = [...allResults];
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList,
scrapeSchema: this.serializableDataByType.scrapeSchema
scrapeSchema: this.serializableDataByType.scrapeSchema,
crawl: this.serializableDataByType.crawl || {},
search: this.serializableDataByType.search || {}
});
};
@@ -1735,7 +2485,7 @@ export default class Interpreter extends EventEmitter {
// Clear accumulated data to free memory
this.cumulativeResults = [];
this.namedResults = {};
this.serializableDataByType = { scrapeList: {}, scrapeSchema: {} };
this.serializableDataByType = { scrapeList: {}, scrapeSchema: {}, crawl: {}, search: {} };
// Reset state
this.isAborted = false;

View File

@@ -28,7 +28,7 @@ type MethodNames<T> = {
[K in keyof T]: T[K] extends Function ? K : never;
}[keyof T];
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto';
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto' | 'crawl' | 'search';
export type What = {
action: MethodNames<Page> | CustomFunctions,

View File

@@ -13,8 +13,8 @@ import { AuthenticatedRequest } from "../routes/record"
import {capture} from "../utils/analytics";
import { Page } from "playwright-core";
import { WorkflowFile } from "maxun-core";
import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
import { addGoogleSheetUpdateTask, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
import { addAirtableUpdateTask, processAirtableUpdates } from "../workflow-management/integrations/airtable";
import { sendWebhook } from "../routes/webhook";
import { convertPageToHTML, convertPageToMarkdown, convertPageToScreenshot } from '../markdownify/scrape';
@@ -309,8 +309,8 @@ router.get("/robots/:id/runs",requireAPIKey, async (req: Request, res: Response)
statusCode: 200,
messageCode: "success",
runs: {
totalCount: formattedRuns.length,
items: formattedRuns,
totalCount: formattedRuns.length,
items: formattedRuns,
},
};
@@ -342,6 +342,8 @@ function formatRunResponse(run: any) {
data: {
textData: {},
listData: {},
crawlData: {},
searchData: {},
markdown: '',
html: ''
},
@@ -358,6 +360,14 @@ function formatRunResponse(run: any) {
formattedRun.data.listData = output.scrapeList;
}
if (output.crawl && typeof output.crawl === 'object') {
formattedRun.data.crawlData = output.crawl;
}
if (output.search && typeof output.search === 'object') {
formattedRun.data.searchData = output.search;
}
if (output.markdown && Array.isArray(output.markdown)) {
formattedRun.data.markdown = output.markdown[0]?.content || '';
}
@@ -466,7 +476,7 @@ router.get("/robots/:id/runs/:runId", requireAPIKey, async (req: Request, res: R
}
});
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
async function createWorkflowAndStoreMetadata(id: string, userId: string, isSDK: boolean) {
try {
const recording = await Robot.findOne({
where: {
@@ -510,7 +520,9 @@ async function createWorkflowAndStoreMetadata(id: string, userId: string) {
interpreterSettings: { maxConcurrency: 1, maxRepeats: 1, debug: true },
log: '',
runId,
runByAPI: true,
runByUserId: userId,
runByAPI: !isSDK,
runBySDK: isSDK,
serializableOutput: {},
binaryOutput: {},
retryCount: 0
@@ -687,7 +699,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
let formats = recording.recording_meta.formats || ['markdown'];
// Override if API request defines formats
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
formats = requestedFormats.filter((f): f is 'markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage' =>
['markdown', 'html', 'screenshot-visible', 'screenshot-fullpage'].includes(f)
@@ -714,50 +725,70 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
const SCRAPE_TIMEOUT = 120000;
if (formats.includes('markdown')) {
const markdownPromise = convertPageToMarkdown(url, currentPage);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
markdown = await Promise.race([markdownPromise, timeoutPromise]);
serializableOutput.markdown = [{ content: markdown }];
try {
const markdownPromise = convertPageToMarkdown(url, currentPage);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
});
markdown = await Promise.race([markdownPromise, timeoutPromise]);
if (markdown && markdown.trim().length > 0) {
serializableOutput.markdown = [{ content: markdown }];
}
} catch (error: any) {
logger.log('warn', `Markdown conversion failed for API run ${plainRun.runId}: ${error.message}`);
}
}
if (formats.includes('html')) {
const htmlPromise = convertPageToHTML(url, currentPage);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
html = await Promise.race([htmlPromise, timeoutPromise]);
serializableOutput.html = [{ content: html }];
try {
const htmlPromise = convertPageToHTML(url, currentPage);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
});
html = await Promise.race([htmlPromise, timeoutPromise]);
if (html && html.trim().length > 0) {
serializableOutput.html = [{ content: html }];
}
} catch (error: any) {
logger.log('warn', `HTML conversion failed for API run ${plainRun.runId}: ${error.message}`);
}
}
if (formats.includes("screenshot-visible")) {
const screenshotPromise = convertPageToScreenshot(url, currentPage, false);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
try {
const screenshotPromise = convertPageToScreenshot(url, currentPage, false);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
});
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
if (!binaryOutput['screenshot-visible']) {
binaryOutput['screenshot-visible'] = {
data: screenshotBuffer.toString('base64'),
mimeType: 'image/png'
};
if (screenshotBuffer && screenshotBuffer.length > 0) {
binaryOutput['screenshot-visible'] = {
data: screenshotBuffer.toString('base64'),
mimeType: 'image/png'
};
}
} catch (error: any) {
logger.log('warn', `Screenshot-visible conversion failed for API run ${plainRun.runId}: ${error.message}`);
}
}
if (formats.includes("screenshot-fullpage")) {
const screenshotPromise = convertPageToScreenshot(url, currentPage, true);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
try {
const screenshotPromise = convertPageToScreenshot(url, currentPage, true);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
});
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
if (!binaryOutput['screenshot-fullpage']) {
binaryOutput['screenshot-fullpage'] = {
data: screenshotBuffer.toString('base64'),
mimeType: 'image/png'
};
if (screenshotBuffer && screenshotBuffer.length > 0) {
binaryOutput['screenshot-fullpage'] = {
data: screenshotBuffer.toString('base64'),
mimeType: 'image/png'
};
}
} catch (error: any) {
logger.log('warn', `Screenshot-fullpage conversion failed for API run ${plainRun.runId}: ${error.message}`);
}
}
@@ -769,7 +800,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
binaryOutput,
});
// Upload binary output (screenshots) to MinIO if present
let uploadedBinaryOutput: Record<string, string> = {};
if (Object.keys(binaryOutput).length > 0) {
const binaryOutputService = new BinaryOutputService('maxun-run-screenshots');
@@ -779,7 +809,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
logger.log('info', `Markdown robot execution completed for API run ${id}`);
// Push success socket event
try {
const completionData = {
runId: plainRun.runId,
@@ -800,7 +829,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
);
}
// Build webhook payload
const webhookPayload: any = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
@@ -814,8 +842,8 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
},
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
if (serializableOutput.markdown) webhookPayload.markdown = markdown;
if (serializableOutput.html) webhookPayload.html = html;
if (uploadedBinaryOutput['screenshot-visible']) webhookPayload.screenshot_visible = uploadedBinaryOutput['screenshot-visible'];
if (uploadedBinaryOutput['screenshot-fullpage']) webhookPayload.screenshot_fullpage = uploadedBinaryOutput['screenshot-fullpage'];
@@ -834,9 +862,12 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
capture("maxun-oss-run-created-api", {
runId: plainRun.runId,
user_id: userId,
userId: userId,
robotId: recording.recording_meta.id,
robotType: "scrape",
source: "api",
status: "success",
robot_type: "scrape",
createdAt: new Date().toISOString(),
formats
});
@@ -858,14 +889,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
log: `${formats.join(', ')} conversion failed: ${error.message}`,
});
// Send failure socket event
try {
const failureData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
finishedAt: new Date().toLocaleString(),
error: error.message
};
serverIo
@@ -895,11 +926,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
logger.log('warn', `Failed to send webhook for failed API scrape run ${plainRun.runId}: ${webhookError.message}`);
}
capture("maxun-oss-run-created-api", {
capture("maxun-oss-run-created", {
runId: plainRun.runId,
user_id: userId,
userId: userId,
robotId: recording.recording_meta.id,
robotType: "scrape",
source: "api",
status: "failed",
robot_type: "scrape",
createdAt: new Date().toISOString(),
formats
});
@@ -993,15 +1027,18 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
const totalRowsExtracted = totalSchemaItemsExtracted + totalListItemsExtracted;
capture('maxun-oss-run-created-api',{
capture('maxun-oss-run-created',{
runId: id,
created_at: new Date().toISOString(),
userId: userId,
robotId: recording.recording_meta.id,
robotType: recording.recording_meta.type || 'extract',
source: 'api',
createdAt: new Date().toISOString(),
status: 'success',
totalRowsExtracted,
schemaItemsExtracted: totalSchemaItemsExtracted,
listItemsExtracted: totalListItemsExtracted,
totalSchemaItemsExtracted,
totalListItemsExtracted,
extractedScreenshotsCount,
is_llm: (recording.recording_meta as any).isLLM,
totalRowsExtracted
}
)
@@ -1019,6 +1056,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
typeof parsedOutput.scrapeSchema === "string"
? JSON.parse(parsedOutput.scrapeSchema)
: parsedOutput.scrapeSchema || {};
const parsedCrawl =
typeof parsedOutput.crawl === "string"
? JSON.parse(parsedOutput.crawl)
: parsedOutput.crawl || {};
const parsedSearch =
typeof parsedOutput.search === "string"
? JSON.parse(parsedOutput.search)
: parsedOutput.search || {};
const webhookPayload = {
robot_id: plainRun.robotMetaId,
@@ -1030,6 +1077,8 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
extracted_data: {
captured_texts: parsedSchema || {},
captured_lists: parsedList || {},
crawl_data: parsedCrawl || {},
search_data: parsedSearch || {},
captured_texts_count: totalSchemaItemsExtracted,
captured_lists_count: totalListItemsExtracted,
screenshots_count: extractedScreenshotsCount
@@ -1097,7 +1146,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
const recording = await Robot.findOne({ where: { 'recording_meta.id': run.robotMetaId }, raw: true });
// Trigger webhooks for run failure
const failedWebhookPayload = {
robot_id: run.robotMetaId,
run_id: run.runId,
@@ -1123,10 +1171,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
logger.log('error', `Failed to send failure webhooks for run ${run.runId}: ${webhookError.message}`);
}
capture(
'maxun-oss-run-created-api',
'maxun-oss-run-created',
{
runId: id,
created_at: new Date().toISOString(),
userId: userId,
robotId: recording?.recording_meta?.id || run.robotMetaId,
robotType: recording?.recording_meta?.type || 'extract',
source: 'api',
createdAt: new Date().toISOString(),
status: 'failed',
is_llm: (recording?.recording_meta as any)?.isLLM,
}
@@ -1139,11 +1191,11 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
}
}
export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
export async function handleRunRecording(id: string, userId: string, isSDK: boolean = false) {
let socket: Socket | null = null;
try {
const result = await createWorkflowAndStoreMetadata(id, userId);
const result = await createWorkflowAndStoreMetadata(id, userId, isSDK);
const { browserId, runId: newRunId } = result;
if (!browserId || !newRunId || !userId) {
@@ -1167,6 +1219,10 @@ export async function handleRunRecording(id: string, userId: string, requestedFo
cleanupSocketConnection(socket!, browserId, newRunId);
});
socket.on('error', (error: Error) => {
logger.error(`Socket error for API run ${newRunId}: ${error.message}`);
});
socket.on('disconnect', () => {
cleanupSocketConnection(socket!, browserId, newRunId);
});
@@ -1318,9 +1374,7 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
return res.status(401).json({ ok: false, error: 'Unauthorized' });
}
const requestedFormats = req.body.formats;
const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);
const runId = await handleRunRecording(req.params.id, req.user.id);
if (!runId) {
throw new Error('Run ID is undefined');

View File

@@ -450,13 +450,35 @@ router.post("/sdk/robots/:id/execute", requireAPIKey, async (req: AuthenticatedR
}
}
let crawlData: any[] = [];
if (run.serializableOutput?.crawl) {
const crawl: any = run.serializableOutput.crawl;
if (Array.isArray(crawl)) {
crawlData = crawl;
}
else if (typeof crawl === 'object') {
const crawlValues = Object.values(crawl);
if (crawlValues.length > 0 && Array.isArray(crawlValues[0])) {
crawlData = crawlValues[0] as any[];
}
}
}
let searchData: any = {};
if (run.serializableOutput?.search) {
searchData = run.serializableOutput.search;
}
return res.status(200).json({
data: {
runId: run.runId,
status: run.status,
data: {
textData: run.serializableOutput?.scrapeSchema || {},
listData: listData
listData: listData,
crawlData: crawlData,
searchData: searchData
},
screenshots: Object.values(run.binaryOutput || {})
}
@@ -640,6 +662,202 @@ router.post("/sdk/robots/:id/runs/:runId/abort", requireAPIKey, async (req: Auth
}
});
/**
* Create a crawl robot programmatically
* POST /api/sdk/crawl
*/
router.post("/sdk/crawl", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const user = req.user;
const { url, name, crawlConfig } = req.body;
if (!url || !crawlConfig) {
return res.status(400).json({
error: "URL and crawl configuration are required"
});
}
try {
new URL(url);
} catch (err) {
return res.status(400).json({
error: "Invalid URL format"
});
}
if (typeof crawlConfig !== 'object') {
return res.status(400).json({
error: "crawlConfig must be an object"
});
}
const robotName = name || `Crawl Robot - ${new URL(url).hostname}`;
const robotId = uuid();
const metaId = uuid();
const robot = await Robot.create({
id: robotId,
userId: user.id,
recording_meta: {
name: robotName,
id: metaId,
createdAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
pairs: 1,
params: [],
type: 'crawl',
url: url,
},
recording: {
workflow: [
{
where: { url },
what: [
{ action: 'flag', args: ['generated'] },
{
action: 'crawl',
args: [crawlConfig],
name: 'Crawl'
}
]
},
{
where: { url: 'about:blank' },
what: [
{
action: 'goto',
args: [url]
},
{
action: 'waitForLoadState',
args: ['networkidle']
}
]
}
]
}
});
logger.info(`[SDK] Crawl robot created: ${metaId} (db: ${robotId}) by user ${user.id}`);
capture("maxun-oss-robot-created", {
userId: user.id.toString(),
robotId: metaId,
robotName: robotName,
url: url,
robotType: 'crawl',
crawlConfig: crawlConfig,
source: 'sdk'
});
return res.status(201).json({
data: robot,
message: "Crawl robot created successfully"
});
} catch (error: any) {
logger.error("[SDK] Error creating crawl robot:", error);
return res.status(500).json({
error: "Failed to create crawl robot",
message: error.message
});
}
});
/**
* Create a search robot programmatically
* POST /api/sdk/search
*/
router.post("/sdk/search", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const user = req.user;
const { name, searchConfig } = req.body;
if (!searchConfig) {
return res.status(400).json({
error: "Search configuration is required"
});
}
if (!searchConfig.query) {
return res.status(400).json({
error: "searchConfig must include a query"
});
}
if (typeof searchConfig !== 'object') {
return res.status(400).json({
error: "searchConfig must be an object"
});
}
if (searchConfig.mode && !['discover', 'scrape'].includes(searchConfig.mode)) {
return res.status(400).json({
error: "searchConfig.mode must be either 'discover' or 'scrape'"
});
}
searchConfig.provider = 'duckduckgo';
const robotName = name || `Search Robot - ${searchConfig.query}`;
const robotId = uuid();
const metaId = uuid();
const robot = await Robot.create({
id: robotId,
userId: user.id,
recording_meta: {
name: robotName,
id: metaId,
createdAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
pairs: 1,
params: [],
type: 'search',
},
recording: {
workflow: [
{
where: { url: 'about:blank' },
what: [
{
action: 'search',
args: [searchConfig],
name: 'Search'
}
]
}
]
}
});
logger.info(`[SDK] Search robot created: ${metaId} (db: ${robotId}) by user ${user.id}`);
capture("maxun-oss-robot-created", {
userId: user.id.toString(),
robotId: metaId,
robotName: robotName,
robotType: 'search',
searchQuery: searchConfig.query,
searchProvider: searchConfig.provider || 'duckduckgo',
searchLimit: searchConfig.limit || 10,
source: 'sdk'
});
return res.status(201).json({
data: robot,
message: "Search robot created successfully"
});
} catch (error: any) {
logger.error("[SDK] Error creating search robot:", error);
return res.status(500).json({
error: "Failed to create search robot",
message: error.message
});
}
});
/**
* LLM-based extraction - generate workflow from natural language prompt
* POST /api/sdk/extract/llm

View File

@@ -9,7 +9,7 @@ interface RobotMeta {
pairs: number;
updatedAt: string;
params: any[];
type?: 'extract' | 'scrape';
type?: 'extract' | 'scrape' | 'crawl' | 'search';
url?: string;
formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[];
isLLM?: boolean;

View File

@@ -23,6 +23,7 @@ interface RunAttributes {
runByUserId?: string;
runByScheduleId?: string;
runByAPI?: boolean;
runBySDK?: boolean;
serializableOutput: Record<string, any>;
binaryOutput: Record<string, string>;
retryCount?: number;

View File

@@ -132,7 +132,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
logger.log('info', `Processing run execution job for runId: ${data.runId}, browserId: ${data.browserId}`);
try {
// Find the run
const run = await Run.findOne({ where: { runId: data.runId } });
if (!run) {
logger.log('error', `Run ${data.runId} not found in database`);
@@ -193,7 +192,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
logger.log('info', `Browser ${browserId} found and ready for execution`);
try {
// Find the recording
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
if (!recording) {
@@ -473,11 +471,12 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
interpretationInfo.binaryOutput
);
// Get the already persisted and credit-validated data from the run record
const finalRun = await Run.findByPk(run.id);
const categorizedOutput = {
scrapeSchema: finalRun?.serializableOutput?.scrapeSchema || {},
scrapeList: finalRun?.serializableOutput?.scrapeList || {}
scrapeList: finalRun?.serializableOutput?.scrapeList || {},
crawl: finalRun?.serializableOutput?.crawl || {},
search: finalRun?.serializableOutput?.search || {}
};
if (await isRunAborted()) {
@@ -489,10 +488,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
status: 'success',
finishedAt: new Date().toLocaleString(),
log: interpretationInfo.log.join('\n'),
serializableOutput: JSON.parse(JSON.stringify({
scrapeSchema: categorizedOutput.scrapeSchema || {},
scrapeList: categorizedOutput.scrapeList || {},
})),
binaryOutput: uploadedBinaryOutput,
});
@@ -572,6 +567,8 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
}, {} as Record<string, any[]>)
: {},
captured_lists: categorizedOutput.scrapeList,
crawl_data: categorizedOutput.crawl,
search_data: categorizedOutput.search,
captured_texts_count: totalSchemaItemsExtracted,
captured_lists_count: totalListItemsExtracted,
screenshots_count: extractedScreenshotsCount

View File

@@ -251,21 +251,18 @@ function handleWorkflowActions(workflow: any[], credentials: Credentials) {
router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, res) => {
try {
const { id } = req.params;
const { name, limits, credentials, targetUrl, workflow: incomingWorkflow } = req.body;
const { name, limits, credentials, targetUrl, workflow: incomingWorkflow } = req.body;
// Validate input
if (!name && !limits && !credentials && !targetUrl) {
if (!name && !limits && !credentials && !targetUrl && !incomingWorkflow) {
return res.status(400).json({ error: 'Either "name", "limits", "credentials" or "target_url" must be provided.' });
}
// Fetch the robot by ID
const robot = await Robot.findOne({ where: { 'recording_meta.id': id } });
if (!robot) {
return res.status(404).json({ error: 'Robot not found.' });
}
// Update fields if provided
if (name) {
robot.set('recording_meta', { ...robot.recording_meta, name });
}
@@ -274,7 +271,6 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
const updatedWorkflow = [...robot.recording.workflow];
let foundGoto = false;
for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
const step = updatedWorkflow[i];
@@ -289,7 +285,6 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
robot.changed('recording', true);
foundGoto = true;
i = -1;
break;
}
@@ -299,10 +294,9 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
await robot.save();
// Start with existing workflow or allow client to supply a full workflow replacement
let workflow = incomingWorkflow && Array.isArray(incomingWorkflow)
? JSON.parse(JSON.stringify(incomingWorkflow))
: [...robot.recording.workflow]; // Create a copy of the workflow
: [...robot.recording.workflow];
if (credentials) {
workflow = handleWorkflowActions(workflow, credentials);
@@ -344,7 +338,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
where: { 'recording_meta.id': id }
});
const updatedRobot = await Robot.findOne({ where: { 'recording_meta.id': id } });
await Robot.findOne({ where: { 'recording_meta.id': id } });
logger.log('info', `Robot with ID ${id} was updated successfully.`);
@@ -1322,4 +1316,198 @@ export async function recoverOrphanedRuns() {
}
}
/**
* POST endpoint for creating a crawl robot
* @route POST /recordings/crawl
* @auth requireSignIn - JWT authentication required
*/
router.post('/recordings/crawl', requireSignIn, async (req: AuthenticatedRequest, res) => {
try {
const { url, name, crawlConfig } = req.body;
if (!url || !crawlConfig) {
return res.status(400).json({ error: 'URL and crawl configuration are required.' });
}
if (!req.user) {
return res.status(401).send({ error: 'Unauthorized' });
}
try {
new URL(url);
} catch (err) {
return res.status(400).json({ error: 'Invalid URL format' });
}
const robotName = name || `Crawl Robot - ${new URL(url).hostname}`;
const currentTimestamp = new Date().toLocaleString('en-US');
const robotId = uuid();
const newRobot = await Robot.create({
id: uuid(),
userId: req.user.id,
recording_meta: {
name: robotName,
id: robotId,
createdAt: currentTimestamp,
updatedAt: currentTimestamp,
pairs: 1,
params: [],
type: 'crawl',
url: url,
},
recording: {
workflow: [
{
where: { url },
what: [
{ action: 'flag', args: ['generated'] },
{
action: 'crawl',
args: [crawlConfig],
name: 'Crawl'
}
]
},
{
where: { url: 'about:blank' },
what: [
{
action: 'goto',
args: [url]
},
{
action: 'waitForLoadState',
args: ['networkidle']
}
]
}
]
},
google_sheet_email: null,
google_sheet_name: null,
google_sheet_id: null,
google_access_token: null,
google_refresh_token: null,
airtable_base_id: null,
airtable_base_name: null,
airtable_table_name: null,
airtable_table_id: null,
airtable_access_token: null,
airtable_refresh_token: null,
schedule: null,
webhooks: null
});
logger.log('info', `Crawl robot created with id: ${newRobot.id}`);
capture('maxun-oss-robot-created', {
userId: req.user.id.toString(),
robotId: robotId,
robotName: robotName,
url: url,
robotType: 'crawl',
crawlConfig: crawlConfig
});
return res.status(201).json({
message: 'Crawl robot created successfully.',
robot: newRobot,
});
} catch (error) {
if (error instanceof Error) {
logger.log('error', `Error creating crawl robot: ${error.message}`);
return res.status(500).json({ error: error.message });
} else {
logger.log('error', 'Unknown error creating crawl robot');
return res.status(500).json({ error: 'An unknown error occurred.' });
}
}
});
/**
* POST endpoint for creating a search robot
* @route POST /recordings/search
* @auth requireSignIn - JWT authentication required
*/
router.post('/recordings/search', requireSignIn, async (req: AuthenticatedRequest, res) => {
try {
const { searchConfig, name } = req.body;
if (!searchConfig || !searchConfig.query) {
return res.status(400).json({ error: 'Search configuration with query is required.' });
}
if (!req.user) {
return res.status(401).send({ error: 'Unauthorized' });
}
const robotName = name || `Search Robot - ${searchConfig.query.substring(0, 50)}`;
const currentTimestamp = new Date().toLocaleString('en-US');
const robotId = uuid();
const newRobot = await Robot.create({
id: uuid(),
userId: req.user.id,
recording_meta: {
name: robotName,
id: robotId,
createdAt: currentTimestamp,
updatedAt: currentTimestamp,
pairs: 1,
params: [],
type: 'search',
},
recording: {
workflow: [
{
where: { url: 'about:blank' },
what: [{
action: 'search',
args: [searchConfig],
name: 'Search'
}]
}
]
},
google_sheet_email: null,
google_sheet_name: null,
google_sheet_id: null,
google_access_token: null,
google_refresh_token: null,
airtable_base_id: null,
airtable_base_name: null,
airtable_table_name: null,
airtable_table_id: null,
airtable_access_token: null,
airtable_refresh_token: null,
schedule: null,
webhooks: null
});
logger.log('info', `Search robot created with id: ${newRobot.id}`);
capture('maxun-oss-robot-created', {
userId: req.user.id.toString(),
robotId: robotId,
robotName: robotName,
robotType: 'search',
searchQuery: searchConfig.query,
searchProvider: searchConfig.provider || 'duckduckgo',
searchLimit: searchConfig.limit || 10
});
return res.status(201).json({
message: 'Search robot created successfully.',
robot: newRobot,
});
} catch (error) {
if (error instanceof Error) {
logger.log('error', `Error creating search robot: ${error.message}`);
return res.status(500).json({ error: error.message });
} else {
logger.log('error', 'Unknown error creating search robot');
return res.status(500).json({ error: 'An unknown error occurred.' });
}
}
});
export { processQueuedRuns };

View File

@@ -16,7 +16,6 @@ function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): W
processedWorkflow.workflow.forEach((pair) => {
pair.what.forEach((action) => {
// Handle limit validation for scrapeList action
if (action.action === 'scrapeList' && checkLimit && Array.isArray(action.args) && action.args.length > 0) {
const scrapeConfig = action.args[0];
if (scrapeConfig && typeof scrapeConfig === 'object' && 'limit' in scrapeConfig) {
@@ -26,7 +25,6 @@ function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): W
}
}
// Handle decryption for type and press actions
if ((action.action === 'type' || action.action === 'press') && Array.isArray(action.args) && action.args.length > 1) {
try {
const encryptedValue = action.args[1];
@@ -93,10 +91,14 @@ export class WorkflowInterpreter {
public serializableDataByType: {
scrapeSchema: Record<string, any>;
scrapeList: Record<string, any>;
crawl: Record<string, any>;
search: Record<string, any>;
[key: string]: any;
} = {
scrapeSchema: {},
scrapeList: {},
crawl: {},
search: {},
};
private currentActionName: string | null = null;
@@ -282,7 +284,6 @@ export class WorkflowInterpreter {
}
} else if (this.currentActionType === 'scrapeList') {
if (data && Array.isArray(data) && data.length > 0) {
// Use the current index for persistence
await this.persistDataToDatabase('scrapeList', data, this.currentScrapeListIndex);
}
@@ -293,7 +294,6 @@ export class WorkflowInterpreter {
}
},
binaryCallback: async (data: string, mimetype: string) => {
// For editor mode, we don't have the name yet, so use a timestamp-based name
const binaryItem = {
name: `Screenshot ${Date.now()}`,
mimeType: mimetype,
@@ -301,7 +301,6 @@ export class WorkflowInterpreter {
};
this.binaryData.push(binaryItem);
// Persist binary data to database
await this.persistBinaryDataToDatabase(binaryItem);
this.socket.emit('binaryCallback', {
@@ -340,7 +339,6 @@ export class WorkflowInterpreter {
logger.log('debug', `Interpretation finished`);
// Flush any remaining data in persistence buffer before completing
await this.flushPersistenceBuffer();
this.interpreter = null;
@@ -419,6 +417,8 @@ export class WorkflowInterpreter {
this.serializableDataByType = {
scrapeSchema: {},
scrapeList: {},
crawl: {},
search: {},
};
this.binaryData = [];
this.currentScrapeListIndex = 0;
@@ -591,12 +591,20 @@ export class WorkflowInterpreter {
typeKey = "scrapeList";
} else if (this.currentActionType === "scrapeSchema") {
typeKey = "scrapeSchema";
} else if (this.currentActionType === "crawl") {
typeKey = "crawl";
} else if (this.currentActionType === "search") {
typeKey = "search";
}
if (typeKey === "scrapeList" && data.scrapeList) {
data = data.scrapeList;
} else if (typeKey === "scrapeSchema" && data.scrapeSchema) {
data = data.scrapeSchema;
} else if (typeKey === "crawl" && data.crawl) {
data = data.crawl;
} else if (typeKey === "search" && data.search) {
data = data.search;
}
let actionName = "";
@@ -609,38 +617,65 @@ export class WorkflowInterpreter {
actionName = keys[keys.length - 1];
data = data[actionName];
}
} else if (typeKey === "crawl" && data && typeof data === "object" && !Array.isArray(data)) {
const keys = Object.keys(data);
if (keys.length === 1) {
actionName = keys[0];
data = data[actionName];
} else if (keys.length > 1) {
actionName = keys[keys.length - 1];
data = data[actionName];
}
} else if (typeKey === "search" && data && typeof data === "object" && !Array.isArray(data)) {
const keys = Object.keys(data);
if (keys.length === 1) {
actionName = keys[0];
data = data[actionName];
} else if (keys.length > 1) {
actionName = keys[keys.length - 1];
data = data[actionName];
}
}
if (!actionName) {
actionName = this.currentActionName || "";
if (typeKey === "scrapeList" && !actionName) {
actionName = this.getUniqueActionName(typeKey, "");
} else if (typeKey === "crawl" && !actionName) {
actionName = this.getUniqueActionName(typeKey, "Crawl Results");
} else if (typeKey === "search" && !actionName) {
actionName = this.getUniqueActionName(typeKey, "Search Results");
}
}
const flattened = Array.isArray(data)
? data
: (
data?.List ??
(data && typeof data === "object"
? Object.values(data).flat?.() ?? data
: [])
);
let processedData;
if (typeKey === "search") {
processedData = data;
} else {
processedData = Array.isArray(data)
? data
: (
data?.List ??
(data && typeof data === "object"
? Object.values(data).flat?.() ?? data
: [])
);
}
if (!this.serializableDataByType[typeKey]) {
this.serializableDataByType[typeKey] = {};
}
this.serializableDataByType[typeKey][actionName] = flattened;
this.serializableDataByType[typeKey][actionName] = processedData;
await this.persistDataToDatabase(typeKey, {
[actionName]: flattened,
[actionName]: processedData,
});
this.socket.emit("serializableCallback", {
type: typeKey,
name: actionName,
data: flattened,
data: processedData,
});
} catch (err: any) {
logger.log('error', `serializableCallback handler failed: ${err.message}`);
@@ -698,7 +733,6 @@ export class WorkflowInterpreter {
await this.flushPersistenceBuffer();
// Structure the output to maintain separate data for each action type
const result = {
log: this.debugMessages,
result: status,
@@ -794,7 +828,7 @@ export class WorkflowInterpreter {
const currentSerializableOutput = run.serializableOutput ?
JSON.parse(JSON.stringify(run.serializableOutput)) :
{ scrapeSchema: [], scrapeList: [] };
{ scrapeSchema: {}, scrapeList: {}, crawl: {}, search: {} };
if (Array.isArray(currentSerializableOutput.scrapeList)) {
currentSerializableOutput.scrapeList = {};
@@ -802,6 +836,9 @@ export class WorkflowInterpreter {
if (Array.isArray(currentSerializableOutput.scrapeSchema)) {
currentSerializableOutput.scrapeSchema = {};
}
if (!currentSerializableOutput.search) {
currentSerializableOutput.search = {};
}
let hasUpdates = false;
@@ -827,6 +864,18 @@ export class WorkflowInterpreter {
}
mergeLists(currentSerializableOutput.scrapeList, item.data);
hasUpdates = true;
} else if (item.actionType === 'crawl') {
currentSerializableOutput.crawl = {
...(currentSerializableOutput.crawl || {}),
...item.data
};
hasUpdates = true;
} else if (item.actionType === 'search') {
currentSerializableOutput.search = {
...(currentSerializableOutput.search || {}),
...item.data
};
hasUpdates = true;
}
}

View File

@@ -13,7 +13,11 @@ interface AirtableUpdateTask {
interface SerializableOutput {
scrapeSchema?: Record<string, any[]>;
scrapeList?: Record<string, any[]>;
scrapeList?: Record<string, any[]>;
markdown?: Array<{ content: string }>;
html?: Array<{ content: string }>;
crawl?: Record<string, any[]>;
search?: any;
}
const MAX_RETRIES = 3;
@@ -67,6 +71,10 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
const schemaData: Array<{ Group: string; Field: string; Value: any }> = [];
const listData: any[] = [];
const screenshotData: Array<{ key: string; url: string }> = [];
const markdownData: any[] = [];
const htmlData: any[] = [];
const crawlData: any[] = [];
const searchData: any[] = [];
if (serializableOutput.scrapeSchema) {
if (Array.isArray(serializableOutput.scrapeSchema)) {
@@ -122,6 +130,66 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
}
}
if (serializableOutput.markdown && Array.isArray(serializableOutput.markdown)) {
serializableOutput.markdown.forEach((item, index) => {
if (item.content) {
markdownData.push({
"Index": index + 1,
"Type": "Markdown",
"Content": item.content
});
}
});
}
if (serializableOutput.html && Array.isArray(serializableOutput.html)) {
serializableOutput.html.forEach((item, index) => {
if (item.content) {
htmlData.push({
"Index": index + 1,
"Type": "HTML",
"Content": item.content
});
}
});
}
if (serializableOutput.crawl && typeof serializableOutput.crawl === "object") {
for (const [crawlName, crawlArray] of Object.entries(serializableOutput.crawl)) {
if (Array.isArray(crawlArray)) {
crawlArray.forEach((crawlItem) => {
const hasContent = Object.values(crawlItem || {}).some(
(value) => value !== null && value !== undefined && value !== ""
);
if (hasContent) {
crawlData.push({ "Crawl Type": crawlName, ...crawlItem });
}
});
}
}
}
if (serializableOutput.search) {
let results: any[] = [];
if (serializableOutput.search.results && Array.isArray(serializableOutput.search.results)) {
results = serializableOutput.search.results;
} else if (Array.isArray(serializableOutput.search)) {
results = serializableOutput.search;
} else {
results = [serializableOutput.search];
}
results.forEach((result) => {
const hasContent = Object.values(result || {}).some(
(value) => value !== null && value !== undefined && value !== ""
);
if (hasContent) {
searchData.push(result);
}
});
}
// Collect screenshot data (handles both string and object forms safely)
// if (binaryOutput && Object.keys(binaryOutput).length > 0) {
// Object.entries(binaryOutput).forEach(([key, rawValue]: [string, any]) => {
@@ -152,7 +220,15 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
// }
// --- Merge all types into Airtable rows ---
const maxLength = Math.max(schemaData.length, listData.length, screenshotData.length);
const maxLength = Math.max(
schemaData.length,
listData.length,
screenshotData.length,
markdownData.length,
htmlData.length,
crawlData.length,
searchData.length
);
for (let i = 0; i < maxLength; i++) {
const record: Record<string, any> = {};
@@ -176,6 +252,38 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
record.Screenshot = screenshotData[i].url;
}
if (i < markdownData.length) {
Object.entries(markdownData[i] || {}).forEach(([key, value]) => {
if (value !== null && value !== undefined && value !== "") {
record[key] = value;
}
});
}
if (i < htmlData.length) {
Object.entries(htmlData[i] || {}).forEach(([key, value]) => {
if (value !== null && value !== undefined && value !== "") {
record[key] = value;
}
});
}
if (i < crawlData.length) {
Object.entries(crawlData[i] || {}).forEach(([key, value]) => {
if (value !== null && value !== undefined && value !== "") {
record[key] = value;
}
});
}
if (i < searchData.length) {
Object.entries(searchData[i] || {}).forEach(([key, value]) => {
if (value !== null && value !== undefined && value !== "") {
record[key] = value;
}
});
}
if (Object.keys(record).length > 0) {
allRecords.push(record);
}
@@ -194,6 +302,18 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
Screenshot: screenshotData[i].url,
});
}
for (let i = maxLength; i < markdownData.length; i++) {
allRecords.push(markdownData[i]);
}
for (let i = maxLength; i < htmlData.length; i++) {
allRecords.push(htmlData[i]);
}
for (let i = maxLength; i < crawlData.length; i++) {
allRecords.push(crawlData[i]);
}
for (let i = maxLength; i < searchData.length; i++) {
allRecords.push(searchData[i]);
}
return allRecords;
}

View File

@@ -13,6 +13,10 @@ interface GoogleSheetUpdateTask {
interface SerializableOutput {
scrapeSchema?: Record<string, any[]>;
scrapeList?: Record<string, any[]>;
markdown?: Array<{ content: string }>;
html?: Array<{ content: string }>;
crawl?: Record<string, any[]>;
search?: any;
}
@@ -95,6 +99,72 @@ export async function updateGoogleSheet(robotId: string, runId: string) {
}
}
if (serializableOutput.markdown && Array.isArray(serializableOutput.markdown) && serializableOutput.markdown.length > 0) {
const markdownData = serializableOutput.markdown.map((item, index) => ({
"Index": index + 1,
"Content": item.content || ""
}));
await processOutputType(
robotId,
spreadsheetId,
'Markdown',
markdownData,
plainRobot
);
}
if (serializableOutput.html && Array.isArray(serializableOutput.html) && serializableOutput.html.length > 0) {
const htmlData = serializableOutput.html.map((item, index) => ({
"Index": index + 1,
"Content": item.content || ""
}));
await processOutputType(
robotId,
spreadsheetId,
'HTML',
htmlData,
plainRobot
);
}
if (serializableOutput.crawl && typeof serializableOutput.crawl === "object") {
for (const [crawlName, crawlArray] of Object.entries(serializableOutput.crawl)) {
if (!Array.isArray(crawlArray) || crawlArray.length === 0) continue;
await processOutputType(
robotId,
spreadsheetId,
`Crawl - ${crawlName}`,
crawlArray,
plainRobot
);
}
}
if (serializableOutput.search) {
let searchData: any[] = [];
if (serializableOutput.search.results && Array.isArray(serializableOutput.search.results)) {
searchData = serializableOutput.search.results;
} else if (Array.isArray(serializableOutput.search)) {
searchData = serializableOutput.search;
} else {
searchData = [serializableOutput.search];
}
if (searchData.length > 0) {
await processOutputType(
robotId,
spreadsheetId,
'Search Results',
searchData,
plainRobot
);
}
}
}
if (plainRun.binaryOutput && Object.keys(plainRun.binaryOutput).length > 0) {

View File

@@ -484,6 +484,8 @@ async function executeRun(id: string, userId: string) {
const categorizedOutput = {
scrapeSchema: finalRun?.serializableOutput?.scrapeSchema || {},
scrapeList: finalRun?.serializableOutput?.scrapeList || {},
crawl: finalRun?.serializableOutput?.crawl || {},
search: finalRun?.serializableOutput?.search || {}
};
await destroyRemoteBrowser(plainRun.browserId, userId);
@@ -570,6 +572,8 @@ async function executeRun(id: string, userId: string) {
}, {} as Record<string, any[]>)
: {},
captured_lists: categorizedOutput.scrapeList,
crawl_data: categorizedOutput.crawl,
search_data: categorizedOutput.search,
captured_texts_count: totalSchemaItemsExtracted,
captured_lists_count: totalListItemsExtracted,
screenshots_count: extractedScreenshotsCount

View File

@@ -353,7 +353,7 @@ export const createCrawlRobot = async (
): Promise<any> => {
try {
const response = await axios.post(
`${apiUrl}/recordings/crawl`,
`${apiUrl}/storage/recordings/crawl`,
{
url,
name,
@@ -392,7 +392,7 @@ export const createSearchRobot = async (
): Promise<any> => {
try {
const response = await axios.post(
`${apiUrl}/recordings/search`,
`${apiUrl}/storage/recordings/search`,
{
name,
searchConfig,

View File

@@ -154,7 +154,7 @@ export const RobotConfigPage: React.FC<RobotConfigPageProps> = ({
)}
<Box sx={{ display: 'flex', gap: 2 }}>
/* {showCancelButton && (
{/* {showCancelButton && (
<Button
variant="outlined"
onClick={handleBack}
@@ -164,7 +164,7 @@ export const RobotConfigPage: React.FC<RobotConfigPageProps> = ({
}} >
{cancelButtonText || t("buttons.cancel")}
</Button>
)} */
)} */}
{showSaveButton && onSave && (
<Button
variant="contained"

View File

@@ -1051,7 +1051,7 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
})),
credentials: credentialsForPayload,
targetUrl: targetUrl,
workflow: robot.recording.workflow,
workflow: updatedWorkflow,
};
const success = await updateRecording(robot.recording_meta.id, payload);
@@ -1101,13 +1101,15 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
style={{ marginBottom: "20px" }}
/>
<TextField
label={t("robot_duplication.fields.target_url")}
key={t("robot_duplication.fields.target_url")}
value={getTargetUrl() || ""}
onChange={(e) => handleTargetUrlChange(e.target.value)}
style={{ marginBottom: "20px" }}
/>
{robot.recording_meta.type !== 'search' && (
<TextField
label={t("robot_duplication.fields.target_url")}
key={t("robot_duplication.fields.target_url")}
value={getTargetUrl() || ""}
onChange={(e) => handleTargetUrlChange(e.target.value)}
style={{ marginBottom: "20px" }}
/>
)}
{renderCrawlConfigFields()}
{renderSearchConfigFields()}