feat: add server side crawl and search logic
This commit is contained in:
@@ -13,8 +13,8 @@ import { AuthenticatedRequest } from "../routes/record"
|
||||
import {capture} from "../utils/analytics";
|
||||
import { Page } from "playwright-core";
|
||||
import { WorkflowFile } from "maxun-core";
|
||||
import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
|
||||
import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
|
||||
import { addGoogleSheetUpdateTask, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
|
||||
import { addAirtableUpdateTask, processAirtableUpdates } from "../workflow-management/integrations/airtable";
|
||||
import { sendWebhook } from "../routes/webhook";
|
||||
import { convertPageToHTML, convertPageToMarkdown, convertPageToScreenshot } from '../markdownify/scrape';
|
||||
|
||||
@@ -309,8 +309,8 @@ router.get("/robots/:id/runs",requireAPIKey, async (req: Request, res: Response)
|
||||
statusCode: 200,
|
||||
messageCode: "success",
|
||||
runs: {
|
||||
totalCount: formattedRuns.length,
|
||||
items: formattedRuns,
|
||||
totalCount: formattedRuns.length,
|
||||
items: formattedRuns,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -342,6 +342,8 @@ function formatRunResponse(run: any) {
|
||||
data: {
|
||||
textData: {},
|
||||
listData: {},
|
||||
crawlData: {},
|
||||
searchData: {},
|
||||
markdown: '',
|
||||
html: ''
|
||||
},
|
||||
@@ -358,6 +360,14 @@ function formatRunResponse(run: any) {
|
||||
formattedRun.data.listData = output.scrapeList;
|
||||
}
|
||||
|
||||
if (output.crawl && typeof output.crawl === 'object') {
|
||||
formattedRun.data.crawlData = output.crawl;
|
||||
}
|
||||
|
||||
if (output.search && typeof output.search === 'object') {
|
||||
formattedRun.data.searchData = output.search;
|
||||
}
|
||||
|
||||
if (output.markdown && Array.isArray(output.markdown)) {
|
||||
formattedRun.data.markdown = output.markdown[0]?.content || '';
|
||||
}
|
||||
@@ -466,7 +476,7 @@ router.get("/robots/:id/runs/:runId", requireAPIKey, async (req: Request, res: R
|
||||
}
|
||||
});
|
||||
|
||||
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
|
||||
async function createWorkflowAndStoreMetadata(id: string, userId: string, isSDK: boolean) {
|
||||
try {
|
||||
const recording = await Robot.findOne({
|
||||
where: {
|
||||
@@ -510,7 +520,9 @@ async function createWorkflowAndStoreMetadata(id: string, userId: string) {
|
||||
interpreterSettings: { maxConcurrency: 1, maxRepeats: 1, debug: true },
|
||||
log: '',
|
||||
runId,
|
||||
runByAPI: true,
|
||||
runByUserId: userId,
|
||||
runByAPI: !isSDK,
|
||||
runBySDK: isSDK,
|
||||
serializableOutput: {},
|
||||
binaryOutput: {},
|
||||
retryCount: 0
|
||||
@@ -687,7 +699,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
|
||||
let formats = recording.recording_meta.formats || ['markdown'];
|
||||
|
||||
// Override if API request defines formats
|
||||
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
|
||||
formats = requestedFormats.filter((f): f is 'markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage' =>
|
||||
['markdown', 'html', 'screenshot-visible', 'screenshot-fullpage'].includes(f)
|
||||
@@ -714,50 +725,70 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
const SCRAPE_TIMEOUT = 120000;
|
||||
|
||||
if (formats.includes('markdown')) {
|
||||
const markdownPromise = convertPageToMarkdown(url, currentPage);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
markdown = await Promise.race([markdownPromise, timeoutPromise]);
|
||||
serializableOutput.markdown = [{ content: markdown }];
|
||||
try {
|
||||
const markdownPromise = convertPageToMarkdown(url, currentPage);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
markdown = await Promise.race([markdownPromise, timeoutPromise]);
|
||||
if (markdown && markdown.trim().length > 0) {
|
||||
serializableOutput.markdown = [{ content: markdown }];
|
||||
}
|
||||
} catch (error: any) {
|
||||
logger.log('warn', `Markdown conversion failed for API run ${plainRun.runId}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (formats.includes('html')) {
|
||||
const htmlPromise = convertPageToHTML(url, currentPage);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
html = await Promise.race([htmlPromise, timeoutPromise]);
|
||||
serializableOutput.html = [{ content: html }];
|
||||
try {
|
||||
const htmlPromise = convertPageToHTML(url, currentPage);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
html = await Promise.race([htmlPromise, timeoutPromise]);
|
||||
if (html && html.trim().length > 0) {
|
||||
serializableOutput.html = [{ content: html }];
|
||||
}
|
||||
} catch (error: any) {
|
||||
logger.log('warn', `HTML conversion failed for API run ${plainRun.runId}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (formats.includes("screenshot-visible")) {
|
||||
const screenshotPromise = convertPageToScreenshot(url, currentPage, false);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
|
||||
try {
|
||||
const screenshotPromise = convertPageToScreenshot(url, currentPage, false);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
|
||||
|
||||
if (!binaryOutput['screenshot-visible']) {
|
||||
binaryOutput['screenshot-visible'] = {
|
||||
data: screenshotBuffer.toString('base64'),
|
||||
mimeType: 'image/png'
|
||||
};
|
||||
if (screenshotBuffer && screenshotBuffer.length > 0) {
|
||||
binaryOutput['screenshot-visible'] = {
|
||||
data: screenshotBuffer.toString('base64'),
|
||||
mimeType: 'image/png'
|
||||
};
|
||||
}
|
||||
} catch (error: any) {
|
||||
logger.log('warn', `Screenshot-visible conversion failed for API run ${plainRun.runId}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (formats.includes("screenshot-fullpage")) {
|
||||
const screenshotPromise = convertPageToScreenshot(url, currentPage, true);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
|
||||
try {
|
||||
const screenshotPromise = convertPageToScreenshot(url, currentPage, true);
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
||||
});
|
||||
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);
|
||||
|
||||
if (!binaryOutput['screenshot-fullpage']) {
|
||||
binaryOutput['screenshot-fullpage'] = {
|
||||
data: screenshotBuffer.toString('base64'),
|
||||
mimeType: 'image/png'
|
||||
};
|
||||
if (screenshotBuffer && screenshotBuffer.length > 0) {
|
||||
binaryOutput['screenshot-fullpage'] = {
|
||||
data: screenshotBuffer.toString('base64'),
|
||||
mimeType: 'image/png'
|
||||
};
|
||||
}
|
||||
} catch (error: any) {
|
||||
logger.log('warn', `Screenshot-fullpage conversion failed for API run ${plainRun.runId}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -769,7 +800,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
binaryOutput,
|
||||
});
|
||||
|
||||
// Upload binary output (screenshots) to MinIO if present
|
||||
let uploadedBinaryOutput: Record<string, string> = {};
|
||||
if (Object.keys(binaryOutput).length > 0) {
|
||||
const binaryOutputService = new BinaryOutputService('maxun-run-screenshots');
|
||||
@@ -779,7 +809,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
|
||||
logger.log('info', `Markdown robot execution completed for API run ${id}`);
|
||||
|
||||
// Push success socket event
|
||||
try {
|
||||
const completionData = {
|
||||
runId: plainRun.runId,
|
||||
@@ -800,7 +829,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
);
|
||||
}
|
||||
|
||||
// Build webhook payload
|
||||
const webhookPayload: any = {
|
||||
robot_id: plainRun.robotMetaId,
|
||||
run_id: plainRun.runId,
|
||||
@@ -814,8 +842,8 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
},
|
||||
};
|
||||
|
||||
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
|
||||
if (formats.includes('html')) webhookPayload.html = html;
|
||||
if (serializableOutput.markdown) webhookPayload.markdown = markdown;
|
||||
if (serializableOutput.html) webhookPayload.html = html;
|
||||
if (uploadedBinaryOutput['screenshot-visible']) webhookPayload.screenshot_visible = uploadedBinaryOutput['screenshot-visible'];
|
||||
if (uploadedBinaryOutput['screenshot-fullpage']) webhookPayload.screenshot_fullpage = uploadedBinaryOutput['screenshot-fullpage'];
|
||||
|
||||
@@ -834,9 +862,12 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
|
||||
capture("maxun-oss-run-created-api", {
|
||||
runId: plainRun.runId,
|
||||
user_id: userId,
|
||||
userId: userId,
|
||||
robotId: recording.recording_meta.id,
|
||||
robotType: "scrape",
|
||||
source: "api",
|
||||
status: "success",
|
||||
robot_type: "scrape",
|
||||
createdAt: new Date().toISOString(),
|
||||
formats
|
||||
});
|
||||
|
||||
@@ -858,14 +889,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
log: `${formats.join(', ')} conversion failed: ${error.message}`,
|
||||
});
|
||||
|
||||
// Send failure socket event
|
||||
try {
|
||||
const failureData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
error: error.message
|
||||
};
|
||||
|
||||
serverIo
|
||||
@@ -895,11 +926,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
logger.log('warn', `Failed to send webhook for failed API scrape run ${plainRun.runId}: ${webhookError.message}`);
|
||||
}
|
||||
|
||||
capture("maxun-oss-run-created-api", {
|
||||
capture("maxun-oss-run-created", {
|
||||
runId: plainRun.runId,
|
||||
user_id: userId,
|
||||
userId: userId,
|
||||
robotId: recording.recording_meta.id,
|
||||
robotType: "scrape",
|
||||
source: "api",
|
||||
status: "failed",
|
||||
robot_type: "scrape",
|
||||
createdAt: new Date().toISOString(),
|
||||
formats
|
||||
});
|
||||
|
||||
@@ -993,15 +1027,18 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
|
||||
const totalRowsExtracted = totalSchemaItemsExtracted + totalListItemsExtracted;
|
||||
|
||||
capture('maxun-oss-run-created-api',{
|
||||
capture('maxun-oss-run-created',{
|
||||
runId: id,
|
||||
created_at: new Date().toISOString(),
|
||||
userId: userId,
|
||||
robotId: recording.recording_meta.id,
|
||||
robotType: recording.recording_meta.type || 'extract',
|
||||
source: 'api',
|
||||
createdAt: new Date().toISOString(),
|
||||
status: 'success',
|
||||
totalRowsExtracted,
|
||||
schemaItemsExtracted: totalSchemaItemsExtracted,
|
||||
listItemsExtracted: totalListItemsExtracted,
|
||||
totalSchemaItemsExtracted,
|
||||
totalListItemsExtracted,
|
||||
extractedScreenshotsCount,
|
||||
is_llm: (recording.recording_meta as any).isLLM,
|
||||
totalRowsExtracted
|
||||
}
|
||||
)
|
||||
|
||||
@@ -1019,6 +1056,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
typeof parsedOutput.scrapeSchema === "string"
|
||||
? JSON.parse(parsedOutput.scrapeSchema)
|
||||
: parsedOutput.scrapeSchema || {};
|
||||
|
||||
const parsedCrawl =
|
||||
typeof parsedOutput.crawl === "string"
|
||||
? JSON.parse(parsedOutput.crawl)
|
||||
: parsedOutput.crawl || {};
|
||||
|
||||
const parsedSearch =
|
||||
typeof parsedOutput.search === "string"
|
||||
? JSON.parse(parsedOutput.search)
|
||||
: parsedOutput.search || {};
|
||||
|
||||
const webhookPayload = {
|
||||
robot_id: plainRun.robotMetaId,
|
||||
@@ -1030,6 +1077,8 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
extracted_data: {
|
||||
captured_texts: parsedSchema || {},
|
||||
captured_lists: parsedList || {},
|
||||
crawl_data: parsedCrawl || {},
|
||||
search_data: parsedSearch || {},
|
||||
captured_texts_count: totalSchemaItemsExtracted,
|
||||
captured_lists_count: totalListItemsExtracted,
|
||||
screenshots_count: extractedScreenshotsCount
|
||||
@@ -1097,7 +1146,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
|
||||
const recording = await Robot.findOne({ where: { 'recording_meta.id': run.robotMetaId }, raw: true });
|
||||
|
||||
// Trigger webhooks for run failure
|
||||
const failedWebhookPayload = {
|
||||
robot_id: run.robotMetaId,
|
||||
run_id: run.runId,
|
||||
@@ -1123,10 +1171,14 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
logger.log('error', `Failed to send failure webhooks for run ${run.runId}: ${webhookError.message}`);
|
||||
}
|
||||
capture(
|
||||
'maxun-oss-run-created-api',
|
||||
'maxun-oss-run-created',
|
||||
{
|
||||
runId: id,
|
||||
created_at: new Date().toISOString(),
|
||||
userId: userId,
|
||||
robotId: recording?.recording_meta?.id || run.robotMetaId,
|
||||
robotType: recording?.recording_meta?.type || 'extract',
|
||||
source: 'api',
|
||||
createdAt: new Date().toISOString(),
|
||||
status: 'failed',
|
||||
is_llm: (recording?.recording_meta as any)?.isLLM,
|
||||
}
|
||||
@@ -1139,11 +1191,11 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
||||
}
|
||||
}
|
||||
|
||||
export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
|
||||
export async function handleRunRecording(id: string, userId: string, isSDK: boolean = false) {
|
||||
let socket: Socket | null = null;
|
||||
|
||||
try {
|
||||
const result = await createWorkflowAndStoreMetadata(id, userId);
|
||||
const result = await createWorkflowAndStoreMetadata(id, userId, isSDK);
|
||||
const { browserId, runId: newRunId } = result;
|
||||
|
||||
if (!browserId || !newRunId || !userId) {
|
||||
@@ -1167,6 +1219,10 @@ export async function handleRunRecording(id: string, userId: string, requestedFo
|
||||
cleanupSocketConnection(socket!, browserId, newRunId);
|
||||
});
|
||||
|
||||
socket.on('error', (error: Error) => {
|
||||
logger.error(`Socket error for API run ${newRunId}: ${error.message}`);
|
||||
});
|
||||
|
||||
socket.on('disconnect', () => {
|
||||
cleanupSocketConnection(socket!, browserId, newRunId);
|
||||
});
|
||||
@@ -1318,9 +1374,7 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
|
||||
return res.status(401).json({ ok: false, error: 'Unauthorized' });
|
||||
}
|
||||
|
||||
const requestedFormats = req.body.formats;
|
||||
|
||||
const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);
|
||||
const runId = await handleRunRecording(req.params.id, req.user.id);
|
||||
|
||||
if (!runId) {
|
||||
throw new Error('Run ID is undefined');
|
||||
|
||||
@@ -450,13 +450,35 @@ router.post("/sdk/robots/:id/execute", requireAPIKey, async (req: AuthenticatedR
|
||||
}
|
||||
}
|
||||
|
||||
let crawlData: any[] = [];
|
||||
if (run.serializableOutput?.crawl) {
|
||||
const crawl: any = run.serializableOutput.crawl;
|
||||
|
||||
if (Array.isArray(crawl)) {
|
||||
crawlData = crawl;
|
||||
}
|
||||
else if (typeof crawl === 'object') {
|
||||
const crawlValues = Object.values(crawl);
|
||||
if (crawlValues.length > 0 && Array.isArray(crawlValues[0])) {
|
||||
crawlData = crawlValues[0] as any[];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let searchData: any = {};
|
||||
if (run.serializableOutput?.search) {
|
||||
searchData = run.serializableOutput.search;
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
data: {
|
||||
runId: run.runId,
|
||||
status: run.status,
|
||||
data: {
|
||||
textData: run.serializableOutput?.scrapeSchema || {},
|
||||
listData: listData
|
||||
listData: listData,
|
||||
crawlData: crawlData,
|
||||
searchData: searchData
|
||||
},
|
||||
screenshots: Object.values(run.binaryOutput || {})
|
||||
}
|
||||
@@ -640,6 +662,202 @@ router.post("/sdk/robots/:id/runs/:runId/abort", requireAPIKey, async (req: Auth
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Create a crawl robot programmatically
|
||||
* POST /api/sdk/crawl
|
||||
*/
|
||||
router.post("/sdk/crawl", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
|
||||
try {
|
||||
const user = req.user;
|
||||
const { url, name, crawlConfig } = req.body;
|
||||
|
||||
if (!url || !crawlConfig) {
|
||||
return res.status(400).json({
|
||||
error: "URL and crawl configuration are required"
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
new URL(url);
|
||||
} catch (err) {
|
||||
return res.status(400).json({
|
||||
error: "Invalid URL format"
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof crawlConfig !== 'object') {
|
||||
return res.status(400).json({
|
||||
error: "crawlConfig must be an object"
|
||||
});
|
||||
}
|
||||
|
||||
const robotName = name || `Crawl Robot - ${new URL(url).hostname}`;
|
||||
const robotId = uuid();
|
||||
const metaId = uuid();
|
||||
|
||||
const robot = await Robot.create({
|
||||
id: robotId,
|
||||
userId: user.id,
|
||||
recording_meta: {
|
||||
name: robotName,
|
||||
id: metaId,
|
||||
createdAt: new Date().toISOString(),
|
||||
updatedAt: new Date().toISOString(),
|
||||
pairs: 1,
|
||||
params: [],
|
||||
type: 'crawl',
|
||||
url: url,
|
||||
},
|
||||
recording: {
|
||||
workflow: [
|
||||
{
|
||||
where: { url },
|
||||
what: [
|
||||
{ action: 'flag', args: ['generated'] },
|
||||
{
|
||||
action: 'crawl',
|
||||
args: [crawlConfig],
|
||||
name: 'Crawl'
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
where: { url: 'about:blank' },
|
||||
what: [
|
||||
{
|
||||
action: 'goto',
|
||||
args: [url]
|
||||
},
|
||||
{
|
||||
action: 'waitForLoadState',
|
||||
args: ['networkidle']
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
logger.info(`[SDK] Crawl robot created: ${metaId} (db: ${robotId}) by user ${user.id}`);
|
||||
|
||||
capture("maxun-oss-robot-created", {
|
||||
userId: user.id.toString(),
|
||||
robotId: metaId,
|
||||
robotName: robotName,
|
||||
url: url,
|
||||
robotType: 'crawl',
|
||||
crawlConfig: crawlConfig,
|
||||
source: 'sdk'
|
||||
});
|
||||
|
||||
return res.status(201).json({
|
||||
data: robot,
|
||||
message: "Crawl robot created successfully"
|
||||
});
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error("[SDK] Error creating crawl robot:", error);
|
||||
return res.status(500).json({
|
||||
error: "Failed to create crawl robot",
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Create a search robot programmatically
|
||||
* POST /api/sdk/search
|
||||
*/
|
||||
router.post("/sdk/search", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
|
||||
try {
|
||||
const user = req.user;
|
||||
const { name, searchConfig } = req.body;
|
||||
|
||||
if (!searchConfig) {
|
||||
return res.status(400).json({
|
||||
error: "Search configuration is required"
|
||||
});
|
||||
}
|
||||
|
||||
if (!searchConfig.query) {
|
||||
return res.status(400).json({
|
||||
error: "searchConfig must include a query"
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof searchConfig !== 'object') {
|
||||
return res.status(400).json({
|
||||
error: "searchConfig must be an object"
|
||||
});
|
||||
}
|
||||
|
||||
if (searchConfig.mode && !['discover', 'scrape'].includes(searchConfig.mode)) {
|
||||
return res.status(400).json({
|
||||
error: "searchConfig.mode must be either 'discover' or 'scrape'"
|
||||
});
|
||||
}
|
||||
|
||||
searchConfig.provider = 'duckduckgo';
|
||||
|
||||
const robotName = name || `Search Robot - ${searchConfig.query}`;
|
||||
const robotId = uuid();
|
||||
const metaId = uuid();
|
||||
|
||||
const robot = await Robot.create({
|
||||
id: robotId,
|
||||
userId: user.id,
|
||||
recording_meta: {
|
||||
name: robotName,
|
||||
id: metaId,
|
||||
createdAt: new Date().toISOString(),
|
||||
updatedAt: new Date().toISOString(),
|
||||
pairs: 1,
|
||||
params: [],
|
||||
type: 'search',
|
||||
},
|
||||
recording: {
|
||||
workflow: [
|
||||
{
|
||||
where: { url: 'about:blank' },
|
||||
what: [
|
||||
{
|
||||
action: 'search',
|
||||
args: [searchConfig],
|
||||
name: 'Search'
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
logger.info(`[SDK] Search robot created: ${metaId} (db: ${robotId}) by user ${user.id}`);
|
||||
|
||||
capture("maxun-oss-robot-created", {
|
||||
userId: user.id.toString(),
|
||||
robotId: metaId,
|
||||
robotName: robotName,
|
||||
robotType: 'search',
|
||||
searchQuery: searchConfig.query,
|
||||
searchProvider: searchConfig.provider || 'duckduckgo',
|
||||
searchLimit: searchConfig.limit || 10,
|
||||
source: 'sdk'
|
||||
});
|
||||
|
||||
return res.status(201).json({
|
||||
data: robot,
|
||||
message: "Search robot created successfully"
|
||||
});
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error("[SDK] Error creating search robot:", error);
|
||||
return res.status(500).json({
|
||||
error: "Failed to create search robot",
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* LLM-based extraction - generate workflow from natural language prompt
|
||||
* POST /api/sdk/extract/llm
|
||||
|
||||
@@ -9,7 +9,7 @@ interface RobotMeta {
|
||||
pairs: number;
|
||||
updatedAt: string;
|
||||
params: any[];
|
||||
type?: 'extract' | 'scrape';
|
||||
type?: 'extract' | 'scrape' | 'crawl' | 'search';
|
||||
url?: string;
|
||||
formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[];
|
||||
isLLM?: boolean;
|
||||
|
||||
@@ -23,6 +23,7 @@ interface RunAttributes {
|
||||
runByUserId?: string;
|
||||
runByScheduleId?: string;
|
||||
runByAPI?: boolean;
|
||||
runBySDK?: boolean;
|
||||
serializableOutput: Record<string, any>;
|
||||
binaryOutput: Record<string, string>;
|
||||
retryCount?: number;
|
||||
|
||||
@@ -132,7 +132,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
logger.log('info', `Processing run execution job for runId: ${data.runId}, browserId: ${data.browserId}`);
|
||||
|
||||
try {
|
||||
// Find the run
|
||||
const run = await Run.findOne({ where: { runId: data.runId } });
|
||||
if (!run) {
|
||||
logger.log('error', `Run ${data.runId} not found in database`);
|
||||
@@ -193,7 +192,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
logger.log('info', `Browser ${browserId} found and ready for execution`);
|
||||
|
||||
try {
|
||||
// Find the recording
|
||||
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
|
||||
|
||||
if (!recording) {
|
||||
@@ -473,11 +471,12 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
interpretationInfo.binaryOutput
|
||||
);
|
||||
|
||||
// Get the already persisted and credit-validated data from the run record
|
||||
const finalRun = await Run.findByPk(run.id);
|
||||
const categorizedOutput = {
|
||||
scrapeSchema: finalRun?.serializableOutput?.scrapeSchema || {},
|
||||
scrapeList: finalRun?.serializableOutput?.scrapeList || {}
|
||||
scrapeList: finalRun?.serializableOutput?.scrapeList || {},
|
||||
crawl: finalRun?.serializableOutput?.crawl || {},
|
||||
search: finalRun?.serializableOutput?.search || {}
|
||||
};
|
||||
|
||||
if (await isRunAborted()) {
|
||||
@@ -489,10 +488,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: interpretationInfo.log.join('\n'),
|
||||
serializableOutput: JSON.parse(JSON.stringify({
|
||||
scrapeSchema: categorizedOutput.scrapeSchema || {},
|
||||
scrapeList: categorizedOutput.scrapeList || {},
|
||||
})),
|
||||
binaryOutput: uploadedBinaryOutput,
|
||||
});
|
||||
|
||||
@@ -572,6 +567,8 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
}, {} as Record<string, any[]>)
|
||||
: {},
|
||||
captured_lists: categorizedOutput.scrapeList,
|
||||
crawl_data: categorizedOutput.crawl,
|
||||
search_data: categorizedOutput.search,
|
||||
captured_texts_count: totalSchemaItemsExtracted,
|
||||
captured_lists_count: totalListItemsExtracted,
|
||||
screenshots_count: extractedScreenshotsCount
|
||||
|
||||
@@ -251,21 +251,18 @@ function handleWorkflowActions(workflow: any[], credentials: Credentials) {
|
||||
router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { name, limits, credentials, targetUrl, workflow: incomingWorkflow } = req.body;
|
||||
const { name, limits, credentials, targetUrl, workflow: incomingWorkflow } = req.body;
|
||||
|
||||
// Validate input
|
||||
if (!name && !limits && !credentials && !targetUrl) {
|
||||
if (!name && !limits && !credentials && !targetUrl && !incomingWorkflow) {
|
||||
return res.status(400).json({ error: 'Either "name", "limits", "credentials" or "target_url" must be provided.' });
|
||||
}
|
||||
|
||||
// Fetch the robot by ID
|
||||
const robot = await Robot.findOne({ where: { 'recording_meta.id': id } });
|
||||
|
||||
if (!robot) {
|
||||
return res.status(404).json({ error: 'Robot not found.' });
|
||||
}
|
||||
|
||||
// Update fields if provided
|
||||
|
||||
if (name) {
|
||||
robot.set('recording_meta', { ...robot.recording_meta, name });
|
||||
}
|
||||
@@ -274,7 +271,6 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
|
||||
|
||||
const updatedWorkflow = [...robot.recording.workflow];
|
||||
let foundGoto = false;
|
||||
|
||||
for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
|
||||
const step = updatedWorkflow[i];
|
||||
@@ -289,7 +285,6 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
|
||||
robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
|
||||
robot.changed('recording', true);
|
||||
foundGoto = true;
|
||||
i = -1;
|
||||
break;
|
||||
}
|
||||
@@ -299,10 +294,9 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
|
||||
await robot.save();
|
||||
|
||||
// Start with existing workflow or allow client to supply a full workflow replacement
|
||||
let workflow = incomingWorkflow && Array.isArray(incomingWorkflow)
|
||||
? JSON.parse(JSON.stringify(incomingWorkflow))
|
||||
: [...robot.recording.workflow]; // Create a copy of the workflow
|
||||
: [...robot.recording.workflow];
|
||||
|
||||
if (credentials) {
|
||||
workflow = handleWorkflowActions(workflow, credentials);
|
||||
@@ -344,7 +338,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
where: { 'recording_meta.id': id }
|
||||
});
|
||||
|
||||
const updatedRobot = await Robot.findOne({ where: { 'recording_meta.id': id } });
|
||||
await Robot.findOne({ where: { 'recording_meta.id': id } });
|
||||
|
||||
logger.log('info', `Robot with ID ${id} was updated successfully.`);
|
||||
|
||||
@@ -1322,4 +1316,198 @@ export async function recoverOrphanedRuns() {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* POST endpoint for creating a crawl robot
|
||||
* @route POST /recordings/crawl
|
||||
* @auth requireSignIn - JWT authentication required
|
||||
*/
|
||||
router.post('/recordings/crawl', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const { url, name, crawlConfig } = req.body;
|
||||
|
||||
if (!url || !crawlConfig) {
|
||||
return res.status(400).json({ error: 'URL and crawl configuration are required.' });
|
||||
}
|
||||
|
||||
if (!req.user) {
|
||||
return res.status(401).send({ error: 'Unauthorized' });
|
||||
}
|
||||
|
||||
try {
|
||||
new URL(url);
|
||||
} catch (err) {
|
||||
return res.status(400).json({ error: 'Invalid URL format' });
|
||||
}
|
||||
|
||||
const robotName = name || `Crawl Robot - ${new URL(url).hostname}`;
|
||||
const currentTimestamp = new Date().toLocaleString('en-US');
|
||||
const robotId = uuid();
|
||||
|
||||
const newRobot = await Robot.create({
|
||||
id: uuid(),
|
||||
userId: req.user.id,
|
||||
recording_meta: {
|
||||
name: robotName,
|
||||
id: robotId,
|
||||
createdAt: currentTimestamp,
|
||||
updatedAt: currentTimestamp,
|
||||
pairs: 1,
|
||||
params: [],
|
||||
type: 'crawl',
|
||||
url: url,
|
||||
},
|
||||
recording: {
|
||||
workflow: [
|
||||
{
|
||||
where: { url },
|
||||
what: [
|
||||
{ action: 'flag', args: ['generated'] },
|
||||
{
|
||||
action: 'crawl',
|
||||
args: [crawlConfig],
|
||||
name: 'Crawl'
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
where: { url: 'about:blank' },
|
||||
what: [
|
||||
{
|
||||
action: 'goto',
|
||||
args: [url]
|
||||
},
|
||||
{
|
||||
action: 'waitForLoadState',
|
||||
args: ['networkidle']
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
google_sheet_email: null,
|
||||
google_sheet_name: null,
|
||||
google_sheet_id: null,
|
||||
google_access_token: null,
|
||||
google_refresh_token: null,
|
||||
airtable_base_id: null,
|
||||
airtable_base_name: null,
|
||||
airtable_table_name: null,
|
||||
airtable_table_id: null,
|
||||
airtable_access_token: null,
|
||||
airtable_refresh_token: null,
|
||||
schedule: null,
|
||||
webhooks: null
|
||||
});
|
||||
|
||||
logger.log('info', `Crawl robot created with id: ${newRobot.id}`);
|
||||
capture('maxun-oss-robot-created', {
|
||||
userId: req.user.id.toString(),
|
||||
robotId: robotId,
|
||||
robotName: robotName,
|
||||
url: url,
|
||||
robotType: 'crawl',
|
||||
crawlConfig: crawlConfig
|
||||
});
|
||||
|
||||
return res.status(201).json({
|
||||
message: 'Crawl robot created successfully.',
|
||||
robot: newRobot,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
logger.log('error', `Error creating crawl robot: ${error.message}`);
|
||||
return res.status(500).json({ error: error.message });
|
||||
} else {
|
||||
logger.log('error', 'Unknown error creating crawl robot');
|
||||
return res.status(500).json({ error: 'An unknown error occurred.' });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST endpoint for creating a search robot
|
||||
* @route POST /recordings/search
|
||||
* @auth requireSignIn - JWT authentication required
|
||||
*/
|
||||
router.post('/recordings/search', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const { searchConfig, name } = req.body;
|
||||
|
||||
if (!searchConfig || !searchConfig.query) {
|
||||
return res.status(400).json({ error: 'Search configuration with query is required.' });
|
||||
}
|
||||
|
||||
if (!req.user) {
|
||||
return res.status(401).send({ error: 'Unauthorized' });
|
||||
}
|
||||
|
||||
const robotName = name || `Search Robot - ${searchConfig.query.substring(0, 50)}`;
|
||||
const currentTimestamp = new Date().toLocaleString('en-US');
|
||||
const robotId = uuid();
|
||||
|
||||
const newRobot = await Robot.create({
|
||||
id: uuid(),
|
||||
userId: req.user.id,
|
||||
recording_meta: {
|
||||
name: robotName,
|
||||
id: robotId,
|
||||
createdAt: currentTimestamp,
|
||||
updatedAt: currentTimestamp,
|
||||
pairs: 1,
|
||||
params: [],
|
||||
type: 'search',
|
||||
},
|
||||
recording: {
|
||||
workflow: [
|
||||
{
|
||||
where: { url: 'about:blank' },
|
||||
what: [{
|
||||
action: 'search',
|
||||
args: [searchConfig],
|
||||
name: 'Search'
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
google_sheet_email: null,
|
||||
google_sheet_name: null,
|
||||
google_sheet_id: null,
|
||||
google_access_token: null,
|
||||
google_refresh_token: null,
|
||||
airtable_base_id: null,
|
||||
airtable_base_name: null,
|
||||
airtable_table_name: null,
|
||||
airtable_table_id: null,
|
||||
airtable_access_token: null,
|
||||
airtable_refresh_token: null,
|
||||
schedule: null,
|
||||
webhooks: null
|
||||
});
|
||||
|
||||
logger.log('info', `Search robot created with id: ${newRobot.id}`);
|
||||
capture('maxun-oss-robot-created', {
|
||||
userId: req.user.id.toString(),
|
||||
robotId: robotId,
|
||||
robotName: robotName,
|
||||
robotType: 'search',
|
||||
searchQuery: searchConfig.query,
|
||||
searchProvider: searchConfig.provider || 'duckduckgo',
|
||||
searchLimit: searchConfig.limit || 10
|
||||
});
|
||||
|
||||
return res.status(201).json({
|
||||
message: 'Search robot created successfully.',
|
||||
robot: newRobot,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
logger.log('error', `Error creating search robot: ${error.message}`);
|
||||
return res.status(500).json({ error: error.message });
|
||||
} else {
|
||||
logger.log('error', 'Unknown error creating search robot');
|
||||
return res.status(500).json({ error: 'An unknown error occurred.' });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
export { processQueuedRuns };
|
||||
@@ -16,7 +16,6 @@ function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): W
|
||||
|
||||
processedWorkflow.workflow.forEach((pair) => {
|
||||
pair.what.forEach((action) => {
|
||||
// Handle limit validation for scrapeList action
|
||||
if (action.action === 'scrapeList' && checkLimit && Array.isArray(action.args) && action.args.length > 0) {
|
||||
const scrapeConfig = action.args[0];
|
||||
if (scrapeConfig && typeof scrapeConfig === 'object' && 'limit' in scrapeConfig) {
|
||||
@@ -26,7 +25,6 @@ function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): W
|
||||
}
|
||||
}
|
||||
|
||||
// Handle decryption for type and press actions
|
||||
if ((action.action === 'type' || action.action === 'press') && Array.isArray(action.args) && action.args.length > 1) {
|
||||
try {
|
||||
const encryptedValue = action.args[1];
|
||||
@@ -93,10 +91,14 @@ export class WorkflowInterpreter {
|
||||
public serializableDataByType: {
|
||||
scrapeSchema: Record<string, any>;
|
||||
scrapeList: Record<string, any>;
|
||||
crawl: Record<string, any>;
|
||||
search: Record<string, any>;
|
||||
[key: string]: any;
|
||||
} = {
|
||||
scrapeSchema: {},
|
||||
scrapeList: {},
|
||||
crawl: {},
|
||||
search: {},
|
||||
};
|
||||
|
||||
private currentActionName: string | null = null;
|
||||
@@ -282,7 +284,6 @@ export class WorkflowInterpreter {
|
||||
}
|
||||
} else if (this.currentActionType === 'scrapeList') {
|
||||
if (data && Array.isArray(data) && data.length > 0) {
|
||||
// Use the current index for persistence
|
||||
await this.persistDataToDatabase('scrapeList', data, this.currentScrapeListIndex);
|
||||
}
|
||||
|
||||
@@ -293,7 +294,6 @@ export class WorkflowInterpreter {
|
||||
}
|
||||
},
|
||||
binaryCallback: async (data: string, mimetype: string) => {
|
||||
// For editor mode, we don't have the name yet, so use a timestamp-based name
|
||||
const binaryItem = {
|
||||
name: `Screenshot ${Date.now()}`,
|
||||
mimeType: mimetype,
|
||||
@@ -301,7 +301,6 @@ export class WorkflowInterpreter {
|
||||
};
|
||||
this.binaryData.push(binaryItem);
|
||||
|
||||
// Persist binary data to database
|
||||
await this.persistBinaryDataToDatabase(binaryItem);
|
||||
|
||||
this.socket.emit('binaryCallback', {
|
||||
@@ -340,7 +339,6 @@ export class WorkflowInterpreter {
|
||||
|
||||
logger.log('debug', `Interpretation finished`);
|
||||
|
||||
// Flush any remaining data in persistence buffer before completing
|
||||
await this.flushPersistenceBuffer();
|
||||
|
||||
this.interpreter = null;
|
||||
@@ -419,6 +417,8 @@ export class WorkflowInterpreter {
|
||||
this.serializableDataByType = {
|
||||
scrapeSchema: {},
|
||||
scrapeList: {},
|
||||
crawl: {},
|
||||
search: {},
|
||||
};
|
||||
this.binaryData = [];
|
||||
this.currentScrapeListIndex = 0;
|
||||
@@ -591,12 +591,20 @@ export class WorkflowInterpreter {
|
||||
typeKey = "scrapeList";
|
||||
} else if (this.currentActionType === "scrapeSchema") {
|
||||
typeKey = "scrapeSchema";
|
||||
} else if (this.currentActionType === "crawl") {
|
||||
typeKey = "crawl";
|
||||
} else if (this.currentActionType === "search") {
|
||||
typeKey = "search";
|
||||
}
|
||||
|
||||
if (typeKey === "scrapeList" && data.scrapeList) {
|
||||
data = data.scrapeList;
|
||||
} else if (typeKey === "scrapeSchema" && data.scrapeSchema) {
|
||||
data = data.scrapeSchema;
|
||||
} else if (typeKey === "crawl" && data.crawl) {
|
||||
data = data.crawl;
|
||||
} else if (typeKey === "search" && data.search) {
|
||||
data = data.search;
|
||||
}
|
||||
|
||||
let actionName = "";
|
||||
@@ -609,38 +617,65 @@ export class WorkflowInterpreter {
|
||||
actionName = keys[keys.length - 1];
|
||||
data = data[actionName];
|
||||
}
|
||||
} else if (typeKey === "crawl" && data && typeof data === "object" && !Array.isArray(data)) {
|
||||
const keys = Object.keys(data);
|
||||
if (keys.length === 1) {
|
||||
actionName = keys[0];
|
||||
data = data[actionName];
|
||||
} else if (keys.length > 1) {
|
||||
actionName = keys[keys.length - 1];
|
||||
data = data[actionName];
|
||||
}
|
||||
} else if (typeKey === "search" && data && typeof data === "object" && !Array.isArray(data)) {
|
||||
const keys = Object.keys(data);
|
||||
if (keys.length === 1) {
|
||||
actionName = keys[0];
|
||||
data = data[actionName];
|
||||
} else if (keys.length > 1) {
|
||||
actionName = keys[keys.length - 1];
|
||||
data = data[actionName];
|
||||
}
|
||||
}
|
||||
|
||||
if (!actionName) {
|
||||
actionName = this.currentActionName || "";
|
||||
if (typeKey === "scrapeList" && !actionName) {
|
||||
actionName = this.getUniqueActionName(typeKey, "");
|
||||
} else if (typeKey === "crawl" && !actionName) {
|
||||
actionName = this.getUniqueActionName(typeKey, "Crawl Results");
|
||||
} else if (typeKey === "search" && !actionName) {
|
||||
actionName = this.getUniqueActionName(typeKey, "Search Results");
|
||||
}
|
||||
}
|
||||
|
||||
const flattened = Array.isArray(data)
|
||||
? data
|
||||
: (
|
||||
data?.List ??
|
||||
(data && typeof data === "object"
|
||||
? Object.values(data).flat?.() ?? data
|
||||
: [])
|
||||
);
|
||||
let processedData;
|
||||
if (typeKey === "search") {
|
||||
processedData = data;
|
||||
} else {
|
||||
processedData = Array.isArray(data)
|
||||
? data
|
||||
: (
|
||||
data?.List ??
|
||||
(data && typeof data === "object"
|
||||
? Object.values(data).flat?.() ?? data
|
||||
: [])
|
||||
);
|
||||
}
|
||||
|
||||
if (!this.serializableDataByType[typeKey]) {
|
||||
this.serializableDataByType[typeKey] = {};
|
||||
}
|
||||
|
||||
this.serializableDataByType[typeKey][actionName] = flattened;
|
||||
this.serializableDataByType[typeKey][actionName] = processedData;
|
||||
|
||||
await this.persistDataToDatabase(typeKey, {
|
||||
[actionName]: flattened,
|
||||
[actionName]: processedData,
|
||||
});
|
||||
|
||||
this.socket.emit("serializableCallback", {
|
||||
type: typeKey,
|
||||
name: actionName,
|
||||
data: flattened,
|
||||
data: processedData,
|
||||
});
|
||||
} catch (err: any) {
|
||||
logger.log('error', `serializableCallback handler failed: ${err.message}`);
|
||||
@@ -698,7 +733,6 @@ export class WorkflowInterpreter {
|
||||
|
||||
await this.flushPersistenceBuffer();
|
||||
|
||||
// Structure the output to maintain separate data for each action type
|
||||
const result = {
|
||||
log: this.debugMessages,
|
||||
result: status,
|
||||
@@ -794,7 +828,7 @@ export class WorkflowInterpreter {
|
||||
|
||||
const currentSerializableOutput = run.serializableOutput ?
|
||||
JSON.parse(JSON.stringify(run.serializableOutput)) :
|
||||
{ scrapeSchema: [], scrapeList: [] };
|
||||
{ scrapeSchema: {}, scrapeList: {}, crawl: {}, search: {} };
|
||||
|
||||
if (Array.isArray(currentSerializableOutput.scrapeList)) {
|
||||
currentSerializableOutput.scrapeList = {};
|
||||
@@ -802,6 +836,9 @@ export class WorkflowInterpreter {
|
||||
if (Array.isArray(currentSerializableOutput.scrapeSchema)) {
|
||||
currentSerializableOutput.scrapeSchema = {};
|
||||
}
|
||||
if (!currentSerializableOutput.search) {
|
||||
currentSerializableOutput.search = {};
|
||||
}
|
||||
|
||||
let hasUpdates = false;
|
||||
|
||||
@@ -827,6 +864,18 @@ export class WorkflowInterpreter {
|
||||
}
|
||||
mergeLists(currentSerializableOutput.scrapeList, item.data);
|
||||
hasUpdates = true;
|
||||
} else if (item.actionType === 'crawl') {
|
||||
currentSerializableOutput.crawl = {
|
||||
...(currentSerializableOutput.crawl || {}),
|
||||
...item.data
|
||||
};
|
||||
hasUpdates = true;
|
||||
} else if (item.actionType === 'search') {
|
||||
currentSerializableOutput.search = {
|
||||
...(currentSerializableOutput.search || {}),
|
||||
...item.data
|
||||
};
|
||||
hasUpdates = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -13,7 +13,11 @@ interface AirtableUpdateTask {
|
||||
|
||||
interface SerializableOutput {
|
||||
scrapeSchema?: Record<string, any[]>;
|
||||
scrapeList?: Record<string, any[]>;
|
||||
scrapeList?: Record<string, any[]>;
|
||||
markdown?: Array<{ content: string }>;
|
||||
html?: Array<{ content: string }>;
|
||||
crawl?: Record<string, any[]>;
|
||||
search?: any;
|
||||
}
|
||||
|
||||
const MAX_RETRIES = 3;
|
||||
@@ -67,6 +71,10 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
const schemaData: Array<{ Group: string; Field: string; Value: any }> = [];
|
||||
const listData: any[] = [];
|
||||
const screenshotData: Array<{ key: string; url: string }> = [];
|
||||
const markdownData: any[] = [];
|
||||
const htmlData: any[] = [];
|
||||
const crawlData: any[] = [];
|
||||
const searchData: any[] = [];
|
||||
|
||||
if (serializableOutput.scrapeSchema) {
|
||||
if (Array.isArray(serializableOutput.scrapeSchema)) {
|
||||
@@ -122,6 +130,66 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.markdown && Array.isArray(serializableOutput.markdown)) {
|
||||
serializableOutput.markdown.forEach((item, index) => {
|
||||
if (item.content) {
|
||||
markdownData.push({
|
||||
"Index": index + 1,
|
||||
"Type": "Markdown",
|
||||
"Content": item.content
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (serializableOutput.html && Array.isArray(serializableOutput.html)) {
|
||||
serializableOutput.html.forEach((item, index) => {
|
||||
if (item.content) {
|
||||
htmlData.push({
|
||||
"Index": index + 1,
|
||||
"Type": "HTML",
|
||||
"Content": item.content
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (serializableOutput.crawl && typeof serializableOutput.crawl === "object") {
|
||||
for (const [crawlName, crawlArray] of Object.entries(serializableOutput.crawl)) {
|
||||
if (Array.isArray(crawlArray)) {
|
||||
crawlArray.forEach((crawlItem) => {
|
||||
const hasContent = Object.values(crawlItem || {}).some(
|
||||
(value) => value !== null && value !== undefined && value !== ""
|
||||
);
|
||||
if (hasContent) {
|
||||
crawlData.push({ "Crawl Type": crawlName, ...crawlItem });
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.search) {
|
||||
let results: any[] = [];
|
||||
|
||||
if (serializableOutput.search.results && Array.isArray(serializableOutput.search.results)) {
|
||||
results = serializableOutput.search.results;
|
||||
} else if (Array.isArray(serializableOutput.search)) {
|
||||
results = serializableOutput.search;
|
||||
} else {
|
||||
results = [serializableOutput.search];
|
||||
}
|
||||
|
||||
results.forEach((result) => {
|
||||
const hasContent = Object.values(result || {}).some(
|
||||
(value) => value !== null && value !== undefined && value !== ""
|
||||
);
|
||||
if (hasContent) {
|
||||
searchData.push(result);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Collect screenshot data (handles both string and object forms safely)
|
||||
// if (binaryOutput && Object.keys(binaryOutput).length > 0) {
|
||||
// Object.entries(binaryOutput).forEach(([key, rawValue]: [string, any]) => {
|
||||
@@ -152,7 +220,15 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
// }
|
||||
|
||||
// --- Merge all types into Airtable rows ---
|
||||
const maxLength = Math.max(schemaData.length, listData.length, screenshotData.length);
|
||||
const maxLength = Math.max(
|
||||
schemaData.length,
|
||||
listData.length,
|
||||
screenshotData.length,
|
||||
markdownData.length,
|
||||
htmlData.length,
|
||||
crawlData.length,
|
||||
searchData.length
|
||||
);
|
||||
|
||||
for (let i = 0; i < maxLength; i++) {
|
||||
const record: Record<string, any> = {};
|
||||
@@ -176,6 +252,38 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
record.Screenshot = screenshotData[i].url;
|
||||
}
|
||||
|
||||
if (i < markdownData.length) {
|
||||
Object.entries(markdownData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (i < htmlData.length) {
|
||||
Object.entries(htmlData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (i < crawlData.length) {
|
||||
Object.entries(crawlData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (i < searchData.length) {
|
||||
Object.entries(searchData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (Object.keys(record).length > 0) {
|
||||
allRecords.push(record);
|
||||
}
|
||||
@@ -194,6 +302,18 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
Screenshot: screenshotData[i].url,
|
||||
});
|
||||
}
|
||||
for (let i = maxLength; i < markdownData.length; i++) {
|
||||
allRecords.push(markdownData[i]);
|
||||
}
|
||||
for (let i = maxLength; i < htmlData.length; i++) {
|
||||
allRecords.push(htmlData[i]);
|
||||
}
|
||||
for (let i = maxLength; i < crawlData.length; i++) {
|
||||
allRecords.push(crawlData[i]);
|
||||
}
|
||||
for (let i = maxLength; i < searchData.length; i++) {
|
||||
allRecords.push(searchData[i]);
|
||||
}
|
||||
|
||||
return allRecords;
|
||||
}
|
||||
|
||||
@@ -13,6 +13,10 @@ interface GoogleSheetUpdateTask {
|
||||
interface SerializableOutput {
|
||||
scrapeSchema?: Record<string, any[]>;
|
||||
scrapeList?: Record<string, any[]>;
|
||||
markdown?: Array<{ content: string }>;
|
||||
html?: Array<{ content: string }>;
|
||||
crawl?: Record<string, any[]>;
|
||||
search?: any;
|
||||
}
|
||||
|
||||
|
||||
@@ -95,6 +99,72 @@ export async function updateGoogleSheet(robotId: string, runId: string) {
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.markdown && Array.isArray(serializableOutput.markdown) && serializableOutput.markdown.length > 0) {
|
||||
const markdownData = serializableOutput.markdown.map((item, index) => ({
|
||||
"Index": index + 1,
|
||||
"Content": item.content || ""
|
||||
}));
|
||||
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
'Markdown',
|
||||
markdownData,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
|
||||
if (serializableOutput.html && Array.isArray(serializableOutput.html) && serializableOutput.html.length > 0) {
|
||||
const htmlData = serializableOutput.html.map((item, index) => ({
|
||||
"Index": index + 1,
|
||||
"Content": item.content || ""
|
||||
}));
|
||||
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
'HTML',
|
||||
htmlData,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
|
||||
if (serializableOutput.crawl && typeof serializableOutput.crawl === "object") {
|
||||
for (const [crawlName, crawlArray] of Object.entries(serializableOutput.crawl)) {
|
||||
if (!Array.isArray(crawlArray) || crawlArray.length === 0) continue;
|
||||
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
`Crawl - ${crawlName}`,
|
||||
crawlArray,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.search) {
|
||||
let searchData: any[] = [];
|
||||
|
||||
if (serializableOutput.search.results && Array.isArray(serializableOutput.search.results)) {
|
||||
searchData = serializableOutput.search.results;
|
||||
} else if (Array.isArray(serializableOutput.search)) {
|
||||
searchData = serializableOutput.search;
|
||||
} else {
|
||||
searchData = [serializableOutput.search];
|
||||
}
|
||||
|
||||
if (searchData.length > 0) {
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
'Search Results',
|
||||
searchData,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (plainRun.binaryOutput && Object.keys(plainRun.binaryOutput).length > 0) {
|
||||
|
||||
@@ -484,6 +484,8 @@ async function executeRun(id: string, userId: string) {
|
||||
const categorizedOutput = {
|
||||
scrapeSchema: finalRun?.serializableOutput?.scrapeSchema || {},
|
||||
scrapeList: finalRun?.serializableOutput?.scrapeList || {},
|
||||
crawl: finalRun?.serializableOutput?.crawl || {},
|
||||
search: finalRun?.serializableOutput?.search || {}
|
||||
};
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
@@ -570,6 +572,8 @@ async function executeRun(id: string, userId: string) {
|
||||
}, {} as Record<string, any[]>)
|
||||
: {},
|
||||
captured_lists: categorizedOutput.scrapeList,
|
||||
crawl_data: categorizedOutput.crawl,
|
||||
search_data: categorizedOutput.search,
|
||||
captured_texts_count: totalSchemaItemsExtracted,
|
||||
captured_lists_count: totalListItemsExtracted,
|
||||
screenshots_count: extractedScreenshotsCount
|
||||
|
||||
Reference in New Issue
Block a user