Merge branch 'markdownify' of https://github.com/getmaxun/maxun into markdownify

This commit is contained in:
amhsirak
2025-11-20 15:35:45 +05:30
11 changed files with 630 additions and 31 deletions

View File

@@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core";
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
import { sendWebhook } from "../routes/webhook";
import { convertPageToMarkdown } from '../markdownify/scrape';
chromium.use(stealthPlugin());
@@ -344,7 +345,8 @@ function formatRunResponse(run: any) {
runByAPI: run.runByAPI,
data: {
textData: {},
listData: {}
listData: {},
markdown: ''
},
screenshots: [] as any[],
};
@@ -359,6 +361,10 @@ function formatRunResponse(run: any) {
formattedRun.data.listData = output.scrapeList;
}
if (output.markdown && Array.isArray(output.markdown)) {
formattedRun.data.markdown = output.markdown[0]?.content || '';
}
if (run.binaryOutput) {
Object.keys(run.binaryOutput).forEach(key => {
if (run.binaryOutput[key]) {
@@ -651,6 +657,105 @@ async function executeRun(id: string, userId: string) {
};
}
if (recording.recording_meta.type === 'markdown') {
logger.log('info', `Executing markdown robot for API run ${id}`);
await run.update({
status: 'running',
log: 'Converting page to markdown'
});
try {
const url = recording.recording_meta.url;
if (!url) {
throw new Error('No URL specified for markdown robot');
}
const markdown = await convertPageToMarkdown(url);
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: 'Markdown conversion completed successfully',
serializableOutput: {
markdown: [{ content: markdown }]
},
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for API run ${id}`);
try {
const completionData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString()
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
}
const webhookPayload = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
robot_name: recording.recording_meta.name,
status: 'success',
started_at: plainRun.startedAt,
finished_at: new Date().toLocaleString(),
markdown: markdown,
metadata: {
browser_id: plainRun.browserId,
user_id: userId,
}
};
try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log('info', `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`);
} catch (webhookError: any) {
logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
}
await destroyRemoteBrowser(plainRun.browserId, userId);
return {
success: true,
interpretationInfo: run.toJSON()
};
} catch (error: any) {
logger.log('error', `Markdown conversion failed for API run ${id}: ${error.message}`);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `Markdown conversion failed: ${error.message}`,
});
try {
const failureData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
}
await destroyRemoteBrowser(plainRun.browserId, userId);
throw error;
}
}
plainRun.status = 'running';
browser = browserPool.getRemoteBrowser(plainRun.browserId);
@@ -889,12 +994,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
if (!run) throw new Error('Run not found');
if (run.status === 'success') {
return run.toJSON();
return run;
} else if (run.status === 'failed') {
throw new Error('Run failed');
}
// Wait for the next polling interval
await new Promise(resolve => setTimeout(resolve, interval));
}
}

View File

@@ -9,6 +9,8 @@ interface RobotMeta {
pairs: number;
updatedAt: string;
params: any[];
type?: 'traditional' | 'markdown';
url?: string;
}
interface RobotWorkflow {

View File

@@ -20,6 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme
import { io as serverIo } from "./server";
import { sendWebhook } from './routes/webhook';
import { BinaryOutputService } from './storage/mino';
import { convertPageToMarkdown } from './markdownify/scrape';
if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
@@ -183,11 +184,103 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
try {
// Find the recording
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
if (!recording) {
throw new Error(`Recording for run ${data.runId} not found`);
}
if (recording.recording_meta.type === 'markdown') {
logger.log('info', `Executing markdown robot for run ${data.runId}`);
await run.update({
status: 'running',
log: 'Converting page to markdown'
});
try {
const url = recording.recording_meta.url;
if (!url) {
throw new Error('No URL specified for markdown robot');
}
const markdown = await convertPageToMarkdown(url);
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: 'Markdown conversion completed successfully',
serializableOutput: {
markdown: [{ content: markdown }]
},
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
try {
const completionData = {
runId: data.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString()
};
serverIo.of(browserId).emit('run-completed', completionData);
serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', completionData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
}
try {
const webhookPayload = {
runId: data.runId,
robotId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString(),
markdown: markdown
};
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
} catch (webhookError: any) {
logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`);
}
await destroyRemoteBrowser(browserId, data.userId);
return { success: true };
} catch (error: any) {
logger.log('error', `Markdown conversion failed for run ${data.runId}: ${error.message}`);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `Markdown conversion failed: ${error.message}`,
});
try {
const failureData = {
runId: data.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
};
serverIo.of(browserId).emit('run-completed', failureData);
serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', failureData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`);
}
await destroyRemoteBrowser(browserId, data.userId);
throw error;
}
}
const isRunAborted = async (): Promise<boolean> => {
try {
const currentRun = await Run.findOne({ where: { runId: data.runId } });

View File

@@ -274,7 +274,10 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
}
if (targetUrl) {
robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
const updatedWorkflow = [...robot.recording.workflow];
let foundGoto = false;
for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
const step = updatedWorkflow[i];
@@ -289,6 +292,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
robot.changed('recording', true);
foundGoto = true;
i = -1;
break;
}
@@ -331,10 +335,11 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
}
};
if (name) {
if (name || targetUrl) {
updates.recording_meta = {
...robot.recording_meta,
name
...(name && { name }),
...(targetUrl && { url: targetUrl })
};
}
@@ -432,6 +437,78 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate
}
});
/**
* POST endpoint for creating a markdown robot
*/
router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequest, res) => {
try {
const { url, name } = req.body;
if (!url) {
return res.status(400).json({ error: 'The "url" field is required.' });
}
if (!req.user) {
return res.status(401).send({ error: 'Unauthorized' });
}
// Validate URL format
try {
new URL(url);
} catch (err) {
return res.status(400).json({ error: 'Invalid URL format' });
}
const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
const currentTimestamp = new Date().toLocaleString();
const robotId = uuid();
const newRobot = await Robot.create({
id: uuid(),
userId: req.user.id,
recording_meta: {
name: robotName,
id: robotId,
createdAt: currentTimestamp,
updatedAt: currentTimestamp,
pairs: 0,
params: [],
type: 'markdown',
url: url,
},
recording: { workflow: [] },
google_sheet_email: null,
google_sheet_name: null,
google_sheet_id: null,
google_access_token: null,
google_refresh_token: null,
schedule: null,
});
logger.log('info', `Markdown robot created with id: ${newRobot.id}`);
capture(
'maxun-oss-markdown-robot-created',
{
robot_meta: newRobot.recording_meta,
url: url,
}
);
return res.status(201).json({
message: 'Markdown robot created successfully.',
robot: newRobot,
});
} catch (error) {
if (error instanceof Error) {
logger.log('error', `Error creating markdown robot: ${error.message}`);
return res.status(500).json({ error: error.message });
} else {
logger.log('error', 'Unknown error creating markdown robot');
return res.status(500).json({ error: 'An unknown error occurred.' });
}
}
});
/**
* DELETE endpoint for deleting a recording from the storage.
*/

View File

@@ -15,6 +15,7 @@ import { WorkflowFile } from "maxun-core";
import { Page } from "playwright";
import { sendWebhook } from "../../routes/webhook";
import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
import { convertPageToMarkdown } from "../../markdownify/scrape";
chromium.use(stealthPlugin());
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
@@ -207,6 +208,119 @@ async function executeRun(id: string, userId: string) {
}
}
if (recording.recording_meta.type === 'markdown') {
logger.log('info', `Executing markdown robot for scheduled run ${id}`);
await run.update({
status: 'running',
log: 'Converting page to markdown'
});
try {
const runStartedData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'running',
startedAt: plainRun.startedAt
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`);
}
try {
const url = recording.recording_meta.url;
if (!url) {
throw new Error('No URL specified for markdown robot');
}
const markdown = await convertPageToMarkdown(url);
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: 'Markdown conversion completed successfully',
serializableOutput: {
markdown: [{ content: markdown }]
},
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
try {
const completionData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString()
};
serverIo.of(plainRun.browserId).emit('run-completed', completionData);
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
}
const webhookPayload = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
robot_name: recording.recording_meta.name,
status: 'success',
started_at: plainRun.startedAt,
finished_at: new Date().toLocaleString(),
markdown: markdown,
metadata: {
browser_id: plainRun.browserId,
user_id: userId,
}
};
try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`);
} catch (webhookError: any) {
logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
}
await destroyRemoteBrowser(plainRun.browserId, userId);
return true;
} catch (error: any) {
logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `Markdown conversion failed: ${error.message}`,
});
try {
const failureData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
};
serverIo.of(plainRun.browserId).emit('run-completed', failureData);
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
}
await destroyRemoteBrowser(plainRun.browserId, userId);
throw error;
}
}
plainRun.status = 'running';
try {
@@ -217,7 +331,7 @@ async function executeRun(id: string, userId: string) {
status: 'running',
startedAt: plainRun.startedAt
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`);
} catch (socketError: any) {