Merge branch 'markdownify' of https://github.com/getmaxun/maxun into markdownify
This commit is contained in:
@@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core";
|
||||
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
|
||||
import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
|
||||
import { sendWebhook } from "../routes/webhook";
|
||||
import { convertPageToMarkdown } from '../markdownify/scrape';
|
||||
|
||||
chromium.use(stealthPlugin());
|
||||
|
||||
@@ -344,7 +345,8 @@ function formatRunResponse(run: any) {
|
||||
runByAPI: run.runByAPI,
|
||||
data: {
|
||||
textData: {},
|
||||
listData: {}
|
||||
listData: {},
|
||||
markdown: ''
|
||||
},
|
||||
screenshots: [] as any[],
|
||||
};
|
||||
@@ -359,6 +361,10 @@ function formatRunResponse(run: any) {
|
||||
formattedRun.data.listData = output.scrapeList;
|
||||
}
|
||||
|
||||
if (output.markdown && Array.isArray(output.markdown)) {
|
||||
formattedRun.data.markdown = output.markdown[0]?.content || '';
|
||||
}
|
||||
|
||||
if (run.binaryOutput) {
|
||||
Object.keys(run.binaryOutput).forEach(key => {
|
||||
if (run.binaryOutput[key]) {
|
||||
@@ -651,6 +657,105 @@ async function executeRun(id: string, userId: string) {
|
||||
};
|
||||
}
|
||||
|
||||
if (recording.recording_meta.type === 'markdown') {
|
||||
logger.log('info', `Executing markdown robot for API run ${id}`);
|
||||
|
||||
await run.update({
|
||||
status: 'running',
|
||||
log: 'Converting page to markdown'
|
||||
});
|
||||
|
||||
try {
|
||||
const url = recording.recording_meta.url;
|
||||
|
||||
if (!url) {
|
||||
throw new Error('No URL specified for markdown robot');
|
||||
}
|
||||
|
||||
const markdown = await convertPageToMarkdown(url);
|
||||
|
||||
await run.update({
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: 'Markdown conversion completed successfully',
|
||||
serializableOutput: {
|
||||
markdown: [{ content: markdown }]
|
||||
},
|
||||
binaryOutput: {},
|
||||
});
|
||||
|
||||
logger.log('info', `Markdown robot execution completed for API run ${id}`);
|
||||
|
||||
try {
|
||||
const completionData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
|
||||
} catch (socketError: any) {
|
||||
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
|
||||
}
|
||||
|
||||
const webhookPayload = {
|
||||
robot_id: plainRun.robotMetaId,
|
||||
run_id: plainRun.runId,
|
||||
robot_name: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
started_at: plainRun.startedAt,
|
||||
finished_at: new Date().toLocaleString(),
|
||||
markdown: markdown,
|
||||
metadata: {
|
||||
browser_id: plainRun.browserId,
|
||||
user_id: userId,
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
||||
logger.log('info', `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`);
|
||||
} catch (webhookError: any) {
|
||||
logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
|
||||
}
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
interpretationInfo: run.toJSON()
|
||||
};
|
||||
} catch (error: any) {
|
||||
logger.log('error', `Markdown conversion failed for API run ${id}: ${error.message}`);
|
||||
|
||||
await run.update({
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: `Markdown conversion failed: ${error.message}`,
|
||||
});
|
||||
|
||||
try {
|
||||
const failureData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
|
||||
} catch (socketError: any) {
|
||||
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
|
||||
}
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
plainRun.status = 'running';
|
||||
|
||||
browser = browserPool.getRemoteBrowser(plainRun.browserId);
|
||||
@@ -889,12 +994,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
|
||||
if (!run) throw new Error('Run not found');
|
||||
|
||||
if (run.status === 'success') {
|
||||
return run.toJSON();
|
||||
return run;
|
||||
} else if (run.status === 'failed') {
|
||||
throw new Error('Run failed');
|
||||
}
|
||||
|
||||
// Wait for the next polling interval
|
||||
await new Promise(resolve => setTimeout(resolve, interval));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,8 @@ interface RobotMeta {
|
||||
pairs: number;
|
||||
updatedAt: string;
|
||||
params: any[];
|
||||
type?: 'traditional' | 'markdown';
|
||||
url?: string;
|
||||
}
|
||||
|
||||
interface RobotWorkflow {
|
||||
|
||||
@@ -20,6 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme
|
||||
import { io as serverIo } from "./server";
|
||||
import { sendWebhook } from './routes/webhook';
|
||||
import { BinaryOutputService } from './storage/mino';
|
||||
import { convertPageToMarkdown } from './markdownify/scrape';
|
||||
|
||||
if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
|
||||
throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
|
||||
@@ -183,11 +184,103 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
try {
|
||||
// Find the recording
|
||||
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
|
||||
|
||||
|
||||
if (!recording) {
|
||||
throw new Error(`Recording for run ${data.runId} not found`);
|
||||
}
|
||||
|
||||
|
||||
if (recording.recording_meta.type === 'markdown') {
|
||||
logger.log('info', `Executing markdown robot for run ${data.runId}`);
|
||||
|
||||
await run.update({
|
||||
status: 'running',
|
||||
log: 'Converting page to markdown'
|
||||
});
|
||||
|
||||
try {
|
||||
const url = recording.recording_meta.url;
|
||||
|
||||
if (!url) {
|
||||
throw new Error('No URL specified for markdown robot');
|
||||
}
|
||||
|
||||
const markdown = await convertPageToMarkdown(url);
|
||||
|
||||
await run.update({
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: 'Markdown conversion completed successfully',
|
||||
serializableOutput: {
|
||||
markdown: [{ content: markdown }]
|
||||
},
|
||||
binaryOutput: {},
|
||||
});
|
||||
|
||||
logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
|
||||
|
||||
try {
|
||||
const completionData = {
|
||||
runId: data.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo.of(browserId).emit('run-completed', completionData);
|
||||
serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', completionData);
|
||||
} catch (socketError: any) {
|
||||
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
|
||||
}
|
||||
|
||||
try {
|
||||
const webhookPayload = {
|
||||
runId: data.runId,
|
||||
robotId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
markdown: markdown
|
||||
};
|
||||
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
||||
logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
|
||||
} catch (webhookError: any) {
|
||||
logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`);
|
||||
}
|
||||
|
||||
await destroyRemoteBrowser(browserId, data.userId);
|
||||
|
||||
return { success: true };
|
||||
} catch (error: any) {
|
||||
logger.log('error', `Markdown conversion failed for run ${data.runId}: ${error.message}`);
|
||||
|
||||
await run.update({
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: `Markdown conversion failed: ${error.message}`,
|
||||
});
|
||||
|
||||
try {
|
||||
const failureData = {
|
||||
runId: data.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo.of(browserId).emit('run-completed', failureData);
|
||||
serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', failureData);
|
||||
} catch (socketError: any) {
|
||||
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`);
|
||||
}
|
||||
|
||||
await destroyRemoteBrowser(browserId, data.userId);
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const isRunAborted = async (): Promise<boolean> => {
|
||||
try {
|
||||
const currentRun = await Run.findOne({ where: { runId: data.runId } });
|
||||
|
||||
@@ -274,7 +274,10 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
}
|
||||
|
||||
if (targetUrl) {
|
||||
robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
|
||||
|
||||
const updatedWorkflow = [...robot.recording.workflow];
|
||||
let foundGoto = false;
|
||||
|
||||
for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
|
||||
const step = updatedWorkflow[i];
|
||||
@@ -289,6 +292,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
|
||||
robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
|
||||
robot.changed('recording', true);
|
||||
foundGoto = true;
|
||||
i = -1;
|
||||
break;
|
||||
}
|
||||
@@ -331,10 +335,11 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
}
|
||||
};
|
||||
|
||||
if (name) {
|
||||
if (name || targetUrl) {
|
||||
updates.recording_meta = {
|
||||
...robot.recording_meta,
|
||||
name
|
||||
...(name && { name }),
|
||||
...(targetUrl && { url: targetUrl })
|
||||
};
|
||||
}
|
||||
|
||||
@@ -432,6 +437,78 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST endpoint for creating a markdown robot
|
||||
*/
|
||||
router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const { url, name } = req.body;
|
||||
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: 'The "url" field is required.' });
|
||||
}
|
||||
|
||||
if (!req.user) {
|
||||
return res.status(401).send({ error: 'Unauthorized' });
|
||||
}
|
||||
|
||||
// Validate URL format
|
||||
try {
|
||||
new URL(url);
|
||||
} catch (err) {
|
||||
return res.status(400).json({ error: 'Invalid URL format' });
|
||||
}
|
||||
|
||||
const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
|
||||
const currentTimestamp = new Date().toLocaleString();
|
||||
const robotId = uuid();
|
||||
|
||||
const newRobot = await Robot.create({
|
||||
id: uuid(),
|
||||
userId: req.user.id,
|
||||
recording_meta: {
|
||||
name: robotName,
|
||||
id: robotId,
|
||||
createdAt: currentTimestamp,
|
||||
updatedAt: currentTimestamp,
|
||||
pairs: 0,
|
||||
params: [],
|
||||
type: 'markdown',
|
||||
url: url,
|
||||
},
|
||||
recording: { workflow: [] },
|
||||
google_sheet_email: null,
|
||||
google_sheet_name: null,
|
||||
google_sheet_id: null,
|
||||
google_access_token: null,
|
||||
google_refresh_token: null,
|
||||
schedule: null,
|
||||
});
|
||||
|
||||
logger.log('info', `Markdown robot created with id: ${newRobot.id}`);
|
||||
capture(
|
||||
'maxun-oss-markdown-robot-created',
|
||||
{
|
||||
robot_meta: newRobot.recording_meta,
|
||||
url: url,
|
||||
}
|
||||
);
|
||||
|
||||
return res.status(201).json({
|
||||
message: 'Markdown robot created successfully.',
|
||||
robot: newRobot,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
logger.log('error', `Error creating markdown robot: ${error.message}`);
|
||||
return res.status(500).json({ error: error.message });
|
||||
} else {
|
||||
logger.log('error', 'Unknown error creating markdown robot');
|
||||
return res.status(500).json({ error: 'An unknown error occurred.' });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* DELETE endpoint for deleting a recording from the storage.
|
||||
*/
|
||||
|
||||
@@ -15,6 +15,7 @@ import { WorkflowFile } from "maxun-core";
|
||||
import { Page } from "playwright";
|
||||
import { sendWebhook } from "../../routes/webhook";
|
||||
import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
|
||||
import { convertPageToMarkdown } from "../../markdownify/scrape";
|
||||
chromium.use(stealthPlugin());
|
||||
|
||||
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
|
||||
@@ -207,6 +208,119 @@ async function executeRun(id: string, userId: string) {
|
||||
}
|
||||
}
|
||||
|
||||
if (recording.recording_meta.type === 'markdown') {
|
||||
logger.log('info', `Executing markdown robot for scheduled run ${id}`);
|
||||
|
||||
await run.update({
|
||||
status: 'running',
|
||||
log: 'Converting page to markdown'
|
||||
});
|
||||
|
||||
try {
|
||||
const runStartedData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'running',
|
||||
startedAt: plainRun.startedAt
|
||||
};
|
||||
|
||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
|
||||
logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`);
|
||||
} catch (socketError: any) {
|
||||
logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`);
|
||||
}
|
||||
|
||||
try {
|
||||
const url = recording.recording_meta.url;
|
||||
|
||||
if (!url) {
|
||||
throw new Error('No URL specified for markdown robot');
|
||||
}
|
||||
|
||||
const markdown = await convertPageToMarkdown(url);
|
||||
|
||||
await run.update({
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: 'Markdown conversion completed successfully',
|
||||
serializableOutput: {
|
||||
markdown: [{ content: markdown }]
|
||||
},
|
||||
binaryOutput: {},
|
||||
});
|
||||
|
||||
logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
|
||||
|
||||
try {
|
||||
const completionData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo.of(plainRun.browserId).emit('run-completed', completionData);
|
||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
|
||||
} catch (socketError: any) {
|
||||
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
|
||||
}
|
||||
|
||||
const webhookPayload = {
|
||||
robot_id: plainRun.robotMetaId,
|
||||
run_id: plainRun.runId,
|
||||
robot_name: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
started_at: plainRun.startedAt,
|
||||
finished_at: new Date().toLocaleString(),
|
||||
markdown: markdown,
|
||||
metadata: {
|
||||
browser_id: plainRun.browserId,
|
||||
user_id: userId,
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
||||
logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`);
|
||||
} catch (webhookError: any) {
|
||||
logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
|
||||
}
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
|
||||
return true;
|
||||
} catch (error: any) {
|
||||
logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`);
|
||||
|
||||
await run.update({
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: `Markdown conversion failed: ${error.message}`,
|
||||
});
|
||||
|
||||
try {
|
||||
const failureData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo.of(plainRun.browserId).emit('run-completed', failureData);
|
||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
|
||||
} catch (socketError: any) {
|
||||
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
|
||||
}
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
plainRun.status = 'running';
|
||||
|
||||
try {
|
||||
@@ -217,7 +331,7 @@ async function executeRun(id: string, userId: string) {
|
||||
status: 'running',
|
||||
startedAt: plainRun.startedAt
|
||||
};
|
||||
|
||||
|
||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
|
||||
logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`);
|
||||
} catch (socketError: any) {
|
||||
|
||||
Reference in New Issue
Block a user