feat: add html scrape support

This commit is contained in:
Rohit Rajan
2025-11-20 18:49:39 +05:30
parent fef038b8cf
commit e90cd9961e
12 changed files with 366 additions and 105 deletions

View File

@@ -15,7 +15,7 @@ import { WorkflowFile } from "maxun-core";
import { Page } from "playwright";
import { sendWebhook } from "../../routes/webhook";
import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
import { convertPageToMarkdown } from "../../markdownify/scrape";
import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape";
chromium.use(stealthPlugin());
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
@@ -208,12 +208,14 @@ async function executeRun(id: string, userId: string) {
}
}
if (recording.recording_meta.type === 'markdown') {
logger.log('info', `Executing markdown robot for scheduled run ${id}`);
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for scheduled run ${id}`);
const formats = recording.recording_meta.formats || ['markdown'];
await run.update({
status: 'running',
log: 'Converting page to markdown'
log: `Converting page to: ${formats.join(', ')}`
});
try {
@@ -226,9 +228,15 @@ async function executeRun(id: string, userId: string) {
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`);
logger.log(
'info',
`Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`
);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`);
logger.log(
'warn',
`Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`
);
}
try {
@@ -238,20 +246,33 @@ async function executeRun(id: string, userId: string) {
throw new Error('No URL specified for markdown robot');
}
const markdown = await convertPageToMarkdown(url);
let markdown = '';
let html = '';
const serializableOutput: any = {};
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: 'Markdown conversion completed successfully',
serializableOutput: {
markdown: [{ content: markdown }]
},
log: `${formats.join(', ')} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
// Run-completed socket notifications
try {
const completionData = {
runId: plainRun.runId,
@@ -264,40 +285,53 @@ async function executeRun(id: string, userId: string) {
serverIo.of(plainRun.browserId).emit('run-completed', completionData);
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
logger.log(
'warn',
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
);
}
const webhookPayload = {
// Webhook payload
const webhookPayload: any = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
robot_name: recording.recording_meta.name,
status: 'success',
started_at: plainRun.startedAt,
finished_at: new Date().toLocaleString(),
markdown: markdown,
metadata: {
browser_id: plainRun.browserId,
user_id: userId,
}
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`);
logger.log(
'info',
`Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`
);
} catch (webhookError: any) {
logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
logger.log(
'warn',
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
);
}
await destroyRemoteBrowser(plainRun.browserId, userId);
return true;
} catch (error: any) {
logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`);
logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `Markdown conversion failed: ${error.message}`,
log: `${formats.join(', ')} conversion failed: ${error.message}`,
});
try {
@@ -312,7 +346,10 @@ async function executeRun(id: string, userId: string) {
serverIo.of(plainRun.browserId).emit('run-completed', failureData);
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
logger.log(
'warn',
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
);
}
await destroyRemoteBrowser(plainRun.browserId, userId);