feat: add html scrape support

This commit is contained in:
Rohit Rajan
2025-11-20 18:49:39 +05:30
parent fef038b8cf
commit e90cd9961e
12 changed files with 366 additions and 105 deletions

View File

@@ -18,7 +18,7 @@ import { WorkflowFile } from "maxun-core";
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
import { sendWebhook } from "../routes/webhook";
import { convertPageToMarkdown } from '../markdownify/scrape';
import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';
chromium.use(stealthPlugin());
@@ -346,7 +346,8 @@ function formatRunResponse(run: any) {
data: {
textData: {},
listData: {},
markdown: ''
markdown: '',
html: ''
},
screenshots: [] as any[],
};
@@ -365,6 +366,10 @@ function formatRunResponse(run: any) {
formattedRun.data.markdown = output.markdown[0]?.content || '';
}
if (output.html && Array.isArray(output.html)) {
formattedRun.data.html = output.html[0]?.content || '';
}
if (run.binaryOutput) {
Object.keys(run.binaryOutput).forEach(key => {
if (run.binaryOutput[key]) {
@@ -575,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr
}
}
async function readyForRunHandler(browserId: string, id: string, userId: string){
async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){
try {
const result = await executeRun(id, userId);
const result = await executeRun(id, userId, requestedFormats);
if (result && result.success) {
logger.log('info', `Interpretation of ${id} succeeded`);
@@ -614,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) {
return copy;
};
async function executeRun(id: string, userId: string) {
async function executeRun(id: string, userId: string, requestedFormats?: string[]) {
let browser: any = null;
try {
@@ -657,12 +662,19 @@ async function executeRun(id: string, userId: string) {
};
}
if (recording.recording_meta.type === 'markdown') {
logger.log('info', `Executing markdown robot for API run ${id}`);
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for API run ${id}`);
let formats = recording.recording_meta.formats || ['markdown'];
// Override if API request defines formats
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
}
await run.update({
status: 'running',
log: 'Converting page to markdown'
log: `Converting page to: ${formats.join(', ')}`
});
try {
@@ -672,20 +684,33 @@ async function executeRun(id: string, userId: string) {
throw new Error('No URL specified for markdown robot');
}
const markdown = await convertPageToMarkdown(url);
let markdown = '';
let html = '';
const serializableOutput: any = {};
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: 'Markdown conversion completed successfully',
serializableOutput: {
markdown: [{ content: markdown }]
},
log: `${formats.join(', ')} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for API run ${id}`);
// Push success socket event
try {
const completionData = {
runId: plainRun.runId,
@@ -695,30 +720,45 @@ async function executeRun(id: string, userId: string) {
finishedAt: new Date().toLocaleString()
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
serverIo
.of('/queued-run')
.to(`user-${userId}`)
.emit('run-completed', completionData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
logger.log(
'warn',
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
);
}
const webhookPayload = {
// Build webhook payload
const webhookPayload: any = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
robot_name: recording.recording_meta.name,
status: 'success',
started_at: plainRun.startedAt,
finished_at: new Date().toLocaleString(),
markdown: markdown,
metadata: {
browser_id: plainRun.browserId,
user_id: userId,
}
},
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log('info', `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`);
logger.log(
'info',
`Webhooks sent successfully for markdown robot API run ${plainRun.runId}`
);
} catch (webhookError: any) {
logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
logger.log(
'warn',
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
);
}
await destroyRemoteBrowser(plainRun.browserId, userId);
@@ -728,14 +768,18 @@ async function executeRun(id: string, userId: string) {
interpretationInfo: run.toJSON()
};
} catch (error: any) {
logger.log('error', `Markdown conversion failed for API run ${id}: ${error.message}`);
logger.log(
'error',
`${formats.join(', ')} conversion failed for API run ${id}: ${error.message}`
);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `Markdown conversion failed: ${error.message}`,
log: `${formats.join(', ')} conversion failed: ${error.message}`,
});
// Send failure socket event
try {
const failureData = {
runId: plainRun.runId,
@@ -745,9 +789,15 @@ async function executeRun(id: string, userId: string) {
finishedAt: new Date().toLocaleString()
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
serverIo
.of('/queued-run')
.to(`user-${userId}`)
.emit('run-completed', failureData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
logger.log(
'warn',
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
);
}
await destroyRemoteBrowser(plainRun.browserId, userId);
@@ -953,7 +1003,7 @@ async function executeRun(id: string, userId: string) {
}
}
export async function handleRunRecording(id: string, userId: string) {
export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
try {
const result = await createWorkflowAndStoreMetadata(id, userId);
const { browserId, runId: newRunId } = result;
@@ -967,7 +1017,7 @@ export async function handleRunRecording(id: string, userId: string) {
rejectUnauthorized: false
});
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId));
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats));
logger.log('info', `Running Robot: ${id}`);
@@ -1018,6 +1068,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
* type: string
* required: true
* description: The ID of the robot to run.
* requestBody:
* required: false
* content:
* application/json:
* schema:
* type: object
* properties:
* formats:
* type: array
* items:
* type: string
* enum: [markdown, html]
* description: Optional override formats for this run.
* example:
* formats: ["html"]
* responses:
* 200:
* description: Robot run started successfully.
@@ -1076,7 +1141,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
if (!req.user) {
return res.status(401).json({ ok: false, error: 'Unauthorized' });
}
const runId = await handleRunRecording(req.params.id, req.user.id);
const requestedFormats = req.body.formats;
const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);
if (!runId) {
throw new Error('Run ID is undefined');

View File

@@ -55,3 +55,57 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
const markdown = await parseMarkdown(cleanedHtml, url);
return markdown;
}
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML.
*/
export async function convertPageToHTML(url: string): Promise<string> {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle" });
await page.addInitScript(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
// Return cleaned HTML directly
return cleanedHtml;
}

View File

@@ -9,8 +9,9 @@ interface RobotMeta {
pairs: number;
updatedAt: string;
params: any[];
type?: 'traditional' | 'markdown';
type?: 'extract' | 'scrape';
url?: string;
formats?: ('markdown' | 'html')[];
}
interface RobotWorkflow {

View File

@@ -20,7 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme
import { io as serverIo } from "./server";
import { sendWebhook } from './routes/webhook';
import { BinaryOutputService } from './storage/mino';
import { convertPageToMarkdown } from './markdownify/scrape';
import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape';
if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
@@ -189,12 +189,14 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
throw new Error(`Recording for run ${data.runId} not found`);
}
if (recording.recording_meta.type === 'markdown') {
logger.log('info', `Executing markdown robot for run ${data.runId}`);
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for run ${data.runId}`);
const formats = recording.recording_meta.formats || ['markdown'];
await run.update({
status: 'running',
log: 'Converting page to markdown'
log: `Converting page to ${formats.join(', ')}`
});
try {
@@ -204,20 +206,34 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
throw new Error('No URL specified for markdown robot');
}
const markdown = await convertPageToMarkdown(url);
let markdown = '';
let html = '';
const serializableOutput: any = {};
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}
// Success update
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: 'Markdown conversion completed successfully',
serializableOutput: {
markdown: [{ content: markdown }]
},
log: `${formats.join(', ').toUpperCase()} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
// Notify sockets
try {
const completionData = {
runId: data.runId,
@@ -233,15 +249,19 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
}
// Webhooks
try {
const webhookPayload = {
const webhookPayload: any = {
runId: data.runId,
robotId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString(),
markdown: markdown
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
} catch (webhookError: any) {
@@ -251,13 +271,14 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
await destroyRemoteBrowser(browserId, data.userId);
return { success: true };
} catch (error: any) {
logger.log('error', `Markdown conversion failed for run ${data.runId}: ${error.message}`);
logger.log('error', `${formats.join(', ')} conversion failed for run ${data.runId}: ${error.message}`);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `Markdown conversion failed: ${error.message}`,
log: `${formats.join(', ').toUpperCase()} conversion failed: ${error.message}`,
});
try {

View File

@@ -440,9 +440,9 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate
/**
* POST endpoint for creating a markdown robot
*/
router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequest, res) => {
router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedRequest, res) => {
try {
const { url, name } = req.body;
const { url, name, formats } = req.body;
if (!url) {
return res.status(400).json({ error: 'The "url" field is required.' });
@@ -459,6 +459,18 @@ router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequ
return res.status(400).json({ error: 'Invalid URL format' });
}
// Validate format
const validFormats = ['markdown', 'html'];
if (!Array.isArray(formats) || formats.length === 0) {
return res.status(400).json({ error: 'At least one output format must be selected.' });
}
const invalid = formats.filter(f => !validFormats.includes(f));
if (invalid.length > 0) {
return res.status(400).json({ error: `Invalid formats: ${invalid.join(', ')}` });
}
const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
const currentTimestamp = new Date().toLocaleString();
const robotId = uuid();
@@ -473,8 +485,9 @@ router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequ
updatedAt: currentTimestamp,
pairs: 0,
params: [],
type: 'markdown',
type: 'scrape',
url: url,
formats: formats,
},
recording: { workflow: [] },
google_sheet_email: null,

View File

@@ -15,7 +15,7 @@ import { WorkflowFile } from "maxun-core";
import { Page } from "playwright";
import { sendWebhook } from "../../routes/webhook";
import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
import { convertPageToMarkdown } from "../../markdownify/scrape";
import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape";
chromium.use(stealthPlugin());
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
@@ -208,12 +208,14 @@ async function executeRun(id: string, userId: string) {
}
}
if (recording.recording_meta.type === 'markdown') {
logger.log('info', `Executing markdown robot for scheduled run ${id}`);
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for scheduled run ${id}`);
const formats = recording.recording_meta.formats || ['markdown'];
await run.update({
status: 'running',
log: 'Converting page to markdown'
log: `Converting page to: ${formats.join(', ')}`
});
try {
@@ -226,9 +228,15 @@ async function executeRun(id: string, userId: string) {
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`);
logger.log(
'info',
`Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`
);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`);
logger.log(
'warn',
`Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`
);
}
try {
@@ -238,20 +246,33 @@ async function executeRun(id: string, userId: string) {
throw new Error('No URL specified for markdown robot');
}
const markdown = await convertPageToMarkdown(url);
let markdown = '';
let html = '';
const serializableOutput: any = {};
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: 'Markdown conversion completed successfully',
serializableOutput: {
markdown: [{ content: markdown }]
},
log: `${formats.join(', ')} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
// Run-completed socket notifications
try {
const completionData = {
runId: plainRun.runId,
@@ -264,40 +285,53 @@ async function executeRun(id: string, userId: string) {
serverIo.of(plainRun.browserId).emit('run-completed', completionData);
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
logger.log(
'warn',
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
);
}
const webhookPayload = {
// Webhook payload
const webhookPayload: any = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
robot_name: recording.recording_meta.name,
status: 'success',
started_at: plainRun.startedAt,
finished_at: new Date().toLocaleString(),
markdown: markdown,
metadata: {
browser_id: plainRun.browserId,
user_id: userId,
}
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`);
logger.log(
'info',
`Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`
);
} catch (webhookError: any) {
logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
logger.log(
'warn',
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
);
}
await destroyRemoteBrowser(plainRun.browserId, userId);
return true;
} catch (error: any) {
logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`);
logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `Markdown conversion failed: ${error.message}`,
log: `${formats.join(', ')} conversion failed: ${error.message}`,
});
try {
@@ -312,7 +346,10 @@ async function executeRun(id: string, userId: string) {
serverIo.of(plainRun.browserId).emit('run-completed', failureData);
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
logger.log(
'warn',
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
);
}
await destroyRemoteBrowser(plainRun.browserId, userId);