Merge pull request #889 from getmaxun/markdownify

feat: scrape [html + markdown]
This commit is contained in:
Karishma Shukla
2025-11-21 00:14:31 +05:30
committed by GitHub
18 changed files with 1422 additions and 210 deletions

View File

@@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core";
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
import { sendWebhook } from "../routes/webhook";
import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';
chromium.use(stealthPlugin());
@@ -344,7 +345,9 @@ function formatRunResponse(run: any) {
runByAPI: run.runByAPI,
data: {
textData: {},
listData: {}
listData: {},
markdown: '',
html: ''
},
screenshots: [] as any[],
};
@@ -359,6 +362,14 @@ function formatRunResponse(run: any) {
formattedRun.data.listData = output.scrapeList;
}
if (output.markdown && Array.isArray(output.markdown)) {
formattedRun.data.markdown = output.markdown[0]?.content || '';
}
if (output.html && Array.isArray(output.html)) {
formattedRun.data.html = output.html[0]?.content || '';
}
if (run.binaryOutput) {
Object.keys(run.binaryOutput).forEach(key => {
if (run.binaryOutput[key]) {
@@ -569,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr
}
}
async function readyForRunHandler(browserId: string, id: string, userId: string){
async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){
try {
const result = await executeRun(id, userId);
const result = await executeRun(id, userId, requestedFormats);
if (result && result.success) {
logger.log('info', `Interpretation of ${id} succeeded`);
@@ -608,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) {
return copy;
};
async function executeRun(id: string, userId: string) {
async function executeRun(id: string, userId: string, requestedFormats?: string[]) {
let browser: any = null;
try {
@@ -651,6 +662,166 @@ async function executeRun(id: string, userId: string) {
};
}
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for API run ${id}`);
let formats = recording.recording_meta.formats || ['markdown'];
// Override if API request defines formats
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
}
await run.update({
status: 'running',
log: `Converting page to: ${formats.join(', ')}`
});
try {
const url = recording.recording_meta.url;
if (!url) {
throw new Error('No URL specified for markdown robot');
}
let markdown = '';
let html = '';
const serializableOutput: any = {};
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for API run ${id}`);
// Push success socket event
try {
const completionData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString()
};
serverIo
.of('/queued-run')
.to(`user-${userId}`)
.emit('run-completed', completionData);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
);
}
// Build webhook payload
const webhookPayload: any = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
robot_name: recording.recording_meta.name,
status: 'success',
started_at: plainRun.startedAt,
finished_at: new Date().toLocaleString(),
metadata: {
browser_id: plainRun.browserId,
user_id: userId,
},
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log(
'info',
`Webhooks sent successfully for markdown robot API run ${plainRun.runId}`
);
} catch (webhookError: any) {
logger.log(
'warn',
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
);
}
capture("maxun-oss-run-created-api", {
runId: plainRun.runId,
user_id: userId,
status: "success",
robot_type: "scrape",
formats
});
await destroyRemoteBrowser(plainRun.browserId, userId);
return {
success: true,
interpretationInfo: run.toJSON()
};
} catch (error: any) {
logger.log(
'error',
`${formats.join(', ')} conversion failed for API run ${id}: ${error.message}`
);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion failed: ${error.message}`,
});
// Send failure socket event
try {
const failureData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
};
serverIo
.of('/queued-run')
.to(`user-${userId}`)
.emit('run-completed', failureData);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
);
}
capture("maxun-oss-run-created-api", {
runId: plainRun.runId,
user_id: userId,
status: "failed",
robot_type: "scrape",
formats
});
await destroyRemoteBrowser(plainRun.browserId, userId);
throw error;
}
}
plainRun.status = 'running';
browser = browserPool.getRemoteBrowser(plainRun.browserId);
@@ -848,7 +1019,7 @@ async function executeRun(id: string, userId: string) {
}
}
export async function handleRunRecording(id: string, userId: string) {
export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
try {
const result = await createWorkflowAndStoreMetadata(id, userId);
const { browserId, runId: newRunId } = result;
@@ -862,7 +1033,7 @@ export async function handleRunRecording(id: string, userId: string) {
rejectUnauthorized: false
});
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId));
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats));
logger.log('info', `Running Robot: ${id}`);
@@ -889,12 +1060,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
if (!run) throw new Error('Run not found');
if (run.status === 'success') {
return run.toJSON();
return run;
} else if (run.status === 'failed') {
throw new Error('Run failed');
}
// Wait for the next polling interval
await new Promise(resolve => setTimeout(resolve, interval));
}
}
@@ -914,6 +1084,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
* type: string
* required: true
* description: The ID of the robot to run.
* requestBody:
* required: false
* content:
* application/json:
* schema:
* type: object
* properties:
* formats:
* type: array
* items:
* type: string
* enum: [markdown, html]
* description: Optional override formats for this run.
* example:
* formats: ["html"]
* responses:
* 200:
* description: Robot run started successfully.
@@ -972,7 +1157,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
if (!req.user) {
return res.status(401).json({ ok: false, error: 'Unauthorized' });
}
const runId = await handleRunRecording(req.params.id, req.user.id);
const requestedFormats = req.body.formats;
const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);
if (!runId) {
throw new Error('Run ID is undefined');

View File

@@ -0,0 +1,160 @@
export async function parseMarkdown(
html: string | null | undefined,
baseUrl?: string | null
): Promise<string> {
const TurndownService = require("turndown");
const { gfm } = require("joplin-turndown-plugin-gfm");
const cheerio = require("cheerio");
const { URL } = require("url");
if (!html) return "";
const tidiedHtml = tidyHtml(html);
const t = new TurndownService({
headingStyle: "atx", // ensures #### instead of ------
codeBlockStyle: "fenced",
});
// ---------------------------------------------
// Proper ATX headings #### instead of underline-style
// ---------------------------------------------
t.addRule("forceAtxHeadings", {
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
replacement: (content: string, node: any) => {
const level = Number(node.nodeName.charAt(1));
const clean = content.trim();
return `\n${"#".repeat(level)} ${clean}\n`;
},
});
// ---------------------------------------------
// Remove SVGs
// ---------------------------------------------
t.addRule("truncate-svg", {
filter: "svg",
replacement: () => "",
});
// ---------------------------------------------
// Improved paragraph cleanup
// ---------------------------------------------
t.addRule("improved-paragraph", {
filter: "p",
replacement: (innerText: string) => {
const trimmed = innerText.trim();
if (!trimmed) return "";
return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
},
});
// ---------------------------------------------
// Inline link with fallback text
// ---------------------------------------------
t.addRule("inlineLink", {
filter: (node: any, opts: any) =>
node.nodeName === "A" && node.getAttribute("href"),
replacement: (content: string, node: any) => {
let text = content.trim();
// Fallback: aria-label → title → domain
if (!text) {
text =
node.getAttribute("aria-label")?.trim() ||
node.getAttribute("title")?.trim() ||
getDomainFromUrl(node.getAttribute("href")) ||
"link";
}
let href = node.getAttribute("href").trim();
// relative → absolute
if (baseUrl && isRelativeUrl(href)) {
try {
const u = new URL(href, baseUrl);
href = u.toString();
} catch { }
}
href = cleanUrl(href);
return `[${text}](${href})`;
},
});
t.use(gfm);
// Convert HTML → Markdown
try {
let out = await t.turndown(tidiedHtml);
out = fixBrokenLinks(out);
out = stripSkipLinks(out);
return out.trim();
} catch (err) {
console.error("HTML→Markdown failed", { err });
return "";
}
}
// -----------------------------------------------------
// Helpers
// -----------------------------------------------------
function isRelativeUrl(url: string): boolean {
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
}
function getDomainFromUrl(url: string): string | null {
try {
const u = new URL(url);
return u.hostname.replace("www.", "");
} catch {
return null;
}
}
function cleanUrl(u: string): string {
return u;
}
function cleanAttribute(attr: string) {
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
}
function tidyHtml(html: string): string {
const cheerio = require("cheerio");
const $ = cheerio.load(html);
const manuallyCleanedElements = [
"script",
"style",
"iframe",
"noscript",
"meta",
"link",
"object",
"embed",
"canvas",
"audio",
"video",
];
manuallyCleanedElements.forEach((tag) => $(tag).remove());
return $("body").html();
}
function fixBrokenLinks(md: string): string {
let depth = 0;
let result = "";
for (const ch of md) {
if (ch === "[") depth++;
if (ch === "]") depth = Math.max(0, depth - 1);
result += depth > 0 && ch === "\n" ? "\\\n" : ch;
}
return result;
}
function stripSkipLinks(md: string): string {
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
}

View File

@@ -0,0 +1,111 @@
import { chromium } from "playwright";
import { parseMarkdown } from "./markdown";
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean Markdown using parser.
*/
export async function convertPageToMarkdown(url: string): Promise<string> {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
await page.addInitScript(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
// Convert cleaned HTML → Markdown
const markdown = await parseMarkdown(cleanedHtml, url);
return markdown;
}
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML.
*/
export async function convertPageToHTML(url: string): Promise<string> {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
await page.addInitScript(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
// Return cleaned HTML directly
return cleanedHtml;
}

View File

@@ -0,0 +1,6 @@
import { convertPageToMarkdown } from "./scrape";
(async () => {
const md = await convertPageToMarkdown("https://quotes.toscrape.com/");
console.log(md);
})();

View File

@@ -9,6 +9,9 @@ interface RobotMeta {
pairs: number;
updatedAt: string;
params: any[];
type?: 'extract' | 'scrape';
url?: string;
formats?: ('markdown' | 'html')[];
}
interface RobotWorkflow {

View File

@@ -20,6 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme
import { io as serverIo } from "./server";
import { sendWebhook } from './routes/webhook';
import { BinaryOutputService } from './storage/mino';
import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape';
if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
@@ -183,11 +184,140 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
try {
// Find the recording
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
if (!recording) {
throw new Error(`Recording for run ${data.runId} not found`);
}
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for run ${data.runId}`);
const formats = recording.recording_meta.formats || ['markdown'];
await run.update({
status: 'running',
log: `Converting page to ${formats.join(', ')}`
});
try {
const url = recording.recording_meta.url;
if (!url) {
throw new Error('No URL specified for markdown robot');
}
let markdown = '';
let html = '';
const serializableOutput: any = {};
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}
// Success update
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ').toUpperCase()} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
// Notify sockets
try {
const completionData = {
runId: data.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString()
};
serverIo.of(browserId).emit('run-completed', completionData);
serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', completionData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
}
// Webhooks
try {
const webhookPayload: any = {
runId: data.runId,
robotId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString(),
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
} catch (webhookError: any) {
logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`);
}
capture("maxun-oss-run-created-manual", {
runId: data.runId,
user_id: data.userId,
status: "success",
robot_type: "scrape",
formats,
});
await destroyRemoteBrowser(browserId, data.userId);
return { success: true };
} catch (error: any) {
logger.log('error', `${formats.join(', ')} conversion failed for run ${data.runId}: ${error.message}`);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ').toUpperCase()} conversion failed: ${error.message}`,
});
try {
const failureData = {
runId: data.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
};
serverIo.of(browserId).emit('run-completed', failureData);
serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', failureData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`);
}
capture("maxun-oss-run-created-manual", {
runId: data.runId,
user_id: data.userId,
status: "failed",
robot_type: "scrape",
formats,
});
await destroyRemoteBrowser(browserId, data.userId);
throw error;
}
}
const isRunAborted = async (): Promise<boolean> => {
try {
const currentRun = await Run.findOne({ where: { runId: data.runId } });

View File

@@ -274,7 +274,10 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
}
if (targetUrl) {
robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
const updatedWorkflow = [...robot.recording.workflow];
let foundGoto = false;
for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
const step = updatedWorkflow[i];
@@ -289,6 +292,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
robot.changed('recording', true);
foundGoto = true;
i = -1;
break;
}
@@ -331,10 +335,11 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
}
};
if (name) {
if (name || targetUrl) {
updates.recording_meta = {
...robot.recording_meta,
name
...(name && { name }),
...(targetUrl && { url: targetUrl })
};
}
@@ -432,6 +437,91 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate
}
});
/**
* POST endpoint for creating a markdown robot
*/
router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedRequest, res) => {
try {
const { url, name, formats } = req.body;
if (!url) {
return res.status(400).json({ error: 'The "url" field is required.' });
}
if (!req.user) {
return res.status(401).send({ error: 'Unauthorized' });
}
// Validate URL format
try {
new URL(url);
} catch (err) {
return res.status(400).json({ error: 'Invalid URL format' });
}
// Validate format
const validFormats = ['markdown', 'html'];
if (!Array.isArray(formats) || formats.length === 0) {
return res.status(400).json({ error: 'At least one output format must be selected.' });
}
const invalid = formats.filter(f => !validFormats.includes(f));
if (invalid.length > 0) {
return res.status(400).json({ error: `Invalid formats: ${invalid.join(', ')}` });
}
const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
const currentTimestamp = new Date().toLocaleString();
const robotId = uuid();
const newRobot = await Robot.create({
id: uuid(),
userId: req.user.id,
recording_meta: {
name: robotName,
id: robotId,
createdAt: currentTimestamp,
updatedAt: currentTimestamp,
pairs: 0,
params: [],
type: 'scrape',
url: url,
formats: formats,
},
recording: { workflow: [] },
google_sheet_email: null,
google_sheet_name: null,
google_sheet_id: null,
google_access_token: null,
google_refresh_token: null,
schedule: null,
});
logger.log('info', `Markdown robot created with id: ${newRobot.id}`);
capture(
'maxun-oss-robot-created',
{
robot_meta: newRobot.recording_meta,
recording: newRobot.recording,
}
)
return res.status(201).json({
message: 'Markdown robot created successfully.',
robot: newRobot,
});
} catch (error) {
if (error instanceof Error) {
logger.log('error', `Error creating markdown robot: ${error.message}`);
return res.status(500).json({ error: error.message });
} else {
logger.log('error', 'Unknown error creating markdown robot');
return res.status(500).json({ error: 'An unknown error occurred.' });
}
}
});
/**
* DELETE endpoint for deleting a recording from the storage.
*/

View File

@@ -15,6 +15,7 @@ import { WorkflowFile } from "maxun-core";
import { Page } from "playwright";
import { sendWebhook } from "../../routes/webhook";
import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape";
chromium.use(stealthPlugin());
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
@@ -207,6 +208,172 @@ async function executeRun(id: string, userId: string) {
}
}
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for scheduled run ${id}`);
const formats = recording.recording_meta.formats || ['markdown'];
await run.update({
status: 'running',
log: `Converting page to: ${formats.join(', ')}`
});
try {
const runStartedData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'running',
startedAt: plainRun.startedAt
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
logger.log(
'info',
`Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`
);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`
);
}
try {
const url = recording.recording_meta.url;
if (!url) {
throw new Error('No URL specified for markdown robot');
}
let markdown = '';
let html = '';
const serializableOutput: any = {};
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
// Run-completed socket notifications
try {
const completionData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString()
};
serverIo.of(plainRun.browserId).emit('run-completed', completionData);
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
);
}
// Webhook payload
const webhookPayload: any = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
robot_name: recording.recording_meta.name,
status: 'success',
started_at: plainRun.startedAt,
finished_at: new Date().toLocaleString(),
metadata: {
browser_id: plainRun.browserId,
user_id: userId,
}
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log(
'info',
`Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`
);
} catch (webhookError: any) {
logger.log(
'warn',
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
);
}
capture("maxun-oss-run-created-scheduled", {
runId: plainRun.runId,
user_id: userId,
status: "success",
robot_type: "scrape",
formats
});
await destroyRemoteBrowser(plainRun.browserId, userId);
return true;
} catch (error: any) {
logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion failed: ${error.message}`,
});
try {
const failureData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
};
serverIo.of(plainRun.browserId).emit('run-completed', failureData);
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
);
}
capture("maxun-oss-run-created-scheduled", {
runId: plainRun.runId,
user_id: userId,
status: "failed",
robot_type: "scrape",
formats
});
await destroyRemoteBrowser(plainRun.browserId, userId);
throw error;
}
}
plainRun.status = 'running';
try {
@@ -217,7 +384,7 @@ async function executeRun(id: string, userId: string) {
status: 'running',
startedAt: plainRun.startedAt
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`);
} catch (socketError: any) {