Merge pull request #889 from getmaxun/markdownify
feat: scrape [html + markdown]
This commit is contained in:
@@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core";
|
||||
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
|
||||
import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
|
||||
import { sendWebhook } from "../routes/webhook";
|
||||
import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';
|
||||
|
||||
chromium.use(stealthPlugin());
|
||||
|
||||
@@ -344,7 +345,9 @@ function formatRunResponse(run: any) {
|
||||
runByAPI: run.runByAPI,
|
||||
data: {
|
||||
textData: {},
|
||||
listData: {}
|
||||
listData: {},
|
||||
markdown: '',
|
||||
html: ''
|
||||
},
|
||||
screenshots: [] as any[],
|
||||
};
|
||||
@@ -359,6 +362,14 @@ function formatRunResponse(run: any) {
|
||||
formattedRun.data.listData = output.scrapeList;
|
||||
}
|
||||
|
||||
if (output.markdown && Array.isArray(output.markdown)) {
|
||||
formattedRun.data.markdown = output.markdown[0]?.content || '';
|
||||
}
|
||||
|
||||
if (output.html && Array.isArray(output.html)) {
|
||||
formattedRun.data.html = output.html[0]?.content || '';
|
||||
}
|
||||
|
||||
if (run.binaryOutput) {
|
||||
Object.keys(run.binaryOutput).forEach(key => {
|
||||
if (run.binaryOutput[key]) {
|
||||
@@ -569,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr
|
||||
}
|
||||
}
|
||||
|
||||
async function readyForRunHandler(browserId: string, id: string, userId: string){
|
||||
async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){
|
||||
try {
|
||||
const result = await executeRun(id, userId);
|
||||
const result = await executeRun(id, userId, requestedFormats);
|
||||
|
||||
if (result && result.success) {
|
||||
logger.log('info', `Interpretation of ${id} succeeded`);
|
||||
@@ -608,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) {
|
||||
return copy;
|
||||
};
|
||||
|
||||
async function executeRun(id: string, userId: string) {
|
||||
async function executeRun(id: string, userId: string, requestedFormats?: string[]) {
|
||||
let browser: any = null;
|
||||
|
||||
try {
|
||||
@@ -651,6 +662,166 @@ async function executeRun(id: string, userId: string) {
|
||||
};
|
||||
}
|
||||
|
||||
if (recording.recording_meta.type === 'scrape') {
|
||||
logger.log('info', `Executing scrape robot for API run ${id}`);
|
||||
|
||||
let formats = recording.recording_meta.formats || ['markdown'];
|
||||
|
||||
// Override if API request defines formats
|
||||
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
|
||||
formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
|
||||
}
|
||||
|
||||
await run.update({
|
||||
status: 'running',
|
||||
log: `Converting page to: ${formats.join(', ')}`
|
||||
});
|
||||
|
||||
try {
|
||||
const url = recording.recording_meta.url;
|
||||
|
||||
if (!url) {
|
||||
throw new Error('No URL specified for markdown robot');
|
||||
}
|
||||
|
||||
let markdown = '';
|
||||
let html = '';
|
||||
const serializableOutput: any = {};
|
||||
|
||||
// Markdown conversion
|
||||
if (formats.includes('markdown')) {
|
||||
markdown = await convertPageToMarkdown(url);
|
||||
serializableOutput.markdown = [{ content: markdown }];
|
||||
}
|
||||
|
||||
// HTML conversion
|
||||
if (formats.includes('html')) {
|
||||
html = await convertPageToHTML(url);
|
||||
serializableOutput.html = [{ content: html }];
|
||||
}
|
||||
|
||||
await run.update({
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: `${formats.join(', ')} conversion completed successfully`,
|
||||
serializableOutput,
|
||||
binaryOutput: {},
|
||||
});
|
||||
|
||||
logger.log('info', `Markdown robot execution completed for API run ${id}`);
|
||||
|
||||
// Push success socket event
|
||||
try {
|
||||
const completionData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo
|
||||
.of('/queued-run')
|
||||
.to(`user-${userId}`)
|
||||
.emit('run-completed', completionData);
|
||||
} catch (socketError: any) {
|
||||
logger.log(
|
||||
'warn',
|
||||
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
|
||||
);
|
||||
}
|
||||
|
||||
// Build webhook payload
|
||||
const webhookPayload: any = {
|
||||
robot_id: plainRun.robotMetaId,
|
||||
run_id: plainRun.runId,
|
||||
robot_name: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
started_at: plainRun.startedAt,
|
||||
finished_at: new Date().toLocaleString(),
|
||||
metadata: {
|
||||
browser_id: plainRun.browserId,
|
||||
user_id: userId,
|
||||
},
|
||||
};
|
||||
|
||||
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
|
||||
if (formats.includes('html')) webhookPayload.html = html;
|
||||
|
||||
try {
|
||||
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
||||
logger.log(
|
||||
'info',
|
||||
`Webhooks sent successfully for markdown robot API run ${plainRun.runId}`
|
||||
);
|
||||
} catch (webhookError: any) {
|
||||
logger.log(
|
||||
'warn',
|
||||
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
|
||||
);
|
||||
}
|
||||
|
||||
capture("maxun-oss-run-created-api", {
|
||||
runId: plainRun.runId,
|
||||
user_id: userId,
|
||||
status: "success",
|
||||
robot_type: "scrape",
|
||||
formats
|
||||
});
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
interpretationInfo: run.toJSON()
|
||||
};
|
||||
} catch (error: any) {
|
||||
logger.log(
|
||||
'error',
|
||||
`${formats.join(', ')} conversion failed for API run ${id}: ${error.message}`
|
||||
);
|
||||
|
||||
await run.update({
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: `${formats.join(', ')} conversion failed: ${error.message}`,
|
||||
});
|
||||
|
||||
// Send failure socket event
|
||||
try {
|
||||
const failureData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo
|
||||
.of('/queued-run')
|
||||
.to(`user-${userId}`)
|
||||
.emit('run-completed', failureData);
|
||||
} catch (socketError: any) {
|
||||
logger.log(
|
||||
'warn',
|
||||
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
|
||||
);
|
||||
}
|
||||
|
||||
capture("maxun-oss-run-created-api", {
|
||||
runId: plainRun.runId,
|
||||
user_id: userId,
|
||||
status: "failed",
|
||||
robot_type: "scrape",
|
||||
formats
|
||||
});
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
plainRun.status = 'running';
|
||||
|
||||
browser = browserPool.getRemoteBrowser(plainRun.browserId);
|
||||
@@ -848,7 +1019,7 @@ async function executeRun(id: string, userId: string) {
|
||||
}
|
||||
}
|
||||
|
||||
export async function handleRunRecording(id: string, userId: string) {
|
||||
export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
|
||||
try {
|
||||
const result = await createWorkflowAndStoreMetadata(id, userId);
|
||||
const { browserId, runId: newRunId } = result;
|
||||
@@ -862,7 +1033,7 @@ export async function handleRunRecording(id: string, userId: string) {
|
||||
rejectUnauthorized: false
|
||||
});
|
||||
|
||||
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId));
|
||||
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats));
|
||||
|
||||
logger.log('info', `Running Robot: ${id}`);
|
||||
|
||||
@@ -889,12 +1060,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
|
||||
if (!run) throw new Error('Run not found');
|
||||
|
||||
if (run.status === 'success') {
|
||||
return run.toJSON();
|
||||
return run;
|
||||
} else if (run.status === 'failed') {
|
||||
throw new Error('Run failed');
|
||||
}
|
||||
|
||||
// Wait for the next polling interval
|
||||
await new Promise(resolve => setTimeout(resolve, interval));
|
||||
}
|
||||
}
|
||||
@@ -914,6 +1084,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
|
||||
* type: string
|
||||
* required: true
|
||||
* description: The ID of the robot to run.
|
||||
* requestBody:
|
||||
* required: false
|
||||
* content:
|
||||
* application/json:
|
||||
* schema:
|
||||
* type: object
|
||||
* properties:
|
||||
* formats:
|
||||
* type: array
|
||||
* items:
|
||||
* type: string
|
||||
* enum: [markdown, html]
|
||||
* description: Optional override formats for this run.
|
||||
* example:
|
||||
* formats: ["html"]
|
||||
* responses:
|
||||
* 200:
|
||||
* description: Robot run started successfully.
|
||||
@@ -972,7 +1157,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
|
||||
if (!req.user) {
|
||||
return res.status(401).json({ ok: false, error: 'Unauthorized' });
|
||||
}
|
||||
const runId = await handleRunRecording(req.params.id, req.user.id);
|
||||
|
||||
const requestedFormats = req.body.formats;
|
||||
|
||||
const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);
|
||||
|
||||
if (!runId) {
|
||||
throw new Error('Run ID is undefined');
|
||||
|
||||
160
server/src/markdownify/markdown.ts
Normal file
160
server/src/markdownify/markdown.ts
Normal file
@@ -0,0 +1,160 @@
|
||||
export async function parseMarkdown(
|
||||
html: string | null | undefined,
|
||||
baseUrl?: string | null
|
||||
): Promise<string> {
|
||||
const TurndownService = require("turndown");
|
||||
const { gfm } = require("joplin-turndown-plugin-gfm");
|
||||
const cheerio = require("cheerio");
|
||||
const { URL } = require("url");
|
||||
|
||||
if (!html) return "";
|
||||
|
||||
const tidiedHtml = tidyHtml(html);
|
||||
|
||||
const t = new TurndownService({
|
||||
headingStyle: "atx", // ensures #### instead of ------
|
||||
codeBlockStyle: "fenced",
|
||||
});
|
||||
|
||||
// ---------------------------------------------
|
||||
// Proper ATX headings #### instead of underline-style
|
||||
// ---------------------------------------------
|
||||
t.addRule("forceAtxHeadings", {
|
||||
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
|
||||
replacement: (content: string, node: any) => {
|
||||
const level = Number(node.nodeName.charAt(1));
|
||||
const clean = content.trim();
|
||||
return `\n${"#".repeat(level)} ${clean}\n`;
|
||||
},
|
||||
});
|
||||
|
||||
// ---------------------------------------------
|
||||
// Remove SVGs
|
||||
// ---------------------------------------------
|
||||
t.addRule("truncate-svg", {
|
||||
filter: "svg",
|
||||
replacement: () => "",
|
||||
});
|
||||
|
||||
// ---------------------------------------------
|
||||
// Improved paragraph cleanup
|
||||
// ---------------------------------------------
|
||||
t.addRule("improved-paragraph", {
|
||||
filter: "p",
|
||||
replacement: (innerText: string) => {
|
||||
const trimmed = innerText.trim();
|
||||
if (!trimmed) return "";
|
||||
return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
|
||||
},
|
||||
});
|
||||
|
||||
// ---------------------------------------------
|
||||
// Inline link with fallback text
|
||||
// ---------------------------------------------
|
||||
t.addRule("inlineLink", {
|
||||
filter: (node: any, opts: any) =>
|
||||
node.nodeName === "A" && node.getAttribute("href"),
|
||||
|
||||
replacement: (content: string, node: any) => {
|
||||
let text = content.trim();
|
||||
|
||||
// Fallback: aria-label → title → domain
|
||||
if (!text) {
|
||||
text =
|
||||
node.getAttribute("aria-label")?.trim() ||
|
||||
node.getAttribute("title")?.trim() ||
|
||||
getDomainFromUrl(node.getAttribute("href")) ||
|
||||
"link";
|
||||
}
|
||||
|
||||
let href = node.getAttribute("href").trim();
|
||||
|
||||
// relative → absolute
|
||||
if (baseUrl && isRelativeUrl(href)) {
|
||||
try {
|
||||
const u = new URL(href, baseUrl);
|
||||
href = u.toString();
|
||||
} catch { }
|
||||
}
|
||||
|
||||
href = cleanUrl(href);
|
||||
|
||||
return `[${text}](${href})`;
|
||||
},
|
||||
});
|
||||
|
||||
t.use(gfm);
|
||||
|
||||
// Convert HTML → Markdown
|
||||
try {
|
||||
let out = await t.turndown(tidiedHtml);
|
||||
out = fixBrokenLinks(out);
|
||||
out = stripSkipLinks(out);
|
||||
return out.trim();
|
||||
} catch (err) {
|
||||
console.error("HTML→Markdown failed", { err });
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------
|
||||
// Helpers
|
||||
// -----------------------------------------------------
|
||||
function isRelativeUrl(url: string): boolean {
|
||||
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
|
||||
}
|
||||
|
||||
function getDomainFromUrl(url: string): string | null {
|
||||
try {
|
||||
const u = new URL(url);
|
||||
return u.hostname.replace("www.", "");
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function cleanUrl(u: string): string {
|
||||
return u;
|
||||
}
|
||||
|
||||
function cleanAttribute(attr: string) {
|
||||
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
|
||||
}
|
||||
|
||||
function tidyHtml(html: string): string {
|
||||
const cheerio = require("cheerio");
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const manuallyCleanedElements = [
|
||||
"script",
|
||||
"style",
|
||||
"iframe",
|
||||
"noscript",
|
||||
"meta",
|
||||
"link",
|
||||
"object",
|
||||
"embed",
|
||||
"canvas",
|
||||
"audio",
|
||||
"video",
|
||||
];
|
||||
|
||||
manuallyCleanedElements.forEach((tag) => $(tag).remove());
|
||||
return $("body").html();
|
||||
}
|
||||
|
||||
function fixBrokenLinks(md: string): string {
|
||||
let depth = 0;
|
||||
let result = "";
|
||||
|
||||
for (const ch of md) {
|
||||
if (ch === "[") depth++;
|
||||
if (ch === "]") depth = Math.max(0, depth - 1);
|
||||
result += depth > 0 && ch === "\n" ? "\\\n" : ch;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function stripSkipLinks(md: string): string {
|
||||
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
|
||||
}
|
||||
111
server/src/markdownify/scrape.ts
Normal file
111
server/src/markdownify/scrape.ts
Normal file
@@ -0,0 +1,111 @@
|
||||
import { chromium } from "playwright";
|
||||
import { parseMarkdown } from "./markdown";
|
||||
|
||||
/**
|
||||
* Fetches a webpage, strips scripts/styles/images/etc,
|
||||
* returns clean Markdown using parser.
|
||||
*/
|
||||
export async function convertPageToMarkdown(url: string): Promise<string> {
|
||||
const browser = await chromium.launch();
|
||||
const page = await browser.newPage();
|
||||
|
||||
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
|
||||
|
||||
await page.addInitScript(() => {
|
||||
const selectors = [
|
||||
"script",
|
||||
"style",
|
||||
"link[rel='stylesheet']",
|
||||
"noscript",
|
||||
"meta",
|
||||
"svg",
|
||||
"img",
|
||||
"picture",
|
||||
"source",
|
||||
"video",
|
||||
"audio",
|
||||
"iframe",
|
||||
"object",
|
||||
"embed"
|
||||
];
|
||||
|
||||
selectors.forEach(sel => {
|
||||
document.querySelectorAll(sel).forEach(e => e.remove());
|
||||
});
|
||||
|
||||
// Remove inline event handlers (onclick, onload…)
|
||||
const all = document.querySelectorAll("*");
|
||||
all.forEach(el => {
|
||||
[...el.attributes].forEach(attr => {
|
||||
if (attr.name.startsWith("on")) {
|
||||
el.removeAttribute(attr.name);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// Re-extract HTML after cleanup
|
||||
const cleanedHtml = await page.evaluate(() => {
|
||||
return document.documentElement.outerHTML;
|
||||
});
|
||||
|
||||
await browser.close();
|
||||
|
||||
// Convert cleaned HTML → Markdown
|
||||
const markdown = await parseMarkdown(cleanedHtml, url);
|
||||
return markdown;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches a webpage, strips scripts/styles/images/etc,
|
||||
* returns clean HTML.
|
||||
*/
|
||||
export async function convertPageToHTML(url: string): Promise<string> {
|
||||
const browser = await chromium.launch();
|
||||
const page = await browser.newPage();
|
||||
|
||||
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
|
||||
|
||||
await page.addInitScript(() => {
|
||||
const selectors = [
|
||||
"script",
|
||||
"style",
|
||||
"link[rel='stylesheet']",
|
||||
"noscript",
|
||||
"meta",
|
||||
"svg",
|
||||
"img",
|
||||
"picture",
|
||||
"source",
|
||||
"video",
|
||||
"audio",
|
||||
"iframe",
|
||||
"object",
|
||||
"embed"
|
||||
];
|
||||
|
||||
selectors.forEach(sel => {
|
||||
document.querySelectorAll(sel).forEach(e => e.remove());
|
||||
});
|
||||
|
||||
// Remove inline event handlers (onclick, onload…)
|
||||
const all = document.querySelectorAll("*");
|
||||
all.forEach(el => {
|
||||
[...el.attributes].forEach(attr => {
|
||||
if (attr.name.startsWith("on")) {
|
||||
el.removeAttribute(attr.name);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// Re-extract HTML after cleanup
|
||||
const cleanedHtml = await page.evaluate(() => {
|
||||
return document.documentElement.outerHTML;
|
||||
});
|
||||
|
||||
await browser.close();
|
||||
|
||||
// Return cleaned HTML directly
|
||||
return cleanedHtml;
|
||||
}
|
||||
6
server/src/markdownify/test.ts
Normal file
6
server/src/markdownify/test.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
import { convertPageToMarkdown } from "./scrape";
|
||||
|
||||
(async () => {
|
||||
const md = await convertPageToMarkdown("https://quotes.toscrape.com/");
|
||||
console.log(md);
|
||||
})();
|
||||
@@ -9,6 +9,9 @@ interface RobotMeta {
|
||||
pairs: number;
|
||||
updatedAt: string;
|
||||
params: any[];
|
||||
type?: 'extract' | 'scrape';
|
||||
url?: string;
|
||||
formats?: ('markdown' | 'html')[];
|
||||
}
|
||||
|
||||
interface RobotWorkflow {
|
||||
|
||||
@@ -20,6 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme
|
||||
import { io as serverIo } from "./server";
|
||||
import { sendWebhook } from './routes/webhook';
|
||||
import { BinaryOutputService } from './storage/mino';
|
||||
import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape';
|
||||
|
||||
if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
|
||||
throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
|
||||
@@ -183,11 +184,140 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
||||
try {
|
||||
// Find the recording
|
||||
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
|
||||
|
||||
|
||||
if (!recording) {
|
||||
throw new Error(`Recording for run ${data.runId} not found`);
|
||||
}
|
||||
|
||||
|
||||
if (recording.recording_meta.type === 'scrape') {
|
||||
logger.log('info', `Executing scrape robot for run ${data.runId}`);
|
||||
|
||||
const formats = recording.recording_meta.formats || ['markdown'];
|
||||
|
||||
await run.update({
|
||||
status: 'running',
|
||||
log: `Converting page to ${formats.join(', ')}`
|
||||
});
|
||||
|
||||
try {
|
||||
const url = recording.recording_meta.url;
|
||||
|
||||
if (!url) {
|
||||
throw new Error('No URL specified for markdown robot');
|
||||
}
|
||||
|
||||
let markdown = '';
|
||||
let html = '';
|
||||
const serializableOutput: any = {};
|
||||
|
||||
// Markdown conversion
|
||||
if (formats.includes('markdown')) {
|
||||
markdown = await convertPageToMarkdown(url);
|
||||
serializableOutput.markdown = [{ content: markdown }];
|
||||
}
|
||||
|
||||
// HTML conversion
|
||||
if (formats.includes('html')) {
|
||||
html = await convertPageToHTML(url);
|
||||
serializableOutput.html = [{ content: html }];
|
||||
}
|
||||
|
||||
// Success update
|
||||
await run.update({
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: `${formats.join(', ').toUpperCase()} conversion completed successfully`,
|
||||
serializableOutput,
|
||||
binaryOutput: {},
|
||||
});
|
||||
|
||||
logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
|
||||
|
||||
// Notify sockets
|
||||
try {
|
||||
const completionData = {
|
||||
runId: data.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo.of(browserId).emit('run-completed', completionData);
|
||||
serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', completionData);
|
||||
} catch (socketError: any) {
|
||||
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
|
||||
}
|
||||
|
||||
// Webhooks
|
||||
try {
|
||||
const webhookPayload: any = {
|
||||
runId: data.runId,
|
||||
robotId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
};
|
||||
|
||||
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
|
||||
if (formats.includes('html')) webhookPayload.html = html;
|
||||
|
||||
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
||||
logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
|
||||
} catch (webhookError: any) {
|
||||
logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`);
|
||||
}
|
||||
|
||||
capture("maxun-oss-run-created-manual", {
|
||||
runId: data.runId,
|
||||
user_id: data.userId,
|
||||
status: "success",
|
||||
robot_type: "scrape",
|
||||
formats,
|
||||
});
|
||||
|
||||
await destroyRemoteBrowser(browserId, data.userId);
|
||||
|
||||
return { success: true };
|
||||
|
||||
} catch (error: any) {
|
||||
logger.log('error', `${formats.join(', ')} conversion failed for run ${data.runId}: ${error.message}`);
|
||||
|
||||
await run.update({
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: `${formats.join(', ').toUpperCase()} conversion failed: ${error.message}`,
|
||||
});
|
||||
|
||||
try {
|
||||
const failureData = {
|
||||
runId: data.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo.of(browserId).emit('run-completed', failureData);
|
||||
serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', failureData);
|
||||
} catch (socketError: any) {
|
||||
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`);
|
||||
}
|
||||
|
||||
capture("maxun-oss-run-created-manual", {
|
||||
runId: data.runId,
|
||||
user_id: data.userId,
|
||||
status: "failed",
|
||||
robot_type: "scrape",
|
||||
formats,
|
||||
});
|
||||
|
||||
await destroyRemoteBrowser(browserId, data.userId);
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const isRunAborted = async (): Promise<boolean> => {
|
||||
try {
|
||||
const currentRun = await Run.findOne({ where: { runId: data.runId } });
|
||||
|
||||
@@ -274,7 +274,10 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
}
|
||||
|
||||
if (targetUrl) {
|
||||
robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
|
||||
|
||||
const updatedWorkflow = [...robot.recording.workflow];
|
||||
let foundGoto = false;
|
||||
|
||||
for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
|
||||
const step = updatedWorkflow[i];
|
||||
@@ -289,6 +292,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
|
||||
robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
|
||||
robot.changed('recording', true);
|
||||
foundGoto = true;
|
||||
i = -1;
|
||||
break;
|
||||
}
|
||||
@@ -331,10 +335,11 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
|
||||
}
|
||||
};
|
||||
|
||||
if (name) {
|
||||
if (name || targetUrl) {
|
||||
updates.recording_meta = {
|
||||
...robot.recording_meta,
|
||||
name
|
||||
...(name && { name }),
|
||||
...(targetUrl && { url: targetUrl })
|
||||
};
|
||||
}
|
||||
|
||||
@@ -432,6 +437,91 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST endpoint for creating a markdown robot
|
||||
*/
|
||||
router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const { url, name, formats } = req.body;
|
||||
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: 'The "url" field is required.' });
|
||||
}
|
||||
|
||||
if (!req.user) {
|
||||
return res.status(401).send({ error: 'Unauthorized' });
|
||||
}
|
||||
|
||||
// Validate URL format
|
||||
try {
|
||||
new URL(url);
|
||||
} catch (err) {
|
||||
return res.status(400).json({ error: 'Invalid URL format' });
|
||||
}
|
||||
|
||||
// Validate format
|
||||
const validFormats = ['markdown', 'html'];
|
||||
|
||||
if (!Array.isArray(formats) || formats.length === 0) {
|
||||
return res.status(400).json({ error: 'At least one output format must be selected.' });
|
||||
}
|
||||
|
||||
const invalid = formats.filter(f => !validFormats.includes(f));
|
||||
if (invalid.length > 0) {
|
||||
return res.status(400).json({ error: `Invalid formats: ${invalid.join(', ')}` });
|
||||
}
|
||||
|
||||
const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
|
||||
const currentTimestamp = new Date().toLocaleString();
|
||||
const robotId = uuid();
|
||||
|
||||
const newRobot = await Robot.create({
|
||||
id: uuid(),
|
||||
userId: req.user.id,
|
||||
recording_meta: {
|
||||
name: robotName,
|
||||
id: robotId,
|
||||
createdAt: currentTimestamp,
|
||||
updatedAt: currentTimestamp,
|
||||
pairs: 0,
|
||||
params: [],
|
||||
type: 'scrape',
|
||||
url: url,
|
||||
formats: formats,
|
||||
},
|
||||
recording: { workflow: [] },
|
||||
google_sheet_email: null,
|
||||
google_sheet_name: null,
|
||||
google_sheet_id: null,
|
||||
google_access_token: null,
|
||||
google_refresh_token: null,
|
||||
schedule: null,
|
||||
});
|
||||
|
||||
logger.log('info', `Markdown robot created with id: ${newRobot.id}`);
|
||||
capture(
|
||||
'maxun-oss-robot-created',
|
||||
{
|
||||
robot_meta: newRobot.recording_meta,
|
||||
recording: newRobot.recording,
|
||||
}
|
||||
)
|
||||
|
||||
return res.status(201).json({
|
||||
message: 'Markdown robot created successfully.',
|
||||
robot: newRobot,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
logger.log('error', `Error creating markdown robot: ${error.message}`);
|
||||
return res.status(500).json({ error: error.message });
|
||||
} else {
|
||||
logger.log('error', 'Unknown error creating markdown robot');
|
||||
return res.status(500).json({ error: 'An unknown error occurred.' });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* DELETE endpoint for deleting a recording from the storage.
|
||||
*/
|
||||
|
||||
@@ -15,6 +15,7 @@ import { WorkflowFile } from "maxun-core";
|
||||
import { Page } from "playwright";
|
||||
import { sendWebhook } from "../../routes/webhook";
|
||||
import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
|
||||
import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape";
|
||||
chromium.use(stealthPlugin());
|
||||
|
||||
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
|
||||
@@ -207,6 +208,172 @@ async function executeRun(id: string, userId: string) {
|
||||
}
|
||||
}
|
||||
|
||||
if (recording.recording_meta.type === 'scrape') {
|
||||
logger.log('info', `Executing scrape robot for scheduled run ${id}`);
|
||||
|
||||
const formats = recording.recording_meta.formats || ['markdown'];
|
||||
|
||||
await run.update({
|
||||
status: 'running',
|
||||
log: `Converting page to: ${formats.join(', ')}`
|
||||
});
|
||||
|
||||
try {
|
||||
const runStartedData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'running',
|
||||
startedAt: plainRun.startedAt
|
||||
};
|
||||
|
||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
|
||||
logger.log(
|
||||
'info',
|
||||
`Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`
|
||||
);
|
||||
} catch (socketError: any) {
|
||||
logger.log(
|
||||
'warn',
|
||||
`Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`
|
||||
);
|
||||
}
|
||||
|
||||
try {
|
||||
const url = recording.recording_meta.url;
|
||||
|
||||
if (!url) {
|
||||
throw new Error('No URL specified for markdown robot');
|
||||
}
|
||||
|
||||
let markdown = '';
|
||||
let html = '';
|
||||
const serializableOutput: any = {};
|
||||
|
||||
// Markdown conversion
|
||||
if (formats.includes('markdown')) {
|
||||
markdown = await convertPageToMarkdown(url);
|
||||
serializableOutput.markdown = [{ content: markdown }];
|
||||
}
|
||||
|
||||
// HTML conversion
|
||||
if (formats.includes('html')) {
|
||||
html = await convertPageToHTML(url);
|
||||
serializableOutput.html = [{ content: html }];
|
||||
}
|
||||
|
||||
await run.update({
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: `${formats.join(', ')} conversion completed successfully`,
|
||||
serializableOutput,
|
||||
binaryOutput: {},
|
||||
});
|
||||
|
||||
logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
|
||||
|
||||
// Run-completed socket notifications
|
||||
try {
|
||||
const completionData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo.of(plainRun.browserId).emit('run-completed', completionData);
|
||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
|
||||
} catch (socketError: any) {
|
||||
logger.log(
|
||||
'warn',
|
||||
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
|
||||
);
|
||||
}
|
||||
|
||||
// Webhook payload
|
||||
const webhookPayload: any = {
|
||||
robot_id: plainRun.robotMetaId,
|
||||
run_id: plainRun.runId,
|
||||
robot_name: recording.recording_meta.name,
|
||||
status: 'success',
|
||||
started_at: plainRun.startedAt,
|
||||
finished_at: new Date().toLocaleString(),
|
||||
metadata: {
|
||||
browser_id: plainRun.browserId,
|
||||
user_id: userId,
|
||||
}
|
||||
};
|
||||
|
||||
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
|
||||
if (formats.includes('html')) webhookPayload.html = html;
|
||||
|
||||
try {
|
||||
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
||||
logger.log(
|
||||
'info',
|
||||
`Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`
|
||||
);
|
||||
} catch (webhookError: any) {
|
||||
logger.log(
|
||||
'warn',
|
||||
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
|
||||
);
|
||||
}
|
||||
|
||||
capture("maxun-oss-run-created-scheduled", {
|
||||
runId: plainRun.runId,
|
||||
user_id: userId,
|
||||
status: "success",
|
||||
robot_type: "scrape",
|
||||
formats
|
||||
});
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
|
||||
return true;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`);
|
||||
|
||||
await run.update({
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString(),
|
||||
log: `${formats.join(', ')} conversion failed: ${error.message}`,
|
||||
});
|
||||
|
||||
try {
|
||||
const failureData = {
|
||||
runId: plainRun.runId,
|
||||
robotMetaId: plainRun.robotMetaId,
|
||||
robotName: recording.recording_meta.name,
|
||||
status: 'failed',
|
||||
finishedAt: new Date().toLocaleString()
|
||||
};
|
||||
|
||||
serverIo.of(plainRun.browserId).emit('run-completed', failureData);
|
||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
|
||||
} catch (socketError: any) {
|
||||
logger.log(
|
||||
'warn',
|
||||
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
|
||||
);
|
||||
}
|
||||
|
||||
capture("maxun-oss-run-created-scheduled", {
|
||||
runId: plainRun.runId,
|
||||
user_id: userId,
|
||||
status: "failed",
|
||||
robot_type: "scrape",
|
||||
formats
|
||||
});
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
plainRun.status = 'running';
|
||||
|
||||
try {
|
||||
@@ -217,7 +384,7 @@ async function executeRun(id: string, userId: string) {
|
||||
status: 'running',
|
||||
startedAt: plainRun.startedAt
|
||||
};
|
||||
|
||||
|
||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
|
||||
logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`);
|
||||
} catch (socketError: any) {
|
||||
|
||||
Reference in New Issue
Block a user