Merge pull request #889 from getmaxun/markdownify

feat: scrape [html + markdown]
This commit is contained in:
Karishma Shukla
2025-11-21 00:14:31 +05:30
committed by GitHub
18 changed files with 1422 additions and 210 deletions

View File

@@ -46,6 +46,7 @@
"idcac-playwright": "^0.1.3", "idcac-playwright": "^0.1.3",
"ioredis": "^5.4.1", "ioredis": "^5.4.1",
"joi": "^17.6.0", "joi": "^17.6.0",
"joplin-turndown-plugin-gfm": "^1.0.12",
"jsonwebtoken": "^9.0.2", "jsonwebtoken": "^9.0.2",
"jwt-decode": "^4.0.0", "jwt-decode": "^4.0.0",
"lodash": "^4.17.21", "lodash": "^4.17.21",
@@ -80,6 +81,7 @@
"styled-components": "^5.3.3", "styled-components": "^5.3.3",
"swagger-jsdoc": "^6.2.8", "swagger-jsdoc": "^6.2.8",
"swagger-ui-express": "^5.0.1", "swagger-ui-express": "^5.0.1",
"turndown": "^7.2.2",
"typedoc": "^0.23.8", "typedoc": "^0.23.8",
"typescript": "^5.0.0", "typescript": "^5.0.0",
"uuid": "^8.3.2", "uuid": "^8.3.2",
@@ -126,6 +128,7 @@
"@types/styled-components": "^5.1.23", "@types/styled-components": "^5.1.23",
"@types/swagger-jsdoc": "^6.0.4", "@types/swagger-jsdoc": "^6.0.4",
"@types/swagger-ui-express": "^4.1.6", "@types/swagger-ui-express": "^4.1.6",
"@types/turndown": "^5.0.6",
"@vitejs/plugin-react": "^4.3.3", "@vitejs/plugin-react": "^4.3.3",
"ajv": "^8.8.2", "ajv": "^8.8.2",
"concurrently": "^7.0.0", "concurrently": "^7.0.0",

View File

@@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core";
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet"; import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable"; import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
import { sendWebhook } from "../routes/webhook"; import { sendWebhook } from "../routes/webhook";
import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';
chromium.use(stealthPlugin()); chromium.use(stealthPlugin());
@@ -344,7 +345,9 @@ function formatRunResponse(run: any) {
runByAPI: run.runByAPI, runByAPI: run.runByAPI,
data: { data: {
textData: {}, textData: {},
listData: {} listData: {},
markdown: '',
html: ''
}, },
screenshots: [] as any[], screenshots: [] as any[],
}; };
@@ -359,6 +362,14 @@ function formatRunResponse(run: any) {
formattedRun.data.listData = output.scrapeList; formattedRun.data.listData = output.scrapeList;
} }
if (output.markdown && Array.isArray(output.markdown)) {
formattedRun.data.markdown = output.markdown[0]?.content || '';
}
if (output.html && Array.isArray(output.html)) {
formattedRun.data.html = output.html[0]?.content || '';
}
if (run.binaryOutput) { if (run.binaryOutput) {
Object.keys(run.binaryOutput).forEach(key => { Object.keys(run.binaryOutput).forEach(key => {
if (run.binaryOutput[key]) { if (run.binaryOutput[key]) {
@@ -569,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr
} }
} }
async function readyForRunHandler(browserId: string, id: string, userId: string){ async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){
try { try {
const result = await executeRun(id, userId); const result = await executeRun(id, userId, requestedFormats);
if (result && result.success) { if (result && result.success) {
logger.log('info', `Interpretation of ${id} succeeded`); logger.log('info', `Interpretation of ${id} succeeded`);
@@ -608,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) {
return copy; return copy;
}; };
async function executeRun(id: string, userId: string) { async function executeRun(id: string, userId: string, requestedFormats?: string[]) {
let browser: any = null; let browser: any = null;
try { try {
@@ -651,6 +662,166 @@ async function executeRun(id: string, userId: string) {
}; };
} }
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for API run ${id}`);
let formats = recording.recording_meta.formats || ['markdown'];
// Override if API request defines formats
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
}
await run.update({
status: 'running',
log: `Converting page to: ${formats.join(', ')}`
});
try {
const url = recording.recording_meta.url;
if (!url) {
throw new Error('No URL specified for markdown robot');
}
let markdown = '';
let html = '';
const serializableOutput: any = {};
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for API run ${id}`);
// Push success socket event
try {
const completionData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString()
};
serverIo
.of('/queued-run')
.to(`user-${userId}`)
.emit('run-completed', completionData);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
);
}
// Build webhook payload
const webhookPayload: any = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
robot_name: recording.recording_meta.name,
status: 'success',
started_at: plainRun.startedAt,
finished_at: new Date().toLocaleString(),
metadata: {
browser_id: plainRun.browserId,
user_id: userId,
},
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log(
'info',
`Webhooks sent successfully for markdown robot API run ${plainRun.runId}`
);
} catch (webhookError: any) {
logger.log(
'warn',
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
);
}
capture("maxun-oss-run-created-api", {
runId: plainRun.runId,
user_id: userId,
status: "success",
robot_type: "scrape",
formats
});
await destroyRemoteBrowser(plainRun.browserId, userId);
return {
success: true,
interpretationInfo: run.toJSON()
};
} catch (error: any) {
logger.log(
'error',
`${formats.join(', ')} conversion failed for API run ${id}: ${error.message}`
);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion failed: ${error.message}`,
});
// Send failure socket event
try {
const failureData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
};
serverIo
.of('/queued-run')
.to(`user-${userId}`)
.emit('run-completed', failureData);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
);
}
capture("maxun-oss-run-created-api", {
runId: plainRun.runId,
user_id: userId,
status: "failed",
robot_type: "scrape",
formats
});
await destroyRemoteBrowser(plainRun.browserId, userId);
throw error;
}
}
plainRun.status = 'running'; plainRun.status = 'running';
browser = browserPool.getRemoteBrowser(plainRun.browserId); browser = browserPool.getRemoteBrowser(plainRun.browserId);
@@ -848,7 +1019,7 @@ async function executeRun(id: string, userId: string) {
} }
} }
export async function handleRunRecording(id: string, userId: string) { export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
try { try {
const result = await createWorkflowAndStoreMetadata(id, userId); const result = await createWorkflowAndStoreMetadata(id, userId);
const { browserId, runId: newRunId } = result; const { browserId, runId: newRunId } = result;
@@ -862,7 +1033,7 @@ export async function handleRunRecording(id: string, userId: string) {
rejectUnauthorized: false rejectUnauthorized: false
}); });
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId)); socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats));
logger.log('info', `Running Robot: ${id}`); logger.log('info', `Running Robot: ${id}`);
@@ -889,12 +1060,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
if (!run) throw new Error('Run not found'); if (!run) throw new Error('Run not found');
if (run.status === 'success') { if (run.status === 'success') {
return run.toJSON(); return run;
} else if (run.status === 'failed') { } else if (run.status === 'failed') {
throw new Error('Run failed'); throw new Error('Run failed');
} }
// Wait for the next polling interval
await new Promise(resolve => setTimeout(resolve, interval)); await new Promise(resolve => setTimeout(resolve, interval));
} }
} }
@@ -914,6 +1084,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
* type: string * type: string
* required: true * required: true
* description: The ID of the robot to run. * description: The ID of the robot to run.
* requestBody:
* required: false
* content:
* application/json:
* schema:
* type: object
* properties:
* formats:
* type: array
* items:
* type: string
* enum: [markdown, html]
* description: Optional override formats for this run.
* example:
* formats: ["html"]
* responses: * responses:
* 200: * 200:
* description: Robot run started successfully. * description: Robot run started successfully.
@@ -972,7 +1157,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
if (!req.user) { if (!req.user) {
return res.status(401).json({ ok: false, error: 'Unauthorized' }); return res.status(401).json({ ok: false, error: 'Unauthorized' });
} }
const runId = await handleRunRecording(req.params.id, req.user.id);
const requestedFormats = req.body.formats;
const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);
if (!runId) { if (!runId) {
throw new Error('Run ID is undefined'); throw new Error('Run ID is undefined');

View File

@@ -0,0 +1,160 @@
export async function parseMarkdown(
html: string | null | undefined,
baseUrl?: string | null
): Promise<string> {
const TurndownService = require("turndown");
const { gfm } = require("joplin-turndown-plugin-gfm");
const cheerio = require("cheerio");
const { URL } = require("url");
if (!html) return "";
const tidiedHtml = tidyHtml(html);
const t = new TurndownService({
headingStyle: "atx", // ensures #### instead of ------
codeBlockStyle: "fenced",
});
// ---------------------------------------------
// Proper ATX headings #### instead of underline-style
// ---------------------------------------------
t.addRule("forceAtxHeadings", {
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
replacement: (content: string, node: any) => {
const level = Number(node.nodeName.charAt(1));
const clean = content.trim();
return `\n${"#".repeat(level)} ${clean}\n`;
},
});
// ---------------------------------------------
// Remove SVGs
// ---------------------------------------------
t.addRule("truncate-svg", {
filter: "svg",
replacement: () => "",
});
// ---------------------------------------------
// Improved paragraph cleanup
// ---------------------------------------------
t.addRule("improved-paragraph", {
filter: "p",
replacement: (innerText: string) => {
const trimmed = innerText.trim();
if (!trimmed) return "";
return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
},
});
// ---------------------------------------------
// Inline link with fallback text
// ---------------------------------------------
t.addRule("inlineLink", {
filter: (node: any, opts: any) =>
node.nodeName === "A" && node.getAttribute("href"),
replacement: (content: string, node: any) => {
let text = content.trim();
// Fallback: aria-label → title → domain
if (!text) {
text =
node.getAttribute("aria-label")?.trim() ||
node.getAttribute("title")?.trim() ||
getDomainFromUrl(node.getAttribute("href")) ||
"link";
}
let href = node.getAttribute("href").trim();
// relative → absolute
if (baseUrl && isRelativeUrl(href)) {
try {
const u = new URL(href, baseUrl);
href = u.toString();
} catch { }
}
href = cleanUrl(href);
return `[${text}](${href})`;
},
});
t.use(gfm);
// Convert HTML → Markdown
try {
let out = await t.turndown(tidiedHtml);
out = fixBrokenLinks(out);
out = stripSkipLinks(out);
return out.trim();
} catch (err) {
console.error("HTML→Markdown failed", { err });
return "";
}
}
// -----------------------------------------------------
// Helpers
// -----------------------------------------------------
function isRelativeUrl(url: string): boolean {
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
}
function getDomainFromUrl(url: string): string | null {
try {
const u = new URL(url);
return u.hostname.replace("www.", "");
} catch {
return null;
}
}
function cleanUrl(u: string): string {
return u;
}
function cleanAttribute(attr: string) {
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
}
function tidyHtml(html: string): string {
const cheerio = require("cheerio");
const $ = cheerio.load(html);
const manuallyCleanedElements = [
"script",
"style",
"iframe",
"noscript",
"meta",
"link",
"object",
"embed",
"canvas",
"audio",
"video",
];
manuallyCleanedElements.forEach((tag) => $(tag).remove());
return $("body").html();
}
function fixBrokenLinks(md: string): string {
let depth = 0;
let result = "";
for (const ch of md) {
if (ch === "[") depth++;
if (ch === "]") depth = Math.max(0, depth - 1);
result += depth > 0 && ch === "\n" ? "\\\n" : ch;
}
return result;
}
function stripSkipLinks(md: string): string {
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
}

View File

@@ -0,0 +1,111 @@
import { chromium } from "playwright";
import { parseMarkdown } from "./markdown";
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean Markdown using parser.
*/
export async function convertPageToMarkdown(url: string): Promise<string> {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
await page.addInitScript(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
// Convert cleaned HTML → Markdown
const markdown = await parseMarkdown(cleanedHtml, url);
return markdown;
}
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML.
*/
export async function convertPageToHTML(url: string): Promise<string> {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
await page.addInitScript(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
// Return cleaned HTML directly
return cleanedHtml;
}

View File

@@ -0,0 +1,6 @@
import { convertPageToMarkdown } from "./scrape";
(async () => {
const md = await convertPageToMarkdown("https://quotes.toscrape.com/");
console.log(md);
})();

View File

@@ -9,6 +9,9 @@ interface RobotMeta {
pairs: number; pairs: number;
updatedAt: string; updatedAt: string;
params: any[]; params: any[];
type?: 'extract' | 'scrape';
url?: string;
formats?: ('markdown' | 'html')[];
} }
interface RobotWorkflow { interface RobotWorkflow {

View File

@@ -20,6 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme
import { io as serverIo } from "./server"; import { io as serverIo } from "./server";
import { sendWebhook } from './routes/webhook'; import { sendWebhook } from './routes/webhook';
import { BinaryOutputService } from './storage/mino'; import { BinaryOutputService } from './storage/mino';
import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape';
if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) { if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.'); throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
@@ -183,11 +184,140 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
try { try {
// Find the recording // Find the recording
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true }); const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
if (!recording) { if (!recording) {
throw new Error(`Recording for run ${data.runId} not found`); throw new Error(`Recording for run ${data.runId} not found`);
} }
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for run ${data.runId}`);
const formats = recording.recording_meta.formats || ['markdown'];
await run.update({
status: 'running',
log: `Converting page to ${formats.join(', ')}`
});
try {
const url = recording.recording_meta.url;
if (!url) {
throw new Error('No URL specified for markdown robot');
}
let markdown = '';
let html = '';
const serializableOutput: any = {};
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}
// Success update
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ').toUpperCase()} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
// Notify sockets
try {
const completionData = {
runId: data.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString()
};
serverIo.of(browserId).emit('run-completed', completionData);
serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', completionData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
}
// Webhooks
try {
const webhookPayload: any = {
runId: data.runId,
robotId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString(),
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
} catch (webhookError: any) {
logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`);
}
capture("maxun-oss-run-created-manual", {
runId: data.runId,
user_id: data.userId,
status: "success",
robot_type: "scrape",
formats,
});
await destroyRemoteBrowser(browserId, data.userId);
return { success: true };
} catch (error: any) {
logger.log('error', `${formats.join(', ')} conversion failed for run ${data.runId}: ${error.message}`);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ').toUpperCase()} conversion failed: ${error.message}`,
});
try {
const failureData = {
runId: data.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
};
serverIo.of(browserId).emit('run-completed', failureData);
serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', failureData);
} catch (socketError: any) {
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`);
}
capture("maxun-oss-run-created-manual", {
runId: data.runId,
user_id: data.userId,
status: "failed",
robot_type: "scrape",
formats,
});
await destroyRemoteBrowser(browserId, data.userId);
throw error;
}
}
const isRunAborted = async (): Promise<boolean> => { const isRunAborted = async (): Promise<boolean> => {
try { try {
const currentRun = await Run.findOne({ where: { runId: data.runId } }); const currentRun = await Run.findOne({ where: { runId: data.runId } });

View File

@@ -274,7 +274,10 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
} }
if (targetUrl) { if (targetUrl) {
robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
const updatedWorkflow = [...robot.recording.workflow]; const updatedWorkflow = [...robot.recording.workflow];
let foundGoto = false;
for (let i = updatedWorkflow.length - 1; i >= 0; i--) { for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
const step = updatedWorkflow[i]; const step = updatedWorkflow[i];
@@ -289,6 +292,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
robot.set('recording', { ...robot.recording, workflow: updatedWorkflow }); robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
robot.changed('recording', true); robot.changed('recording', true);
foundGoto = true;
i = -1; i = -1;
break; break;
} }
@@ -331,10 +335,11 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
} }
}; };
if (name) { if (name || targetUrl) {
updates.recording_meta = { updates.recording_meta = {
...robot.recording_meta, ...robot.recording_meta,
name ...(name && { name }),
...(targetUrl && { url: targetUrl })
}; };
} }
@@ -432,6 +437,91 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate
} }
}); });
/**
* POST endpoint for creating a markdown robot
*/
router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedRequest, res) => {
try {
const { url, name, formats } = req.body;
if (!url) {
return res.status(400).json({ error: 'The "url" field is required.' });
}
if (!req.user) {
return res.status(401).send({ error: 'Unauthorized' });
}
// Validate URL format
try {
new URL(url);
} catch (err) {
return res.status(400).json({ error: 'Invalid URL format' });
}
// Validate format
const validFormats = ['markdown', 'html'];
if (!Array.isArray(formats) || formats.length === 0) {
return res.status(400).json({ error: 'At least one output format must be selected.' });
}
const invalid = formats.filter(f => !validFormats.includes(f));
if (invalid.length > 0) {
return res.status(400).json({ error: `Invalid formats: ${invalid.join(', ')}` });
}
const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
const currentTimestamp = new Date().toLocaleString();
const robotId = uuid();
const newRobot = await Robot.create({
id: uuid(),
userId: req.user.id,
recording_meta: {
name: robotName,
id: robotId,
createdAt: currentTimestamp,
updatedAt: currentTimestamp,
pairs: 0,
params: [],
type: 'scrape',
url: url,
formats: formats,
},
recording: { workflow: [] },
google_sheet_email: null,
google_sheet_name: null,
google_sheet_id: null,
google_access_token: null,
google_refresh_token: null,
schedule: null,
});
logger.log('info', `Markdown robot created with id: ${newRobot.id}`);
capture(
'maxun-oss-robot-created',
{
robot_meta: newRobot.recording_meta,
recording: newRobot.recording,
}
)
return res.status(201).json({
message: 'Markdown robot created successfully.',
robot: newRobot,
});
} catch (error) {
if (error instanceof Error) {
logger.log('error', `Error creating markdown robot: ${error.message}`);
return res.status(500).json({ error: error.message });
} else {
logger.log('error', 'Unknown error creating markdown robot');
return res.status(500).json({ error: 'An unknown error occurred.' });
}
}
});
/** /**
* DELETE endpoint for deleting a recording from the storage. * DELETE endpoint for deleting a recording from the storage.
*/ */

View File

@@ -15,6 +15,7 @@ import { WorkflowFile } from "maxun-core";
import { Page } from "playwright"; import { Page } from "playwright";
import { sendWebhook } from "../../routes/webhook"; import { sendWebhook } from "../../routes/webhook";
import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable"; import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape";
chromium.use(stealthPlugin()); chromium.use(stealthPlugin());
async function createWorkflowAndStoreMetadata(id: string, userId: string) { async function createWorkflowAndStoreMetadata(id: string, userId: string) {
@@ -207,6 +208,172 @@ async function executeRun(id: string, userId: string) {
} }
} }
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for scheduled run ${id}`);
const formats = recording.recording_meta.formats || ['markdown'];
await run.update({
status: 'running',
log: `Converting page to: ${formats.join(', ')}`
});
try {
const runStartedData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'running',
startedAt: plainRun.startedAt
};
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
logger.log(
'info',
`Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`
);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`
);
}
try {
const url = recording.recording_meta.url;
if (!url) {
throw new Error('No URL specified for markdown robot');
}
let markdown = '';
let html = '';
const serializableOutput: any = {};
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
serializableOutput.html = [{ content: html }];
}
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
});
logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
// Run-completed socket notifications
try {
const completionData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'success',
finishedAt: new Date().toLocaleString()
};
serverIo.of(plainRun.browserId).emit('run-completed', completionData);
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
);
}
// Webhook payload
const webhookPayload: any = {
robot_id: plainRun.robotMetaId,
run_id: plainRun.runId,
robot_name: recording.recording_meta.name,
status: 'success',
started_at: plainRun.startedAt,
finished_at: new Date().toLocaleString(),
metadata: {
browser_id: plainRun.browserId,
user_id: userId,
}
};
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log(
'info',
`Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`
);
} catch (webhookError: any) {
logger.log(
'warn',
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
);
}
capture("maxun-oss-run-created-scheduled", {
runId: plainRun.runId,
user_id: userId,
status: "success",
robot_type: "scrape",
formats
});
await destroyRemoteBrowser(plainRun.browserId, userId);
return true;
} catch (error: any) {
logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`);
await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion failed: ${error.message}`,
});
try {
const failureData = {
runId: plainRun.runId,
robotMetaId: plainRun.robotMetaId,
robotName: recording.recording_meta.name,
status: 'failed',
finishedAt: new Date().toLocaleString()
};
serverIo.of(plainRun.browserId).emit('run-completed', failureData);
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
} catch (socketError: any) {
logger.log(
'warn',
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
);
}
capture("maxun-oss-run-created-scheduled", {
runId: plainRun.runId,
user_id: userId,
status: "failed",
robot_type: "scrape",
formats
});
await destroyRemoteBrowser(plainRun.browserId, userId);
throw error;
}
}
plainRun.status = 'running'; plainRun.status = 'running';
try { try {
@@ -217,7 +384,7 @@ async function executeRun(id: string, userId: string) {
status: 'running', status: 'running',
startedAt: plainRun.startedAt startedAt: plainRun.startedAt
}; };
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData); serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`); logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`);
} catch (socketError: any) { } catch (socketError: any) {

View File

@@ -28,6 +28,36 @@ export const getStoredRecordings = async (): Promise<string[] | null> => {
} }
}; };
export const createScrapeRobot = async (
url: string,
name?: string,
formats: string[] = ['markdown']
): Promise<any> => {
try {
const response = await axios.post(
`${apiUrl}/storage/recordings/scrape`,
{
url,
name,
formats,
},
{
headers: { 'Content-Type': 'application/json' },
withCredentials: true,
}
);
if (response.status === 201) {
return response.data;
} else {
throw new Error('Failed to create markdown robot');
}
} catch (error: any) {
console.error('Error creating markdown robot:', error);
return null;
}
};
export const updateRecording = async (id: string, data: { export const updateRecording = async (id: string, data: {
name?: string; name?: string;
limits?: Array<{pairIndex: number, actionIndex: number, argIndex: number, limit: number}>; limits?: Array<{pairIndex: number, actionIndex: number, argIndex: number, limit: number}>;

View File

@@ -121,6 +121,7 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => {
handleEdit={() => handlers.handleEditRobot(row.id, row.name, row.params || [])} handleEdit={() => handlers.handleEditRobot(row.id, row.name, row.params || [])}
handleDuplicate={() => handlers.handleDuplicateRobot(row.id, row.name, row.params || [])} handleDuplicate={() => handlers.handleDuplicateRobot(row.id, row.name, row.params || [])}
handleDelete={() => handlers.handleDelete(row.id)} handleDelete={() => handlers.handleDelete(row.id)}
robotType={row.type}
/> />
</MemoizedTableCell> </MemoizedTableCell>
); );
@@ -742,9 +743,10 @@ interface OptionsButtonProps {
handleEdit: () => void; handleEdit: () => void;
handleDelete: () => void; handleDelete: () => void;
handleDuplicate: () => void; handleDuplicate: () => void;
robotType: string;
} }
const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate }: OptionsButtonProps) => { const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate, robotType }: OptionsButtonProps) => {
const [anchorEl, setAnchorEl] = React.useState<null | HTMLElement>(null); const [anchorEl, setAnchorEl] = React.useState<null | HTMLElement>(null);
const handleClick = (event: React.MouseEvent<HTMLElement>) => { const handleClick = (event: React.MouseEvent<HTMLElement>) => {
@@ -771,34 +773,33 @@ const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicat
open={Boolean(anchorEl)} open={Boolean(anchorEl)}
onClose={handleClose} onClose={handleClose}
> >
<MenuItem onClick={() => { handleRetrain(); handleClose(); }}> {robotType !== 'scrape' && (
<ListItemIcon> <MenuItem onClick={() => { handleRetrain(); handleClose(); }}>
<Refresh fontSize="small" /> <ListItemIcon>
</ListItemIcon> <Refresh fontSize="small" />
<ListItemText>{t('recordingtable.retrain')}</ListItemText> </ListItemIcon>
</MenuItem> <ListItemText>Retrain</ListItemText>
</MenuItem>
)}
<MenuItem onClick={() => { handleEdit(); handleClose(); }}> <MenuItem onClick={() => { handleEdit(); handleClose(); }}>
<ListItemIcon> <ListItemIcon><Edit fontSize="small" /></ListItemIcon>
<Edit fontSize="small" /> <ListItemText>Edit</ListItemText>
</ListItemIcon>
<ListItemText>{t('recordingtable.edit')}</ListItemText>
</MenuItem> </MenuItem>
<MenuItem onClick={() => { handleDelete(); handleClose(); }}> <MenuItem onClick={() => { handleDelete(); handleClose(); }}>
<ListItemIcon> <ListItemIcon><DeleteForever fontSize="small" /></ListItemIcon>
<DeleteForever fontSize="small" /> <ListItemText>Delete</ListItemText>
</ListItemIcon>
<ListItemText>{t('recordingtable.delete')}</ListItemText>
</MenuItem> </MenuItem>
<MenuItem onClick={() => { handleDuplicate(); handleClose(); }}> {robotType !== 'scrape' && (
<ListItemIcon> <MenuItem onClick={() => { handleDuplicate(); handleClose(); }}>
<ContentCopy fontSize="small" /> <ListItemIcon><ContentCopy fontSize="small" /></ListItemIcon>
</ListItemIcon> <ListItemText>Duplicate</ListItemText>
<ListItemText>{t('recordingtable.duplicate')}</ListItemText> </MenuItem>
</MenuItem> )}
</Menu> </Menu>
</> </>
); );
}; };

View File

@@ -13,29 +13,65 @@ import {
Card, Card,
CircularProgress, CircularProgress,
Container, Container,
CardContent CardContent,
Tabs,
Tab,
RadioGroup,
Radio,
FormControl,
FormLabel
} from '@mui/material'; } from '@mui/material';
import { ArrowBack, PlayCircleOutline, Article } from '@mui/icons-material'; import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material';
import { useGlobalInfoStore } from '../../../context/globalInfo'; import { useGlobalInfoStore } from '../../../context/globalInfo';
import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording'; import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording';
import { createScrapeRobot } from "../../../api/storage";
import { AuthContext } from '../../../context/auth'; import { AuthContext } from '../../../context/auth';
import { GenericModal } from '../../ui/GenericModal'; import { GenericModal } from '../../ui/GenericModal';
interface TabPanelProps {
children?: React.ReactNode;
index: number;
value: number;
}
function TabPanel(props: TabPanelProps) {
const { children, value, index, ...other } = props;
return (
<div
role="tabpanel"
hidden={value !== index}
id={`robot-tabpanel-${index}`}
aria-labelledby={`robot-tab-${index}`}
{...other}
>
{value === index && <Box>{children}</Box>}
</div>
);
}
const RobotCreate: React.FC = () => { const RobotCreate: React.FC = () => {
const { t } = useTranslation(); const { t } = useTranslation();
const navigate = useNavigate(); const navigate = useNavigate();
const { setBrowserId, setRecordingUrl, notify, setRecordingId } = useGlobalInfoStore(); const { setBrowserId, setRecordingUrl, notify, setRecordingId, setRerenderRobots } = useGlobalInfoStore();
const [tabValue, setTabValue] = useState(0);
const [url, setUrl] = useState(''); const [url, setUrl] = useState('');
const [scrapeRobotName, setScrapeRobotName] = useState('');
const [needsLogin, setNeedsLogin] = useState(false); const [needsLogin, setNeedsLogin] = useState(false);
const [isLoading, setIsLoading] = useState(false); const [isLoading, setIsLoading] = useState(false);
const [isWarningModalOpen, setWarningModalOpen] = useState(false); const [isWarningModalOpen, setWarningModalOpen] = useState(false);
const [activeBrowserId, setActiveBrowserId] = useState(''); const [activeBrowserId, setActiveBrowserId] = useState('');
const [outputFormats, setOutputFormats] = useState<string[]>([]);
const { state } = React.useContext(AuthContext); const { state } = React.useContext(AuthContext);
const { user } = state; const { user } = state;
const handleTabChange = (event: React.SyntheticEvent, newValue: number) => {
setTabValue(newValue);
};
const handleStartRecording = async () => { const handleStartRecording = async () => {
if (!url.trim()) { if (!url.trim()) {
@@ -146,155 +182,307 @@ const RobotCreate: React.FC = () => {
<ArrowBack /> <ArrowBack />
</IconButton> </IconButton>
<Typography variant="h5" component="h1"> <Typography variant="h5" component="h1">
New Data Extraction Robot Create New Robot
</Typography> </Typography>
</Box> </Box>
<Card sx={{ mb: 4, p: 4, textAlign: 'center' }}> <Box sx={{ borderBottom: 1, borderColor: 'divider', mb: 2, mt: "-30px" }}>
<Box display="flex" flexDirection="column" alignItems="center"> <Tabs
{/* Logo (kept as original) */} value={tabValue}
<img centered
src="https://ik.imagekit.io/ys1blv5kv/maxunlogo.png" onChange={handleTabChange}
width={73} aria-label="robot type tabs"
height={65} sx={{
style={{ minHeight: 36,
borderRadius: '5px', '& .MuiTab-root': {
marginBottom: '30px' minHeight: 36,
}} paddingX: 2,
alt="Maxun Logo" paddingY: 1.5,
/> minWidth: 0,
},
'& .MuiTabs-indicator': {
height: 2,
},
}}
>
<Tab label="Extract" id="extract-robot" aria-controls="extract-robot" />
<Tab label="Scrape" id="scrape-robot" aria-controls="scrape-robot" />
</Tabs>
</Box>
{/* Origin URL Input */}
<Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}> <TabPanel value={tabValue} index={0}>
<TextField <Card sx={{ mb: 4, p: 4, textAlign: 'center' }}>
placeholder="Example: https://www.ycombinator.com/companies/" <Box display="flex" flexDirection="column" alignItems="center">
variant="outlined" {/* Logo (kept as original) */}
<img
src="https://ik.imagekit.io/ys1blv5kv/maxunlogo.png"
width={73}
height={65}
style={{
borderRadius: '5px',
marginBottom: '30px'
}}
alt="Maxun Logo"
/>
<Typography variant="body2" color="text.secondary" mb={3}>
Extract structured data from websites in a few clicks.
</Typography>
{/* Origin URL Input */}
<Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
<TextField
placeholder="Example: https://www.ycombinator.com/companies/"
variant="outlined"
fullWidth
value={url}
onChange={(e) => setUrl(e.target.value)}
/>
</Box>
{/* Checkbox */}
<Box sx={{ width: '100%', maxWidth: 700, mb: 3, textAlign: 'left' }}>
<FormControlLabel
control={
<Checkbox
checked={needsLogin}
onChange={(e) => setNeedsLogin(e.target.checked)}
color="primary"
/>
}
label="This website needs logging in."
/>
</Box>
{/* Button */}
<Button
variant="contained"
fullWidth fullWidth
value={url} onClick={handleStartRecording}
onChange={(e) => setUrl(e.target.value)} disabled={!url.trim() || isLoading}
/> sx={{
bgcolor: '#ff00c3',
py: 1.4,
fontSize: '1rem',
textTransform: 'none',
maxWidth: 700,
borderRadius: 2
}}
startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
>
{isLoading ? 'Starting...' : 'Start Recording'}
</Button>
</Box> </Box>
</Card>
{/* Checkbox */} <Box mt={6} textAlign="center">
<Box sx={{ width: '100%', maxWidth: 700, mb: 3, textAlign: 'left' }}> <Typography variant="h6" gutterBottom>
<FormControlLabel First time creating a robot?
control={ </Typography>
<Checkbox <Typography variant="body2" color="text.secondary" mb={3}>
checked={needsLogin} Get help and learn how to use Maxun effectively.
onChange={(e) => setNeedsLogin(e.target.checked)} </Typography>
color="primary"
/>
}
label="This website needs logging in."
/>
</Box>
{/* Button */} <Grid container spacing={3} justifyContent="center">
<Button
variant="contained" {/* YouTube Tutorials */}
fullWidth <Grid item xs={12} sm={6} md={4}>
onClick={handleStartRecording} <Card
disabled={!url.trim() || isLoading} sx={{
sx={{ height: 140,
bgcolor: '#ff00c3', cursor: "pointer",
py: 1.4, }}
fontSize: '1rem', onClick={() => window.open("https://www.youtube.com/@MaxunOSS/videos", "_blank")}
textTransform: 'none', >
maxWidth: 700, <CardContent
borderRadius: 2 sx={{
}} display: "flex",
startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null} flexDirection: "column",
> alignItems: "center",
{isLoading ? 'Starting...' : 'Start Recording'} justifyContent: "center", // center content
</Button> height: "100%",
textAlign: "center",
p: 2,
color: (theme) =>
theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '',
}}
>
<PlayCircleOutline sx={{ fontSize: "32px", mb: 2 }} />
<Box sx={{ textAlign: "center" }}>
<Typography variant="body1" fontWeight="600" sx={{ lineHeight: 1.2 }}>
Video Tutorials
</Typography>
<Typography variant="body2" color="text.secondary" sx={{ lineHeight: 1.4, mt: 1 }}>
Watch step-by-step guides
</Typography>
</Box>
</CardContent>
</Card>
</Grid>
{/* Documentation */}
<Grid item xs={12} sm={6} md={4}>
<Card
sx={{
height: 140,
cursor: "pointer",
}}
onClick={() => window.open("https://docs.maxun.dev", "_blank")}
>
<CardContent
sx={{
display: "flex",
flexDirection: "column",
alignItems: "center",
justifyContent: "center", // center everything
height: "100%",
textAlign: "center",
p: 2,
color: (theme) =>
theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '',
}}
>
<Article sx={{ fontSize: "32px", mb: 2 }} />
<Box sx={{ textAlign: "center" }}>
<Typography variant="body1" fontWeight="600" sx={{ lineHeight: 1.2 }}>
Documentation
</Typography>
<Typography variant="body2" color="text.secondary" sx={{ lineHeight: 1.4, mt: 1 }}>
Explore detailed guides
</Typography>
</Box>
</CardContent>
</Card>
</Grid>
</Grid>
</Box> </Box>
</Card> </TabPanel>
<TabPanel value={tabValue} index={1}>
<Card sx={{ mb: 4, p: 4, textAlign: 'center' }}>
<Box mt={6} textAlign="center"> <Box display="flex" flexDirection="column" alignItems="center">
<Typography variant="h6" gutterBottom> <img
First time creating a robot? src="https://ik.imagekit.io/ys1blv5kv/maxunlogo.png"
</Typography> width={73}
<Typography variant="body2" color="text.secondary" mb={3}> height={65}
Get help and learn how to use Maxun effectively. style={{
</Typography> borderRadius: '5px',
marginBottom: '30px'
<Grid container spacing={3} justifyContent="center">
{/* YouTube Tutorials */}
<Grid item xs={12} sm={6} md={4}>
<Card
sx={{
height: 140,
cursor: "pointer",
}} }}
onClick={() => window.open("https://www.youtube.com/@MaxunOSS/videos", "_blank")} alt="Maxun Logo"
> />
<CardContent
sx={{
display: "flex",
flexDirection: "column",
alignItems: "center",
justifyContent: "center", // center content
height: "100%",
textAlign: "center",
p: 2,
color: (theme) =>
theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '',
}}
>
<PlayCircleOutline sx={{ fontSize: "32px", mb: 2 }} />
<Box sx={{ textAlign: "center" }}> <Typography variant="body2" color="text.secondary" mb={3}>
<Typography variant="body1" fontWeight="600" sx={{ lineHeight: 1.2 }}> Turn websites into LLM-ready Markdown & clean HTML for AI apps.
Video Tutorials </Typography>
</Typography>
<Typography variant="body2" color="text.secondary" sx={{ lineHeight: 1.4, mt: 1 }}>
Watch step-by-step guides
</Typography>
</Box>
</CardContent>
</Card>
</Grid>
{/* Documentation */} <Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
<Grid item xs={12} sm={6} md={4}> <TextField
<Card placeholder="Example: YC Companies Scraper"
sx={{ variant="outlined"
height: 140, fullWidth
cursor: "pointer", value={scrapeRobotName}
onChange={(e) => setScrapeRobotName(e.target.value)}
sx={{ mb: 2 }}
label="Robot Name"
/>
<TextField
placeholder="Example: https://www.ycombinator.com/companies/"
variant="outlined"
fullWidth
value={url}
onChange={(e) => setUrl(e.target.value)}
label="Website URL"
sx={{ mb: 2 }}
/>
<FormControl component="fieldset" sx={{ width: '100%', textAlign: 'left' }}>
<p>Output Format (Select at least one)</p>
<FormControlLabel
control={
<Checkbox
checked={outputFormats.includes('markdown')}
onChange={(e) => {
if (e.target.checked) {
setOutputFormats([...outputFormats, 'markdown']);
} else {
setOutputFormats(outputFormats.filter(f => f !== 'markdown'));
}
}}
/>
}
label="Markdown"
/>
<FormControlLabel
control={
<Checkbox
checked={outputFormats.includes('html')}
onChange={(e) => {
if (e.target.checked) {
setOutputFormats([...outputFormats, 'html']);
} else {
setOutputFormats(outputFormats.filter(f => f !== 'html'));
}
}}
/>
}
label="HTML"
/>
</FormControl>
</Box>
<Button
variant="contained"
fullWidth
onClick={async () => {
if (!url.trim()) {
notify('error', 'Please enter a valid URL');
return;
}
if (!scrapeRobotName.trim()) {
notify('error', 'Please enter a robot name');
return;
}
if (outputFormats.length === 0) {
notify('error', 'Please select at least one output format');
return;
}
setIsLoading(true);
const result = await createScrapeRobot(url, scrapeRobotName, outputFormats);
setIsLoading(false);
if (result) {
setRerenderRobots(true);
notify('success', `${scrapeRobotName} created successfully!`);
navigate('/robots');
} else {
notify('error', 'Failed to create markdown robot');
}
}} }}
onClick={() => window.open("https://docs.maxun.dev", "_blank")} disabled={!url.trim() || !scrapeRobotName.trim() || outputFormats.length === 0 || isLoading}
sx={{
bgcolor: '#ff00c3',
py: 1.4,
fontSize: '1rem',
textTransform: 'none',
maxWidth: 700,
borderRadius: 2
}}
startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
> >
<CardContent {isLoading
sx={{ ? "Creating..."
display: "flex", : `Create Robot`
flexDirection: "column", }
alignItems: "center", </Button>
justifyContent: "center", // center everything </Box>
height: "100%", </Card>
textAlign: "center", </TabPanel>
p: 2,
color: (theme) =>
theme.palette.mode === 'light' ? 'rgba(0, 0, 0, 0.54)' : '',
}}
>
<Article sx={{ fontSize: "32px", mb: 2 }} />
<Box sx={{ textAlign: "center" }}>
<Typography variant="body1" fontWeight="600" sx={{ lineHeight: 1.2 }}>
Documentation
</Typography>
<Typography variant="body2" color="text.secondary" sx={{ lineHeight: 1.4, mt: 1 }}>
Explore detailed guides
</Typography>
</Box>
</CardContent>
</Card>
</Grid>
</Grid>
</Box>
</Box> </Box>

View File

@@ -24,13 +24,9 @@ interface RobotMeta {
pairs: number; pairs: number;
updatedAt: string; updatedAt: string;
params: any[]; params: any[];
type?: string; type?: 'extract' | 'scrape';
description?: string;
usedByUsers?: number[];
subscriptionLevel?: number;
access?: string;
sample?: any[];
url?: string; url?: string;
formats?: ('markdown' | 'html')[];
} }
interface RobotWorkflow { interface RobotWorkflow {

View File

@@ -24,13 +24,9 @@ interface RobotMeta {
pairs: number; pairs: number;
updatedAt: string; updatedAt: string;
params: any[]; params: any[];
type?: string; type?: 'extract' | 'scrape';
description?: string;
usedByUsers?: number[];
subscriptionLevel?: number;
access?: string;
sample?: any[];
url?: string; url?: string;
formats?: ('markdown' | 'html')[];
} }
interface RobotWorkflow { interface RobotWorkflow {
@@ -795,11 +791,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
navigate(basePath); navigate(basePath);
}; };
const lastPair =
robot?.recording.workflow[robot?.recording.workflow.length - 1];
const targetUrl = lastPair?.what.find((action) => action.action === "goto")
?.args?.[0];
return ( return (
<RobotConfigPage <RobotConfigPage
title={t("robot_edit.title")} title={t("robot_edit.title")}
@@ -826,7 +817,7 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => {
<TextField <TextField
label={t("robot_duplication.fields.target_url")} label={t("robot_duplication.fields.target_url")}
key={t("robot_duplication.fields.target_url")} key={t("robot_duplication.fields.target_url")}
value={targetUrl || ""} value={getTargetUrl() || ""}
onChange={(e) => handleTargetUrlChange(e.target.value)} onChange={(e) => handleTargetUrlChange(e.target.value)}
style={{ marginBottom: "20px" }} style={{ marginBottom: "20px" }}
/> />

View File

@@ -128,6 +128,8 @@ export const RobotIntegrationPage = ({
"googleSheets" | "airtable" | "webhook" | null "googleSheets" | "airtable" | "webhook" | null
>(integrationType); >(integrationType);
const isScrapeRobot = recording?.recording_meta?.type === "scrape";
const authenticateWithGoogle = () => { const authenticateWithGoogle = () => {
if (!recordingId) { if (!recordingId) {
console.error("Cannot authenticate: recordingId is null"); console.error("Cannot authenticate: recordingId is null");
@@ -729,26 +731,61 @@ export const RobotIntegrationPage = ({
width: "100%", width: "100%",
}} }}
> >
<Button variant="outlined" onClick={() => { {!isScrapeRobot && (
if (!recordingId) return; <Button
setSelectedIntegrationType("googleSheets"); variant="outlined"
setSettings({ ...settings, integrationType: "googleSheets" }); onClick={() => {
const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots"; if (!recordingId) return;
navigate(`${basePath}/${recordingId}/integrate/googleSheets`); setSelectedIntegrationType("googleSheets");
}} style={{ display: "flex", flexDirection: "column", alignItems: "center", background: 'white', color: '#ff00c3' }}> setSettings({ ...settings, integrationType: "googleSheets" });
<img src="https://ik.imagekit.io/ys1blv5kv/gsheet.svg" alt="Google Sheets" style={{ margin: "6px" }} /> const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots";
Google Sheets navigate(`${basePath}/${recordingId}/integrate/googleSheets`);
</Button> }}
<Button variant="outlined" onClick={() => { style={{
if (!recordingId) return; display: "flex",
setSelectedIntegrationType("airtable"); flexDirection: "column",
setSettings({ ...settings, integrationType: "airtable" }); alignItems: "center",
const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots"; background: "white",
navigate(`${basePath}/${recordingId}/integrate/airtable`); color: "#ff00c3",
}} style={{ display: "flex", flexDirection: "column", alignItems: "center", background: 'white', color: '#ff00c3' }}> }}
<img src="https://ik.imagekit.io/ys1blv5kv/airtable.svg" alt="Airtable" style={{ margin: "6px" }} /> >
Airtable <img
</Button> src="https://ik.imagekit.io/ys1blv5kv/gsheet.svg"
alt="Google Sheets"
style={{ margin: "6px" }}
/>
Google Sheets
</Button>
)}
{!isScrapeRobot && (
<Button
variant="outlined"
onClick={() => {
if (!recordingId) return;
setSelectedIntegrationType("airtable");
setSettings({ ...settings, integrationType: "airtable" });
const basePath = robotPath === "prebuilt-robots" ? "/prebuilt-robots" : "/robots";
navigate(`${basePath}/${recordingId}/integrate/airtable`);
}}
style={{
display: "flex",
flexDirection: "column",
alignItems: "center",
background: "white",
color: "#ff00c3",
}}
>
<img
src="https://ik.imagekit.io/ys1blv5kv/airtable.svg"
alt="Airtable"
style={{ margin: "6px" }}
/>
Airtable
</Button>
)}
<Button variant="outlined" onClick={() => { <Button variant="outlined" onClick={() => {
if (!recordingId) return; if (!recordingId) return;
setSelectedIntegrationType("webhook"); setSelectedIntegrationType("webhook");

View File

@@ -16,7 +16,9 @@ interface RobotMeta {
pairs: number; pairs: number;
updatedAt: string; updatedAt: string;
params: any[]; params: any[];
type?: 'extract' | 'scrape';
url?: string; url?: string;
formats?: ('markdown' | 'html')[];
} }
interface RobotWorkflow { interface RobotWorkflow {

View File

@@ -37,6 +37,8 @@ interface RunContentProps {
export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRef, abortRunHandler }: RunContentProps) => { export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRef, abortRunHandler }: RunContentProps) => {
const { t } = useTranslation(); const { t } = useTranslation();
const [tab, setTab] = React.useState<string>('output'); const [tab, setTab] = React.useState<string>('output');
const [markdownContent, setMarkdownContent] = useState<string>('');
const [htmlContent, setHtmlContent] = useState<string>('');
const [schemaData, setSchemaData] = useState<any[]>([]); const [schemaData, setSchemaData] = useState<any[]>([]);
const [schemaColumns, setSchemaColumns] = useState<string[]>([]); const [schemaColumns, setSchemaColumns] = useState<string[]>([]);
@@ -63,6 +65,26 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
setTab(tab); setTab(tab);
}, [interpretationInProgress]); }, [interpretationInProgress]);
useEffect(() => {
setMarkdownContent('');
setHtmlContent('');
if (row.serializableOutput?.markdown && Array.isArray(row.serializableOutput.markdown)) {
const markdownData = row.serializableOutput.markdown[0];
if (markdownData?.content) {
setMarkdownContent(markdownData.content);
}
}
if (row.serializableOutput?.html && Array.isArray(row.serializableOutput.html)) {
const htmlData = row.serializableOutput.html[0];
if (htmlData?.content) {
setHtmlContent(htmlData.content);
}
}
}, [row.serializableOutput]);
useEffect(() => { useEffect(() => {
if (row.status === 'running' || row.status === 'queued' || row.status === 'scheduled') { if (row.status === 'running' || row.status === 'queued' || row.status === 'scheduled') {
setSchemaData([]); setSchemaData([]);
@@ -374,6 +396,22 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
}, 100); }, 100);
}; };
const downloadMarkdown = (content: string, filename: string) => {
const blob = new Blob([content], { type: 'text/markdown;charset=utf-8;' });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.setAttribute("download", filename);
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
setTimeout(() => {
URL.revokeObjectURL(url);
}, 100);
};
const renderDataTable = ( const renderDataTable = (
data: any[], data: any[],
@@ -636,11 +674,77 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
const hasData = schemaData.length > 0 || listData.length > 0 || legacyData.length > 0; const hasData = schemaData.length > 0 || listData.length > 0 || legacyData.length > 0;
const hasScreenshots = row.binaryOutput && Object.keys(row.binaryOutput).length > 0; const hasScreenshots = row.binaryOutput && Object.keys(row.binaryOutput).length > 0;
const hasMarkdown = markdownContent.length > 0;
const hasHTML = htmlContent.length > 0;
return ( return (
<Box sx={{ width: '100%' }}> <Box sx={{ width: '100%' }}>
<TabContext value={tab}> <TabContext value={tab}>
<TabPanel value='output' sx={{ width: '100%', maxWidth: '900px' }}> <TabPanel value='output' sx={{ width: '100%', maxWidth: '900px' }}>
{hasMarkdown || hasHTML ? (
<>
{hasMarkdown && (
<Accordion defaultExpanded sx={{ mb: 2 }}>
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
<Typography variant='h6'>Markdown</Typography>
</AccordionSummary>
<AccordionDetails>
<Paper sx={{ p: 2, maxHeight: '500px', overflow: 'auto' }}>
<Typography component="pre" sx={{ whiteSpace: 'pre-wrap', fontFamily: 'monospace' }}>
{markdownContent}
</Typography>
</Paper>
<Box sx={{ mt: 2 }}>
<Button
onClick={() => downloadMarkdown(markdownContent, 'output.md')}
sx={{ color: '#FF00C3', textTransform: 'none' }}
>
Download
</Button>
</Box>
</AccordionDetails>
</Accordion>
)}
{hasHTML && (
<Accordion defaultExpanded sx={{ mb: 2 }}>
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
<Typography variant='h6'>HTML</Typography>
</AccordionSummary>
<AccordionDetails>
<Paper sx={{ p: 2, maxHeight: '500px', overflow: 'auto' }}>
<Typography
component="pre"
sx={{ whiteSpace: 'pre-wrap', fontFamily: 'monospace' }}
>
{htmlContent}
</Typography>
</Paper>
<Box sx={{ mt: 2 }}>
<Button
onClick={() => {
const blob = new Blob([htmlContent], { type: 'text/html;charset=utf-8;' });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = "output.html";
link.click();
setTimeout(() => URL.revokeObjectURL(url), 100);
}}
sx={{ color: '#FF00C3', textTransform: 'none' }}
>
Download
</Button>
</Box>
</AccordionDetails>
</Accordion>
)}
</>
) : (
// Extract robot output
<>
{row.status === 'running' || row.status === 'queued' ? ( {row.status === 'running' || row.status === 'queued' ? (
<> <>
<Box sx={{ display: 'flex', alignItems: 'center', mb: 2 }}> <Box sx={{ display: 'flex', alignItems: 'center', mb: 2 }}>
@@ -939,6 +1043,8 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe
</AccordionDetails> </AccordionDetails>
</Accordion> </Accordion>
)} )}
</>
)}
</TabPanel> </TabPanel>
</TabContext> </TabContext>
</Box> </Box>

View File

@@ -27,6 +27,9 @@ interface RobotMeta {
pairs: number; pairs: number;
updatedAt: string; updatedAt: string;
params: any[]; params: any[];
type?: 'extract' | 'scrape';
url?: string;
formats?: ('markdown' | 'html')[];
} }
interface RobotWorkflow { interface RobotWorkflow {