Merge pull request #889 from getmaxun/markdownify

feat: scrape [html + markdown]
2025-11-21 00:14:31 +05:30
parent 1bb1cc8c16 a1b2117866
commit a1515c2abf
18 changed files with 1422 additions and 210 deletions
--- a/server/src/api/record.ts
+++ b/server/src/api/record.ts
@@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core";
 import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
 import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
 import { sendWebhook } from "../routes/webhook";
+import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';

 chromium.use(stealthPlugin());

@@ -344,7 +345,9 @@ function formatRunResponse(run: any) {
        runByAPI: run.runByAPI,
        data: {
            textData: {},
-            listData: {}
+            listData: {},
+            markdown: '',
+            html: ''
        },
        screenshots: [] as any[],
    };
@@ -359,6 +362,14 @@ function formatRunResponse(run: any) {
        formattedRun.data.listData = output.scrapeList;
    }

+    if (output.markdown && Array.isArray(output.markdown)) {
+        formattedRun.data.markdown = output.markdown[0]?.content || '';
+    }
+
+    if (output.html && Array.isArray(output.html)) {
+        formattedRun.data.html = output.html[0]?.content || '';
+    }
+
    if (run.binaryOutput) {
        Object.keys(run.binaryOutput).forEach(key => {
            if (run.binaryOutput[key]) {
@@ -569,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr
  }
 }

-async function readyForRunHandler(browserId: string, id: string, userId: string){
+async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){
    try {
-        const result = await executeRun(id, userId);
+        const result = await executeRun(id, userId, requestedFormats);

        if (result && result.success) {
            logger.log('info', `Interpretation of ${id} succeeded`);
@@ -608,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) {
    return copy;
 };

-async function executeRun(id: string, userId: string) {
+async function executeRun(id: string, userId: string, requestedFormats?: string[]) {
    let browser: any = null;
    
    try {
@@ -651,6 +662,166 @@ async function executeRun(id: string, userId: string) {
            };
        }

+        if (recording.recording_meta.type === 'scrape') {
+            logger.log('info', `Executing scrape robot for API run ${id}`);
+
+            let formats = recording.recording_meta.formats || ['markdown'];
+
+            // Override if API request defines formats
+            if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
+                formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
+            }
+
+            await run.update({
+                status: 'running',
+                log: `Converting page to: ${formats.join(', ')}`
+            });
+
+            try {
+                const url = recording.recording_meta.url;
+
+                if (!url) {
+                    throw new Error('No URL specified for markdown robot');
+                }
+
+                let markdown = '';
+                let html = '';
+                const serializableOutput: any = {};
+
+                // Markdown conversion
+                if (formats.includes('markdown')) {
+                    markdown = await convertPageToMarkdown(url);
+                    serializableOutput.markdown = [{ content: markdown }];
+                }
+
+                // HTML conversion
+                if (formats.includes('html')) {
+                    html = await convertPageToHTML(url);
+                    serializableOutput.html = [{ content: html }];
+                }
+
+                await run.update({
+                    status: 'success',
+                    finishedAt: new Date().toLocaleString(),
+                    log: `${formats.join(', ')} conversion completed successfully`,
+                    serializableOutput,
+                    binaryOutput: {},
+                });
+
+                logger.log('info', `Markdown robot execution completed for API run ${id}`);
+
+                // Push success socket event
+                try {
+                    const completionData = {
+                        runId: plainRun.runId,
+                        robotMetaId: plainRun.robotMetaId,
+                        robotName: recording.recording_meta.name,
+                        status: 'success',
+                        finishedAt: new Date().toLocaleString()
+                    };
+
+                    serverIo
+                        .of('/queued-run')
+                        .to(`user-${userId}`)
+                        .emit('run-completed', completionData);
+                } catch (socketError: any) {
+                    logger.log(
+                        'warn',
+                        `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
+                    );
+                }
+
+                // Build webhook payload
+                const webhookPayload: any = {
+                    robot_id: plainRun.robotMetaId,
+                    run_id: plainRun.runId,
+                    robot_name: recording.recording_meta.name,
+                    status: 'success',
+                    started_at: plainRun.startedAt,
+                    finished_at: new Date().toLocaleString(),
+                    metadata: {
+                        browser_id: plainRun.browserId,
+                        user_id: userId,
+                    },
+                };
+
+                if (formats.includes('markdown')) webhookPayload.markdown = markdown;
+                if (formats.includes('html')) webhookPayload.html = html;
+
+                try {
+                    await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
+                    logger.log(
+                        'info',
+                        `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`
+                    );
+                } catch (webhookError: any) {
+                    logger.log(
+                        'warn',
+                        `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
+                    );
+                }
+
+                capture("maxun-oss-run-created-api", {
+                    runId: plainRun.runId,
+                    user_id: userId,
+                    status: "success",
+                    robot_type: "scrape",
+                    formats
+                });
+
+                await destroyRemoteBrowser(plainRun.browserId, userId);
+
+                return {
+                    success: true,
+                    interpretationInfo: run.toJSON()
+                };
+            } catch (error: any) {
+                logger.log(
+                    'error',
+                    `${formats.join(', ')} conversion failed for API run ${id}: ${error.message}`
+                );
+
+                await run.update({
+                    status: 'failed',
+                    finishedAt: new Date().toLocaleString(),
+                    log: `${formats.join(', ')} conversion failed: ${error.message}`,
+                });
+
+                // Send failure socket event
+                try {
+                    const failureData = {
+                        runId: plainRun.runId,
+                        robotMetaId: plainRun.robotMetaId,
+                        robotName: recording.recording_meta.name,
+                        status: 'failed',
+                        finishedAt: new Date().toLocaleString()
+                    };
+
+                    serverIo
+                        .of('/queued-run')
+                        .to(`user-${userId}`)
+                        .emit('run-completed', failureData);
+                } catch (socketError: any) {
+                    logger.log(
+                        'warn',
+                        `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
+                    );
+                }
+
+                capture("maxun-oss-run-created-api", {
+                    runId: plainRun.runId,
+                    user_id: userId,
+                    status: "failed",
+                    robot_type: "scrape",
+                    formats
+                });
+
+                await destroyRemoteBrowser(plainRun.browserId, userId);
+
+                throw error;
+            }
+        }
+
        plainRun.status = 'running';

        browser = browserPool.getRemoteBrowser(plainRun.browserId);
@@ -848,7 +1019,7 @@ async function executeRun(id: string, userId: string) {
    }
 }

-export async function handleRunRecording(id: string, userId: string) {
+export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
    try {
        const result = await createWorkflowAndStoreMetadata(id, userId);
        const { browserId, runId: newRunId } = result;
@@ -862,7 +1033,7 @@ export async function handleRunRecording(id: string, userId: string) {
            rejectUnauthorized: false
        });

-        socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId));
+        socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats));

        logger.log('info', `Running Robot: ${id}`);

@@ -889,12 +1060,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
        if (!run) throw new Error('Run not found');

        if (run.status === 'success') {
-            return run.toJSON();
+            return run;
        } else if (run.status === 'failed') {
            throw new Error('Run failed');
        }

-        // Wait for the next polling interval
        await new Promise(resolve => setTimeout(resolve, interval));
    }
 }
@@ -914,6 +1084,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
 *           type: string
 *         required: true
 *         description: The ID of the robot to run.
+ *     requestBody:
+ *       required: false
+ *       content:
+ *         application/json:
+ *           schema:
+ *             type: object
+ *             properties:
+ *               formats:
+ *                 type: array
+ *                 items:
+ *                   type: string
+ *                   enum: [markdown, html]
+ *                 description: Optional override formats for this run.
+ *           example:
+ *             formats: ["html"]
 *     responses:
 *       200:
 *         description: Robot run started successfully.
@@ -972,7 +1157,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
        if (!req.user) {
            return res.status(401).json({ ok: false, error: 'Unauthorized' });
        }
-        const runId = await handleRunRecording(req.params.id, req.user.id);
+
+        const requestedFormats = req.body.formats;
+
+        const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);

        if (!runId) {
            throw new Error('Run ID is undefined');
--- a/server/src/markdownify/markdown.ts
+++ b/server/src/markdownify/markdown.ts
@@ -0,0 +1,160 @@
+export async function parseMarkdown(
+  html: string | null | undefined,
+  baseUrl?: string | null
+): Promise<string> {
+  const TurndownService = require("turndown");
+  const { gfm } = require("joplin-turndown-plugin-gfm");
+  const cheerio = require("cheerio");
+  const { URL } = require("url");
+
+  if (!html) return "";
+
+  const tidiedHtml = tidyHtml(html);
+
+  const t = new TurndownService({
+    headingStyle: "atx", // ensures #### instead of ------
+    codeBlockStyle: "fenced",
+  });
+
+  // ---------------------------------------------
+  // Proper ATX headings #### instead of underline-style
+  // ---------------------------------------------
+  t.addRule("forceAtxHeadings", {
+    filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
+    replacement: (content: string, node: any) => {
+      const level = Number(node.nodeName.charAt(1));
+      const clean = content.trim();
+      return `\n${"#".repeat(level)} ${clean}\n`;
+    },
+  });
+
+  // ---------------------------------------------
+  // Remove SVGs
+  // ---------------------------------------------
+  t.addRule("truncate-svg", {
+    filter: "svg",
+    replacement: () => "",
+  });
+
+  // ---------------------------------------------
+  // Improved paragraph cleanup
+  // ---------------------------------------------
+  t.addRule("improved-paragraph", {
+    filter: "p",
+    replacement: (innerText: string) => {
+      const trimmed = innerText.trim();
+      if (!trimmed) return "";
+      return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
+    },
+  });
+
+  // ---------------------------------------------
+  // Inline link with fallback text
+  // ---------------------------------------------
+  t.addRule("inlineLink", {
+    filter: (node: any, opts: any) =>
+      node.nodeName === "A" && node.getAttribute("href"),
+
+    replacement: (content: string, node: any) => {
+      let text = content.trim();
+
+      // Fallback: aria-label → title → domain
+      if (!text) {
+        text =
+          node.getAttribute("aria-label")?.trim() ||
+          node.getAttribute("title")?.trim() ||
+          getDomainFromUrl(node.getAttribute("href")) ||
+          "link";
+      }
+
+      let href = node.getAttribute("href").trim();
+
+      // relative → absolute
+      if (baseUrl && isRelativeUrl(href)) {
+        try {
+          const u = new URL(href, baseUrl);
+          href = u.toString();
+        } catch { }
+      }
+
+      href = cleanUrl(href);
+
+      return `[${text}](${href})`;
+    },
+  });
+
+  t.use(gfm);
+
+  // Convert HTML → Markdown
+  try {
+    let out = await t.turndown(tidiedHtml);
+    out = fixBrokenLinks(out);
+    out = stripSkipLinks(out);
+    return out.trim();
+  } catch (err) {
+    console.error("HTML→Markdown failed", { err });
+    return "";
+  }
+}
+
+// -----------------------------------------------------
+// Helpers
+// -----------------------------------------------------
+function isRelativeUrl(url: string): boolean {
+  return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
+}
+
+function getDomainFromUrl(url: string): string | null {
+  try {
+    const u = new URL(url);
+    return u.hostname.replace("www.", "");
+  } catch {
+    return null;
+  }
+}
+
+function cleanUrl(u: string): string {
+  return u;
+}
+
+function cleanAttribute(attr: string) {
+  return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
+}
+
+function tidyHtml(html: string): string {
+  const cheerio = require("cheerio");
+  const $ = cheerio.load(html);
+
+  const manuallyCleanedElements = [
+    "script",
+    "style",
+    "iframe",
+    "noscript",
+    "meta",
+    "link",
+    "object",
+    "embed",
+    "canvas",
+    "audio",
+    "video",
+  ];
+
+  manuallyCleanedElements.forEach((tag) => $(tag).remove());
+  return $("body").html();
+}
+
+function fixBrokenLinks(md: string): string {
+  let depth = 0;
+  let result = "";
+
+  for (const ch of md) {
+    if (ch === "[") depth++;
+    if (ch === "]") depth = Math.max(0, depth - 1);
+    result += depth > 0 && ch === "\n" ? "\\\n" : ch;
+  }
+  return result;
+}
+
+function stripSkipLinks(md: string): string {
+  return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
+}
--- a/server/src/markdownify/scrape.ts
+++ b/server/src/markdownify/scrape.ts
@@ -0,0 +1,111 @@
+import { chromium } from "playwright";
+import { parseMarkdown } from "./markdown";
+
+/**
+ * Fetches a webpage, strips scripts/styles/images/etc,
+ * returns clean Markdown using parser.
+ */
+export async function convertPageToMarkdown(url: string): Promise<string> {
+  const browser = await chromium.launch();
+  const page = await browser.newPage();
+
+  await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
+
+  await page.addInitScript(() => {
+    const selectors = [
+      "script",
+      "style",
+      "link[rel='stylesheet']",
+      "noscript",
+      "meta",
+      "svg",
+      "img",
+      "picture",
+      "source",
+      "video",
+      "audio",
+      "iframe",
+      "object",
+      "embed"
+    ];
+
+    selectors.forEach(sel => {
+      document.querySelectorAll(sel).forEach(e => e.remove());
+    });
+
+    // Remove inline event handlers (onclick, onload…)
+    const all = document.querySelectorAll("*");
+    all.forEach(el => {
+      [...el.attributes].forEach(attr => {
+        if (attr.name.startsWith("on")) {
+          el.removeAttribute(attr.name);
+        }
+      });
+    });
+  });
+
+  // Re-extract HTML after cleanup
+  const cleanedHtml = await page.evaluate(() => {
+    return document.documentElement.outerHTML;
+  });
+
+  await browser.close();
+
+  // Convert cleaned HTML → Markdown
+  const markdown = await parseMarkdown(cleanedHtml, url);
+  return markdown;
+}
+
+/**
+ * Fetches a webpage, strips scripts/styles/images/etc,
+ * returns clean HTML.
+ */
+export async function convertPageToHTML(url: string): Promise<string> {
+  const browser = await chromium.launch();
+  const page = await browser.newPage();
+
+  await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
+
+  await page.addInitScript(() => {
+    const selectors = [
+      "script",
+      "style",
+      "link[rel='stylesheet']",
+      "noscript",
+      "meta",
+      "svg",
+      "img",
+      "picture",
+      "source",
+      "video",
+      "audio",
+      "iframe",
+      "object",
+      "embed"
+    ];
+
+    selectors.forEach(sel => {
+      document.querySelectorAll(sel).forEach(e => e.remove());
+    });
+
+    // Remove inline event handlers (onclick, onload…)
+    const all = document.querySelectorAll("*");
+    all.forEach(el => {
+      [...el.attributes].forEach(attr => {
+        if (attr.name.startsWith("on")) {
+          el.removeAttribute(attr.name);
+        }
+      });
+    });
+  });
+
+  // Re-extract HTML after cleanup
+  const cleanedHtml = await page.evaluate(() => {
+    return document.documentElement.outerHTML;
+  });
+
+  await browser.close();
+
+  // Return cleaned HTML directly
+  return cleanedHtml;
+}
--- a/server/src/markdownify/test.ts
+++ b/server/src/markdownify/test.ts
@@ -0,0 +1,6 @@
+import { convertPageToMarkdown } from "./scrape";
+
+(async () => {
+  const md = await convertPageToMarkdown("https://quotes.toscrape.com/");
+  console.log(md);
+})();
--- a/server/src/models/Robot.ts
+++ b/server/src/models/Robot.ts
@@ -9,6 +9,9 @@ interface RobotMeta {
  pairs: number;
  updatedAt: string;
  params: any[];
+  type?: 'extract' | 'scrape';
+  url?: string;
+  formats?: ('markdown' | 'html')[];
 }

 interface RobotWorkflow {
--- a/server/src/pgboss-worker.ts
+++ b/server/src/pgboss-worker.ts
@@ -20,6 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme
 import { io as serverIo } from "./server";
 import { sendWebhook } from './routes/webhook';
 import { BinaryOutputService } from './storage/mino';
+import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape';

 if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
    throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
@@ -183,11 +184,140 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
    try {  
      // Find the recording
      const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
-      
+
      if (!recording) {
        throw new Error(`Recording for run ${data.runId} not found`);
      }
-      
+
+      if (recording.recording_meta.type === 'scrape') {
+        logger.log('info', `Executing scrape robot for run ${data.runId}`);
+
+        const formats = recording.recording_meta.formats || ['markdown'];
+
+        await run.update({
+          status: 'running',
+          log: `Converting page to ${formats.join(', ')}`
+        });
+
+        try {
+          const url = recording.recording_meta.url;
+
+          if (!url) {
+            throw new Error('No URL specified for markdown robot');
+          }
+
+          let markdown = '';
+          let html = '';
+          const serializableOutput: any = {};
+
+          // Markdown conversion
+          if (formats.includes('markdown')) {
+            markdown = await convertPageToMarkdown(url);
+            serializableOutput.markdown = [{ content: markdown }];
+          }
+
+          // HTML conversion
+          if (formats.includes('html')) {
+            html = await convertPageToHTML(url);
+            serializableOutput.html = [{ content: html }];
+          }
+
+          // Success update
+          await run.update({
+            status: 'success',
+            finishedAt: new Date().toLocaleString(),
+            log: `${formats.join(', ').toUpperCase()} conversion completed successfully`,
+            serializableOutput,
+            binaryOutput: {},
+          });
+
+          logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
+
+          // Notify sockets
+          try {
+            const completionData = {
+              runId: data.runId,
+              robotMetaId: plainRun.robotMetaId,
+              robotName: recording.recording_meta.name,
+              status: 'success',
+              finishedAt: new Date().toLocaleString()
+            };
+
+            serverIo.of(browserId).emit('run-completed', completionData);
+            serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', completionData);
+          } catch (socketError: any) {
+            logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
+          }
+
+          // Webhooks
+          try {
+            const webhookPayload: any = {
+              runId: data.runId,
+              robotId: plainRun.robotMetaId,
+              robotName: recording.recording_meta.name,
+              status: 'success',
+              finishedAt: new Date().toLocaleString(),
+            };
+
+            if (formats.includes('markdown')) webhookPayload.markdown = markdown;
+            if (formats.includes('html')) webhookPayload.html = html;
+
+            await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
+            logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
+          } catch (webhookError: any) {
+            logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`);
+          }
+
+          capture("maxun-oss-run-created-manual", {
+            runId: data.runId,
+            user_id: data.userId,
+            status: "success",
+            robot_type: "scrape",
+            formats,
+          });
+
+          await destroyRemoteBrowser(browserId, data.userId);
+
+          return { success: true };
+
+        } catch (error: any) {
+          logger.log('error', `${formats.join(', ')} conversion failed for run ${data.runId}: ${error.message}`);
+
+          await run.update({
+            status: 'failed',
+            finishedAt: new Date().toLocaleString(),
+            log: `${formats.join(', ').toUpperCase()} conversion failed: ${error.message}`,
+          });
+
+          try {
+            const failureData = {
+              runId: data.runId,
+              robotMetaId: plainRun.robotMetaId,
+              robotName: recording.recording_meta.name,
+              status: 'failed',
+              finishedAt: new Date().toLocaleString()
+            };
+
+            serverIo.of(browserId).emit('run-completed', failureData);
+            serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', failureData);
+          } catch (socketError: any) {
+            logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`);
+          }
+
+          capture("maxun-oss-run-created-manual", {
+            runId: data.runId,
+            user_id: data.userId,
+            status: "failed",
+            robot_type: "scrape",
+            formats,
+          });
+
+          await destroyRemoteBrowser(browserId, data.userId);
+
+          throw error;
+        }
+      }
+
      const isRunAborted = async (): Promise<boolean> => {
        try {
          const currentRun = await Run.findOne({ where: { runId: data.runId } });
--- a/server/src/routes/storage.ts
+++ b/server/src/routes/storage.ts
@@ -274,7 +274,10 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
    }

    if (targetUrl) {
+      robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
+
      const updatedWorkflow = [...robot.recording.workflow];
+      let foundGoto = false;

      for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
        const step = updatedWorkflow[i];
@@ -289,6 +292,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r

            robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
            robot.changed('recording', true);
+            foundGoto = true;
            i = -1;
            break;
          }
@@ -331,10 +335,11 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
      }
    };

-    if (name) {
+    if (name || targetUrl) {
      updates.recording_meta = {
        ...robot.recording_meta,
-        name
+        ...(name && { name }),
+        ...(targetUrl && { url: targetUrl })
      };
    }

@@ -432,6 +437,91 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate
  }
 });

+/**
+ * POST endpoint for creating a markdown robot
+ */
+router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedRequest, res) => {
+  try {
+    const { url, name, formats } = req.body;
+
+    if (!url) {
+      return res.status(400).json({ error: 'The "url" field is required.' });
+    }
+
+    if (!req.user) {
+      return res.status(401).send({ error: 'Unauthorized' });
+    }
+
+    // Validate URL format
+    try {
+      new URL(url);
+    } catch (err) {
+      return res.status(400).json({ error: 'Invalid URL format' });
+    }
+
+    // Validate format
+    const validFormats = ['markdown', 'html'];
+
+    if (!Array.isArray(formats) || formats.length === 0) {
+      return res.status(400).json({ error: 'At least one output format must be selected.' });
+    }
+
+    const invalid = formats.filter(f => !validFormats.includes(f));
+    if (invalid.length > 0) {
+      return res.status(400).json({ error: `Invalid formats: ${invalid.join(', ')}` });
+    }
+
+    const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
+    const currentTimestamp = new Date().toLocaleString();
+    const robotId = uuid();
+
+    const newRobot = await Robot.create({
+      id: uuid(),
+      userId: req.user.id,
+      recording_meta: {
+        name: robotName,
+        id: robotId,
+        createdAt: currentTimestamp,
+        updatedAt: currentTimestamp,
+        pairs: 0,
+        params: [],
+        type: 'scrape',
+        url: url,
+        formats: formats,
+      },
+      recording: { workflow: [] },
+      google_sheet_email: null,
+      google_sheet_name: null,
+      google_sheet_id: null,
+      google_access_token: null,
+      google_refresh_token: null,
+      schedule: null,
+    });
+
+    logger.log('info', `Markdown robot created with id: ${newRobot.id}`);
+    capture(
+      'maxun-oss-robot-created',
+      {
+        robot_meta: newRobot.recording_meta,
+        recording: newRobot.recording,
+      }
+    )
+
+    return res.status(201).json({
+      message: 'Markdown robot created successfully.',
+      robot: newRobot,
+    });
+  } catch (error) {
+    if (error instanceof Error) {
+      logger.log('error', `Error creating markdown robot: ${error.message}`);
+      return res.status(500).json({ error: error.message });
+    } else {
+      logger.log('error', 'Unknown error creating markdown robot');
+      return res.status(500).json({ error: 'An unknown error occurred.' });
+    }
+  }
+});
+
 /**
 * DELETE endpoint for deleting a recording from the storage.
 */
--- a/server/src/workflow-management/scheduler/index.ts
+++ b/server/src/workflow-management/scheduler/index.ts
@@ -15,6 +15,7 @@ import { WorkflowFile } from "maxun-core";
 import { Page } from "playwright";
 import { sendWebhook } from "../../routes/webhook";
 import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
+import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape";
 chromium.use(stealthPlugin());

 async function createWorkflowAndStoreMetadata(id: string, userId: string) {
@@ -207,6 +208,172 @@ async function executeRun(id: string, userId: string) {
      }
    }

+    if (recording.recording_meta.type === 'scrape') {
+      logger.log('info', `Executing scrape robot for scheduled run ${id}`);
+
+      const formats = recording.recording_meta.formats || ['markdown'];
+
+      await run.update({
+        status: 'running',
+        log: `Converting page to: ${formats.join(', ')}`
+      });
+
+      try {
+        const runStartedData = {
+          runId: plainRun.runId,
+          robotMetaId: plainRun.robotMetaId,
+          robotName: recording.recording_meta.name,
+          status: 'running',
+          startedAt: plainRun.startedAt
+        };
+
+        serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
+        logger.log(
+          'info',
+          `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`
+        );
+      } catch (socketError: any) {
+        logger.log(
+          'warn',
+          `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`
+        );
+      }
+
+      try {
+        const url = recording.recording_meta.url;
+
+        if (!url) {
+          throw new Error('No URL specified for markdown robot');
+        }
+
+        let markdown = '';
+        let html = '';
+        const serializableOutput: any = {};
+
+        // Markdown conversion
+        if (formats.includes('markdown')) {
+          markdown = await convertPageToMarkdown(url);
+          serializableOutput.markdown = [{ content: markdown }];
+        }
+
+        // HTML conversion
+        if (formats.includes('html')) {
+          html = await convertPageToHTML(url);
+          serializableOutput.html = [{ content: html }];
+        }
+
+        await run.update({
+          status: 'success',
+          finishedAt: new Date().toLocaleString(),
+          log: `${formats.join(', ')} conversion completed successfully`,
+          serializableOutput,
+          binaryOutput: {},
+        });
+
+        logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
+
+        // Run-completed socket notifications
+        try {
+          const completionData = {
+            runId: plainRun.runId,
+            robotMetaId: plainRun.robotMetaId,
+            robotName: recording.recording_meta.name,
+            status: 'success',
+            finishedAt: new Date().toLocaleString()
+          };
+
+          serverIo.of(plainRun.browserId).emit('run-completed', completionData);
+          serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
+        } catch (socketError: any) {
+          logger.log(
+            'warn',
+            `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
+          );
+        }
+
+        // Webhook payload
+        const webhookPayload: any = {
+          robot_id: plainRun.robotMetaId,
+          run_id: plainRun.runId,
+          robot_name: recording.recording_meta.name,
+          status: 'success',
+          started_at: plainRun.startedAt,
+          finished_at: new Date().toLocaleString(),
+          metadata: {
+            browser_id: plainRun.browserId,
+            user_id: userId,
+          }
+        };
+
+        if (formats.includes('markdown')) webhookPayload.markdown = markdown;
+        if (formats.includes('html')) webhookPayload.html = html;
+
+        try {
+          await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
+          logger.log(
+            'info',
+            `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`
+          );
+        } catch (webhookError: any) {
+          logger.log(
+            'warn',
+            `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
+          );
+        }
+
+        capture("maxun-oss-run-created-scheduled", {
+          runId: plainRun.runId,
+          user_id: userId,
+          status: "success",
+          robot_type: "scrape",
+          formats
+        });
+
+        await destroyRemoteBrowser(plainRun.browserId, userId);
+
+        return true;
+
+      } catch (error: any) {
+        logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`);
+
+        await run.update({
+          status: 'failed',
+          finishedAt: new Date().toLocaleString(),
+          log: `${formats.join(', ')} conversion failed: ${error.message}`,
+        });
+
+        try {
+          const failureData = {
+            runId: plainRun.runId,
+            robotMetaId: plainRun.robotMetaId,
+            robotName: recording.recording_meta.name,
+            status: 'failed',
+            finishedAt: new Date().toLocaleString()
+          };
+
+          serverIo.of(plainRun.browserId).emit('run-completed', failureData);
+          serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
+        } catch (socketError: any) {
+          logger.log(
+            'warn',
+            `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
+          );
+        }
+
+        capture("maxun-oss-run-created-scheduled", {
+          runId: plainRun.runId,
+          user_id: userId,
+          status: "failed",
+          robot_type: "scrape",
+          formats
+        });
+
+        await destroyRemoteBrowser(plainRun.browserId, userId);
+
+        throw error;
+      }
+    }
+
    plainRun.status = 'running';

    try {
@@ -217,7 +384,7 @@ async function executeRun(id: string, userId: string) {
        status: 'running',
        startedAt: plainRun.startedAt
      };
-      
+
      serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
      logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`);
    } catch (socketError: any) {