feat: use parser to scrape

2025-11-20 03:01:18 +05:30
parent 66d8291282
commit 1d65f90033
1 changed files with 57 additions and 0 deletions
--- a/server/src/markdownify/scrape.ts
+++ b/server/src/markdownify/scrape.ts
@@ -0,0 +1,57 @@
+import { chromium } from "playwright";
+import { parseMarkdown } from "./markdown";
+
+/**
+ * Fetches a webpage, strips scripts/styles/images/etc,
+ * returns clean Markdown using parser.
+ */
+export async function convertPageToMarkdown(url: string): Promise<string> {
+  const browser = await chromium.launch();
+  const page = await browser.newPage();
+
+  await page.goto(url, { waitUntil: "networkidle" });
+
+  await page.addInitScript(() => {
+    const selectors = [
+      "script",
+      "style",
+      "link[rel='stylesheet']",
+      "noscript",
+      "meta",
+      "svg",
+      "img",
+      "picture",
+      "source",
+      "video",
+      "audio",
+      "iframe",
+      "object",
+      "embed"
+    ];
+
+    selectors.forEach(sel => {
+      document.querySelectorAll(sel).forEach(e => e.remove());
+    });
+
+    // Remove inline event handlers (onclick, onload…)
+    const all = document.querySelectorAll("*");
+    all.forEach(el => {
+      [...el.attributes].forEach(attr => {
+        if (attr.name.startsWith("on")) {
+          el.removeAttribute(attr.name);
+        }
+      });
+    });
+  });
+
+  // Re-extract HTML after cleanup
+  const cleanedHtml = await page.evaluate(() => {
+    return document.documentElement.outerHTML;
+  });
+
+  await browser.close();
+
+  // Convert cleaned HTML → Markdown
+  const markdown = await parseMarkdown(cleanedHtml || "");
+  return markdown;
+}