From 1d65f900339831d081a336bd44998442a6f24f97 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 03:01:18 +0530 Subject: [PATCH] feat: use parser to scrape --- server/src/markdownify/scrape.ts | 57 ++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 server/src/markdownify/scrape.ts diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts new file mode 100644 index 00000000..39d1fa3a --- /dev/null +++ b/server/src/markdownify/scrape.ts @@ -0,0 +1,57 @@ +import { chromium } from "playwright"; +import { parseMarkdown } from "./markdown"; + +/** + * Fetches a webpage, strips scripts/styles/images/etc, + * returns clean Markdown using parser. + */ +export async function convertPageToMarkdown(url: string): Promise { + const browser = await chromium.launch(); + const page = await browser.newPage(); + + await page.goto(url, { waitUntil: "networkidle" }); + + await page.addInitScript(() => { + const selectors = [ + "script", + "style", + "link[rel='stylesheet']", + "noscript", + "meta", + "svg", + "img", + "picture", + "source", + "video", + "audio", + "iframe", + "object", + "embed" + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(e => e.remove()); + }); + + // Remove inline event handlers (onclick, onload…) + const all = document.querySelectorAll("*"); + all.forEach(el => { + [...el.attributes].forEach(attr => { + if (attr.name.startsWith("on")) { + el.removeAttribute(attr.name); + } + }); + }); + }); + + // Re-extract HTML after cleanup + const cleanedHtml = await page.evaluate(() => { + return document.documentElement.outerHTML; + }); + + await browser.close(); + + // Convert cleaned HTML → Markdown + const markdown = await parseMarkdown(cleanedHtml || ""); + return markdown; +}