feat: use parser to scrape

This commit is contained in:
amhsirak
2025-11-20 03:01:18 +05:30
parent 66d8291282
commit 1d65f90033

View File

@@ -0,0 +1,57 @@
import { chromium } from "playwright";
import { parseMarkdown } from "./markdown";
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean Markdown using parser.
*/
export async function convertPageToMarkdown(url: string): Promise<string> {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle" });
await page.addInitScript(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
// Convert cleaned HTML → Markdown
const markdown = await parseMarkdown(cleanedHtml || "");
return markdown;
}