feat: add html scrape support

This commit is contained in:
Rohit Rajan
2025-11-20 18:49:39 +05:30
parent fef038b8cf
commit e90cd9961e
12 changed files with 366 additions and 105 deletions

View File

@@ -55,3 +55,57 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
const markdown = await parseMarkdown(cleanedHtml, url);
return markdown;
}
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML.
*/
export async function convertPageToHTML(url: string): Promise<string> {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle" });
await page.addInitScript(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
// Return cleaned HTML directly
return cleanedHtml;
}