feat: add html scrape support
This commit is contained in:
@@ -55,3 +55,57 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
|
||||
const markdown = await parseMarkdown(cleanedHtml, url);
|
||||
return markdown;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches a webpage, strips scripts/styles/images/etc,
|
||||
* returns clean HTML.
|
||||
*/
|
||||
export async function convertPageToHTML(url: string): Promise<string> {
|
||||
const browser = await chromium.launch();
|
||||
const page = await browser.newPage();
|
||||
|
||||
await page.goto(url, { waitUntil: "networkidle" });
|
||||
|
||||
await page.addInitScript(() => {
|
||||
const selectors = [
|
||||
"script",
|
||||
"style",
|
||||
"link[rel='stylesheet']",
|
||||
"noscript",
|
||||
"meta",
|
||||
"svg",
|
||||
"img",
|
||||
"picture",
|
||||
"source",
|
||||
"video",
|
||||
"audio",
|
||||
"iframe",
|
||||
"object",
|
||||
"embed"
|
||||
];
|
||||
|
||||
selectors.forEach(sel => {
|
||||
document.querySelectorAll(sel).forEach(e => e.remove());
|
||||
});
|
||||
|
||||
// Remove inline event handlers (onclick, onload…)
|
||||
const all = document.querySelectorAll("*");
|
||||
all.forEach(el => {
|
||||
[...el.attributes].forEach(attr => {
|
||||
if (attr.name.startsWith("on")) {
|
||||
el.removeAttribute(attr.name);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// Re-extract HTML after cleanup
|
||||
const cleanedHtml = await page.evaluate(() => {
|
||||
return document.documentElement.outerHTML;
|
||||
});
|
||||
|
||||
await browser.close();
|
||||
|
||||
// Return cleaned HTML directly
|
||||
return cleanedHtml;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user