fix: lesser restrictions

This commit is contained in:
amhsirak
2025-11-20 17:22:33 +05:30
parent 930c7b6c74
commit 691dedc351

View File

@@ -11,13 +11,34 @@ export async function parseMarkdown(
const tidiedHtml = tidyHtml(html); const tidiedHtml = tidyHtml(html);
const t = new TurndownService(); const t = new TurndownService({
headingStyle: "atx", // ensures #### instead of ------
codeBlockStyle: "fenced",
});
// ---------------------------------------------
// Fix 1: Proper ATX headings #### instead of underline-style
// ---------------------------------------------
t.addRule("forceAtxHeadings", {
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
replacement: (content: string, node: any) => {
const level = Number(node.nodeName.charAt(1));
const clean = content.trim();
return `\n${"#".repeat(level)} ${clean}\n`;
},
});
// ---------------------------------------------
// Remove SVGs
// ---------------------------------------------
t.addRule("truncate-svg", { t.addRule("truncate-svg", {
filter: "svg", filter: "svg",
replacement: () => "", replacement: () => "",
}); });
// ---------------------------------------------
// Improved paragraph cleanup
// ---------------------------------------------
t.addRule("improved-paragraph", { t.addRule("improved-paragraph", {
filter: "p", filter: "p",
replacement: (innerText: string) => { replacement: (innerText: string) => {
@@ -27,16 +48,28 @@ export async function parseMarkdown(
}, },
}); });
// ---------------------------------------------
// Fix 2: Inline link with fallback text
// ---------------------------------------------
t.addRule("inlineLink", { t.addRule("inlineLink", {
filter: (node: any, opts: any) => filter: (node: any, opts: any) =>
opts.linkStyle === "inlined" && node.nodeName === "A" && node.getAttribute("href"),
node.nodeName === "A" &&
node.getAttribute("href"),
replacement: (content: string, node: any) => { replacement: (content: string, node: any) => {
let text = content.trim();
// Fallback: aria-label → title → domain
if (!text) {
text =
node.getAttribute("aria-label")?.trim() ||
node.getAttribute("title")?.trim() ||
getDomainFromUrl(node.getAttribute("href")) ||
"link";
}
let href = node.getAttribute("href").trim(); let href = node.getAttribute("href").trim();
// Relative → absolute // relative → absolute
if (baseUrl && isRelativeUrl(href)) { if (baseUrl && isRelativeUrl(href)) {
try { try {
const u = new URL(href, baseUrl); const u = new URL(href, baseUrl);
@@ -44,45 +77,46 @@ export async function parseMarkdown(
} catch {} } catch {}
} }
// Clean URL
href = cleanUrl(href); href = cleanUrl(href);
const title = node.title ? ` "${cleanAttribute(node.title)}"` : ""; return `[${text}](${href})`;
return `[${content.trim()}](${href}${title})\n`;
}, },
}); });
t.use(gfm); t.use(gfm);
// --------------------------------------------------- // Convert HTML → Markdown
// Convert
// ---------------------------------------------------
try { try {
let out = await t.turndown(tidiedHtml); let out = await t.turndown(tidiedHtml);
out = fixBrokenLinks(out); out = fixBrokenLinks(out);
out = stripSkipLinks(out); out = stripSkipLinks(out);
return out; return out.trim();
} catch (err) { } catch (err) {
console.error("HTML→Markdown failed", { err }); console.error("HTML→Markdown failed", { err });
return ""; return "";
} }
} }
// --------------------------------------------- // -----------------------------------------------------
// Helpers // Helpers
// --------------------------------------------- // -----------------------------------------------------
function isRelativeUrl(url: string): boolean { function isRelativeUrl(url: string): boolean {
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:"); return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
} }
function cleanUrl(u: string): string { function getDomainFromUrl(url: string): string | null {
try { try {
return u; const u = new URL(url);
return u.hostname.replace("www.", "");
} catch { } catch {
return u; return null;
} }
} }
function cleanUrl(u: string): string {
return u;
}
function cleanAttribute(attr: string) { function cleanAttribute(attr: string) {
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : ""; return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
} }
@@ -92,24 +126,23 @@ function tidyHtml(html: string): string {
const $ = cheerio.load(html); const $ = cheerio.load(html);
const manuallyCleanedElements = [ const manuallyCleanedElements = [
"script", "script",
"style", "style",
"iframe", "iframe",
"noscript", "noscript",
"meta", "meta",
"link", "link",
"object", "object",
"embed", "embed",
"canvas", "canvas",
"audio", "audio",
"video" "video",
]; ];
manuallyCleanedElements.forEach((tag) => $(tag).remove()); manuallyCleanedElements.forEach((tag) => $(tag).remove());
return $("body").html(); return $("body").html();
} }
function fixBrokenLinks(md: string): string { function fixBrokenLinks(md: string): string {
let depth = 0; let depth = 0;
let result = ""; let result = "";