fix: lesser restrictions

This commit is contained in:
amhsirak
2025-11-20 17:22:33 +05:30
parent 930c7b6c74
commit 691dedc351

View File

@@ -11,13 +11,34 @@ export async function parseMarkdown(
const tidiedHtml = tidyHtml(html);
const t = new TurndownService();
const t = new TurndownService({
headingStyle: "atx", // ensures #### instead of ------
codeBlockStyle: "fenced",
});
// ---------------------------------------------
// Fix 1: Proper ATX headings #### instead of underline-style
// ---------------------------------------------
t.addRule("forceAtxHeadings", {
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
replacement: (content: string, node: any) => {
const level = Number(node.nodeName.charAt(1));
const clean = content.trim();
return `\n${"#".repeat(level)} ${clean}\n`;
},
});
// ---------------------------------------------
// Remove SVGs
// ---------------------------------------------
t.addRule("truncate-svg", {
filter: "svg",
replacement: () => "",
});
// ---------------------------------------------
// Improved paragraph cleanup
// ---------------------------------------------
t.addRule("improved-paragraph", {
filter: "p",
replacement: (innerText: string) => {
@@ -27,16 +48,28 @@ export async function parseMarkdown(
},
});
// ---------------------------------------------
// Fix 2: Inline link with fallback text
// ---------------------------------------------
t.addRule("inlineLink", {
filter: (node: any, opts: any) =>
opts.linkStyle === "inlined" &&
node.nodeName === "A" &&
node.getAttribute("href"),
node.nodeName === "A" && node.getAttribute("href"),
replacement: (content: string, node: any) => {
let text = content.trim();
// Fallback: aria-label → title → domain
if (!text) {
text =
node.getAttribute("aria-label")?.trim() ||
node.getAttribute("title")?.trim() ||
getDomainFromUrl(node.getAttribute("href")) ||
"link";
}
let href = node.getAttribute("href").trim();
// Relative → absolute
// relative → absolute
if (baseUrl && isRelativeUrl(href)) {
try {
const u = new URL(href, baseUrl);
@@ -44,45 +77,46 @@ export async function parseMarkdown(
} catch {}
}
// Clean URL
href = cleanUrl(href);
const title = node.title ? ` "${cleanAttribute(node.title)}"` : "";
return `[${content.trim()}](${href}${title})\n`;
return `[${text}](${href})`;
},
});
t.use(gfm);
// ---------------------------------------------------
// Convert
// ---------------------------------------------------
// Convert HTML → Markdown
try {
let out = await t.turndown(tidiedHtml);
out = fixBrokenLinks(out);
out = stripSkipLinks(out);
return out;
return out.trim();
} catch (err) {
console.error("HTML→Markdown failed", { err });
return "";
}
}
// ---------------------------------------------
// -----------------------------------------------------
// Helpers
// ---------------------------------------------
// -----------------------------------------------------
function isRelativeUrl(url: string): boolean {
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
}
function cleanUrl(u: string): string {
function getDomainFromUrl(url: string): string | null {
try {
return u;
const u = new URL(url);
return u.hostname.replace("www.", "");
} catch {
return u;
return null;
}
}
function cleanUrl(u: string): string {
return u;
}
function cleanAttribute(attr: string) {
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
}
@@ -92,24 +126,23 @@ function tidyHtml(html: string): string {
const $ = cheerio.load(html);
const manuallyCleanedElements = [
"script",
"style",
"iframe",
"noscript",
"meta",
"link",
"object",
"embed",
"canvas",
"audio",
"video"
"script",
"style",
"iframe",
"noscript",
"meta",
"link",
"object",
"embed",
"canvas",
"audio",
"video",
];
manuallyCleanedElements.forEach((tag) => $(tag).remove());
return $("body").html();
}
function fixBrokenLinks(md: string): string {
let depth = 0;
let result = "";