feat: extend turndown + clean

This commit is contained in:
amhsirak
2025-11-20 15:35:31 +05:30
parent 9257b1564e
commit ddcb3dfe4b

View File

@@ -4,36 +4,76 @@ export async function parseMarkdown(
): Promise<string> {
const TurndownService = require("turndown");
const { gfm } = require("joplin-turndown-plugin-gfm");
const { URL } = require('url');
const cheerio = require("cheerio");
const { URL } = require("url");
if (!html) return "";
const tidiedHtml = tidyHtml(html);
const t = new TurndownService();
// Remove irrelevant tags
const elementsToRemove = [
"meta",
"style",
"script",
"noscript",
"link",
"textarea",
];
t.addRule("remove-irrelevant", {
filter: elementsToRemove,
replacement: () => "",
});
t.addRule("truncate-svg", {
filter: "svg",
replacement: () => "",
});
t.addRule("improved-paragraph", {
filter: "p",
replacement: (innerText: string) => {
const trimmed = innerText.trim();
if (!trimmed) return "";
return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
},
});
t.addRule("inlineLink", {
filter: (node: any, opts: any) =>
opts.linkStyle === "inlined" &&
node.nodeName === "A" &&
node.getAttribute("href"),
replacement: (content: string, node: any) => {
let href = node.getAttribute("href").trim();
// Convert relative URLs to absolute if baseUrl is provided
// Relative absolute
if (baseUrl && isRelativeUrl(href)) {
try {
const url = new URL(href, baseUrl);
href = url.toString();
} catch (err) {
// If URL construction fails, keep the original href
}
const u = new URL(href, baseUrl);
href = u.toString();
} catch {}
}
const title = node.title ? ` "${node.title}"` : "";
// Clean URL
href = cleanUrl(href);
const title = node.title ? ` "${cleanAttribute(node.title)}"` : "";
return `[${content.trim()}](${href}${title})\n`;
},
});
t.use(gfm);
// ---------------------------------------------------
// Convert
// ---------------------------------------------------
try {
let out = await t.turndown(html);
let out = await t.turndown(tidiedHtml);
out = fixBrokenLinks(out);
out = stripSkipLinks(out);
return out;
@@ -43,13 +83,98 @@ export async function parseMarkdown(
}
}
function isRelativeUrl(url: string): boolean {
return !url.includes('://') && !url.startsWith('mailto:') && !url.startsWith('tel:');
}
// ---------------------------------------------
// Helpers
// ---------------------------------------------
function isRelativeUrl(url: string): boolean {
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
}
function cleanUrl(u: string): string {
try {
return u;
} catch {
return u;
}
}
// CODE 1: attribute cleaner
function cleanAttribute(attr: string) {
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
}
// ---------------------------------------------------------
// CODE 1: Full tidyHtml cleaning logic (ported verbatim)
// ---------------------------------------------------------
function tidyHtml(html: string): string {
const cheerio = require("cheerio");
const $ = cheerio.load(html);
// Fix broken attributes
$("*").each(function (this: any) {
const element = $(this);
const attributes = Object.keys(this.attribs);
for (let i = 0; i < attributes.length; i++) {
let attr = attributes[i];
if (attr.includes('"')) {
element.remove();
}
}
});
const manuallyCleanedElements = [
"aside",
"embed",
"head",
"iframe",
"menu",
"object",
"script",
"applet",
"audio",
"canvas",
"map",
"svg",
"video",
"area",
"blink",
"datalist",
"dialog",
"frame",
"frameset",
"link",
"input",
"ins",
"legend",
"marquee",
"math",
"menuitem",
"nav",
"noscript",
"optgroup",
"output",
"param",
"progress",
"rp",
"rt",
"rtc",
"source",
"style",
"track",
"textarea",
"time",
"use",
"img",
"picture",
"figure",
];
manuallyCleanedElements.forEach((tag) => $(tag).remove());
return $("body").html();
}
function fixBrokenLinks(md: string): string {
let depth = 0;
let result = "";
@@ -57,12 +182,7 @@ function fixBrokenLinks(md: string): string {
for (const ch of md) {
if (ch === "[") depth++;
if (ch === "]") depth = Math.max(0, depth - 1);
if (depth > 0 && ch === "\n") {
result += "\\\n";
} else {
result += ch;
}
result += depth > 0 && ch === "\n" ? "\\\n" : ch;
}
return result;
}
@@ -70,5 +190,3 @@ function fixBrokenLinks(md: string): string {
function stripSkipLinks(md: string): string {
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
}