From ddcb3dfe4b9c99144b9b14cd0b197f52c3d81d6b Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 15:35:31 +0530 Subject: [PATCH] feat: extend turndown + clean --- server/src/markdownify/markdown.ts | 164 +++++++++++++++++++++++++---- 1 file changed, 141 insertions(+), 23 deletions(-) diff --git a/server/src/markdownify/markdown.ts b/server/src/markdownify/markdown.ts index 9ee7c720..e660679d 100644 --- a/server/src/markdownify/markdown.ts +++ b/server/src/markdownify/markdown.ts @@ -4,36 +4,76 @@ export async function parseMarkdown( ): Promise { const TurndownService = require("turndown"); const { gfm } = require("joplin-turndown-plugin-gfm"); - const { URL } = require('url'); + const cheerio = require("cheerio"); + const { URL } = require("url"); + + if (!html) return ""; + + const tidiedHtml = tidyHtml(html); const t = new TurndownService(); + + // Remove irrelevant tags + const elementsToRemove = [ + "meta", + "style", + "script", + "noscript", + "link", + "textarea", + ]; + + t.addRule("remove-irrelevant", { + filter: elementsToRemove, + replacement: () => "", + }); + + t.addRule("truncate-svg", { + filter: "svg", + replacement: () => "", + }); + + t.addRule("improved-paragraph", { + filter: "p", + replacement: (innerText: string) => { + const trimmed = innerText.trim(); + if (!trimmed) return ""; + return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`; + }, + }); + t.addRule("inlineLink", { filter: (node: any, opts: any) => opts.linkStyle === "inlined" && node.nodeName === "A" && node.getAttribute("href"), + replacement: (content: string, node: any) => { let href = node.getAttribute("href").trim(); - - // Convert relative URLs to absolute if baseUrl is provided + + // Relative → absolute if (baseUrl && isRelativeUrl(href)) { try { - const url = new URL(href, baseUrl); - href = url.toString(); - } catch (err) { - // If URL construction fails, keep the original href - } + const u = new URL(href, baseUrl); + href = u.toString(); + } catch {} } - - const title = node.title ? ` "${node.title}"` : ""; + + // Clean URL + href = cleanUrl(href); + + const title = node.title ? ` "${cleanAttribute(node.title)}"` : ""; return `[${content.trim()}](${href}${title})\n`; }, }); t.use(gfm); + // --------------------------------------------------- + // Convert + // --------------------------------------------------- try { - let out = await t.turndown(html); + let out = await t.turndown(tidiedHtml); out = fixBrokenLinks(out); out = stripSkipLinks(out); return out; @@ -43,13 +83,98 @@ export async function parseMarkdown( } } -function isRelativeUrl(url: string): boolean { - return !url.includes('://') && !url.startsWith('mailto:') && !url.startsWith('tel:'); -} - // --------------------------------------------- // Helpers // --------------------------------------------- +function isRelativeUrl(url: string): boolean { + return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:"); +} + +function cleanUrl(u: string): string { + try { + return u; + } catch { + return u; + } +} + +// CODE 1: attribute cleaner +function cleanAttribute(attr: string) { + return attr ? attr.replace(/(\n+\s*)+/g, "\n") : ""; +} + +// --------------------------------------------------------- +// CODE 1: Full tidyHtml cleaning logic (ported verbatim) +// --------------------------------------------------------- +function tidyHtml(html: string): string { + const cheerio = require("cheerio"); + const $ = cheerio.load(html); + + // Fix broken attributes + $("*").each(function (this: any) { + const element = $(this); + const attributes = Object.keys(this.attribs); + + for (let i = 0; i < attributes.length; i++) { + let attr = attributes[i]; + if (attr.includes('"')) { + element.remove(); + } + } + }); + + const manuallyCleanedElements = [ + "aside", + "embed", + "head", + "iframe", + "menu", + "object", + "script", + "applet", + "audio", + "canvas", + "map", + "svg", + "video", + "area", + "blink", + "datalist", + "dialog", + "frame", + "frameset", + "link", + "input", + "ins", + "legend", + "marquee", + "math", + "menuitem", + "nav", + "noscript", + "optgroup", + "output", + "param", + "progress", + "rp", + "rt", + "rtc", + "source", + "style", + "track", + "textarea", + "time", + "use", + "img", + "picture", + "figure", + ]; + + manuallyCleanedElements.forEach((tag) => $(tag).remove()); + return $("body").html(); +} + + function fixBrokenLinks(md: string): string { let depth = 0; let result = ""; @@ -57,12 +182,7 @@ function fixBrokenLinks(md: string): string { for (const ch of md) { if (ch === "[") depth++; if (ch === "]") depth = Math.max(0, depth - 1); - - if (depth > 0 && ch === "\n") { - result += "\\\n"; - } else { - result += ch; - } + result += depth > 0 && ch === "\n" ? "\\\n" : ch; } return result; } @@ -70,5 +190,3 @@ function fixBrokenLinks(md: string): string { function stripSkipLinks(md: string): string { return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, ""); } - -