feat: extend turndown + clean
This commit is contained in:
@@ -4,36 +4,76 @@ export async function parseMarkdown(
|
||||
): Promise<string> {
|
||||
const TurndownService = require("turndown");
|
||||
const { gfm } = require("joplin-turndown-plugin-gfm");
|
||||
const { URL } = require('url');
|
||||
const cheerio = require("cheerio");
|
||||
const { URL } = require("url");
|
||||
|
||||
if (!html) return "";
|
||||
|
||||
const tidiedHtml = tidyHtml(html);
|
||||
|
||||
const t = new TurndownService();
|
||||
|
||||
// Remove irrelevant tags
|
||||
const elementsToRemove = [
|
||||
"meta",
|
||||
"style",
|
||||
"script",
|
||||
"noscript",
|
||||
"link",
|
||||
"textarea",
|
||||
];
|
||||
|
||||
t.addRule("remove-irrelevant", {
|
||||
filter: elementsToRemove,
|
||||
replacement: () => "",
|
||||
});
|
||||
|
||||
t.addRule("truncate-svg", {
|
||||
filter: "svg",
|
||||
replacement: () => "",
|
||||
});
|
||||
|
||||
t.addRule("improved-paragraph", {
|
||||
filter: "p",
|
||||
replacement: (innerText: string) => {
|
||||
const trimmed = innerText.trim();
|
||||
if (!trimmed) return "";
|
||||
return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
|
||||
},
|
||||
});
|
||||
|
||||
t.addRule("inlineLink", {
|
||||
filter: (node: any, opts: any) =>
|
||||
opts.linkStyle === "inlined" &&
|
||||
node.nodeName === "A" &&
|
||||
node.getAttribute("href"),
|
||||
|
||||
replacement: (content: string, node: any) => {
|
||||
let href = node.getAttribute("href").trim();
|
||||
|
||||
// Convert relative URLs to absolute if baseUrl is provided
|
||||
|
||||
// Relative → absolute
|
||||
if (baseUrl && isRelativeUrl(href)) {
|
||||
try {
|
||||
const url = new URL(href, baseUrl);
|
||||
href = url.toString();
|
||||
} catch (err) {
|
||||
// If URL construction fails, keep the original href
|
||||
}
|
||||
const u = new URL(href, baseUrl);
|
||||
href = u.toString();
|
||||
} catch {}
|
||||
}
|
||||
|
||||
const title = node.title ? ` "${node.title}"` : "";
|
||||
|
||||
// Clean URL
|
||||
href = cleanUrl(href);
|
||||
|
||||
const title = node.title ? ` "${cleanAttribute(node.title)}"` : "";
|
||||
return `[${content.trim()}](${href}${title})\n`;
|
||||
},
|
||||
});
|
||||
|
||||
t.use(gfm);
|
||||
|
||||
// ---------------------------------------------------
|
||||
// Convert
|
||||
// ---------------------------------------------------
|
||||
try {
|
||||
let out = await t.turndown(html);
|
||||
let out = await t.turndown(tidiedHtml);
|
||||
out = fixBrokenLinks(out);
|
||||
out = stripSkipLinks(out);
|
||||
return out;
|
||||
@@ -43,13 +83,98 @@ export async function parseMarkdown(
|
||||
}
|
||||
}
|
||||
|
||||
function isRelativeUrl(url: string): boolean {
|
||||
return !url.includes('://') && !url.startsWith('mailto:') && !url.startsWith('tel:');
|
||||
}
|
||||
|
||||
// ---------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------
|
||||
function isRelativeUrl(url: string): boolean {
|
||||
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
|
||||
}
|
||||
|
||||
function cleanUrl(u: string): string {
|
||||
try {
|
||||
return u;
|
||||
} catch {
|
||||
return u;
|
||||
}
|
||||
}
|
||||
|
||||
// CODE 1: attribute cleaner
|
||||
function cleanAttribute(attr: string) {
|
||||
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// CODE 1: Full tidyHtml cleaning logic (ported verbatim)
|
||||
// ---------------------------------------------------------
|
||||
function tidyHtml(html: string): string {
|
||||
const cheerio = require("cheerio");
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// Fix broken attributes
|
||||
$("*").each(function (this: any) {
|
||||
const element = $(this);
|
||||
const attributes = Object.keys(this.attribs);
|
||||
|
||||
for (let i = 0; i < attributes.length; i++) {
|
||||
let attr = attributes[i];
|
||||
if (attr.includes('"')) {
|
||||
element.remove();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const manuallyCleanedElements = [
|
||||
"aside",
|
||||
"embed",
|
||||
"head",
|
||||
"iframe",
|
||||
"menu",
|
||||
"object",
|
||||
"script",
|
||||
"applet",
|
||||
"audio",
|
||||
"canvas",
|
||||
"map",
|
||||
"svg",
|
||||
"video",
|
||||
"area",
|
||||
"blink",
|
||||
"datalist",
|
||||
"dialog",
|
||||
"frame",
|
||||
"frameset",
|
||||
"link",
|
||||
"input",
|
||||
"ins",
|
||||
"legend",
|
||||
"marquee",
|
||||
"math",
|
||||
"menuitem",
|
||||
"nav",
|
||||
"noscript",
|
||||
"optgroup",
|
||||
"output",
|
||||
"param",
|
||||
"progress",
|
||||
"rp",
|
||||
"rt",
|
||||
"rtc",
|
||||
"source",
|
||||
"style",
|
||||
"track",
|
||||
"textarea",
|
||||
"time",
|
||||
"use",
|
||||
"img",
|
||||
"picture",
|
||||
"figure",
|
||||
];
|
||||
|
||||
manuallyCleanedElements.forEach((tag) => $(tag).remove());
|
||||
return $("body").html();
|
||||
}
|
||||
|
||||
|
||||
function fixBrokenLinks(md: string): string {
|
||||
let depth = 0;
|
||||
let result = "";
|
||||
@@ -57,12 +182,7 @@ function fixBrokenLinks(md: string): string {
|
||||
for (const ch of md) {
|
||||
if (ch === "[") depth++;
|
||||
if (ch === "]") depth = Math.max(0, depth - 1);
|
||||
|
||||
if (depth > 0 && ch === "\n") {
|
||||
result += "\\\n";
|
||||
} else {
|
||||
result += ch;
|
||||
}
|
||||
result += depth > 0 && ch === "\n" ? "\\\n" : ch;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@@ -70,5 +190,3 @@ function fixBrokenLinks(md: string): string {
|
||||
function stripSkipLinks(md: string): string {
|
||||
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user