fix: lesser restrictions
This commit is contained in:
@@ -11,13 +11,34 @@ export async function parseMarkdown(
|
|||||||
|
|
||||||
const tidiedHtml = tidyHtml(html);
|
const tidiedHtml = tidyHtml(html);
|
||||||
|
|
||||||
const t = new TurndownService();
|
const t = new TurndownService({
|
||||||
|
headingStyle: "atx", // ensures #### instead of ------
|
||||||
|
codeBlockStyle: "fenced",
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------
|
||||||
|
// Fix 1: Proper ATX headings #### instead of underline-style
|
||||||
|
// ---------------------------------------------
|
||||||
|
t.addRule("forceAtxHeadings", {
|
||||||
|
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
|
||||||
|
replacement: (content: string, node: any) => {
|
||||||
|
const level = Number(node.nodeName.charAt(1));
|
||||||
|
const clean = content.trim();
|
||||||
|
return `\n${"#".repeat(level)} ${clean}\n`;
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------
|
||||||
|
// Remove SVGs
|
||||||
|
// ---------------------------------------------
|
||||||
t.addRule("truncate-svg", {
|
t.addRule("truncate-svg", {
|
||||||
filter: "svg",
|
filter: "svg",
|
||||||
replacement: () => "",
|
replacement: () => "",
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------
|
||||||
|
// Improved paragraph cleanup
|
||||||
|
// ---------------------------------------------
|
||||||
t.addRule("improved-paragraph", {
|
t.addRule("improved-paragraph", {
|
||||||
filter: "p",
|
filter: "p",
|
||||||
replacement: (innerText: string) => {
|
replacement: (innerText: string) => {
|
||||||
@@ -27,16 +48,28 @@ export async function parseMarkdown(
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------
|
||||||
|
// Fix 2: Inline link with fallback text
|
||||||
|
// ---------------------------------------------
|
||||||
t.addRule("inlineLink", {
|
t.addRule("inlineLink", {
|
||||||
filter: (node: any, opts: any) =>
|
filter: (node: any, opts: any) =>
|
||||||
opts.linkStyle === "inlined" &&
|
node.nodeName === "A" && node.getAttribute("href"),
|
||||||
node.nodeName === "A" &&
|
|
||||||
node.getAttribute("href"),
|
|
||||||
|
|
||||||
replacement: (content: string, node: any) => {
|
replacement: (content: string, node: any) => {
|
||||||
|
let text = content.trim();
|
||||||
|
|
||||||
|
// Fallback: aria-label → title → domain
|
||||||
|
if (!text) {
|
||||||
|
text =
|
||||||
|
node.getAttribute("aria-label")?.trim() ||
|
||||||
|
node.getAttribute("title")?.trim() ||
|
||||||
|
getDomainFromUrl(node.getAttribute("href")) ||
|
||||||
|
"link";
|
||||||
|
}
|
||||||
|
|
||||||
let href = node.getAttribute("href").trim();
|
let href = node.getAttribute("href").trim();
|
||||||
|
|
||||||
// Relative → absolute
|
// relative → absolute
|
||||||
if (baseUrl && isRelativeUrl(href)) {
|
if (baseUrl && isRelativeUrl(href)) {
|
||||||
try {
|
try {
|
||||||
const u = new URL(href, baseUrl);
|
const u = new URL(href, baseUrl);
|
||||||
@@ -44,45 +77,46 @@ export async function parseMarkdown(
|
|||||||
} catch {}
|
} catch {}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean URL
|
|
||||||
href = cleanUrl(href);
|
href = cleanUrl(href);
|
||||||
|
|
||||||
const title = node.title ? ` "${cleanAttribute(node.title)}"` : "";
|
return `[${text}](${href})`;
|
||||||
return `[${content.trim()}](${href}${title})\n`;
|
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
t.use(gfm);
|
t.use(gfm);
|
||||||
|
|
||||||
// ---------------------------------------------------
|
// Convert HTML → Markdown
|
||||||
// Convert
|
|
||||||
// ---------------------------------------------------
|
|
||||||
try {
|
try {
|
||||||
let out = await t.turndown(tidiedHtml);
|
let out = await t.turndown(tidiedHtml);
|
||||||
out = fixBrokenLinks(out);
|
out = fixBrokenLinks(out);
|
||||||
out = stripSkipLinks(out);
|
out = stripSkipLinks(out);
|
||||||
return out;
|
return out.trim();
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error("HTML→Markdown failed", { err });
|
console.error("HTML→Markdown failed", { err });
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------
|
// -----------------------------------------------------
|
||||||
// Helpers
|
// Helpers
|
||||||
// ---------------------------------------------
|
// -----------------------------------------------------
|
||||||
function isRelativeUrl(url: string): boolean {
|
function isRelativeUrl(url: string): boolean {
|
||||||
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
|
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
|
||||||
}
|
}
|
||||||
|
|
||||||
function cleanUrl(u: string): string {
|
function getDomainFromUrl(url: string): string | null {
|
||||||
try {
|
try {
|
||||||
return u;
|
const u = new URL(url);
|
||||||
|
return u.hostname.replace("www.", "");
|
||||||
} catch {
|
} catch {
|
||||||
return u;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function cleanUrl(u: string): string {
|
||||||
|
return u;
|
||||||
|
}
|
||||||
|
|
||||||
function cleanAttribute(attr: string) {
|
function cleanAttribute(attr: string) {
|
||||||
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
|
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
|
||||||
}
|
}
|
||||||
@@ -92,24 +126,23 @@ function tidyHtml(html: string): string {
|
|||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
const manuallyCleanedElements = [
|
const manuallyCleanedElements = [
|
||||||
"script",
|
"script",
|
||||||
"style",
|
"style",
|
||||||
"iframe",
|
"iframe",
|
||||||
"noscript",
|
"noscript",
|
||||||
"meta",
|
"meta",
|
||||||
"link",
|
"link",
|
||||||
"object",
|
"object",
|
||||||
"embed",
|
"embed",
|
||||||
"canvas",
|
"canvas",
|
||||||
"audio",
|
"audio",
|
||||||
"video"
|
"video",
|
||||||
];
|
];
|
||||||
|
|
||||||
manuallyCleanedElements.forEach((tag) => $(tag).remove());
|
manuallyCleanedElements.forEach((tag) => $(tag).remove());
|
||||||
return $("body").html();
|
return $("body").html();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function fixBrokenLinks(md: string): string {
|
function fixBrokenLinks(md: string): string {
|
||||||
let depth = 0;
|
let depth = 0;
|
||||||
let result = "";
|
let result = "";
|
||||||
|
|||||||
Reference in New Issue
Block a user