feat: use turndown
This commit is contained in:
@@ -1,109 +1,27 @@
|
|||||||
import koffi from "koffi";
|
import TurndownService from "turndown";
|
||||||
import dotenv from "dotenv";
|
import { gfm } from "joplin-turndown-plugin-gfm";
|
||||||
import { stat } from "fs/promises";
|
|
||||||
import path from "node:path";
|
|
||||||
import os from "node:os";
|
|
||||||
|
|
||||||
const exts = {
|
|
||||||
win32: ".dll",
|
|
||||||
darwin: ".dylib",
|
|
||||||
default: ".so",
|
|
||||||
};
|
|
||||||
|
|
||||||
const ext =
|
|
||||||
exts[os.platform() as keyof typeof exts] || exts.default;
|
|
||||||
|
|
||||||
// Build path to the binary **inside the same folder**
|
|
||||||
export const GO_MARKDOWN_PARSER_PATH = path.join(
|
|
||||||
__dirname,
|
|
||||||
"html-to-markdown",
|
|
||||||
`html-to-markdown${ext}`
|
|
||||||
);
|
|
||||||
|
|
||||||
dotenv.config();
|
|
||||||
|
|
||||||
// ---------------------------------------------
|
|
||||||
// Native Go binding wrapper
|
|
||||||
// ---------------------------------------------
|
|
||||||
class NativeMarkdownBridge {
|
|
||||||
private static singleton: NativeMarkdownBridge;
|
|
||||||
private fnConvert: any;
|
|
||||||
|
|
||||||
private constructor() {
|
|
||||||
const lib = koffi.load(GO_MARKDOWN_PARSER_PATH);
|
|
||||||
|
|
||||||
const freeFn = lib.func("FreeCString", "void", ["string"]);
|
|
||||||
const trackedType = "CString:" + crypto.randomUUID();
|
|
||||||
const autoReleasedStr = koffi.disposable(trackedType, "string", freeFn);
|
|
||||||
|
|
||||||
this.fnConvert = lib.func("ConvertHTMLToMarkdown", autoReleasedStr, [
|
|
||||||
"string",
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
static async load(): Promise<NativeMarkdownBridge> {
|
|
||||||
if (!NativeMarkdownBridge.singleton) {
|
|
||||||
try {
|
|
||||||
await stat(GO_MARKDOWN_PARSER_PATH);
|
|
||||||
} catch {
|
|
||||||
throw new Error("Go shared library not found");
|
|
||||||
}
|
|
||||||
NativeMarkdownBridge.singleton = new NativeMarkdownBridge();
|
|
||||||
}
|
|
||||||
return NativeMarkdownBridge.singleton;
|
|
||||||
}
|
|
||||||
|
|
||||||
async run(html: string): Promise<string> {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
this.fnConvert.async(html, (err: Error, output: string) => {
|
|
||||||
err ? reject(err) : resolve(output);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------
|
|
||||||
// Main exposed function
|
|
||||||
// ---------------------------------------------
|
|
||||||
export async function parseMarkdown(
|
export async function parseMarkdown(
|
||||||
html: string | null | undefined,
|
html: string | null | undefined,
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
if (!html) return "";
|
if (!html) return "";
|
||||||
|
|
||||||
// Try Go library first (if enabled)
|
|
||||||
try {
|
|
||||||
const engine = await NativeMarkdownBridge.load();
|
|
||||||
let md = await engine.run(html);
|
|
||||||
|
|
||||||
md = fixBrokenLinks(md);
|
|
||||||
md = stripSkipLinks(md);
|
|
||||||
|
|
||||||
return md;
|
|
||||||
} catch (err: any) {
|
|
||||||
if (err?.message !== "Go shared library not found") {
|
|
||||||
console.log("Go markdown parser failed, falling back to JS parser:", err);
|
|
||||||
} else {
|
|
||||||
console.log("Go parser missing.", { GO_MARKDOWN_PARSER_PATH });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback parser
|
|
||||||
const TurndownService = require("turndown");
|
|
||||||
const { gfm } = require("joplin-turndown-plugin-gfm");
|
|
||||||
|
|
||||||
const t = new TurndownService();
|
const t = new TurndownService();
|
||||||
|
|
||||||
|
// Custom rule for inline links
|
||||||
t.addRule("inlineLink", {
|
t.addRule("inlineLink", {
|
||||||
filter: (node: any, opts: any) =>
|
filter: (node: any, opts: any) =>
|
||||||
opts.linkStyle === "inlined" &&
|
opts.linkStyle === "inlined" &&
|
||||||
node.nodeName === "A" &&
|
node.nodeName === "A" &&
|
||||||
node.getAttribute("href"),
|
node.getAttribute("href"),
|
||||||
replacement: (content: string, node: any) => {
|
replacement: (content: string, node: any) => {
|
||||||
const href = node.getAttribute("href").trim();
|
const href = node.getAttribute("href")?.trim() || "";
|
||||||
const title = node.title ? ` "${node.title}"` : "";
|
const title = node.title ? ` "${node.title}"` : "";
|
||||||
return `[${content.trim()}](${href}${title})\n`;
|
return `[${content.trim()}](${href}${title})\n`;
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// GitHub-flavored markdown features
|
||||||
t.use(gfm);
|
t.use(gfm);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -134,9 +52,11 @@ function fixBrokenLinks(md: string): string {
|
|||||||
result += ch;
|
result += ch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
function stripSkipLinks(md: string): string {
|
function stripSkipLinks(md: string): string {
|
||||||
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
|
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user