feat: use turndown

This commit is contained in:
amhsirak
2025-11-20 03:51:27 +05:30
parent 767fa5fe4f
commit b4644ba106

View File

@@ -1,109 +1,27 @@
import koffi from "koffi";
import dotenv from "dotenv";
import { stat } from "fs/promises";
import path from "node:path";
import os from "node:os";
import TurndownService from "turndown";
import { gfm } from "joplin-turndown-plugin-gfm";
const exts = {
win32: ".dll",
darwin: ".dylib",
default: ".so",
};
const ext =
exts[os.platform() as keyof typeof exts] || exts.default;
// Build path to the binary **inside the same folder**
export const GO_MARKDOWN_PARSER_PATH = path.join(
__dirname,
"html-to-markdown",
`html-to-markdown${ext}`
);
dotenv.config();
// ---------------------------------------------
// Native Go binding wrapper
// ---------------------------------------------
class NativeMarkdownBridge {
private static singleton: NativeMarkdownBridge;
private fnConvert: any;
private constructor() {
const lib = koffi.load(GO_MARKDOWN_PARSER_PATH);
const freeFn = lib.func("FreeCString", "void", ["string"]);
const trackedType = "CString:" + crypto.randomUUID();
const autoReleasedStr = koffi.disposable(trackedType, "string", freeFn);
this.fnConvert = lib.func("ConvertHTMLToMarkdown", autoReleasedStr, [
"string",
]);
}
static async load(): Promise<NativeMarkdownBridge> {
if (!NativeMarkdownBridge.singleton) {
try {
await stat(GO_MARKDOWN_PARSER_PATH);
} catch {
throw new Error("Go shared library not found");
}
NativeMarkdownBridge.singleton = new NativeMarkdownBridge();
}
return NativeMarkdownBridge.singleton;
}
async run(html: string): Promise<string> {
return new Promise((resolve, reject) => {
this.fnConvert.async(html, (err: Error, output: string) => {
err ? reject(err) : resolve(output);
});
});
}
}
// ---------------------------------------------
// Main exposed function
// ---------------------------------------------
export async function parseMarkdown(
html: string | null | undefined,
): Promise<string> {
if (!html) return "";
// Try Go library first (if enabled)
try {
const engine = await NativeMarkdownBridge.load();
let md = await engine.run(html);
md = fixBrokenLinks(md);
md = stripSkipLinks(md);
return md;
} catch (err: any) {
if (err?.message !== "Go shared library not found") {
console.log("Go markdown parser failed, falling back to JS parser:", err);
} else {
console.log("Go parser missing.", { GO_MARKDOWN_PARSER_PATH });
}
}
// Fallback parser
const TurndownService = require("turndown");
const { gfm } = require("joplin-turndown-plugin-gfm");
const t = new TurndownService();
// Custom rule for inline links
t.addRule("inlineLink", {
filter: (node: any, opts: any) =>
opts.linkStyle === "inlined" &&
node.nodeName === "A" &&
node.getAttribute("href"),
replacement: (content: string, node: any) => {
const href = node.getAttribute("href").trim();
const href = node.getAttribute("href")?.trim() || "";
const title = node.title ? ` "${node.title}"` : "";
return `[${content.trim()}](${href}${title})\n`;
},
});
// GitHub-flavored markdown features
t.use(gfm);
try {
@@ -134,9 +52,11 @@ function fixBrokenLinks(md: string): string {
result += ch;
}
}
return result;
}
function stripSkipLinks(md: string): string {
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
}