feat: use turndown
This commit is contained in:
@@ -1,109 +1,27 @@
|
||||
import koffi from "koffi";
|
||||
import dotenv from "dotenv";
|
||||
import { stat } from "fs/promises";
|
||||
import path from "node:path";
|
||||
import os from "node:os";
|
||||
import TurndownService from "turndown";
|
||||
import { gfm } from "joplin-turndown-plugin-gfm";
|
||||
|
||||
const exts = {
|
||||
win32: ".dll",
|
||||
darwin: ".dylib",
|
||||
default: ".so",
|
||||
};
|
||||
|
||||
const ext =
|
||||
exts[os.platform() as keyof typeof exts] || exts.default;
|
||||
|
||||
// Build path to the binary **inside the same folder**
|
||||
export const GO_MARKDOWN_PARSER_PATH = path.join(
|
||||
__dirname,
|
||||
"html-to-markdown",
|
||||
`html-to-markdown${ext}`
|
||||
);
|
||||
|
||||
dotenv.config();
|
||||
|
||||
// ---------------------------------------------
|
||||
// Native Go binding wrapper
|
||||
// ---------------------------------------------
|
||||
class NativeMarkdownBridge {
|
||||
private static singleton: NativeMarkdownBridge;
|
||||
private fnConvert: any;
|
||||
|
||||
private constructor() {
|
||||
const lib = koffi.load(GO_MARKDOWN_PARSER_PATH);
|
||||
|
||||
const freeFn = lib.func("FreeCString", "void", ["string"]);
|
||||
const trackedType = "CString:" + crypto.randomUUID();
|
||||
const autoReleasedStr = koffi.disposable(trackedType, "string", freeFn);
|
||||
|
||||
this.fnConvert = lib.func("ConvertHTMLToMarkdown", autoReleasedStr, [
|
||||
"string",
|
||||
]);
|
||||
}
|
||||
|
||||
static async load(): Promise<NativeMarkdownBridge> {
|
||||
if (!NativeMarkdownBridge.singleton) {
|
||||
try {
|
||||
await stat(GO_MARKDOWN_PARSER_PATH);
|
||||
} catch {
|
||||
throw new Error("Go shared library not found");
|
||||
}
|
||||
NativeMarkdownBridge.singleton = new NativeMarkdownBridge();
|
||||
}
|
||||
return NativeMarkdownBridge.singleton;
|
||||
}
|
||||
|
||||
async run(html: string): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
this.fnConvert.async(html, (err: Error, output: string) => {
|
||||
err ? reject(err) : resolve(output);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------
|
||||
// Main exposed function
|
||||
// ---------------------------------------------
|
||||
export async function parseMarkdown(
|
||||
html: string | null | undefined,
|
||||
): Promise<string> {
|
||||
if (!html) return "";
|
||||
|
||||
// Try Go library first (if enabled)
|
||||
try {
|
||||
const engine = await NativeMarkdownBridge.load();
|
||||
let md = await engine.run(html);
|
||||
|
||||
md = fixBrokenLinks(md);
|
||||
md = stripSkipLinks(md);
|
||||
|
||||
return md;
|
||||
} catch (err: any) {
|
||||
if (err?.message !== "Go shared library not found") {
|
||||
console.log("Go markdown parser failed, falling back to JS parser:", err);
|
||||
} else {
|
||||
console.log("Go parser missing.", { GO_MARKDOWN_PARSER_PATH });
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback parser
|
||||
const TurndownService = require("turndown");
|
||||
const { gfm } = require("joplin-turndown-plugin-gfm");
|
||||
|
||||
const t = new TurndownService();
|
||||
|
||||
// Custom rule for inline links
|
||||
t.addRule("inlineLink", {
|
||||
filter: (node: any, opts: any) =>
|
||||
opts.linkStyle === "inlined" &&
|
||||
node.nodeName === "A" &&
|
||||
node.getAttribute("href"),
|
||||
replacement: (content: string, node: any) => {
|
||||
const href = node.getAttribute("href").trim();
|
||||
const href = node.getAttribute("href")?.trim() || "";
|
||||
const title = node.title ? ` "${node.title}"` : "";
|
||||
return `[${content.trim()}](${href}${title})\n`;
|
||||
},
|
||||
});
|
||||
|
||||
// GitHub-flavored markdown features
|
||||
t.use(gfm);
|
||||
|
||||
try {
|
||||
@@ -134,9 +52,11 @@ function fixBrokenLinks(md: string): string {
|
||||
result += ch;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function stripSkipLinks(md: string): string {
|
||||
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user