feat: html -> markdown

This commit is contained in:
amhsirak
2025-11-20 02:42:44 +05:30
parent 713d37465d
commit 6c93cbc9a2

View File

@@ -0,0 +1,141 @@
import koffi from "koffi";
import dotenv from "dotenv";
import { stat } from "fs/promises";
import path from "node:path";
import os from "node:os";
const exts = {
win32: ".dll",
darwin: ".dylib",
default: ".so",
};
const ext =
exts[os.platform() as keyof typeof exts] || exts.default;
// Build path to the binary **inside the same folder**
export const GO_MARKDOWN_PARSER_PATH = path.join(
__dirname,
`html-to-markdown${ext}`
);
dotenv.config();
// ---------------------------------------------
// Native Go binding wrapper
// ---------------------------------------------
class NativeMarkdownBridge {
private static singleton: NativeMarkdownBridge;
private fnConvert: any;
private constructor() {
const lib = koffi.load(GO_MARKDOWN_PARSER_PATH);
const freeFn = lib.func("FreeCString", "void", ["string"]);
const trackedType = "CString:" + crypto.randomUUID();
const autoReleasedStr = koffi.disposable(trackedType, "string", freeFn);
this.fnConvert = lib.func("ConvertHTMLToMarkdown", autoReleasedStr, [
"string",
]);
}
static async load(): Promise<NativeMarkdownBridge> {
if (!NativeMarkdownBridge.singleton) {
try {
await stat(GO_MARKDOWN_PARSER_PATH);
} catch {
throw new Error("Go shared library not found");
}
NativeMarkdownBridge.singleton = new NativeMarkdownBridge();
}
return NativeMarkdownBridge.singleton;
}
async run(html: string): Promise<string> {
return new Promise((resolve, reject) => {
this.fnConvert.async(html, (err: Error, output: string) => {
err ? reject(err) : resolve(output);
});
});
}
}
// ---------------------------------------------
// Main exposed function
// ---------------------------------------------
export async function parseMarkdown(
html: string | null | undefined,
): Promise<string> {
if (!html) return "";
// Try Go library first (if enabled)
try {
const engine = await NativeMarkdownBridge.load();
let md = await engine.run(html);
md = fixBrokenLinks(md);
md = stripSkipLinks(md);
return md;
} catch (err: any) {
if (err?.message !== "Go shared library not found") {
console.log("Go markdown parser failed, falling back to JS parser:", err);
} else {
console.log("Go parser missing.", { GO_MARKDOWN_PARSER_PATH });
}
}
// Fallback parser
const TurndownService = require("turndown");
const { gfm } = require("joplin-turndown-plugin-gfm");
const t = new TurndownService();
t.addRule("inlineLink", {
filter: (node: any, opts: any) =>
opts.linkStyle === "inlined" &&
node.nodeName === "A" &&
node.getAttribute("href"),
replacement: (content: string, node: any) => {
const href = node.getAttribute("href").trim();
const title = node.title ? ` "${node.title}"` : "";
return `[${content.trim()}](${href}${title})\n`;
},
});
t.use(gfm);
try {
let out = await t.turndown(html);
out = fixBrokenLinks(out);
out = stripSkipLinks(out);
return out;
} catch (err) {
console.error("HTML→Markdown failed", { err });
return "";
}
}
// ---------------------------------------------
// Helpers
// ---------------------------------------------
function fixBrokenLinks(md: string): string {
let depth = 0;
let result = "";
for (const ch of md) {
if (ch === "[") depth++;
if (ch === "]") depth = Math.max(0, depth - 1);
if (depth > 0 && ch === "\n") {
result += "\\\n";
} else {
result += ch;
}
}
return result;
}
function stripSkipLinks(md: string): string {
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
}