From 7da464755d0dd77dedacc827e9f1d389ef514f32 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 19 Nov 2025 22:50:46 +0530 Subject: [PATCH] wip: to markdown --- .../html-to-markdown/html-to-markdown.go | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 server/src/markdownify/html-to-markdown/html-to-markdown.go diff --git a/server/src/markdownify/html-to-markdown/html-to-markdown.go b/server/src/markdownify/html-to-markdown/html-to-markdown.go new file mode 100644 index 00000000..dce011c1 --- /dev/null +++ b/server/src/markdownify/html-to-markdown/html-to-markdown.go @@ -0,0 +1,157 @@ +package main + +/* +#include +*/ +import "C" + +import ( + "strings" + "unsafe" + "unicode/utf8" + + "github.com/PuerkitoBio/goquery" + md "github.com/getmaxun/html-to-markdown/v2" + "github.com/getmaxun/html-to-markdown/v2/plugin" + converter "github.com/getmaxun/html-to-markdown/v2/converter" + "golang.org/x/net/html" +) + +// ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C. +func ConvertHTMLToMarkdown(input *C.char) *C.char { + engine := converter.NewConverter("", true, nil) + // engine.Use(plugin.GitHubFlavored()) + + registerPreHandler(engine) + + result, err := engine.ConvertString(C.GoString(input)) + if err != nil { + // swallow conversion error (same as original) + } + + return C.CString(result) +} + +//export FreeCString +// Frees C string memory. +func FreeCString(str *C.char) { + C.free(unsafe.Pointer(str)) +} + +func main() { + // Required empty main for CGO. +} + +// registerPreHandler configures a specialized PRE/code block rule +// to properly extract nested content and detect languages. +func registerPreHandler(conv *converter.Converter) { + isNoiseNode := func(class string) bool { + l := strings.ToLower(class) + return strings.Contains(l, "gutter") || strings.Contains(l, "line-numbers") + } + + findLanguage := func(sel *goquery.Selection) string { + cls := strings.ToLower(sel.AttrOr("class", "")) + for _, chunk := range strings.Fields(cls) { + if strings.HasPrefix(chunk, "language-") { + return strings.TrimPrefix(chunk, "language-") + } + if strings.HasPrefix(chunk, "lang-") { + return strings.TrimPrefix(chunk, "lang-") + } + } + return "" + } + + // Walk nodes and extract visible text, injecting newlines at block boundaries. + var scrape func(n *html.Node, out *strings.Builder) + scrape = func(n *html.Node, out *strings.Builder) { + if n == nil { + return + } + + switch n.Type { + case html.TextNode: + out.WriteString(n.Data) + + case html.ElementNode: + tag := strings.ToLower(n.Data) + + // skip gutter/line number elements + for _, attr := range n.Attr { + if attr.Key == "class" && isNoiseNode(attr.Val) { + return + } + } + + if tag == "br" { + out.WriteString("\n") + } + + for child := n.FirstChild; child != nil; child = child.NextSibling { + scrape(child, out) + } + + switch tag { + case "p", "div", "li", "tr", "table", "thead", "tbody", "tfoot", + "section", "article", "blockquote", "pre", + "h1", "h2", "h3", "h4", "h5", "h6": + out.WriteString("\n") + } + } + } + + // PRE blocks + conv.AddRules(md.Rule{ + Filter: []string{"pre"}, + Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string { + codeTag := s.Find("code").First() + lang := findLanguage(codeTag) + if lang == "" { + lang = findLanguage(s) + } + + var buf strings.Builder + for _, node := range s.Nodes { + scrape(node, &buf) + } + + raw := strings.TrimRight(buf.String(), "\n") + + fRune, _ := utf8.DecodeRuneInString(opt.Fence) + fence := md.CalculateCodeFence(fRune, raw) + + block := "\n\n" + fence + lang + "\n" + raw + "\n" + fence + "\n\n" + return md.String(block) + }, + }) + + // Inline code rule + conv.AddRules(md.Rule{ + Filter: []string{"code"}, + Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string { + // do nothing when inside PRE + if s.ParentsFiltered("pre").Length() > 0 { + return nil + } + + var buf strings.Builder + for _, node := range s.Nodes { + scrape(node, &buf) + } + + text := md.TrimTrailingSpaces(strings.ReplaceAll(buf.String(), "\r\n", "\n")) + + fence := "`" + if strings.Contains(text, "`") { + fence = "``" + if strings.Contains(text, "``") { + fence = "```" + } + } + + inline := fence + text + fence + return md.String(inline) + }, + }) +}