wip: to markdown

This commit is contained in:
amhsirak
2025-11-19 22:50:46 +05:30
parent 6c8850a0a7
commit 7da464755d

View File

@@ -0,0 +1,157 @@
package main
/*
#include <stdlib.h>
*/
import "C"
import (
"strings"
"unsafe"
"unicode/utf8"
"github.com/PuerkitoBio/goquery"
md "github.com/getmaxun/html-to-markdown/v2"
"github.com/getmaxun/html-to-markdown/v2/plugin"
converter "github.com/getmaxun/html-to-markdown/v2/converter"
"golang.org/x/net/html"
)
// ConvertHTMLToMarkdown receives HTML and returns a markdown string allocated for C.
func ConvertHTMLToMarkdown(input *C.char) *C.char {
engine := converter.NewConverter("", true, nil)
// engine.Use(plugin.GitHubFlavored())
registerPreHandler(engine)
result, err := engine.ConvertString(C.GoString(input))
if err != nil {
// swallow conversion error (same as original)
}
return C.CString(result)
}
//export FreeCString
// Frees C string memory.
func FreeCString(str *C.char) {
C.free(unsafe.Pointer(str))
}
func main() {
// Required empty main for CGO.
}
// registerPreHandler configures a specialized PRE/code block rule
// to properly extract nested content and detect languages.
func registerPreHandler(conv *converter.Converter) {
isNoiseNode := func(class string) bool {
l := strings.ToLower(class)
return strings.Contains(l, "gutter") || strings.Contains(l, "line-numbers")
}
findLanguage := func(sel *goquery.Selection) string {
cls := strings.ToLower(sel.AttrOr("class", ""))
for _, chunk := range strings.Fields(cls) {
if strings.HasPrefix(chunk, "language-") {
return strings.TrimPrefix(chunk, "language-")
}
if strings.HasPrefix(chunk, "lang-") {
return strings.TrimPrefix(chunk, "lang-")
}
}
return ""
}
// Walk nodes and extract visible text, injecting newlines at block boundaries.
var scrape func(n *html.Node, out *strings.Builder)
scrape = func(n *html.Node, out *strings.Builder) {
if n == nil {
return
}
switch n.Type {
case html.TextNode:
out.WriteString(n.Data)
case html.ElementNode:
tag := strings.ToLower(n.Data)
// skip gutter/line number elements
for _, attr := range n.Attr {
if attr.Key == "class" && isNoiseNode(attr.Val) {
return
}
}
if tag == "br" {
out.WriteString("\n")
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
scrape(child, out)
}
switch tag {
case "p", "div", "li", "tr", "table", "thead", "tbody", "tfoot",
"section", "article", "blockquote", "pre",
"h1", "h2", "h3", "h4", "h5", "h6":
out.WriteString("\n")
}
}
}
// PRE blocks
conv.AddRules(md.Rule{
Filter: []string{"pre"},
Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string {
codeTag := s.Find("code").First()
lang := findLanguage(codeTag)
if lang == "" {
lang = findLanguage(s)
}
var buf strings.Builder
for _, node := range s.Nodes {
scrape(node, &buf)
}
raw := strings.TrimRight(buf.String(), "\n")
fRune, _ := utf8.DecodeRuneInString(opt.Fence)
fence := md.CalculateCodeFence(fRune, raw)
block := "\n\n" + fence + lang + "\n" + raw + "\n" + fence + "\n\n"
return md.String(block)
},
})
// Inline code rule
conv.AddRules(md.Rule{
Filter: []string{"code"},
Replacement: func(_ string, s *goquery.Selection, opt *md.Options) *string {
// do nothing when inside PRE
if s.ParentsFiltered("pre").Length() > 0 {
return nil
}
var buf strings.Builder
for _, node := range s.Nodes {
scrape(node, &buf)
}
text := md.TrimTrailingSpaces(strings.ReplaceAll(buf.String(), "\r\n", "\n"))
fence := "`"
if strings.Contains(text, "`") {
fence = "``"
if strings.Contains(text, "``") {
fence = "```"
}
}
inline := fence + text + fence
return md.String(inline)
},
})
}