package htmlconv import ( "fmt" "strings" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) // Convert takes MediaWiki-rendered HTML and returns clean markdown. func Convert(rawHTML string) string { doc, err := html.Parse(strings.NewReader(rawHTML)) if err != nil { return rawHTML } var w walker w.walk(doc) return w.finish() } // ExtractSection extracts a single section by heading text and converts it to markdown. // Matches case-insensitively. Returns empty string if not found. func ExtractSection(rawHTML string, sectionName string) string { doc, err := html.Parse(strings.NewReader(rawHTML)) if err != nil { return "" } needle := strings.ToLower(strings.TrimSpace(sectionName)) // Find the mw-parser-output container. container := findParserOutput(doc) if container == nil { return "" } // Walk top-level children to find the target heading div, then collect // siblings until the next heading of equal or lesser depth. var ( collecting bool targetLvl int collected []*html.Node ) for c := container.FirstChild; c != nil; c = c.NextSibling { if isHeadingDiv(c) { lvl, text := headingInfo(c) if collecting { if lvl <= targetLvl { break } } if !collecting && strings.ToLower(strings.TrimSpace(text)) == needle { collecting = true targetLvl = lvl continue } } if collecting { collected = append(collected, c) } } if len(collected) == 0 { return "" } var w walker for _, n := range collected { w.walk(n) } return w.finish() } // ListSections returns section names and levels from the HTML. func ListSections(rawHTML string) []SectionInfo { doc, err := html.Parse(strings.NewReader(rawHTML)) if err != nil { return nil } container := findParserOutput(doc) if container == nil { return nil } var sections []SectionInfo for c := container.FirstChild; c != nil; c = c.NextSibling { if isHeadingDiv(c) { lvl, text := headingInfo(c) if text != "" { sections = append(sections, SectionInfo{Name: text, Level: lvl}) } } } return sections } // SectionInfo holds a section heading name and level. type SectionInfo struct { Name string Level int } // --- DOM helpers --- func findParserOutput(n *html.Node) *html.Node { if n.Type == html.ElementNode && n.DataAtom == atom.Div && hasClass(n, "mw-parser-output") { return n } for c := n.FirstChild; c != nil; c = c.NextSibling { if found := findParserOutput(c); found != nil { return found } } return nil } func isHeadingDiv(n *html.Node) bool { return n.Type == html.ElementNode && n.DataAtom == atom.Div && hasClassPrefix(n, "mw-heading") } func headingInfo(div *html.Node) (level int, text string) { for c := div.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.ElementNode { switch c.DataAtom { case atom.H1: return 1, textContent(c) case atom.H2: return 2, textContent(c) case atom.H3: return 3, textContent(c) case atom.H4: return 4, textContent(c) case atom.H5: return 5, textContent(c) case atom.H6: return 6, textContent(c) } } } return 0, "" } func textContent(n *html.Node) string { if n.Type == html.TextNode { return n.Data } // Skip edit section spans. if n.Type == html.ElementNode && hasClass(n, "mw-editsection") { return "" } var sb strings.Builder for c := n.FirstChild; c != nil; c = c.NextSibling { sb.WriteString(textContent(c)) } return sb.String() } func hasClass(n *html.Node, cls string) bool { for _, a := range n.Attr { if a.Key == "class" { for _, c := range strings.Fields(a.Val) { if c == cls { return true } } } } return false } func hasClassPrefix(n *html.Node, prefix string) bool { for _, a := range n.Attr { if a.Key == "class" { for _, c := range strings.Fields(a.Val) { if strings.HasPrefix(c, prefix) { return true } } } } return false } func getAttr(n *html.Node, key string) string { for _, a := range n.Attr { if a.Key == key { return a.Val } } return "" } func isHidden(n *html.Node) bool { style := getAttr(n, "style") return strings.Contains(style, "display:none") || strings.Contains(style, "display: none") } // --- walker: recursive HTML-to-markdown converter --- type walker struct { sb strings.Builder listCtx []listContext } type listContext struct { ordered bool index int } func (w *walker) finish() string { out := w.sb.String() // Collapse runs of 3+ newlines to 2. for strings.Contains(out, "\n\n\n") { out = strings.ReplaceAll(out, "\n\n\n", "\n\n") } return strings.TrimSpace(out) + "\n" } func (w *walker) walk(n *html.Node) { switch n.Type { case html.TextNode: w.sb.WriteString(n.Data) return case html.ElementNode: // skip case html.DocumentNode: w.walkChildren(n) return default: return } if isHidden(n) { return } if w.shouldSkip(n) { return } switch n.DataAtom { case atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6: w.renderHeading(n) case atom.P: w.ensureNewline() w.walkChildren(n) w.sb.WriteString("\n\n") case atom.Br: w.sb.WriteString("\n") case atom.B, atom.Strong: w.sb.WriteString("**") w.walkChildren(n) w.sb.WriteString("**") case atom.I, atom.Em: w.sb.WriteString("*") w.walkChildren(n) w.sb.WriteString("*") case atom.A: w.walkChildren(n) case atom.Ul: w.renderList(n, false) case atom.Ol: w.renderList(n, true) case atom.Li: w.renderListItem(n) case atom.Table: w.renderTable(n) case atom.Img: alt := getAttr(n, "alt") if alt != "" { w.sb.WriteString(alt) } case atom.Div: if isHeadingDiv(n) { lvl, text := headingInfo(n) if text != "" { w.ensureNewline() w.sb.WriteString(strings.Repeat("#", lvl)) w.sb.WriteString(" ") w.sb.WriteString(text) w.sb.WriteString("\n\n") } return } w.walkChildren(n) default: w.walkChildren(n) } } func (w *walker) walkChildren(n *html.Node) { for c := n.FirstChild; c != nil; c = c.NextSibling { w.walk(c) } } func (w *walker) shouldSkip(n *html.Node) bool { if n.Type != html.ElementNode { return false } if getAttr(n, "id") == "toc" || getAttr(n, "id") == "toctemplate" { return true } if hasClass(n, "navigation-not-searchable") { return true } if hasClass(n, "mw-editsection") { return true } // Skip infobox tables — noisy in CLI. RS3 uses "rsw-infobox", OSRS uses "infobox". if n.DataAtom == atom.Table && (hasClass(n, "infobox") || hasClassPrefix(n, "rsw-infobox")) { return true } // Skip navbox (bottom-of-page navigation templates). if n.DataAtom == atom.Table && hasClass(n, "navbox") { return true } // Skip messagebox / disambig boxes. if n.DataAtom == atom.Table && hasClass(n, "messagebox") { return true } // Skip "clear" divs. if n.DataAtom == atom.Div && hasClass(n, "clear-template") { return true } return false } func (w *walker) ensureNewline() { s := w.sb.String() if len(s) > 0 && s[len(s)-1] != '\n' { w.sb.WriteString("\n") } } func (w *walker) renderHeading(n *html.Node) { level := 0 switch n.DataAtom { case atom.H1: level = 1 case atom.H2: level = 2 case atom.H3: level = 3 case atom.H4: level = 4 case atom.H5: level = 5 case atom.H6: level = 6 } w.ensureNewline() w.sb.WriteString(strings.Repeat("#", level)) w.sb.WriteString(" ") w.sb.WriteString(textContent(n)) w.sb.WriteString("\n\n") } func (w *walker) renderList(n *html.Node, ordered bool) { w.listCtx = append(w.listCtx, listContext{ordered: ordered}) w.ensureNewline() for c := n.FirstChild; c != nil; c = c.NextSibling { w.walk(c) } w.listCtx = w.listCtx[:len(w.listCtx)-1] if len(w.listCtx) == 0 { w.sb.WriteString("\n") } } func (w *walker) renderListItem(n *html.Node) { depth := len(w.listCtx) if depth == 0 { depth = 1 } indent := strings.Repeat(" ", depth-1) ctx := &w.listCtx[len(w.listCtx)-1] if ctx.ordered { ctx.index++ fmt.Fprintf(&w.sb, "%s%d. ", indent, ctx.index) } else { w.sb.WriteString(indent + "- ") } // Walk children inline — but nested lists should go on their own lines. for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.ElementNode && (c.DataAtom == atom.Ul || c.DataAtom == atom.Ol) { w.sb.WriteString("\n") w.walk(c) } else { w.walk(c) } } w.ensureNewline() } func (w *walker) renderTable(n *html.Node) { rows := collectTableRows(n) if len(rows) == 0 { return } w.ensureNewline() w.sb.WriteString("\n") var headers []string var dataRows [][]string for i, row := range rows { var cells []string allHeaders := true for _, cell := range row { text := strings.TrimSpace(textContent(cell)) text = strings.ReplaceAll(text, "|", "/") text = strings.ReplaceAll(text, "\n", " ") cells = append(cells, text) if cell.DataAtom != atom.Th { allHeaders = false } } if i == 0 && allHeaders { headers = cells } else { dataRows = append(dataRows, cells) } } if headers == nil && len(dataRows) > 0 { headers = dataRows[0] dataRows = dataRows[1:] } if len(headers) == 0 { return } // Header row. w.sb.WriteString("| " + strings.Join(headers, " | ") + " |\n") seps := make([]string, len(headers)) for i := range seps { seps[i] = "---" } w.sb.WriteString("| " + strings.Join(seps, " | ") + " |\n") for _, row := range dataRows { for len(row) < len(headers) { row = append(row, "") } w.sb.WriteString("| " + strings.Join(row[:len(headers)], " | ") + " |\n") } w.sb.WriteString("\n") } func collectTableRows(table *html.Node) [][]*html.Node { var rows [][]*html.Node var visit func(*html.Node) visit = func(n *html.Node) { if n.Type == html.ElementNode && n.DataAtom == atom.Tr { if isHidden(n) { return } var cells []*html.Node for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.ElementNode && (c.DataAtom == atom.Th || c.DataAtom == atom.Td) { cells = append(cells, c) } } if len(cells) > 0 { rows = append(rows, cells) } return } for c := n.FirstChild; c != nil; c = c.NextSibling { visit(c) } } visit(table) return rows }