484 lines
9.9 KiB
Go
484 lines
9.9 KiB
Go
package htmlconv
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
"golang.org/x/net/html/atom"
|
|
)
|
|
|
|
// Convert takes MediaWiki-rendered HTML and returns clean markdown.
|
|
func Convert(rawHTML string) string {
|
|
doc, err := html.Parse(strings.NewReader(rawHTML))
|
|
if err != nil {
|
|
return rawHTML
|
|
}
|
|
|
|
var w walker
|
|
w.walk(doc)
|
|
return w.finish()
|
|
}
|
|
|
|
// ExtractSection extracts a single section by heading text and converts it to markdown.
|
|
// Matches case-insensitively. Returns empty string if not found.
|
|
func ExtractSection(rawHTML string, sectionName string) string {
|
|
doc, err := html.Parse(strings.NewReader(rawHTML))
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
needle := strings.ToLower(strings.TrimSpace(sectionName))
|
|
|
|
// Find the mw-parser-output container.
|
|
container := findParserOutput(doc)
|
|
if container == nil {
|
|
return ""
|
|
}
|
|
|
|
// Walk top-level children to find the target heading div, then collect
|
|
// siblings until the next heading of equal or lesser depth.
|
|
var (
|
|
collecting bool
|
|
targetLvl int
|
|
collected []*html.Node
|
|
)
|
|
for c := container.FirstChild; c != nil; c = c.NextSibling {
|
|
if isHeadingDiv(c) {
|
|
lvl, text := headingInfo(c)
|
|
if collecting {
|
|
if lvl <= targetLvl {
|
|
break
|
|
}
|
|
}
|
|
if !collecting && strings.ToLower(strings.TrimSpace(text)) == needle {
|
|
collecting = true
|
|
targetLvl = lvl
|
|
continue
|
|
}
|
|
}
|
|
if collecting {
|
|
collected = append(collected, c)
|
|
}
|
|
}
|
|
|
|
if len(collected) == 0 {
|
|
return ""
|
|
}
|
|
|
|
var w walker
|
|
for _, n := range collected {
|
|
w.walk(n)
|
|
}
|
|
return w.finish()
|
|
}
|
|
|
|
// ListSections returns section names and levels from the HTML.
|
|
func ListSections(rawHTML string) []SectionInfo {
|
|
doc, err := html.Parse(strings.NewReader(rawHTML))
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
container := findParserOutput(doc)
|
|
if container == nil {
|
|
return nil
|
|
}
|
|
|
|
var sections []SectionInfo
|
|
for c := container.FirstChild; c != nil; c = c.NextSibling {
|
|
if isHeadingDiv(c) {
|
|
lvl, text := headingInfo(c)
|
|
if text != "" {
|
|
sections = append(sections, SectionInfo{Name: text, Level: lvl})
|
|
}
|
|
}
|
|
}
|
|
return sections
|
|
}
|
|
|
|
// SectionInfo holds a section heading name and level.
|
|
type SectionInfo struct {
|
|
Name string
|
|
Level int
|
|
}
|
|
|
|
// --- DOM helpers ---
|
|
|
|
func findParserOutput(n *html.Node) *html.Node {
|
|
if n.Type == html.ElementNode && n.DataAtom == atom.Div && hasClass(n, "mw-parser-output") {
|
|
return n
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
if found := findParserOutput(c); found != nil {
|
|
return found
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func isHeadingDiv(n *html.Node) bool {
|
|
return n.Type == html.ElementNode && n.DataAtom == atom.Div && hasClassPrefix(n, "mw-heading")
|
|
}
|
|
|
|
func headingInfo(div *html.Node) (level int, text string) {
|
|
for c := div.FirstChild; c != nil; c = c.NextSibling {
|
|
if c.Type == html.ElementNode {
|
|
switch c.DataAtom {
|
|
case atom.H1:
|
|
return 1, textContent(c)
|
|
case atom.H2:
|
|
return 2, textContent(c)
|
|
case atom.H3:
|
|
return 3, textContent(c)
|
|
case atom.H4:
|
|
return 4, textContent(c)
|
|
case atom.H5:
|
|
return 5, textContent(c)
|
|
case atom.H6:
|
|
return 6, textContent(c)
|
|
}
|
|
}
|
|
}
|
|
return 0, ""
|
|
}
|
|
|
|
func textContent(n *html.Node) string {
|
|
if n.Type == html.TextNode {
|
|
return n.Data
|
|
}
|
|
// Skip edit section spans.
|
|
if n.Type == html.ElementNode && hasClass(n, "mw-editsection") {
|
|
return ""
|
|
}
|
|
var sb strings.Builder
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
sb.WriteString(textContent(c))
|
|
}
|
|
return sb.String()
|
|
}
|
|
|
|
func hasClass(n *html.Node, cls string) bool {
|
|
for _, a := range n.Attr {
|
|
if a.Key == "class" {
|
|
for _, c := range strings.Fields(a.Val) {
|
|
if c == cls {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func hasClassPrefix(n *html.Node, prefix string) bool {
|
|
for _, a := range n.Attr {
|
|
if a.Key == "class" {
|
|
for _, c := range strings.Fields(a.Val) {
|
|
if strings.HasPrefix(c, prefix) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func getAttr(n *html.Node, key string) string {
|
|
for _, a := range n.Attr {
|
|
if a.Key == key {
|
|
return a.Val
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func isHidden(n *html.Node) bool {
|
|
style := getAttr(n, "style")
|
|
return strings.Contains(style, "display:none") || strings.Contains(style, "display: none")
|
|
}
|
|
|
|
// --- walker: recursive HTML-to-markdown converter ---
|
|
|
|
type walker struct {
|
|
sb strings.Builder
|
|
listCtx []listContext
|
|
}
|
|
|
|
type listContext struct {
|
|
ordered bool
|
|
index int
|
|
}
|
|
|
|
func (w *walker) finish() string {
|
|
out := w.sb.String()
|
|
// Collapse runs of 3+ newlines to 2.
|
|
for strings.Contains(out, "\n\n\n") {
|
|
out = strings.ReplaceAll(out, "\n\n\n", "\n\n")
|
|
}
|
|
return strings.TrimSpace(out) + "\n"
|
|
}
|
|
|
|
func (w *walker) walk(n *html.Node) {
|
|
switch n.Type {
|
|
case html.TextNode:
|
|
w.sb.WriteString(n.Data)
|
|
return
|
|
case html.ElementNode:
|
|
// skip
|
|
case html.DocumentNode:
|
|
w.walkChildren(n)
|
|
return
|
|
default:
|
|
return
|
|
}
|
|
|
|
if isHidden(n) {
|
|
return
|
|
}
|
|
|
|
if w.shouldSkip(n) {
|
|
return
|
|
}
|
|
|
|
switch n.DataAtom {
|
|
case atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
|
|
w.renderHeading(n)
|
|
case atom.P:
|
|
w.ensureNewline()
|
|
w.walkChildren(n)
|
|
w.sb.WriteString("\n\n")
|
|
case atom.Br:
|
|
w.sb.WriteString("\n")
|
|
case atom.B, atom.Strong:
|
|
w.sb.WriteString("**")
|
|
w.walkChildren(n)
|
|
w.sb.WriteString("**")
|
|
case atom.I, atom.Em:
|
|
w.sb.WriteString("*")
|
|
w.walkChildren(n)
|
|
w.sb.WriteString("*")
|
|
case atom.A:
|
|
w.walkChildren(n)
|
|
case atom.Ul:
|
|
w.renderList(n, false)
|
|
case atom.Ol:
|
|
w.renderList(n, true)
|
|
case atom.Li:
|
|
w.renderListItem(n)
|
|
case atom.Table:
|
|
w.renderTable(n)
|
|
case atom.Img:
|
|
alt := getAttr(n, "alt")
|
|
if alt != "" {
|
|
w.sb.WriteString(alt)
|
|
}
|
|
case atom.Div:
|
|
if isHeadingDiv(n) {
|
|
lvl, text := headingInfo(n)
|
|
if text != "" {
|
|
w.ensureNewline()
|
|
w.sb.WriteString(strings.Repeat("#", lvl))
|
|
w.sb.WriteString(" ")
|
|
w.sb.WriteString(text)
|
|
w.sb.WriteString("\n\n")
|
|
}
|
|
return
|
|
}
|
|
w.walkChildren(n)
|
|
default:
|
|
w.walkChildren(n)
|
|
}
|
|
}
|
|
|
|
func (w *walker) walkChildren(n *html.Node) {
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
w.walk(c)
|
|
}
|
|
}
|
|
|
|
func (w *walker) shouldSkip(n *html.Node) bool {
|
|
if n.Type != html.ElementNode {
|
|
return false
|
|
}
|
|
if getAttr(n, "id") == "toc" || getAttr(n, "id") == "toctemplate" {
|
|
return true
|
|
}
|
|
if hasClass(n, "navigation-not-searchable") {
|
|
return true
|
|
}
|
|
if hasClass(n, "mw-editsection") {
|
|
return true
|
|
}
|
|
// Skip infobox tables — noisy in CLI. RS3 uses "rsw-infobox", OSRS uses "infobox".
|
|
if n.DataAtom == atom.Table && (hasClass(n, "infobox") || hasClassPrefix(n, "rsw-infobox")) {
|
|
return true
|
|
}
|
|
// Skip navbox (bottom-of-page navigation templates).
|
|
if n.DataAtom == atom.Table && hasClass(n, "navbox") {
|
|
return true
|
|
}
|
|
// Skip messagebox / disambig boxes.
|
|
if n.DataAtom == atom.Table && hasClass(n, "messagebox") {
|
|
return true
|
|
}
|
|
// Skip "clear" divs.
|
|
if n.DataAtom == atom.Div && hasClass(n, "clear-template") {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (w *walker) ensureNewline() {
|
|
s := w.sb.String()
|
|
if len(s) > 0 && s[len(s)-1] != '\n' {
|
|
w.sb.WriteString("\n")
|
|
}
|
|
}
|
|
|
|
func (w *walker) renderHeading(n *html.Node) {
|
|
level := 0
|
|
switch n.DataAtom {
|
|
case atom.H1:
|
|
level = 1
|
|
case atom.H2:
|
|
level = 2
|
|
case atom.H3:
|
|
level = 3
|
|
case atom.H4:
|
|
level = 4
|
|
case atom.H5:
|
|
level = 5
|
|
case atom.H6:
|
|
level = 6
|
|
}
|
|
w.ensureNewline()
|
|
w.sb.WriteString(strings.Repeat("#", level))
|
|
w.sb.WriteString(" ")
|
|
w.sb.WriteString(textContent(n))
|
|
w.sb.WriteString("\n\n")
|
|
}
|
|
|
|
func (w *walker) renderList(n *html.Node, ordered bool) {
|
|
w.listCtx = append(w.listCtx, listContext{ordered: ordered})
|
|
w.ensureNewline()
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
w.walk(c)
|
|
}
|
|
w.listCtx = w.listCtx[:len(w.listCtx)-1]
|
|
if len(w.listCtx) == 0 {
|
|
w.sb.WriteString("\n")
|
|
}
|
|
}
|
|
|
|
func (w *walker) renderListItem(n *html.Node) {
|
|
depth := len(w.listCtx)
|
|
if depth == 0 {
|
|
depth = 1
|
|
}
|
|
indent := strings.Repeat(" ", depth-1)
|
|
|
|
ctx := &w.listCtx[len(w.listCtx)-1]
|
|
if ctx.ordered {
|
|
ctx.index++
|
|
fmt.Fprintf(&w.sb, "%s%d. ", indent, ctx.index)
|
|
} else {
|
|
w.sb.WriteString(indent + "- ")
|
|
}
|
|
|
|
// Walk children inline — but nested lists should go on their own lines.
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
if c.Type == html.ElementNode && (c.DataAtom == atom.Ul || c.DataAtom == atom.Ol) {
|
|
w.sb.WriteString("\n")
|
|
w.walk(c)
|
|
} else {
|
|
w.walk(c)
|
|
}
|
|
}
|
|
w.ensureNewline()
|
|
}
|
|
|
|
func (w *walker) renderTable(n *html.Node) {
|
|
rows := collectTableRows(n)
|
|
if len(rows) == 0 {
|
|
return
|
|
}
|
|
|
|
w.ensureNewline()
|
|
w.sb.WriteString("\n")
|
|
|
|
var headers []string
|
|
var dataRows [][]string
|
|
|
|
for i, row := range rows {
|
|
var cells []string
|
|
allHeaders := true
|
|
for _, cell := range row {
|
|
text := strings.TrimSpace(textContent(cell))
|
|
text = strings.ReplaceAll(text, "|", "/")
|
|
text = strings.ReplaceAll(text, "\n", " ")
|
|
cells = append(cells, text)
|
|
if cell.DataAtom != atom.Th {
|
|
allHeaders = false
|
|
}
|
|
}
|
|
if i == 0 && allHeaders {
|
|
headers = cells
|
|
} else {
|
|
dataRows = append(dataRows, cells)
|
|
}
|
|
}
|
|
|
|
if headers == nil && len(dataRows) > 0 {
|
|
headers = dataRows[0]
|
|
dataRows = dataRows[1:]
|
|
}
|
|
|
|
if len(headers) == 0 {
|
|
return
|
|
}
|
|
|
|
// Header row.
|
|
w.sb.WriteString("| " + strings.Join(headers, " | ") + " |\n")
|
|
seps := make([]string, len(headers))
|
|
for i := range seps {
|
|
seps[i] = "---"
|
|
}
|
|
w.sb.WriteString("| " + strings.Join(seps, " | ") + " |\n")
|
|
|
|
for _, row := range dataRows {
|
|
for len(row) < len(headers) {
|
|
row = append(row, "")
|
|
}
|
|
w.sb.WriteString("| " + strings.Join(row[:len(headers)], " | ") + " |\n")
|
|
}
|
|
w.sb.WriteString("\n")
|
|
}
|
|
|
|
func collectTableRows(table *html.Node) [][]*html.Node {
|
|
var rows [][]*html.Node
|
|
var visit func(*html.Node)
|
|
visit = func(n *html.Node) {
|
|
if n.Type == html.ElementNode && n.DataAtom == atom.Tr {
|
|
if isHidden(n) {
|
|
return
|
|
}
|
|
var cells []*html.Node
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
if c.Type == html.ElementNode && (c.DataAtom == atom.Th || c.DataAtom == atom.Td) {
|
|
cells = append(cells, c)
|
|
}
|
|
}
|
|
if len(cells) > 0 {
|
|
rows = append(rows, cells)
|
|
}
|
|
return
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
visit(c)
|
|
}
|
|
}
|
|
visit(table)
|
|
return rows
|
|
}
|