Update parser
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
module github.com/runescape-wiki/rsw
|
||||
|
||||
go 1.22
|
||||
go 1.25.0
|
||||
|
||||
require github.com/spf13/cobra v1.8.1
|
||||
|
||||
require (
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/spf13/pflag v1.0.5 // indirect
|
||||
golang.org/x/net v0.51.0 // indirect
|
||||
)
|
||||
|
||||
@@ -6,5 +6,7 @@ github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
|
||||
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
|
||||
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
|
||||
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
|
||||
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
|
||||
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
||||
@@ -4,7 +4,7 @@ import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/runescape-wiki/rsw/internal/extract"
|
||||
"github.com/runescape-wiki/rsw/internal/htmlconv"
|
||||
"github.com/runescape-wiki/rsw/internal/render"
|
||||
"github.com/runescape-wiki/rsw/internal/wiki"
|
||||
"github.com/spf13/cobra"
|
||||
@@ -27,80 +27,54 @@ Examples:
|
||||
title := args[0]
|
||||
client := wiki.NewClient(GameBaseURL())
|
||||
|
||||
page, err := client.GetPage(title)
|
||||
if Raw() {
|
||||
page, err := client.GetPage(title)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
fmt.Println(page.Wikitext)
|
||||
return nil
|
||||
}
|
||||
|
||||
page, err := client.GetPageHTML(title)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to fetch page: %w", err)
|
||||
}
|
||||
|
||||
wikitext := page.Wikitext
|
||||
|
||||
if pageSection != "" {
|
||||
idx := wiki.FindSectionIndex(page.Sections, pageSection)
|
||||
if idx == -1 {
|
||||
needle := strings.ToLower(pageSection)
|
||||
// Case-insensitive exact match
|
||||
for _, s := range page.Sections {
|
||||
if strings.ToLower(s.Line) == needle {
|
||||
fmt.Sscanf(s.Index, "%d", &idx)
|
||||
break
|
||||
}
|
||||
}
|
||||
// Case-insensitive prefix match (e.g. "Location" → "Locations")
|
||||
if idx == -1 {
|
||||
for _, s := range page.Sections {
|
||||
if strings.HasPrefix(strings.ToLower(s.Line), needle) {
|
||||
fmt.Sscanf(s.Index, "%d", &idx)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
// Case-insensitive contains match
|
||||
if idx == -1 {
|
||||
for _, s := range page.Sections {
|
||||
if strings.Contains(strings.ToLower(s.Line), needle) {
|
||||
fmt.Sscanf(s.Index, "%d", &idx)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if idx == -1 {
|
||||
body := htmlconv.ExtractSection(page.HTML, pageSection)
|
||||
if body == "" {
|
||||
sections := htmlconv.ListSections(page.HTML)
|
||||
return fmt.Errorf("section %q not found. Available sections: %s",
|
||||
pageSection, listSections(page.Sections))
|
||||
pageSection, formatSectionNames(sections))
|
||||
}
|
||||
sectionPage, err := client.GetPageSection(title, idx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to fetch section: %w", err)
|
||||
}
|
||||
wikitext = sectionPage.Wikitext
|
||||
}
|
||||
|
||||
if Raw() {
|
||||
fmt.Println(wikitext)
|
||||
md := render.New()
|
||||
md.H1(page.Title)
|
||||
md.Line(body)
|
||||
fmt.Print(md.String())
|
||||
return nil
|
||||
}
|
||||
|
||||
md := render.New()
|
||||
md.H1(page.Title)
|
||||
|
||||
if pageSection == "" && len(page.Sections) > 0 {
|
||||
sections := htmlconv.ListSections(page.HTML)
|
||||
if len(sections) > 0 {
|
||||
md.H2("Sections")
|
||||
for _, s := range page.Sections {
|
||||
for _, s := range sections {
|
||||
indent := ""
|
||||
if s.Level == "3" {
|
||||
if s.Level == 3 {
|
||||
indent = " "
|
||||
} else if s.Level == "4" {
|
||||
} else if s.Level >= 4 {
|
||||
indent = " "
|
||||
}
|
||||
md.Line(fmt.Sprintf("%s- %s", indent, s.Line))
|
||||
md.Line(fmt.Sprintf("%s- %s", indent, s.Name))
|
||||
}
|
||||
md.Newline()
|
||||
md.HR()
|
||||
}
|
||||
|
||||
plain := extract.ExtractPlainText(wikitext)
|
||||
md.P(plain)
|
||||
|
||||
md.Line(htmlconv.Convert(page.HTML))
|
||||
fmt.Print(md.String())
|
||||
return nil
|
||||
},
|
||||
@@ -110,10 +84,10 @@ Examples:
|
||||
return cmd
|
||||
}
|
||||
|
||||
func listSections(sections []wiki.Section) string {
|
||||
func formatSectionNames(sections []htmlconv.SectionInfo) string {
|
||||
names := make([]string, len(sections))
|
||||
for i, s := range sections {
|
||||
names[i] = s.Line
|
||||
names[i] = s.Name
|
||||
}
|
||||
return strings.Join(names, ", ")
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/runescape-wiki/rsw/internal/extract"
|
||||
"github.com/runescape-wiki/rsw/internal/htmlconv"
|
||||
"github.com/runescape-wiki/rsw/internal/render"
|
||||
"github.com/runescape-wiki/rsw/internal/wiki"
|
||||
"github.com/spf13/cobra"
|
||||
@@ -31,19 +31,27 @@ Examples:
|
||||
wikiClient := wiki.NewClient(GameBaseURL())
|
||||
|
||||
trainingTitle := capitalizeFirst(strings.ToLower(skillName)) + " training"
|
||||
page, err := wikiClient.GetPage(trainingTitle)
|
||||
|
||||
if Raw() {
|
||||
page, err := wikiClient.GetPage(trainingTitle)
|
||||
if err != nil {
|
||||
page, err = wikiClient.GetPage(capitalizeFirst(strings.ToLower(skillName)))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to fetch skill page: %w", err)
|
||||
}
|
||||
}
|
||||
fmt.Println(page.Wikitext)
|
||||
return nil
|
||||
}
|
||||
|
||||
page, err := wikiClient.GetPageHTML(trainingTitle)
|
||||
if err != nil {
|
||||
page, err = wikiClient.GetPage(capitalizeFirst(strings.ToLower(skillName)))
|
||||
page, err = wikiClient.GetPageHTML(capitalizeFirst(strings.ToLower(skillName)))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to fetch skill page: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
if Raw() {
|
||||
fmt.Println(page.Wikitext)
|
||||
return nil
|
||||
}
|
||||
|
||||
md := render.New()
|
||||
md.H1(fmt.Sprintf("%s Training Guide", page.Title))
|
||||
|
||||
@@ -51,11 +59,13 @@ Examples:
|
||||
md.P("*Showing methods suitable for ironman accounts (no GE access).*")
|
||||
}
|
||||
|
||||
if len(page.Sections) > 0 {
|
||||
sections := htmlconv.ListSections(page.HTML)
|
||||
|
||||
if len(sections) > 0 {
|
||||
md.H2("Contents")
|
||||
for _, s := range page.Sections {
|
||||
if s.Level == "2" {
|
||||
md.Bullet(s.Line)
|
||||
for _, s := range sections {
|
||||
if s.Level == 2 {
|
||||
md.Bullet(s.Name)
|
||||
}
|
||||
}
|
||||
md.Newline()
|
||||
@@ -63,30 +73,23 @@ Examples:
|
||||
|
||||
if levelRange != "" {
|
||||
found := false
|
||||
for _, s := range page.Sections {
|
||||
if strings.Contains(strings.ToLower(s.Line), strings.ToLower(levelRange)) ||
|
||||
sectionMatchesLevelRange(s.Line, levelRange) {
|
||||
idx := 0
|
||||
fmt.Sscanf(s.Index, "%d", &idx)
|
||||
if idx > 0 {
|
||||
sectionPage, err := wikiClient.GetPageSection(page.Title, idx)
|
||||
if err == nil {
|
||||
plain := extract.ExtractPlainText(sectionPage.Wikitext)
|
||||
if strings.TrimSpace(plain) != "" {
|
||||
md.H2(s.Line)
|
||||
md.P(plain)
|
||||
found = true
|
||||
}
|
||||
}
|
||||
for _, s := range sections {
|
||||
if strings.Contains(strings.ToLower(s.Name), strings.ToLower(levelRange)) ||
|
||||
sectionMatchesLevelRange(s.Name, levelRange) {
|
||||
body := htmlconv.ExtractSection(page.HTML, s.Name)
|
||||
if strings.TrimSpace(body) != "" {
|
||||
md.H2(s.Name)
|
||||
md.Line(body)
|
||||
found = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
md.P(fmt.Sprintf("*No section found matching level range %q. Showing full guide.*", levelRange))
|
||||
renderFullGuide(md, page, wikiClient)
|
||||
renderFullGuideHTML(md, page.HTML, sections)
|
||||
}
|
||||
} else {
|
||||
renderFullGuide(md, page, wikiClient)
|
||||
renderFullGuideHTML(md, page.HTML, sections)
|
||||
}
|
||||
|
||||
fmt.Print(md.String())
|
||||
@@ -98,24 +101,15 @@ Examples:
|
||||
return cmd
|
||||
}
|
||||
|
||||
func renderFullGuide(md *render.Builder, page *wiki.ParsedPage, client *wiki.Client) {
|
||||
for _, s := range page.Sections {
|
||||
if s.Level != "2" {
|
||||
func renderFullGuideHTML(md *render.Builder, pageHTML string, sections []htmlconv.SectionInfo) {
|
||||
for _, s := range sections {
|
||||
if s.Level != 2 {
|
||||
continue
|
||||
}
|
||||
idx := 0
|
||||
fmt.Sscanf(s.Index, "%d", &idx)
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
sectionPage, err := client.GetPageSection(page.Title, idx)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
plain := extract.ExtractPlainText(sectionPage.Wikitext)
|
||||
if strings.TrimSpace(plain) != "" {
|
||||
md.H2(s.Line)
|
||||
md.P(plain)
|
||||
body := htmlconv.ExtractSection(pageHTML, s.Name)
|
||||
if strings.TrimSpace(body) != "" {
|
||||
md.H2(s.Name)
|
||||
md.Line(body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -202,6 +202,7 @@ func cleanWikiLinks(s string) string {
|
||||
// CleanWikitext strips templates, wiki links, and HTML but preserves line structure
|
||||
// and converts wiki list markers (* items) to readable bullet points.
|
||||
func CleanWikitext(s string) string {
|
||||
s = StripTransclusionTags(s)
|
||||
s = expandKnownTemplates(s)
|
||||
s = removeTemplates(s)
|
||||
s = cleanWikiLinks(s)
|
||||
@@ -239,6 +240,7 @@ func CleanWikitext(s string) string {
|
||||
// ExtractPlainText strips all wikitext markup to produce plain text.
|
||||
func ExtractPlainText(wikitext string) string {
|
||||
s := wikitext
|
||||
s = StripTransclusionTags(s)
|
||||
s = expandKnownTemplates(s)
|
||||
s = removeTemplates(s)
|
||||
s = cleanWikiLinks(s)
|
||||
@@ -371,6 +373,48 @@ func tryExpandTemplate(inner string) (string, bool) {
|
||||
return "", false
|
||||
}
|
||||
|
||||
// StripTransclusionTags handles MediaWiki transclusion directives in raw wikitext.
|
||||
// When viewing a page directly (not transcluding):
|
||||
// - <noinclude>content</noinclude> → keep content (strip tags only)
|
||||
// - <includeonly>content</includeonly> → remove entirely (tags + content)
|
||||
// - <onlyinclude>content</onlyinclude> → keep content (strip tags only)
|
||||
func StripTransclusionTags(s string) string {
|
||||
// Remove <includeonly>...</includeonly> blocks entirely
|
||||
for {
|
||||
lower := strings.ToLower(s)
|
||||
start := strings.Index(lower, "<includeonly>")
|
||||
if start == -1 {
|
||||
break
|
||||
}
|
||||
end := strings.Index(lower[start:], "</includeonly>")
|
||||
if end == -1 {
|
||||
// Unclosed tag — remove to end of string
|
||||
s = s[:start]
|
||||
break
|
||||
}
|
||||
s = s[:start] + s[start+end+len("</includeonly>"):]
|
||||
}
|
||||
|
||||
// Unwrap <noinclude> and <onlyinclude> (keep content, remove tags)
|
||||
for _, tag := range []string{"noinclude", "onlyinclude"} {
|
||||
s = removeTagKeepContent(s, tag)
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func removeTagKeepContent(s, tag string) string {
|
||||
open := "<" + tag + ">"
|
||||
close := "</" + tag + ">"
|
||||
s = strings.ReplaceAll(s, open, "")
|
||||
s = strings.ReplaceAll(s, close, "")
|
||||
// Case-insensitive variants
|
||||
upper := "<" + strings.ToUpper(tag) + ">"
|
||||
upperClose := "</" + strings.ToUpper(tag) + ">"
|
||||
s = strings.ReplaceAll(s, upper, "")
|
||||
s = strings.ReplaceAll(s, upperClose, "")
|
||||
return s
|
||||
}
|
||||
|
||||
func removeTemplates(s string) string {
|
||||
var b strings.Builder
|
||||
depth := 0
|
||||
|
||||
483
scripts/rsw/internal/htmlconv/htmlconv.go
Normal file
483
scripts/rsw/internal/htmlconv/htmlconv.go
Normal file
@@ -0,0 +1,483 @@
|
||||
package htmlconv
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
"golang.org/x/net/html/atom"
|
||||
)
|
||||
|
||||
// Convert takes MediaWiki-rendered HTML and returns clean markdown.
|
||||
func Convert(rawHTML string) string {
|
||||
doc, err := html.Parse(strings.NewReader(rawHTML))
|
||||
if err != nil {
|
||||
return rawHTML
|
||||
}
|
||||
|
||||
var w walker
|
||||
w.walk(doc)
|
||||
return w.finish()
|
||||
}
|
||||
|
||||
// ExtractSection extracts a single section by heading text and converts it to markdown.
|
||||
// Matches case-insensitively. Returns empty string if not found.
|
||||
func ExtractSection(rawHTML string, sectionName string) string {
|
||||
doc, err := html.Parse(strings.NewReader(rawHTML))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
needle := strings.ToLower(strings.TrimSpace(sectionName))
|
||||
|
||||
// Find the mw-parser-output container.
|
||||
container := findParserOutput(doc)
|
||||
if container == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Walk top-level children to find the target heading div, then collect
|
||||
// siblings until the next heading of equal or lesser depth.
|
||||
var (
|
||||
collecting bool
|
||||
targetLvl int
|
||||
collected []*html.Node
|
||||
)
|
||||
for c := container.FirstChild; c != nil; c = c.NextSibling {
|
||||
if isHeadingDiv(c) {
|
||||
lvl, text := headingInfo(c)
|
||||
if collecting {
|
||||
if lvl <= targetLvl {
|
||||
break
|
||||
}
|
||||
}
|
||||
if !collecting && strings.ToLower(strings.TrimSpace(text)) == needle {
|
||||
collecting = true
|
||||
targetLvl = lvl
|
||||
continue
|
||||
}
|
||||
}
|
||||
if collecting {
|
||||
collected = append(collected, c)
|
||||
}
|
||||
}
|
||||
|
||||
if len(collected) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
var w walker
|
||||
for _, n := range collected {
|
||||
w.walk(n)
|
||||
}
|
||||
return w.finish()
|
||||
}
|
||||
|
||||
// ListSections returns section names and levels from the HTML.
|
||||
func ListSections(rawHTML string) []SectionInfo {
|
||||
doc, err := html.Parse(strings.NewReader(rawHTML))
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
container := findParserOutput(doc)
|
||||
if container == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var sections []SectionInfo
|
||||
for c := container.FirstChild; c != nil; c = c.NextSibling {
|
||||
if isHeadingDiv(c) {
|
||||
lvl, text := headingInfo(c)
|
||||
if text != "" {
|
||||
sections = append(sections, SectionInfo{Name: text, Level: lvl})
|
||||
}
|
||||
}
|
||||
}
|
||||
return sections
|
||||
}
|
||||
|
||||
// SectionInfo holds a section heading name and level.
|
||||
type SectionInfo struct {
|
||||
Name string
|
||||
Level int
|
||||
}
|
||||
|
||||
// --- DOM helpers ---
|
||||
|
||||
func findParserOutput(n *html.Node) *html.Node {
|
||||
if n.Type == html.ElementNode && n.DataAtom == atom.Div && hasClass(n, "mw-parser-output") {
|
||||
return n
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
if found := findParserOutput(c); found != nil {
|
||||
return found
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func isHeadingDiv(n *html.Node) bool {
|
||||
return n.Type == html.ElementNode && n.DataAtom == atom.Div && hasClassPrefix(n, "mw-heading")
|
||||
}
|
||||
|
||||
func headingInfo(div *html.Node) (level int, text string) {
|
||||
for c := div.FirstChild; c != nil; c = c.NextSibling {
|
||||
if c.Type == html.ElementNode {
|
||||
switch c.DataAtom {
|
||||
case atom.H1:
|
||||
return 1, textContent(c)
|
||||
case atom.H2:
|
||||
return 2, textContent(c)
|
||||
case atom.H3:
|
||||
return 3, textContent(c)
|
||||
case atom.H4:
|
||||
return 4, textContent(c)
|
||||
case atom.H5:
|
||||
return 5, textContent(c)
|
||||
case atom.H6:
|
||||
return 6, textContent(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, ""
|
||||
}
|
||||
|
||||
func textContent(n *html.Node) string {
|
||||
if n.Type == html.TextNode {
|
||||
return n.Data
|
||||
}
|
||||
// Skip edit section spans.
|
||||
if n.Type == html.ElementNode && hasClass(n, "mw-editsection") {
|
||||
return ""
|
||||
}
|
||||
var sb strings.Builder
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
sb.WriteString(textContent(c))
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
func hasClass(n *html.Node, cls string) bool {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == "class" {
|
||||
for _, c := range strings.Fields(a.Val) {
|
||||
if c == cls {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func hasClassPrefix(n *html.Node, prefix string) bool {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == "class" {
|
||||
for _, c := range strings.Fields(a.Val) {
|
||||
if strings.HasPrefix(c, prefix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func getAttr(n *html.Node, key string) string {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == key {
|
||||
return a.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func isHidden(n *html.Node) bool {
|
||||
style := getAttr(n, "style")
|
||||
return strings.Contains(style, "display:none") || strings.Contains(style, "display: none")
|
||||
}
|
||||
|
||||
// --- walker: recursive HTML-to-markdown converter ---
|
||||
|
||||
type walker struct {
|
||||
sb strings.Builder
|
||||
listCtx []listContext
|
||||
}
|
||||
|
||||
type listContext struct {
|
||||
ordered bool
|
||||
index int
|
||||
}
|
||||
|
||||
func (w *walker) finish() string {
|
||||
out := w.sb.String()
|
||||
// Collapse runs of 3+ newlines to 2.
|
||||
for strings.Contains(out, "\n\n\n") {
|
||||
out = strings.ReplaceAll(out, "\n\n\n", "\n\n")
|
||||
}
|
||||
return strings.TrimSpace(out) + "\n"
|
||||
}
|
||||
|
||||
func (w *walker) walk(n *html.Node) {
|
||||
switch n.Type {
|
||||
case html.TextNode:
|
||||
w.sb.WriteString(n.Data)
|
||||
return
|
||||
case html.ElementNode:
|
||||
// skip
|
||||
case html.DocumentNode:
|
||||
w.walkChildren(n)
|
||||
return
|
||||
default:
|
||||
return
|
||||
}
|
||||
|
||||
if isHidden(n) {
|
||||
return
|
||||
}
|
||||
|
||||
if w.shouldSkip(n) {
|
||||
return
|
||||
}
|
||||
|
||||
switch n.DataAtom {
|
||||
case atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
|
||||
w.renderHeading(n)
|
||||
case atom.P:
|
||||
w.ensureNewline()
|
||||
w.walkChildren(n)
|
||||
w.sb.WriteString("\n\n")
|
||||
case atom.Br:
|
||||
w.sb.WriteString("\n")
|
||||
case atom.B, atom.Strong:
|
||||
w.sb.WriteString("**")
|
||||
w.walkChildren(n)
|
||||
w.sb.WriteString("**")
|
||||
case atom.I, atom.Em:
|
||||
w.sb.WriteString("*")
|
||||
w.walkChildren(n)
|
||||
w.sb.WriteString("*")
|
||||
case atom.A:
|
||||
w.walkChildren(n)
|
||||
case atom.Ul:
|
||||
w.renderList(n, false)
|
||||
case atom.Ol:
|
||||
w.renderList(n, true)
|
||||
case atom.Li:
|
||||
w.renderListItem(n)
|
||||
case atom.Table:
|
||||
w.renderTable(n)
|
||||
case atom.Img:
|
||||
alt := getAttr(n, "alt")
|
||||
if alt != "" {
|
||||
w.sb.WriteString(alt)
|
||||
}
|
||||
case atom.Div:
|
||||
if isHeadingDiv(n) {
|
||||
lvl, text := headingInfo(n)
|
||||
if text != "" {
|
||||
w.ensureNewline()
|
||||
w.sb.WriteString(strings.Repeat("#", lvl))
|
||||
w.sb.WriteString(" ")
|
||||
w.sb.WriteString(text)
|
||||
w.sb.WriteString("\n\n")
|
||||
}
|
||||
return
|
||||
}
|
||||
w.walkChildren(n)
|
||||
default:
|
||||
w.walkChildren(n)
|
||||
}
|
||||
}
|
||||
|
||||
func (w *walker) walkChildren(n *html.Node) {
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
w.walk(c)
|
||||
}
|
||||
}
|
||||
|
||||
func (w *walker) shouldSkip(n *html.Node) bool {
|
||||
if n.Type != html.ElementNode {
|
||||
return false
|
||||
}
|
||||
if getAttr(n, "id") == "toc" || getAttr(n, "id") == "toctemplate" {
|
||||
return true
|
||||
}
|
||||
if hasClass(n, "navigation-not-searchable") {
|
||||
return true
|
||||
}
|
||||
if hasClass(n, "mw-editsection") {
|
||||
return true
|
||||
}
|
||||
// Skip infobox tables — noisy in CLI. RS3 uses "rsw-infobox", OSRS uses "infobox".
|
||||
if n.DataAtom == atom.Table && (hasClass(n, "infobox") || hasClassPrefix(n, "rsw-infobox")) {
|
||||
return true
|
||||
}
|
||||
// Skip navbox (bottom-of-page navigation templates).
|
||||
if n.DataAtom == atom.Table && hasClass(n, "navbox") {
|
||||
return true
|
||||
}
|
||||
// Skip messagebox / disambig boxes.
|
||||
if n.DataAtom == atom.Table && hasClass(n, "messagebox") {
|
||||
return true
|
||||
}
|
||||
// Skip "clear" divs.
|
||||
if n.DataAtom == atom.Div && hasClass(n, "clear-template") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (w *walker) ensureNewline() {
|
||||
s := w.sb.String()
|
||||
if len(s) > 0 && s[len(s)-1] != '\n' {
|
||||
w.sb.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
func (w *walker) renderHeading(n *html.Node) {
|
||||
level := 0
|
||||
switch n.DataAtom {
|
||||
case atom.H1:
|
||||
level = 1
|
||||
case atom.H2:
|
||||
level = 2
|
||||
case atom.H3:
|
||||
level = 3
|
||||
case atom.H4:
|
||||
level = 4
|
||||
case atom.H5:
|
||||
level = 5
|
||||
case atom.H6:
|
||||
level = 6
|
||||
}
|
||||
w.ensureNewline()
|
||||
w.sb.WriteString(strings.Repeat("#", level))
|
||||
w.sb.WriteString(" ")
|
||||
w.sb.WriteString(textContent(n))
|
||||
w.sb.WriteString("\n\n")
|
||||
}
|
||||
|
||||
func (w *walker) renderList(n *html.Node, ordered bool) {
|
||||
w.listCtx = append(w.listCtx, listContext{ordered: ordered})
|
||||
w.ensureNewline()
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
w.walk(c)
|
||||
}
|
||||
w.listCtx = w.listCtx[:len(w.listCtx)-1]
|
||||
if len(w.listCtx) == 0 {
|
||||
w.sb.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
func (w *walker) renderListItem(n *html.Node) {
|
||||
depth := len(w.listCtx)
|
||||
if depth == 0 {
|
||||
depth = 1
|
||||
}
|
||||
indent := strings.Repeat(" ", depth-1)
|
||||
|
||||
ctx := &w.listCtx[len(w.listCtx)-1]
|
||||
if ctx.ordered {
|
||||
ctx.index++
|
||||
w.sb.WriteString(fmt.Sprintf("%s%d. ", indent, ctx.index))
|
||||
} else {
|
||||
w.sb.WriteString(indent + "- ")
|
||||
}
|
||||
|
||||
// Walk children inline — but nested lists should go on their own lines.
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
if c.Type == html.ElementNode && (c.DataAtom == atom.Ul || c.DataAtom == atom.Ol) {
|
||||
w.sb.WriteString("\n")
|
||||
w.walk(c)
|
||||
} else {
|
||||
w.walk(c)
|
||||
}
|
||||
}
|
||||
w.ensureNewline()
|
||||
}
|
||||
|
||||
func (w *walker) renderTable(n *html.Node) {
|
||||
rows := collectTableRows(n)
|
||||
if len(rows) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
w.ensureNewline()
|
||||
w.sb.WriteString("\n")
|
||||
|
||||
var headers []string
|
||||
var dataRows [][]string
|
||||
|
||||
for i, row := range rows {
|
||||
var cells []string
|
||||
allHeaders := true
|
||||
for _, cell := range row {
|
||||
text := strings.TrimSpace(textContent(cell))
|
||||
text = strings.ReplaceAll(text, "|", "/")
|
||||
text = strings.ReplaceAll(text, "\n", " ")
|
||||
cells = append(cells, text)
|
||||
if cell.DataAtom != atom.Th {
|
||||
allHeaders = false
|
||||
}
|
||||
}
|
||||
if i == 0 && allHeaders {
|
||||
headers = cells
|
||||
} else {
|
||||
dataRows = append(dataRows, cells)
|
||||
}
|
||||
}
|
||||
|
||||
if headers == nil && len(dataRows) > 0 {
|
||||
headers = dataRows[0]
|
||||
dataRows = dataRows[1:]
|
||||
}
|
||||
|
||||
if len(headers) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Header row.
|
||||
w.sb.WriteString("| " + strings.Join(headers, " | ") + " |\n")
|
||||
seps := make([]string, len(headers))
|
||||
for i := range seps {
|
||||
seps[i] = "---"
|
||||
}
|
||||
w.sb.WriteString("| " + strings.Join(seps, " | ") + " |\n")
|
||||
|
||||
for _, row := range dataRows {
|
||||
for len(row) < len(headers) {
|
||||
row = append(row, "")
|
||||
}
|
||||
w.sb.WriteString("| " + strings.Join(row[:len(headers)], " | ") + " |\n")
|
||||
}
|
||||
w.sb.WriteString("\n")
|
||||
}
|
||||
|
||||
func collectTableRows(table *html.Node) [][]*html.Node {
|
||||
var rows [][]*html.Node
|
||||
var visit func(*html.Node)
|
||||
visit = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode && n.DataAtom == atom.Tr {
|
||||
if isHidden(n) {
|
||||
return
|
||||
}
|
||||
var cells []*html.Node
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
if c.Type == html.ElementNode && (c.DataAtom == atom.Th || c.DataAtom == atom.Td) {
|
||||
cells = append(cells, c)
|
||||
}
|
||||
}
|
||||
if len(cells) > 0 {
|
||||
rows = append(rows, cells)
|
||||
}
|
||||
return
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
visit(c)
|
||||
}
|
||||
}
|
||||
visit(table)
|
||||
return rows
|
||||
}
|
||||
@@ -97,9 +97,10 @@ func (c *Client) GetPageSection(title string, sectionIndex int) (*ParsedPage, er
|
||||
// GetPageHTML fetches the rendered HTML for a page.
|
||||
func (c *Client) GetPageHTML(title string) (*ParsedPage, error) {
|
||||
params := url.Values{
|
||||
"action": {"parse"},
|
||||
"page": {title},
|
||||
"prop": {"text|sections"},
|
||||
"action": {"parse"},
|
||||
"page": {title},
|
||||
"prop": {"text|sections"},
|
||||
"redirects": {"1"},
|
||||
}
|
||||
|
||||
var resp parseResponse
|
||||
@@ -133,3 +134,4 @@ func FindSectionIndex(sections []Section, name string) int {
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user