546 lines
13 KiB
Go
546 lines
13 KiB
Go
package extract
|
|
|
|
import (
|
|
"strconv"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// Infobox represents a parsed wiki infobox (or any template) as key-value pairs.
|
|
type Infobox struct {
|
|
TemplateName string
|
|
Params map[string]string
|
|
}
|
|
|
|
// ParseTemplates extracts all top-level templates from wikitext.
|
|
// It handles nested templates (e.g., {{formatnum:{{GEPrice|...}}}}) by tracking brace depth.
|
|
func ParseTemplates(wikitext string) []Infobox {
|
|
var results []Infobox
|
|
i := 0
|
|
for i < len(wikitext)-1 {
|
|
if wikitext[i] == '{' && wikitext[i+1] == '{' {
|
|
end := findTemplateEnd(wikitext, i)
|
|
if end == -1 {
|
|
i++
|
|
continue
|
|
}
|
|
inner := wikitext[i+2 : end]
|
|
if box := parseTemplate(inner); box != nil {
|
|
results = append(results, *box)
|
|
}
|
|
i = end + 2
|
|
} else {
|
|
i++
|
|
}
|
|
}
|
|
return results
|
|
}
|
|
|
|
// FindTemplate searches parsed templates for one matching the given name (case-insensitive).
|
|
func FindTemplate(templates []Infobox, name string) *Infobox {
|
|
target := strings.ToLower(strings.TrimSpace(name))
|
|
for i, t := range templates {
|
|
if strings.ToLower(strings.TrimSpace(t.TemplateName)) == target {
|
|
return &templates[i]
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// FindAllTemplates returns all templates matching the given name.
|
|
func FindAllTemplates(templates []Infobox, name string) []Infobox {
|
|
target := strings.ToLower(strings.TrimSpace(name))
|
|
var matches []Infobox
|
|
for _, t := range templates {
|
|
if strings.ToLower(strings.TrimSpace(t.TemplateName)) == target {
|
|
matches = append(matches, t)
|
|
}
|
|
}
|
|
return matches
|
|
}
|
|
|
|
// findTemplateEnd finds the index of the closing '}}' for a template starting at pos.
|
|
func findTemplateEnd(s string, pos int) int {
|
|
depth := 0
|
|
i := pos
|
|
for i < len(s)-1 {
|
|
if s[i] == '{' && s[i+1] == '{' {
|
|
depth++
|
|
i += 2
|
|
} else if s[i] == '}' && s[i+1] == '}' {
|
|
depth--
|
|
if depth == 0 {
|
|
return i
|
|
}
|
|
i += 2
|
|
} else {
|
|
i++
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// parseTemplate parses the inner content of a {{...}} template.
|
|
func parseTemplate(inner string) *Infobox {
|
|
// Split on '|' but respect nested templates
|
|
parts := splitOnPipes(inner)
|
|
if len(parts) == 0 {
|
|
return nil
|
|
}
|
|
|
|
name := strings.TrimSpace(parts[0])
|
|
// Skip parser functions like #if, #switch, etc.
|
|
if strings.HasPrefix(name, "#") {
|
|
return nil
|
|
}
|
|
|
|
params := make(map[string]string)
|
|
positional := 1
|
|
for _, part := range parts[1:] {
|
|
eqIdx := strings.Index(part, "=")
|
|
if eqIdx > 0 {
|
|
key := strings.TrimSpace(part[:eqIdx])
|
|
val := strings.TrimSpace(part[eqIdx+1:])
|
|
// Clean up common wikitext artifacts
|
|
val = cleanValue(val)
|
|
if key != "" {
|
|
params[key] = val
|
|
}
|
|
} else {
|
|
// Positional parameter
|
|
params[strconv.Itoa(positional)] = strings.TrimSpace(part)
|
|
positional++
|
|
}
|
|
}
|
|
|
|
return &Infobox{
|
|
TemplateName: name,
|
|
Params: params,
|
|
}
|
|
}
|
|
|
|
// splitOnPipes splits a string on '|' while respecting nested {{...}} and [[...]].
|
|
func splitOnPipes(s string) []string {
|
|
var parts []string
|
|
var current strings.Builder
|
|
braceDepth := 0
|
|
bracketDepth := 0
|
|
|
|
for i := 0; i < len(s); i++ {
|
|
ch := s[i]
|
|
switch {
|
|
case ch == '{' && i+1 < len(s) && s[i+1] == '{':
|
|
braceDepth++
|
|
current.WriteByte('{')
|
|
current.WriteByte('{')
|
|
i++
|
|
case ch == '}' && i+1 < len(s) && s[i+1] == '}':
|
|
braceDepth--
|
|
current.WriteByte('}')
|
|
current.WriteByte('}')
|
|
i++
|
|
case ch == '[' && i+1 < len(s) && s[i+1] == '[':
|
|
bracketDepth++
|
|
current.WriteByte('[')
|
|
current.WriteByte('[')
|
|
i++
|
|
case ch == ']' && i+1 < len(s) && s[i+1] == ']':
|
|
bracketDepth--
|
|
current.WriteByte(']')
|
|
current.WriteByte(']')
|
|
i++
|
|
case ch == '|' && braceDepth == 0 && bracketDepth == 0:
|
|
parts = append(parts, current.String())
|
|
current.Reset()
|
|
default:
|
|
current.WriteByte(ch)
|
|
}
|
|
}
|
|
parts = append(parts, current.String())
|
|
return parts
|
|
}
|
|
|
|
// cleanValue strips common wikitext formatting from a value.
|
|
func cleanValue(s string) string {
|
|
// Remove [[ ]] wiki links, keeping the display text
|
|
s = cleanWikiLinks(s)
|
|
// Remove '' and ''' (bold/italic)
|
|
s = strings.ReplaceAll(s, "'''", "")
|
|
s = strings.ReplaceAll(s, "''", "")
|
|
// Trim whitespace
|
|
s = strings.TrimSpace(s)
|
|
return s
|
|
}
|
|
|
|
// cleanWikiLinks converts [[Target|Display]] to Display, and [[Target]] to Target.
|
|
func cleanWikiLinks(s string) string {
|
|
var b strings.Builder
|
|
i := 0
|
|
for i < len(s) {
|
|
if i+1 < len(s) && s[i] == '[' && s[i+1] == '[' {
|
|
end := strings.Index(s[i:], "]]")
|
|
if end == -1 {
|
|
b.WriteByte(s[i])
|
|
i++
|
|
continue
|
|
}
|
|
inner := s[i+2 : i+end]
|
|
if pipeIdx := strings.Index(inner, "|"); pipeIdx >= 0 {
|
|
b.WriteString(inner[pipeIdx+1:])
|
|
} else {
|
|
b.WriteString(inner)
|
|
}
|
|
i = i + end + 2
|
|
} else {
|
|
b.WriteByte(s[i])
|
|
i++
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
// CleanWikitext strips templates, wiki links, and HTML but preserves line structure
|
|
// and converts wiki list markers (* items) to readable bullet points.
|
|
func CleanWikitext(s string) string {
|
|
s = expandKnownTemplates(s)
|
|
s = removeTemplates(s)
|
|
s = cleanWikiLinks(s)
|
|
s = strings.ReplaceAll(s, "'''", "")
|
|
s = strings.ReplaceAll(s, "''", "")
|
|
s = removeWikiTables(s)
|
|
s = stripHTMLTags(s)
|
|
s = removeRefs(s)
|
|
s = removeSectionHeadings(s)
|
|
s = removeFileAndCategoryLines(s)
|
|
s = removeFileRefs(s)
|
|
|
|
// Process line by line to preserve list structure
|
|
lines := strings.Split(s, "\n")
|
|
var out []string
|
|
for _, line := range lines {
|
|
trimmed := strings.TrimSpace(line)
|
|
if trimmed == "" {
|
|
continue
|
|
}
|
|
// Convert wiki list markers to markdown bullets
|
|
if strings.HasPrefix(trimmed, "***") {
|
|
out = append(out, " - "+strings.TrimSpace(trimmed[3:]))
|
|
} else if strings.HasPrefix(trimmed, "**") {
|
|
out = append(out, " - "+strings.TrimSpace(trimmed[2:]))
|
|
} else if strings.HasPrefix(trimmed, "*") {
|
|
out = append(out, "- "+strings.TrimSpace(trimmed[1:]))
|
|
} else {
|
|
out = append(out, trimmed)
|
|
}
|
|
}
|
|
return strings.Join(out, "\n")
|
|
}
|
|
|
|
// ExtractPlainText strips all wikitext markup to produce plain text.
|
|
func ExtractPlainText(wikitext string) string {
|
|
s := wikitext
|
|
s = expandKnownTemplates(s)
|
|
s = removeTemplates(s)
|
|
s = cleanWikiLinks(s)
|
|
s = strings.ReplaceAll(s, "'''", "")
|
|
s = strings.ReplaceAll(s, "''", "")
|
|
// Remove wiki tables {| ... |}
|
|
s = removeWikiTables(s)
|
|
// Remove section headings (== Foo ==, === Bar ===, etc.)
|
|
s = removeSectionHeadings(s)
|
|
// Remove HTML tags
|
|
s = stripHTMLTags(s)
|
|
// Remove references
|
|
s = removeRefs(s)
|
|
// Remove file/image links and category lines
|
|
s = removeFileAndCategoryLines(s)
|
|
// Remove leftover File: references
|
|
s = removeFileRefs(s)
|
|
// Collapse whitespace
|
|
s = collapseWhitespace(s)
|
|
return strings.TrimSpace(s)
|
|
}
|
|
|
|
// expandKnownTemplates replaces well-known templates with plain text equivalents
|
|
// before the generic template removal pass. This preserves useful data from
|
|
// templates like {{Skillreq|Mining|75}} → "75 Mining".
|
|
func expandKnownTemplates(s string) string {
|
|
var b strings.Builder
|
|
i := 0
|
|
for i < len(s)-1 {
|
|
if s[i] == '{' && s[i+1] == '{' {
|
|
end := findTemplateEnd(s, i)
|
|
if end == -1 {
|
|
b.WriteByte(s[i])
|
|
i++
|
|
continue
|
|
}
|
|
inner := s[i+2 : end]
|
|
if expanded, ok := tryExpandTemplate(inner); ok {
|
|
b.WriteString(expanded)
|
|
} else {
|
|
// Leave it for removeTemplates to handle
|
|
b.WriteString(s[i : end+2])
|
|
}
|
|
i = end + 2
|
|
} else {
|
|
b.WriteByte(s[i])
|
|
i++
|
|
}
|
|
}
|
|
if i < len(s) {
|
|
b.WriteByte(s[i])
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func tryExpandTemplate(inner string) (string, bool) {
|
|
parts := splitOnPipes(inner)
|
|
if len(parts) == 0 {
|
|
return "", false
|
|
}
|
|
name := strings.TrimSpace(parts[0])
|
|
lower := strings.ToLower(name)
|
|
|
|
switch lower {
|
|
case "skillreq", "scp":
|
|
// {{Skillreq|Skill|Level}} or {{SCP|Level|Skill}} → "Level Skill"
|
|
if len(parts) >= 3 {
|
|
if lower == "scp" {
|
|
return strings.TrimSpace(parts[1]) + " " + strings.TrimSpace(parts[2]), true
|
|
}
|
|
return strings.TrimSpace(parts[2]) + " " + strings.TrimSpace(parts[1]), true
|
|
}
|
|
if len(parts) == 2 {
|
|
return strings.TrimSpace(parts[1]), true
|
|
}
|
|
case "fairycode":
|
|
if len(parts) >= 2 {
|
|
return strings.TrimSpace(parts[1]), true
|
|
}
|
|
case "coins", "coins detail":
|
|
if len(parts) >= 2 {
|
|
return strings.TrimSpace(parts[1]) + " coins", true
|
|
}
|
|
|
|
case "loctablehead", "loctablebottom":
|
|
return "", true
|
|
|
|
case "locline":
|
|
paramMap := map[string]string{}
|
|
for _, p := range parts[1:] {
|
|
if idx := strings.Index(p, "="); idx > 0 {
|
|
k := strings.ToLower(strings.TrimSpace(p[:idx]))
|
|
v := cleanWikiLinks(strings.TrimSpace(p[idx+1:]))
|
|
paramMap[k] = v
|
|
}
|
|
}
|
|
// Support both OSRS (location/levels/members) and RS3 (loc/lvls/mem) param names
|
|
locVal := paramMap["location"]
|
|
if locVal == "" {
|
|
locVal = paramMap["loc"]
|
|
}
|
|
lvlVal := paramMap["levels"]
|
|
if lvlVal == "" {
|
|
lvlVal = paramMap["lvls"]
|
|
}
|
|
memVal := paramMap["members"]
|
|
if memVal == "" {
|
|
memVal = paramMap["mem"]
|
|
}
|
|
|
|
var segments []string
|
|
if locVal != "" {
|
|
segments = append(segments, locVal)
|
|
}
|
|
if lvlVal != "" {
|
|
segments = append(segments, "level "+lvlVal)
|
|
}
|
|
if memVal != "" {
|
|
if strings.EqualFold(memVal, "yes") {
|
|
segments = append(segments, "members only")
|
|
} else {
|
|
segments = append(segments, "F2P")
|
|
}
|
|
}
|
|
if len(segments) == 0 {
|
|
return "", true
|
|
}
|
|
return "\n- " + strings.Join(segments, ", "), true
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
func removeTemplates(s string) string {
|
|
var b strings.Builder
|
|
depth := 0
|
|
for i := 0; i < len(s); i++ {
|
|
if i+1 < len(s) && s[i] == '{' && s[i+1] == '{' {
|
|
depth++
|
|
i++
|
|
} else if i+1 < len(s) && s[i] == '}' && s[i+1] == '}' {
|
|
depth--
|
|
if depth < 0 {
|
|
depth = 0
|
|
}
|
|
i++
|
|
} else if depth == 0 {
|
|
b.WriteByte(s[i])
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func stripHTMLTags(s string) string {
|
|
var b strings.Builder
|
|
inTag := false
|
|
for _, r := range s {
|
|
if r == '<' {
|
|
inTag = true
|
|
} else if r == '>' {
|
|
inTag = false
|
|
} else if !inTag {
|
|
b.WriteRune(r)
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func removeRefs(s string) string {
|
|
// Remove <ref>...</ref> and <ref ... />
|
|
for {
|
|
start := strings.Index(s, "<ref")
|
|
if start == -1 {
|
|
break
|
|
}
|
|
// Self-closing?
|
|
selfClose := strings.Index(s[start:], "/>")
|
|
endTag := strings.Index(s[start:], "</ref>")
|
|
|
|
if selfClose != -1 && (endTag == -1 || selfClose < endTag) {
|
|
s = s[:start] + s[start+selfClose+2:]
|
|
} else if endTag != -1 {
|
|
s = s[:start] + s[start+endTag+6:]
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
return s
|
|
}
|
|
|
|
func removeWikiTables(s string) string {
|
|
var b strings.Builder
|
|
depth := 0
|
|
lines := strings.Split(s, "\n")
|
|
for _, line := range lines {
|
|
trimmed := strings.TrimSpace(line)
|
|
if strings.HasPrefix(trimmed, "{|") {
|
|
depth++
|
|
continue
|
|
}
|
|
if strings.HasPrefix(trimmed, "|}") {
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
continue
|
|
}
|
|
if depth == 0 {
|
|
b.WriteString(line)
|
|
b.WriteByte('\n')
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func removeFileAndCategoryLines(s string) string {
|
|
var b strings.Builder
|
|
lines := strings.Split(s, "\n")
|
|
for _, line := range lines {
|
|
trimmed := strings.TrimSpace(line)
|
|
lower := strings.ToLower(trimmed)
|
|
// Skip lines that are file/image embeds or category tags
|
|
if strings.HasPrefix(lower, "[[file:") || strings.HasPrefix(lower, "[[image:") ||
|
|
strings.HasPrefix(lower, "[[category:") || strings.HasPrefix(lower, "category:") ||
|
|
strings.HasPrefix(lower, "thumb|") || lower == "thumb" ||
|
|
(strings.Contains(lower, "px|") && (strings.Contains(lower, "thumb") || strings.Contains(lower, "right") || strings.Contains(lower, "left"))) {
|
|
continue
|
|
}
|
|
// Remove inline [[Category:...]] references
|
|
for strings.Contains(line, "[[Category:") {
|
|
start := strings.Index(line, "[[Category:")
|
|
end := strings.Index(line[start:], "]]")
|
|
if end == -1 {
|
|
break
|
|
}
|
|
line = line[:start] + line[start+end+2:]
|
|
}
|
|
b.WriteString(line)
|
|
b.WriteByte('\n')
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func removeSectionHeadings(s string) string {
|
|
var b strings.Builder
|
|
lines := strings.Split(s, "\n")
|
|
for _, line := range lines {
|
|
trimmed := strings.TrimSpace(line)
|
|
if strings.HasPrefix(trimmed, "==") && strings.HasSuffix(trimmed, "==") {
|
|
continue
|
|
}
|
|
b.WriteString(line)
|
|
b.WriteByte('\n')
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func removeFileRefs(s string) string {
|
|
var b strings.Builder
|
|
lines := strings.Split(s, "\n")
|
|
for _, line := range lines {
|
|
// Remove inline File: references (leftover after wiki link cleaning)
|
|
cleaned := line
|
|
for {
|
|
lower := strings.ToLower(cleaned)
|
|
idx := strings.Index(lower, "file:")
|
|
if idx == -1 {
|
|
break
|
|
}
|
|
// Find the end of this reference — usually ends at whitespace or next sentence
|
|
end := idx + 5
|
|
for end < len(cleaned) && cleaned[end] != ' ' && cleaned[end] != '\n' {
|
|
end++
|
|
}
|
|
cleaned = cleaned[:idx] + cleaned[end:]
|
|
}
|
|
b.WriteString(cleaned)
|
|
b.WriteByte('\n')
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func collapseWhitespace(s string) string {
|
|
var b strings.Builder
|
|
prevSpace := false
|
|
prevNewline := false
|
|
for _, r := range s {
|
|
if r == '\n' {
|
|
if !prevNewline {
|
|
b.WriteRune('\n')
|
|
}
|
|
prevNewline = true
|
|
prevSpace = false
|
|
} else if unicode.IsSpace(r) {
|
|
if !prevSpace && !prevNewline {
|
|
b.WriteRune(' ')
|
|
}
|
|
prevSpace = true
|
|
} else {
|
|
b.WriteRune(r)
|
|
prevSpace = false
|
|
prevNewline = false
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|