Files
2026-03-05 12:51:31 -06:00

590 lines
14 KiB
Go

package extract
import (
"strconv"
"strings"
"unicode"
)
// Infobox represents a parsed wiki infobox (or any template) as key-value pairs.
type Infobox struct {
TemplateName string
Params map[string]string
}
// ParseTemplates extracts all top-level templates from wikitext.
// It handles nested templates (e.g., {{formatnum:{{GEPrice|...}}}}) by tracking brace depth.
func ParseTemplates(wikitext string) []Infobox {
var results []Infobox
i := 0
for i < len(wikitext)-1 {
if wikitext[i] == '{' && wikitext[i+1] == '{' {
end := findTemplateEnd(wikitext, i)
if end == -1 {
i++
continue
}
inner := wikitext[i+2 : end]
if box := parseTemplate(inner); box != nil {
results = append(results, *box)
}
i = end + 2
} else {
i++
}
}
return results
}
// FindTemplate searches parsed templates for one matching the given name (case-insensitive).
func FindTemplate(templates []Infobox, name string) *Infobox {
target := strings.ToLower(strings.TrimSpace(name))
for i, t := range templates {
if strings.ToLower(strings.TrimSpace(t.TemplateName)) == target {
return &templates[i]
}
}
return nil
}
// FindAllTemplates returns all templates matching the given name.
func FindAllTemplates(templates []Infobox, name string) []Infobox {
target := strings.ToLower(strings.TrimSpace(name))
var matches []Infobox
for _, t := range templates {
if strings.ToLower(strings.TrimSpace(t.TemplateName)) == target {
matches = append(matches, t)
}
}
return matches
}
// findTemplateEnd finds the index of the closing '}}' for a template starting at pos.
func findTemplateEnd(s string, pos int) int {
depth := 0
i := pos
for i < len(s)-1 {
if s[i] == '{' && s[i+1] == '{' {
depth++
i += 2
} else if s[i] == '}' && s[i+1] == '}' {
depth--
if depth == 0 {
return i
}
i += 2
} else {
i++
}
}
return -1
}
// parseTemplate parses the inner content of a {{...}} template.
func parseTemplate(inner string) *Infobox {
// Split on '|' but respect nested templates
parts := splitOnPipes(inner)
if len(parts) == 0 {
return nil
}
name := strings.TrimSpace(parts[0])
// Skip parser functions like #if, #switch, etc.
if strings.HasPrefix(name, "#") {
return nil
}
params := make(map[string]string)
positional := 1
for _, part := range parts[1:] {
eqIdx := strings.Index(part, "=")
if eqIdx > 0 {
key := strings.TrimSpace(part[:eqIdx])
val := strings.TrimSpace(part[eqIdx+1:])
// Clean up common wikitext artifacts
val = cleanValue(val)
if key != "" {
params[key] = val
}
} else {
// Positional parameter
params[strconv.Itoa(positional)] = strings.TrimSpace(part)
positional++
}
}
return &Infobox{
TemplateName: name,
Params: params,
}
}
// splitOnPipes splits a string on '|' while respecting nested {{...}} and [[...]].
func splitOnPipes(s string) []string {
var parts []string
var current strings.Builder
braceDepth := 0
bracketDepth := 0
for i := 0; i < len(s); i++ {
ch := s[i]
switch {
case ch == '{' && i+1 < len(s) && s[i+1] == '{':
braceDepth++
current.WriteByte('{')
current.WriteByte('{')
i++
case ch == '}' && i+1 < len(s) && s[i+1] == '}':
braceDepth--
current.WriteByte('}')
current.WriteByte('}')
i++
case ch == '[' && i+1 < len(s) && s[i+1] == '[':
bracketDepth++
current.WriteByte('[')
current.WriteByte('[')
i++
case ch == ']' && i+1 < len(s) && s[i+1] == ']':
bracketDepth--
current.WriteByte(']')
current.WriteByte(']')
i++
case ch == '|' && braceDepth == 0 && bracketDepth == 0:
parts = append(parts, current.String())
current.Reset()
default:
current.WriteByte(ch)
}
}
parts = append(parts, current.String())
return parts
}
// cleanValue strips common wikitext formatting from a value.
func cleanValue(s string) string {
// Remove [[ ]] wiki links, keeping the display text
s = cleanWikiLinks(s)
// Remove '' and ''' (bold/italic)
s = strings.ReplaceAll(s, "'''", "")
s = strings.ReplaceAll(s, "''", "")
// Trim whitespace
s = strings.TrimSpace(s)
return s
}
// cleanWikiLinks converts [[Target|Display]] to Display, and [[Target]] to Target.
func cleanWikiLinks(s string) string {
var b strings.Builder
i := 0
for i < len(s) {
if i+1 < len(s) && s[i] == '[' && s[i+1] == '[' {
end := strings.Index(s[i:], "]]")
if end == -1 {
b.WriteByte(s[i])
i++
continue
}
inner := s[i+2 : i+end]
if pipeIdx := strings.Index(inner, "|"); pipeIdx >= 0 {
b.WriteString(inner[pipeIdx+1:])
} else {
b.WriteString(inner)
}
i = i + end + 2
} else {
b.WriteByte(s[i])
i++
}
}
return b.String()
}
// CleanWikitext strips templates, wiki links, and HTML but preserves line structure
// and converts wiki list markers (* items) to readable bullet points.
func CleanWikitext(s string) string {
s = StripTransclusionTags(s)
s = expandKnownTemplates(s)
s = removeTemplates(s)
s = cleanWikiLinks(s)
s = strings.ReplaceAll(s, "'''", "")
s = strings.ReplaceAll(s, "''", "")
s = removeWikiTables(s)
s = stripHTMLTags(s)
s = removeRefs(s)
s = removeSectionHeadings(s)
s = removeFileAndCategoryLines(s)
s = removeFileRefs(s)
// Process line by line to preserve list structure
lines := strings.Split(s, "\n")
var out []string
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
continue
}
// Convert wiki list markers to markdown bullets
if strings.HasPrefix(trimmed, "***") {
out = append(out, " - "+strings.TrimSpace(trimmed[3:]))
} else if strings.HasPrefix(trimmed, "**") {
out = append(out, " - "+strings.TrimSpace(trimmed[2:]))
} else if strings.HasPrefix(trimmed, "*") {
out = append(out, "- "+strings.TrimSpace(trimmed[1:]))
} else {
out = append(out, trimmed)
}
}
return strings.Join(out, "\n")
}
// ExtractPlainText strips all wikitext markup to produce plain text.
func ExtractPlainText(wikitext string) string {
s := wikitext
s = StripTransclusionTags(s)
s = expandKnownTemplates(s)
s = removeTemplates(s)
s = cleanWikiLinks(s)
s = strings.ReplaceAll(s, "'''", "")
s = strings.ReplaceAll(s, "''", "")
// Remove wiki tables {| ... |}
s = removeWikiTables(s)
// Remove section headings (== Foo ==, === Bar ===, etc.)
s = removeSectionHeadings(s)
// Remove HTML tags
s = stripHTMLTags(s)
// Remove references
s = removeRefs(s)
// Remove file/image links and category lines
s = removeFileAndCategoryLines(s)
// Remove leftover File: references
s = removeFileRefs(s)
// Collapse whitespace
s = collapseWhitespace(s)
return strings.TrimSpace(s)
}
// expandKnownTemplates replaces well-known templates with plain text equivalents
// before the generic template removal pass. This preserves useful data from
// templates like {{Skillreq|Mining|75}} → "75 Mining".
func expandKnownTemplates(s string) string {
var b strings.Builder
i := 0
for i < len(s)-1 {
if s[i] == '{' && s[i+1] == '{' {
end := findTemplateEnd(s, i)
if end == -1 {
b.WriteByte(s[i])
i++
continue
}
inner := s[i+2 : end]
if expanded, ok := tryExpandTemplate(inner); ok {
b.WriteString(expanded)
} else {
// Leave it for removeTemplates to handle
b.WriteString(s[i : end+2])
}
i = end + 2
} else {
b.WriteByte(s[i])
i++
}
}
if i < len(s) {
b.WriteByte(s[i])
}
return b.String()
}
func tryExpandTemplate(inner string) (string, bool) {
parts := splitOnPipes(inner)
if len(parts) == 0 {
return "", false
}
name := strings.TrimSpace(parts[0])
lower := strings.ToLower(name)
switch lower {
case "skillreq", "scp":
// {{Skillreq|Skill|Level}} or {{SCP|Level|Skill}} → "Level Skill"
if len(parts) >= 3 {
if lower == "scp" {
return strings.TrimSpace(parts[1]) + " " + strings.TrimSpace(parts[2]), true
}
return strings.TrimSpace(parts[2]) + " " + strings.TrimSpace(parts[1]), true
}
if len(parts) == 2 {
return strings.TrimSpace(parts[1]), true
}
case "fairycode":
if len(parts) >= 2 {
return strings.TrimSpace(parts[1]), true
}
case "coins", "coins detail":
if len(parts) >= 2 {
return strings.TrimSpace(parts[1]) + " coins", true
}
case "loctablehead", "loctablebottom":
return "", true
case "locline":
paramMap := map[string]string{}
for _, p := range parts[1:] {
if idx := strings.Index(p, "="); idx > 0 {
k := strings.ToLower(strings.TrimSpace(p[:idx]))
v := cleanWikiLinks(strings.TrimSpace(p[idx+1:]))
paramMap[k] = v
}
}
// Support both OSRS (location/levels/members) and RS3 (loc/lvls/mem) param names
locVal := paramMap["location"]
if locVal == "" {
locVal = paramMap["loc"]
}
lvlVal := paramMap["levels"]
if lvlVal == "" {
lvlVal = paramMap["lvls"]
}
memVal := paramMap["members"]
if memVal == "" {
memVal = paramMap["mem"]
}
var segments []string
if locVal != "" {
segments = append(segments, locVal)
}
if lvlVal != "" {
segments = append(segments, "level "+lvlVal)
}
if memVal != "" {
if strings.EqualFold(memVal, "yes") {
segments = append(segments, "members only")
} else {
segments = append(segments, "F2P")
}
}
if len(segments) == 0 {
return "", true
}
return "\n- " + strings.Join(segments, ", "), true
}
return "", false
}
// StripTransclusionTags handles MediaWiki transclusion directives in raw wikitext.
// When viewing a page directly (not transcluding):
// - <noinclude>content</noinclude> → keep content (strip tags only)
// - <includeonly>content</includeonly> → remove entirely (tags + content)
// - <onlyinclude>content</onlyinclude> → keep content (strip tags only)
func StripTransclusionTags(s string) string {
// Remove <includeonly>...</includeonly> blocks entirely
for {
lower := strings.ToLower(s)
start := strings.Index(lower, "<includeonly>")
if start == -1 {
break
}
end := strings.Index(lower[start:], "</includeonly>")
if end == -1 {
// Unclosed tag — remove to end of string
s = s[:start]
break
}
s = s[:start] + s[start+end+len("</includeonly>"):]
}
// Unwrap <noinclude> and <onlyinclude> (keep content, remove tags)
for _, tag := range []string{"noinclude", "onlyinclude"} {
s = removeTagKeepContent(s, tag)
}
return s
}
func removeTagKeepContent(s, tag string) string {
open := "<" + tag + ">"
close := "</" + tag + ">"
s = strings.ReplaceAll(s, open, "")
s = strings.ReplaceAll(s, close, "")
// Case-insensitive variants
upper := "<" + strings.ToUpper(tag) + ">"
upperClose := "</" + strings.ToUpper(tag) + ">"
s = strings.ReplaceAll(s, upper, "")
s = strings.ReplaceAll(s, upperClose, "")
return s
}
func removeTemplates(s string) string {
var b strings.Builder
depth := 0
for i := 0; i < len(s); i++ {
if i+1 < len(s) && s[i] == '{' && s[i+1] == '{' {
depth++
i++
} else if i+1 < len(s) && s[i] == '}' && s[i+1] == '}' {
depth--
if depth < 0 {
depth = 0
}
i++
} else if depth == 0 {
b.WriteByte(s[i])
}
}
return b.String()
}
func stripHTMLTags(s string) string {
var b strings.Builder
inTag := false
for _, r := range s {
if r == '<' {
inTag = true
} else if r == '>' {
inTag = false
} else if !inTag {
b.WriteRune(r)
}
}
return b.String()
}
func removeRefs(s string) string {
// Remove <ref>...</ref> and <ref ... />
for {
start := strings.Index(s, "<ref")
if start == -1 {
break
}
// Self-closing?
selfClose := strings.Index(s[start:], "/>")
endTag := strings.Index(s[start:], "</ref>")
if selfClose != -1 && (endTag == -1 || selfClose < endTag) {
s = s[:start] + s[start+selfClose+2:]
} else if endTag != -1 {
s = s[:start] + s[start+endTag+6:]
} else {
break
}
}
return s
}
func removeWikiTables(s string) string {
var b strings.Builder
depth := 0
lines := strings.Split(s, "\n")
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if strings.HasPrefix(trimmed, "{|") {
depth++
continue
}
if strings.HasPrefix(trimmed, "|}") {
if depth > 0 {
depth--
}
continue
}
if depth == 0 {
b.WriteString(line)
b.WriteByte('\n')
}
}
return b.String()
}
func removeFileAndCategoryLines(s string) string {
var b strings.Builder
lines := strings.Split(s, "\n")
for _, line := range lines {
trimmed := strings.TrimSpace(line)
lower := strings.ToLower(trimmed)
// Skip lines that are file/image embeds or category tags
if strings.HasPrefix(lower, "[[file:") || strings.HasPrefix(lower, "[[image:") ||
strings.HasPrefix(lower, "[[category:") || strings.HasPrefix(lower, "category:") ||
strings.HasPrefix(lower, "thumb|") || lower == "thumb" ||
(strings.Contains(lower, "px|") && (strings.Contains(lower, "thumb") || strings.Contains(lower, "right") || strings.Contains(lower, "left"))) {
continue
}
// Remove inline [[Category:...]] references
for strings.Contains(line, "[[Category:") {
start := strings.Index(line, "[[Category:")
end := strings.Index(line[start:], "]]")
if end == -1 {
break
}
line = line[:start] + line[start+end+2:]
}
b.WriteString(line)
b.WriteByte('\n')
}
return b.String()
}
func removeSectionHeadings(s string) string {
var b strings.Builder
lines := strings.Split(s, "\n")
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if strings.HasPrefix(trimmed, "==") && strings.HasSuffix(trimmed, "==") {
continue
}
b.WriteString(line)
b.WriteByte('\n')
}
return b.String()
}
func removeFileRefs(s string) string {
var b strings.Builder
lines := strings.Split(s, "\n")
for _, line := range lines {
// Remove inline File: references (leftover after wiki link cleaning)
cleaned := line
for {
lower := strings.ToLower(cleaned)
idx := strings.Index(lower, "file:")
if idx == -1 {
break
}
// Find the end of this reference — usually ends at whitespace or next sentence
end := idx + 5
for end < len(cleaned) && cleaned[end] != ' ' && cleaned[end] != '\n' {
end++
}
cleaned = cleaned[:idx] + cleaned[end:]
}
b.WriteString(cleaned)
b.WriteByte('\n')
}
return b.String()
}
func collapseWhitespace(s string) string {
var b strings.Builder
prevSpace := false
prevNewline := false
for _, r := range s {
if r == '\n' {
if !prevNewline {
b.WriteRune('\n')
}
prevNewline = true
prevSpace = false
} else if unicode.IsSpace(r) {
if !prevSpace && !prevNewline {
b.WriteRune(' ')
}
prevSpace = true
} else {
b.WriteRune(r)
prevSpace = false
prevNewline = false
}
}
return b.String()
}