Initial commit
This commit is contained in:
492
scripts/rsw/internal/extract/infobox.go
Normal file
492
scripts/rsw/internal/extract/infobox.go
Normal file
@@ -0,0 +1,492 @@
|
||||
package extract
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// Infobox represents a parsed wiki infobox (or any template) as key-value pairs.
|
||||
type Infobox struct {
|
||||
TemplateName string
|
||||
Params map[string]string
|
||||
}
|
||||
|
||||
// ParseTemplates extracts all top-level templates from wikitext.
|
||||
// It handles nested templates (e.g., {{formatnum:{{GEPrice|...}}}}) by tracking brace depth.
|
||||
func ParseTemplates(wikitext string) []Infobox {
|
||||
var results []Infobox
|
||||
i := 0
|
||||
for i < len(wikitext)-1 {
|
||||
if wikitext[i] == '{' && wikitext[i+1] == '{' {
|
||||
end := findTemplateEnd(wikitext, i)
|
||||
if end == -1 {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
inner := wikitext[i+2 : end]
|
||||
if box := parseTemplate(inner); box != nil {
|
||||
results = append(results, *box)
|
||||
}
|
||||
i = end + 2
|
||||
} else {
|
||||
i++
|
||||
}
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
// FindTemplate searches parsed templates for one matching the given name (case-insensitive).
|
||||
func FindTemplate(templates []Infobox, name string) *Infobox {
|
||||
target := strings.ToLower(strings.TrimSpace(name))
|
||||
for i, t := range templates {
|
||||
if strings.ToLower(strings.TrimSpace(t.TemplateName)) == target {
|
||||
return &templates[i]
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// FindAllTemplates returns all templates matching the given name.
|
||||
func FindAllTemplates(templates []Infobox, name string) []Infobox {
|
||||
target := strings.ToLower(strings.TrimSpace(name))
|
||||
var matches []Infobox
|
||||
for _, t := range templates {
|
||||
if strings.ToLower(strings.TrimSpace(t.TemplateName)) == target {
|
||||
matches = append(matches, t)
|
||||
}
|
||||
}
|
||||
return matches
|
||||
}
|
||||
|
||||
// findTemplateEnd finds the index of the closing '}}' for a template starting at pos.
|
||||
func findTemplateEnd(s string, pos int) int {
|
||||
depth := 0
|
||||
i := pos
|
||||
for i < len(s)-1 {
|
||||
if s[i] == '{' && s[i+1] == '{' {
|
||||
depth++
|
||||
i += 2
|
||||
} else if s[i] == '}' && s[i+1] == '}' {
|
||||
depth--
|
||||
if depth == 0 {
|
||||
return i
|
||||
}
|
||||
i += 2
|
||||
} else {
|
||||
i++
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// parseTemplate parses the inner content of a {{...}} template.
|
||||
func parseTemplate(inner string) *Infobox {
|
||||
// Split on '|' but respect nested templates
|
||||
parts := splitOnPipes(inner)
|
||||
if len(parts) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
name := strings.TrimSpace(parts[0])
|
||||
// Skip parser functions like #if, #switch, etc.
|
||||
if strings.HasPrefix(name, "#") {
|
||||
return nil
|
||||
}
|
||||
|
||||
params := make(map[string]string)
|
||||
positional := 1
|
||||
for _, part := range parts[1:] {
|
||||
eqIdx := strings.Index(part, "=")
|
||||
if eqIdx > 0 {
|
||||
key := strings.TrimSpace(part[:eqIdx])
|
||||
val := strings.TrimSpace(part[eqIdx+1:])
|
||||
// Clean up common wikitext artifacts
|
||||
val = cleanValue(val)
|
||||
if key != "" {
|
||||
params[key] = val
|
||||
}
|
||||
} else {
|
||||
// Positional parameter
|
||||
params[strconv.Itoa(positional)] = strings.TrimSpace(part)
|
||||
positional++
|
||||
}
|
||||
}
|
||||
|
||||
return &Infobox{
|
||||
TemplateName: name,
|
||||
Params: params,
|
||||
}
|
||||
}
|
||||
|
||||
// splitOnPipes splits a string on '|' while respecting nested {{...}} and [[...]].
|
||||
func splitOnPipes(s string) []string {
|
||||
var parts []string
|
||||
var current strings.Builder
|
||||
braceDepth := 0
|
||||
bracketDepth := 0
|
||||
|
||||
for i := 0; i < len(s); i++ {
|
||||
ch := s[i]
|
||||
switch {
|
||||
case ch == '{' && i+1 < len(s) && s[i+1] == '{':
|
||||
braceDepth++
|
||||
current.WriteByte('{')
|
||||
current.WriteByte('{')
|
||||
i++
|
||||
case ch == '}' && i+1 < len(s) && s[i+1] == '}':
|
||||
braceDepth--
|
||||
current.WriteByte('}')
|
||||
current.WriteByte('}')
|
||||
i++
|
||||
case ch == '[' && i+1 < len(s) && s[i+1] == '[':
|
||||
bracketDepth++
|
||||
current.WriteByte('[')
|
||||
current.WriteByte('[')
|
||||
i++
|
||||
case ch == ']' && i+1 < len(s) && s[i+1] == ']':
|
||||
bracketDepth--
|
||||
current.WriteByte(']')
|
||||
current.WriteByte(']')
|
||||
i++
|
||||
case ch == '|' && braceDepth == 0 && bracketDepth == 0:
|
||||
parts = append(parts, current.String())
|
||||
current.Reset()
|
||||
default:
|
||||
current.WriteByte(ch)
|
||||
}
|
||||
}
|
||||
parts = append(parts, current.String())
|
||||
return parts
|
||||
}
|
||||
|
||||
// cleanValue strips common wikitext formatting from a value.
|
||||
func cleanValue(s string) string {
|
||||
// Remove [[ ]] wiki links, keeping the display text
|
||||
s = cleanWikiLinks(s)
|
||||
// Remove '' and ''' (bold/italic)
|
||||
s = strings.ReplaceAll(s, "'''", "")
|
||||
s = strings.ReplaceAll(s, "''", "")
|
||||
// Trim whitespace
|
||||
s = strings.TrimSpace(s)
|
||||
return s
|
||||
}
|
||||
|
||||
// cleanWikiLinks converts [[Target|Display]] to Display, and [[Target]] to Target.
|
||||
func cleanWikiLinks(s string) string {
|
||||
var b strings.Builder
|
||||
i := 0
|
||||
for i < len(s) {
|
||||
if i+1 < len(s) && s[i] == '[' && s[i+1] == '[' {
|
||||
end := strings.Index(s[i:], "]]")
|
||||
if end == -1 {
|
||||
b.WriteByte(s[i])
|
||||
i++
|
||||
continue
|
||||
}
|
||||
inner := s[i+2 : i+end]
|
||||
if pipeIdx := strings.Index(inner, "|"); pipeIdx >= 0 {
|
||||
b.WriteString(inner[pipeIdx+1:])
|
||||
} else {
|
||||
b.WriteString(inner)
|
||||
}
|
||||
i = i + end + 2
|
||||
} else {
|
||||
b.WriteByte(s[i])
|
||||
i++
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// CleanWikitext strips templates, wiki links, and HTML but preserves line structure
|
||||
// and converts wiki list markers (* items) to readable bullet points.
|
||||
func CleanWikitext(s string) string {
|
||||
s = expandKnownTemplates(s)
|
||||
s = removeTemplates(s)
|
||||
s = cleanWikiLinks(s)
|
||||
s = strings.ReplaceAll(s, "'''", "")
|
||||
s = strings.ReplaceAll(s, "''", "")
|
||||
s = removeWikiTables(s)
|
||||
s = stripHTMLTags(s)
|
||||
s = removeRefs(s)
|
||||
s = removeSectionHeadings(s)
|
||||
s = removeFileAndCategoryLines(s)
|
||||
s = removeFileRefs(s)
|
||||
|
||||
// Process line by line to preserve list structure
|
||||
lines := strings.Split(s, "\n")
|
||||
var out []string
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
// Convert wiki list markers to markdown bullets
|
||||
if strings.HasPrefix(trimmed, "***") {
|
||||
out = append(out, " - "+strings.TrimSpace(trimmed[3:]))
|
||||
} else if strings.HasPrefix(trimmed, "**") {
|
||||
out = append(out, " - "+strings.TrimSpace(trimmed[2:]))
|
||||
} else if strings.HasPrefix(trimmed, "*") {
|
||||
out = append(out, "- "+strings.TrimSpace(trimmed[1:]))
|
||||
} else {
|
||||
out = append(out, trimmed)
|
||||
}
|
||||
}
|
||||
return strings.Join(out, "\n")
|
||||
}
|
||||
|
||||
// ExtractPlainText strips all wikitext markup to produce plain text.
|
||||
func ExtractPlainText(wikitext string) string {
|
||||
s := wikitext
|
||||
// Remove templates (simplified — just removes {{ ... }} at depth 0)
|
||||
s = removeTemplates(s)
|
||||
s = cleanWikiLinks(s)
|
||||
s = strings.ReplaceAll(s, "'''", "")
|
||||
s = strings.ReplaceAll(s, "''", "")
|
||||
// Remove wiki tables {| ... |}
|
||||
s = removeWikiTables(s)
|
||||
// Remove section headings (== Foo ==, === Bar ===, etc.)
|
||||
s = removeSectionHeadings(s)
|
||||
// Remove HTML tags
|
||||
s = stripHTMLTags(s)
|
||||
// Remove references
|
||||
s = removeRefs(s)
|
||||
// Remove file/image links and category lines
|
||||
s = removeFileAndCategoryLines(s)
|
||||
// Remove leftover File: references
|
||||
s = removeFileRefs(s)
|
||||
// Collapse whitespace
|
||||
s = collapseWhitespace(s)
|
||||
return strings.TrimSpace(s)
|
||||
}
|
||||
|
||||
// expandKnownTemplates replaces well-known templates with plain text equivalents
|
||||
// before the generic template removal pass. This preserves useful data from
|
||||
// templates like {{Skillreq|Mining|75}} → "75 Mining".
|
||||
func expandKnownTemplates(s string) string {
|
||||
var b strings.Builder
|
||||
i := 0
|
||||
for i < len(s)-1 {
|
||||
if s[i] == '{' && s[i+1] == '{' {
|
||||
end := findTemplateEnd(s, i)
|
||||
if end == -1 {
|
||||
b.WriteByte(s[i])
|
||||
i++
|
||||
continue
|
||||
}
|
||||
inner := s[i+2 : end]
|
||||
if expanded, ok := tryExpandTemplate(inner); ok {
|
||||
b.WriteString(expanded)
|
||||
} else {
|
||||
// Leave it for removeTemplates to handle
|
||||
b.WriteString(s[i : end+2])
|
||||
}
|
||||
i = end + 2
|
||||
} else {
|
||||
b.WriteByte(s[i])
|
||||
i++
|
||||
}
|
||||
}
|
||||
if i < len(s) {
|
||||
b.WriteByte(s[i])
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func tryExpandTemplate(inner string) (string, bool) {
|
||||
parts := splitOnPipes(inner)
|
||||
if len(parts) == 0 {
|
||||
return "", false
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
lower := strings.ToLower(name)
|
||||
|
||||
switch lower {
|
||||
case "skillreq", "scp":
|
||||
// {{Skillreq|Skill|Level}} or {{SCP|Level|Skill}} → "Level Skill"
|
||||
if len(parts) >= 3 {
|
||||
if lower == "scp" {
|
||||
return strings.TrimSpace(parts[1]) + " " + strings.TrimSpace(parts[2]), true
|
||||
}
|
||||
return strings.TrimSpace(parts[2]) + " " + strings.TrimSpace(parts[1]), true
|
||||
}
|
||||
if len(parts) == 2 {
|
||||
return strings.TrimSpace(parts[1]), true
|
||||
}
|
||||
case "fairycode":
|
||||
if len(parts) >= 2 {
|
||||
return strings.TrimSpace(parts[1]), true
|
||||
}
|
||||
case "coins", "coins detail":
|
||||
if len(parts) >= 2 {
|
||||
return strings.TrimSpace(parts[1]) + " coins", true
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
func removeTemplates(s string) string {
|
||||
var b strings.Builder
|
||||
depth := 0
|
||||
for i := 0; i < len(s); i++ {
|
||||
if i+1 < len(s) && s[i] == '{' && s[i+1] == '{' {
|
||||
depth++
|
||||
i++
|
||||
} else if i+1 < len(s) && s[i] == '}' && s[i+1] == '}' {
|
||||
depth--
|
||||
if depth < 0 {
|
||||
depth = 0
|
||||
}
|
||||
i++
|
||||
} else if depth == 0 {
|
||||
b.WriteByte(s[i])
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func stripHTMLTags(s string) string {
|
||||
var b strings.Builder
|
||||
inTag := false
|
||||
for _, r := range s {
|
||||
if r == '<' {
|
||||
inTag = true
|
||||
} else if r == '>' {
|
||||
inTag = false
|
||||
} else if !inTag {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func removeRefs(s string) string {
|
||||
// Remove <ref>...</ref> and <ref ... />
|
||||
for {
|
||||
start := strings.Index(s, "<ref")
|
||||
if start == -1 {
|
||||
break
|
||||
}
|
||||
// Self-closing?
|
||||
selfClose := strings.Index(s[start:], "/>")
|
||||
endTag := strings.Index(s[start:], "</ref>")
|
||||
|
||||
if selfClose != -1 && (endTag == -1 || selfClose < endTag) {
|
||||
s = s[:start] + s[start+selfClose+2:]
|
||||
} else if endTag != -1 {
|
||||
s = s[:start] + s[start+endTag+6:]
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func removeWikiTables(s string) string {
|
||||
var b strings.Builder
|
||||
depth := 0
|
||||
lines := strings.Split(s, "\n")
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(trimmed, "{|") {
|
||||
depth++
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(trimmed, "|}") {
|
||||
if depth > 0 {
|
||||
depth--
|
||||
}
|
||||
continue
|
||||
}
|
||||
if depth == 0 {
|
||||
b.WriteString(line)
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func removeFileAndCategoryLines(s string) string {
|
||||
var b strings.Builder
|
||||
lines := strings.Split(s, "\n")
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
lower := strings.ToLower(trimmed)
|
||||
// Skip lines that are file/image embeds or category tags
|
||||
if strings.HasPrefix(lower, "[[file:") || strings.HasPrefix(lower, "[[image:") ||
|
||||
strings.HasPrefix(lower, "[[category:") || strings.HasPrefix(lower, "category:") ||
|
||||
strings.HasPrefix(lower, "thumb|") || lower == "thumb" ||
|
||||
(strings.Contains(lower, "px|") && (strings.Contains(lower, "thumb") || strings.Contains(lower, "right") || strings.Contains(lower, "left"))) {
|
||||
continue
|
||||
}
|
||||
// Remove inline [[Category:...]] references
|
||||
for strings.Contains(line, "[[Category:") {
|
||||
start := strings.Index(line, "[[Category:")
|
||||
end := strings.Index(line[start:], "]]")
|
||||
if end == -1 {
|
||||
break
|
||||
}
|
||||
line = line[:start] + line[start+end+2:]
|
||||
}
|
||||
b.WriteString(line)
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func removeSectionHeadings(s string) string {
|
||||
var b strings.Builder
|
||||
lines := strings.Split(s, "\n")
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(trimmed, "==") && strings.HasSuffix(trimmed, "==") {
|
||||
continue
|
||||
}
|
||||
b.WriteString(line)
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func removeFileRefs(s string) string {
|
||||
var b strings.Builder
|
||||
lines := strings.Split(s, "\n")
|
||||
for _, line := range lines {
|
||||
// Remove inline File: references (leftover after wiki link cleaning)
|
||||
cleaned := line
|
||||
for {
|
||||
lower := strings.ToLower(cleaned)
|
||||
idx := strings.Index(lower, "file:")
|
||||
if idx == -1 {
|
||||
break
|
||||
}
|
||||
// Find the end of this reference — usually ends at whitespace or next sentence
|
||||
end := idx + 5
|
||||
for end < len(cleaned) && cleaned[end] != ' ' && cleaned[end] != '\n' {
|
||||
end++
|
||||
}
|
||||
cleaned = cleaned[:idx] + cleaned[end:]
|
||||
}
|
||||
b.WriteString(cleaned)
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func collapseWhitespace(s string) string {
|
||||
var b strings.Builder
|
||||
prevSpace := false
|
||||
for _, r := range s {
|
||||
if unicode.IsSpace(r) {
|
||||
if !prevSpace {
|
||||
b.WriteRune(' ')
|
||||
}
|
||||
prevSpace = true
|
||||
} else {
|
||||
b.WriteRune(r)
|
||||
prevSpace = false
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
Reference in New Issue
Block a user