package extract import ( "strconv" "strings" "unicode" ) // Infobox represents a parsed wiki infobox (or any template) as key-value pairs. type Infobox struct { TemplateName string Params map[string]string } // ParseTemplates extracts all top-level templates from wikitext. // It handles nested templates (e.g., {{formatnum:{{GEPrice|...}}}}) by tracking brace depth. func ParseTemplates(wikitext string) []Infobox { var results []Infobox i := 0 for i < len(wikitext)-1 { if wikitext[i] == '{' && wikitext[i+1] == '{' { end := findTemplateEnd(wikitext, i) if end == -1 { i++ continue } inner := wikitext[i+2 : end] if box := parseTemplate(inner); box != nil { results = append(results, *box) } i = end + 2 } else { i++ } } return results } // FindTemplate searches parsed templates for one matching the given name (case-insensitive). func FindTemplate(templates []Infobox, name string) *Infobox { target := strings.ToLower(strings.TrimSpace(name)) for i, t := range templates { if strings.ToLower(strings.TrimSpace(t.TemplateName)) == target { return &templates[i] } } return nil } // FindAllTemplates returns all templates matching the given name. func FindAllTemplates(templates []Infobox, name string) []Infobox { target := strings.ToLower(strings.TrimSpace(name)) var matches []Infobox for _, t := range templates { if strings.ToLower(strings.TrimSpace(t.TemplateName)) == target { matches = append(matches, t) } } return matches } // findTemplateEnd finds the index of the closing '}}' for a template starting at pos. func findTemplateEnd(s string, pos int) int { depth := 0 i := pos for i < len(s)-1 { if s[i] == '{' && s[i+1] == '{' { depth++ i += 2 } else if s[i] == '}' && s[i+1] == '}' { depth-- if depth == 0 { return i } i += 2 } else { i++ } } return -1 } // parseTemplate parses the inner content of a {{...}} template. func parseTemplate(inner string) *Infobox { // Split on '|' but respect nested templates parts := splitOnPipes(inner) if len(parts) == 0 { return nil } name := strings.TrimSpace(parts[0]) // Skip parser functions like #if, #switch, etc. if strings.HasPrefix(name, "#") { return nil } params := make(map[string]string) positional := 1 for _, part := range parts[1:] { eqIdx := strings.Index(part, "=") if eqIdx > 0 { key := strings.TrimSpace(part[:eqIdx]) val := strings.TrimSpace(part[eqIdx+1:]) // Clean up common wikitext artifacts val = cleanValue(val) if key != "" { params[key] = val } } else { // Positional parameter params[strconv.Itoa(positional)] = strings.TrimSpace(part) positional++ } } return &Infobox{ TemplateName: name, Params: params, } } // splitOnPipes splits a string on '|' while respecting nested {{...}} and [[...]]. func splitOnPipes(s string) []string { var parts []string var current strings.Builder braceDepth := 0 bracketDepth := 0 for i := 0; i < len(s); i++ { ch := s[i] switch { case ch == '{' && i+1 < len(s) && s[i+1] == '{': braceDepth++ current.WriteByte('{') current.WriteByte('{') i++ case ch == '}' && i+1 < len(s) && s[i+1] == '}': braceDepth-- current.WriteByte('}') current.WriteByte('}') i++ case ch == '[' && i+1 < len(s) && s[i+1] == '[': bracketDepth++ current.WriteByte('[') current.WriteByte('[') i++ case ch == ']' && i+1 < len(s) && s[i+1] == ']': bracketDepth-- current.WriteByte(']') current.WriteByte(']') i++ case ch == '|' && braceDepth == 0 && bracketDepth == 0: parts = append(parts, current.String()) current.Reset() default: current.WriteByte(ch) } } parts = append(parts, current.String()) return parts } // cleanValue strips common wikitext formatting from a value. func cleanValue(s string) string { // Remove [[ ]] wiki links, keeping the display text s = cleanWikiLinks(s) // Remove '' and ''' (bold/italic) s = strings.ReplaceAll(s, "'''", "") s = strings.ReplaceAll(s, "''", "") // Trim whitespace s = strings.TrimSpace(s) return s } // cleanWikiLinks converts [[Target|Display]] to Display, and [[Target]] to Target. func cleanWikiLinks(s string) string { var b strings.Builder i := 0 for i < len(s) { if i+1 < len(s) && s[i] == '[' && s[i+1] == '[' { end := strings.Index(s[i:], "]]") if end == -1 { b.WriteByte(s[i]) i++ continue } inner := s[i+2 : i+end] if pipeIdx := strings.Index(inner, "|"); pipeIdx >= 0 { b.WriteString(inner[pipeIdx+1:]) } else { b.WriteString(inner) } i = i + end + 2 } else { b.WriteByte(s[i]) i++ } } return b.String() } // CleanWikitext strips templates, wiki links, and HTML but preserves line structure // and converts wiki list markers (* items) to readable bullet points. func CleanWikitext(s string) string { s = expandKnownTemplates(s) s = removeTemplates(s) s = cleanWikiLinks(s) s = strings.ReplaceAll(s, "'''", "") s = strings.ReplaceAll(s, "''", "") s = removeWikiTables(s) s = stripHTMLTags(s) s = removeRefs(s) s = removeSectionHeadings(s) s = removeFileAndCategoryLines(s) s = removeFileRefs(s) // Process line by line to preserve list structure lines := strings.Split(s, "\n") var out []string for _, line := range lines { trimmed := strings.TrimSpace(line) if trimmed == "" { continue } // Convert wiki list markers to markdown bullets if strings.HasPrefix(trimmed, "***") { out = append(out, " - "+strings.TrimSpace(trimmed[3:])) } else if strings.HasPrefix(trimmed, "**") { out = append(out, " - "+strings.TrimSpace(trimmed[2:])) } else if strings.HasPrefix(trimmed, "*") { out = append(out, "- "+strings.TrimSpace(trimmed[1:])) } else { out = append(out, trimmed) } } return strings.Join(out, "\n") } // ExtractPlainText strips all wikitext markup to produce plain text. func ExtractPlainText(wikitext string) string { s := wikitext s = expandKnownTemplates(s) s = removeTemplates(s) s = cleanWikiLinks(s) s = strings.ReplaceAll(s, "'''", "") s = strings.ReplaceAll(s, "''", "") // Remove wiki tables {| ... |} s = removeWikiTables(s) // Remove section headings (== Foo ==, === Bar ===, etc.) s = removeSectionHeadings(s) // Remove HTML tags s = stripHTMLTags(s) // Remove references s = removeRefs(s) // Remove file/image links and category lines s = removeFileAndCategoryLines(s) // Remove leftover File: references s = removeFileRefs(s) // Collapse whitespace s = collapseWhitespace(s) return strings.TrimSpace(s) } // expandKnownTemplates replaces well-known templates with plain text equivalents // before the generic template removal pass. This preserves useful data from // templates like {{Skillreq|Mining|75}} → "75 Mining". func expandKnownTemplates(s string) string { var b strings.Builder i := 0 for i < len(s)-1 { if s[i] == '{' && s[i+1] == '{' { end := findTemplateEnd(s, i) if end == -1 { b.WriteByte(s[i]) i++ continue } inner := s[i+2 : end] if expanded, ok := tryExpandTemplate(inner); ok { b.WriteString(expanded) } else { // Leave it for removeTemplates to handle b.WriteString(s[i : end+2]) } i = end + 2 } else { b.WriteByte(s[i]) i++ } } if i < len(s) { b.WriteByte(s[i]) } return b.String() } func tryExpandTemplate(inner string) (string, bool) { parts := splitOnPipes(inner) if len(parts) == 0 { return "", false } name := strings.TrimSpace(parts[0]) lower := strings.ToLower(name) switch lower { case "skillreq", "scp": // {{Skillreq|Skill|Level}} or {{SCP|Level|Skill}} → "Level Skill" if len(parts) >= 3 { if lower == "scp" { return strings.TrimSpace(parts[1]) + " " + strings.TrimSpace(parts[2]), true } return strings.TrimSpace(parts[2]) + " " + strings.TrimSpace(parts[1]), true } if len(parts) == 2 { return strings.TrimSpace(parts[1]), true } case "fairycode": if len(parts) >= 2 { return strings.TrimSpace(parts[1]), true } case "coins", "coins detail": if len(parts) >= 2 { return strings.TrimSpace(parts[1]) + " coins", true } case "loctablehead", "loctablebottom": return "", true case "locline": paramMap := map[string]string{} for _, p := range parts[1:] { if idx := strings.Index(p, "="); idx > 0 { k := strings.ToLower(strings.TrimSpace(p[:idx])) v := cleanWikiLinks(strings.TrimSpace(p[idx+1:])) paramMap[k] = v } } // Support both OSRS (location/levels/members) and RS3 (loc/lvls/mem) param names locVal := paramMap["location"] if locVal == "" { locVal = paramMap["loc"] } lvlVal := paramMap["levels"] if lvlVal == "" { lvlVal = paramMap["lvls"] } memVal := paramMap["members"] if memVal == "" { memVal = paramMap["mem"] } var segments []string if locVal != "" { segments = append(segments, locVal) } if lvlVal != "" { segments = append(segments, "level "+lvlVal) } if memVal != "" { if strings.EqualFold(memVal, "yes") { segments = append(segments, "members only") } else { segments = append(segments, "F2P") } } if len(segments) == 0 { return "", true } return "\n- " + strings.Join(segments, ", "), true } return "", false } func removeTemplates(s string) string { var b strings.Builder depth := 0 for i := 0; i < len(s); i++ { if i+1 < len(s) && s[i] == '{' && s[i+1] == '{' { depth++ i++ } else if i+1 < len(s) && s[i] == '}' && s[i+1] == '}' { depth-- if depth < 0 { depth = 0 } i++ } else if depth == 0 { b.WriteByte(s[i]) } } return b.String() } func stripHTMLTags(s string) string { var b strings.Builder inTag := false for _, r := range s { if r == '<' { inTag = true } else if r == '>' { inTag = false } else if !inTag { b.WriteRune(r) } } return b.String() } func removeRefs(s string) string { // Remove ... and for { start := strings.Index(s, "") endTag := strings.Index(s[start:], "") if selfClose != -1 && (endTag == -1 || selfClose < endTag) { s = s[:start] + s[start+selfClose+2:] } else if endTag != -1 { s = s[:start] + s[start+endTag+6:] } else { break } } return s } func removeWikiTables(s string) string { var b strings.Builder depth := 0 lines := strings.Split(s, "\n") for _, line := range lines { trimmed := strings.TrimSpace(line) if strings.HasPrefix(trimmed, "{|") { depth++ continue } if strings.HasPrefix(trimmed, "|}") { if depth > 0 { depth-- } continue } if depth == 0 { b.WriteString(line) b.WriteByte('\n') } } return b.String() } func removeFileAndCategoryLines(s string) string { var b strings.Builder lines := strings.Split(s, "\n") for _, line := range lines { trimmed := strings.TrimSpace(line) lower := strings.ToLower(trimmed) // Skip lines that are file/image embeds or category tags if strings.HasPrefix(lower, "[[file:") || strings.HasPrefix(lower, "[[image:") || strings.HasPrefix(lower, "[[category:") || strings.HasPrefix(lower, "category:") || strings.HasPrefix(lower, "thumb|") || lower == "thumb" || (strings.Contains(lower, "px|") && (strings.Contains(lower, "thumb") || strings.Contains(lower, "right") || strings.Contains(lower, "left"))) { continue } // Remove inline [[Category:...]] references for strings.Contains(line, "[[Category:") { start := strings.Index(line, "[[Category:") end := strings.Index(line[start:], "]]") if end == -1 { break } line = line[:start] + line[start+end+2:] } b.WriteString(line) b.WriteByte('\n') } return b.String() } func removeSectionHeadings(s string) string { var b strings.Builder lines := strings.Split(s, "\n") for _, line := range lines { trimmed := strings.TrimSpace(line) if strings.HasPrefix(trimmed, "==") && strings.HasSuffix(trimmed, "==") { continue } b.WriteString(line) b.WriteByte('\n') } return b.String() } func removeFileRefs(s string) string { var b strings.Builder lines := strings.Split(s, "\n") for _, line := range lines { // Remove inline File: references (leftover after wiki link cleaning) cleaned := line for { lower := strings.ToLower(cleaned) idx := strings.Index(lower, "file:") if idx == -1 { break } // Find the end of this reference — usually ends at whitespace or next sentence end := idx + 5 for end < len(cleaned) && cleaned[end] != ' ' && cleaned[end] != '\n' { end++ } cleaned = cleaned[:idx] + cleaned[end:] } b.WriteString(cleaned) b.WriteByte('\n') } return b.String() } func collapseWhitespace(s string) string { var b strings.Builder prevSpace := false prevNewline := false for _, r := range s { if r == '\n' { if !prevNewline { b.WriteRune('\n') } prevNewline = true prevSpace = false } else if unicode.IsSpace(r) { if !prevSpace && !prevNewline { b.WriteRune(' ') } prevSpace = true } else { b.WriteRune(r) prevSpace = false prevNewline = false } } return b.String() }