Update parser

This commit is contained in:
2026-03-05 12:51:31 -06:00
parent b20d68c9f0
commit c283444d68
7 changed files with 603 additions and 103 deletions

View File

@@ -202,6 +202,7 @@ func cleanWikiLinks(s string) string {
// CleanWikitext strips templates, wiki links, and HTML but preserves line structure
// and converts wiki list markers (* items) to readable bullet points.
func CleanWikitext(s string) string {
s = StripTransclusionTags(s)
s = expandKnownTemplates(s)
s = removeTemplates(s)
s = cleanWikiLinks(s)
@@ -239,6 +240,7 @@ func CleanWikitext(s string) string {
// ExtractPlainText strips all wikitext markup to produce plain text.
func ExtractPlainText(wikitext string) string {
s := wikitext
s = StripTransclusionTags(s)
s = expandKnownTemplates(s)
s = removeTemplates(s)
s = cleanWikiLinks(s)
@@ -371,6 +373,48 @@ func tryExpandTemplate(inner string) (string, bool) {
return "", false
}
// StripTransclusionTags handles MediaWiki transclusion directives in raw wikitext.
// When viewing a page directly (not transcluding):
// - <noinclude>content</noinclude> → keep content (strip tags only)
// - <includeonly>content</includeonly> → remove entirely (tags + content)
// - <onlyinclude>content</onlyinclude> → keep content (strip tags only)
func StripTransclusionTags(s string) string {
// Remove <includeonly>...</includeonly> blocks entirely
for {
lower := strings.ToLower(s)
start := strings.Index(lower, "<includeonly>")
if start == -1 {
break
}
end := strings.Index(lower[start:], "</includeonly>")
if end == -1 {
// Unclosed tag — remove to end of string
s = s[:start]
break
}
s = s[:start] + s[start+end+len("</includeonly>"):]
}
// Unwrap <noinclude> and <onlyinclude> (keep content, remove tags)
for _, tag := range []string{"noinclude", "onlyinclude"} {
s = removeTagKeepContent(s, tag)
}
return s
}
func removeTagKeepContent(s, tag string) string {
open := "<" + tag + ">"
close := "</" + tag + ">"
s = strings.ReplaceAll(s, open, "")
s = strings.ReplaceAll(s, close, "")
// Case-insensitive variants
upper := "<" + strings.ToUpper(tag) + ">"
upperClose := "</" + strings.ToUpper(tag) + ">"
s = strings.ReplaceAll(s, upper, "")
s = strings.ReplaceAll(s, upperClose, "")
return s
}
func removeTemplates(s string) string {
var b strings.Builder
depth := 0