Update parser
This commit is contained in:
@@ -202,6 +202,7 @@ func cleanWikiLinks(s string) string {
|
||||
// CleanWikitext strips templates, wiki links, and HTML but preserves line structure
|
||||
// and converts wiki list markers (* items) to readable bullet points.
|
||||
func CleanWikitext(s string) string {
|
||||
s = StripTransclusionTags(s)
|
||||
s = expandKnownTemplates(s)
|
||||
s = removeTemplates(s)
|
||||
s = cleanWikiLinks(s)
|
||||
@@ -239,6 +240,7 @@ func CleanWikitext(s string) string {
|
||||
// ExtractPlainText strips all wikitext markup to produce plain text.
|
||||
func ExtractPlainText(wikitext string) string {
|
||||
s := wikitext
|
||||
s = StripTransclusionTags(s)
|
||||
s = expandKnownTemplates(s)
|
||||
s = removeTemplates(s)
|
||||
s = cleanWikiLinks(s)
|
||||
@@ -371,6 +373,48 @@ func tryExpandTemplate(inner string) (string, bool) {
|
||||
return "", false
|
||||
}
|
||||
|
||||
// StripTransclusionTags handles MediaWiki transclusion directives in raw wikitext.
|
||||
// When viewing a page directly (not transcluding):
|
||||
// - <noinclude>content</noinclude> → keep content (strip tags only)
|
||||
// - <includeonly>content</includeonly> → remove entirely (tags + content)
|
||||
// - <onlyinclude>content</onlyinclude> → keep content (strip tags only)
|
||||
func StripTransclusionTags(s string) string {
|
||||
// Remove <includeonly>...</includeonly> blocks entirely
|
||||
for {
|
||||
lower := strings.ToLower(s)
|
||||
start := strings.Index(lower, "<includeonly>")
|
||||
if start == -1 {
|
||||
break
|
||||
}
|
||||
end := strings.Index(lower[start:], "</includeonly>")
|
||||
if end == -1 {
|
||||
// Unclosed tag — remove to end of string
|
||||
s = s[:start]
|
||||
break
|
||||
}
|
||||
s = s[:start] + s[start+end+len("</includeonly>"):]
|
||||
}
|
||||
|
||||
// Unwrap <noinclude> and <onlyinclude> (keep content, remove tags)
|
||||
for _, tag := range []string{"noinclude", "onlyinclude"} {
|
||||
s = removeTagKeepContent(s, tag)
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func removeTagKeepContent(s, tag string) string {
|
||||
open := "<" + tag + ">"
|
||||
close := "</" + tag + ">"
|
||||
s = strings.ReplaceAll(s, open, "")
|
||||
s = strings.ReplaceAll(s, close, "")
|
||||
// Case-insensitive variants
|
||||
upper := "<" + strings.ToUpper(tag) + ">"
|
||||
upperClose := "</" + strings.ToUpper(tag) + ">"
|
||||
s = strings.ReplaceAll(s, upper, "")
|
||||
s = strings.ReplaceAll(s, upperClose, "")
|
||||
return s
|
||||
}
|
||||
|
||||
func removeTemplates(s string) string {
|
||||
var b strings.Builder
|
||||
depth := 0
|
||||
|
||||
Reference in New Issue
Block a user