Jump to content

Module:Plain text/sandbox: Difference between revisions

From Wikipedia, the free encyclopedia
Content deleted Content added
No edit summary
Tag: Reverted
some performance and results improvements
Tag: Reverted
Line 13: Line 13:
:gsub(' ', ' ') --replace nbsp spaces with regular spaces
:gsub(' ', ' ') --replace nbsp spaces with regular spaces
:gsub('<br ?/?>', ', ') --replace br with commas
:gsub('<br ?/?>', ', ') --replace br with commas
:gsub('<(%a+)[^>]+>(.-)</%1>', function(tag, contents)
:gsub('^%s*<span.->(.-)</span>%s*$', '%1') --remove outer spans while keeping text inside
if tag:lower() == 'span' then
:gsub('<span.->(.-)</span>', '%1') --repeat for nested span tags.
return contents
:gsub('<i.->(.-)</i>', '%1') --remove italics while keeping text inside
else
:gsub('<.->.-<.->', '') --strip out remaining tags and the text inside
return ''
:gsub('<.->', '') --remove any other tag markup
end
:gsub('%[%[%s*[Ff]ile%s*:.-%]%]', '') --strip out files
end)
:gsub('%[%[%s*[Ii]mage%s*:.-%]%]', '') --strip out use of image:
:gsub('%[%[%s*[Cc]ategory%s*:.-%]%]', '') --strip out categories
:gsub('<i[^>]+>([^<]+)</i>', '%1') --remove italics while keeping text inside
:gsub('%[%[[^%]]-|', '') --strip out piped link text
:gsub('<[^>]+>[^<]+<[^>]+>', '') --strip out remaining tags and the text inside
:gsub('%b<>', '') --remove any other tag markup
:gsub('__[^_]+__', '') --remove __ markups
:gsub('^=+[^=]+=+', ''):gsub('\n=+[^=]+=+', '') --remove section titles
:gsub('%b[]',
function(bracketed)
return bracketed:gsub('^%[%[%s*(%a+):.-%]%]$',
function(link_prefix)
link_prefix = link_prefix:lower()
if link_prefix == 'image' or link_prefix == 'file'
or link_prefix == 'media' or link_prefix == 'category' then
return ""
end -- otherwise leave it alone
end)
end)
:gsub('%[%[[^%]|]+|', '') --strip out piped link text
:gsub('[%[%]]', '') --then strip out remaining [ and ]
:gsub('[%[%]]', '') --then strip out remaining [ and ]
:gsub("'''''", "") --strip out bold italic markup
:gsub("'''''", "") --strip out bold italic markup
:gsub("'''?", "") --not stripping out '''' gives correct output for bolded text in quotes
:gsub("'''?", "") --not stripping out '''' gives correct output for bolded text in quotes
:gsub('----', '') --remove ---- lines
:gsub('----', '') --remove ---- lines
:gsub("^%s+", "") --strip leading
:gsub('^%s+', ''):gsub('\n%s+', '\n') --strip leading
:gsub("%s+$", "") --and trailing spaces
:gsub('%s+$', ''):gsub('%s+\n', '\n') --and trailing spaces
:gsub("%s+", " ") --strip redundant spaces
:gsub('(%s)%s+', '%1') --strip redundant spaces
return text
return text
end
end

Revision as of 13:37, 21 June 2021

--converts text with wikilinks to plain text, e.g "[[foo|gah]] is [[bar]]" to "gah is bar"
--removes anything enclosed in tags that isn't nested, mediawiki strip markers (references etc), files, italic and bold markup
local p = {}

function p.main(frame)
	local text = frame.args[1]
	return p._main(text)
end

function p._main(text)
	if not text then return end
	text = mw.text.killMarkers(text)
		:gsub('&nbsp;', ' ') --replace nbsp spaces with regular spaces
		:gsub('<br ?/?>', ', ') --replace br with commas
		:gsub('<(%a+)[^>]+>(.-)</%1>', function(tag, contents)
			if tag:lower() == 'span' then
				return contents
			else
				return ''
			end
		end)
		:gsub('<i[^>]+>([^<]+)</i>', '%1') --remove italics while keeping text inside
		:gsub('<[^>]+>[^<]+<[^>]+>', '') --strip out remaining tags and the text inside
		:gsub('%b<>', '') --remove any other tag markup
		:gsub('__[^_]+__', '') --remove __ markups
		:gsub('^=+[^=]+=+', ''):gsub('\n=+[^=]+=+', '') --remove section titles		
		:gsub('%b[]',
			function(bracketed)
				return bracketed:gsub('^%[%[%s*(%a+):.-%]%]$',
					function(link_prefix)
						link_prefix = link_prefix:lower()
						if link_prefix == 'image' or link_prefix == 'file'
						or link_prefix == 'media' or link_prefix == 'category' then
							return ""
						end -- otherwise leave it alone
					end)
			end)
		:gsub('%[%[[^%]|]+|', '') --strip out piped link text
		:gsub('[%[%]]', '') --then strip out remaining [ and ]
		:gsub("'''''", "") --strip out bold italic markup
		:gsub("'''?", "") --not stripping out '''' gives correct output for bolded text in quotes
		:gsub('----', '') --remove ---- lines
		:gsub('^%s+', ''):gsub('\n%s+', '\n') --strip leading
		:gsub('%s+$', ''):gsub('%s+\n', '\n') --and trailing spaces
		:gsub('(%s)%s+', '%1') --strip redundant spaces
	return text
end

return p