https://en.wikipedia.org/w/index.php?action=history&feed=atom&title=Module%3AScripts%2FcharToScriptModule:Scripts/charToScript - Revision history2025-05-28T11:15:36ZRevision history for this page on the wikiMediaWiki 1.45.0-wmf.2https://en.wikipedia.org/w/index.php?title=Module:Scripts/charToScript&diff=1185975722&oldid=prevAlexis Jazz: Imported page from https://en.wiktionary.org/wiki/Module:scripts/charToScript AJsImportTool]2023-11-20T03:58:13Z<p>Imported page from https://en.wiktionary.org/wiki/Module:scripts/charToScript [<a href="/w/index.php?title=User:Alexis_Jazz/AJsImportTool.js&action=edit&redlink=1" class="new" title="User:Alexis Jazz/AJsImportTool.js (page does not exist)">AJsImportTool</a>]</p>
<p><b>New page</b></p><div>local subexport = {}<br />
<br />
local cp = mw.ustring.codepoint<br />
local floor = math.floor<br />
local min = math.min<br />
local split = mw.text.split<br />
<br />
-- Copied from [[Module:Unicode data]].<br />
local function binaryRangeSearch(codepoint, ranges)<br />
local low, mid, high<br />
low, high = 1, ranges.length or require "Module:table".length(ranges)<br />
while low <= high do<br />
mid = floor((low + high) / 2)<br />
local range = ranges[mid]<br />
if codepoint < range[1] then<br />
high = mid - 1<br />
elseif codepoint <= range[2] then<br />
return range, mid<br />
else<br />
low = mid + 1<br />
end<br />
end<br />
return nil, mid<br />
end<br />
<br />
-- Copied from [[Module:Unicode data]].<br />
local function linearRangeSearch(codepoint, ranges)<br />
for i, range in ipairs(ranges) do<br />
if codepoint < range[1] then<br />
break<br />
elseif codepoint <= range[2] then<br />
return range<br />
end<br />
end<br />
end<br />
<br />
local function compareRanges(range1, range2)<br />
return range1[1] < range2[1]<br />
end<br />
<br />
-- Save previously used codepoint ranges in case another character is in the<br />
-- same range.<br />
local rangesCache = {}<br />
<br />
--[=[<br />
Takes a codepoint or a character and finds the script code(s) (if any) that are appropriate for it based on the codepoint, using the data module [[Module:scripts/recognition data]]. The data module was generated from the patterns in [[Module:scripts/data]] using [[Module:User:Erutuon/script recognition]].<br />
<br />
By default, it returns only the first script code if there are multiple matches (i.e. the code we take to be the default). If `all_scripts` is set, then a table of all matching codes is returned.<br />
]=]<br />
<br />
local charToScriptData<br />
function subexport.charToScript(char, all_scripts)<br />
charToScriptData = charToScriptData or mw.loadData("Module:scripts/recognition data")<br />
local t = type(char)<br />
local codepoint<br />
if t == "string" then<br />
local etc<br />
codepoint, etc = cp(char, 1, 2)<br />
if etc then<br />
error("bad argument #1 to 'charToScript' (expected a single character)")<br />
end<br />
elseif t == "number" then<br />
codepoint = char<br />
else<br />
error(("bad argument #1 to 'charToScript' (expected string or a number, got %s)")<br />
:format(t))<br />
end<br />
<br />
local ret = {}<br />
local individualMatch = charToScriptData.individual[codepoint]<br />
if individualMatch then<br />
ret = split(individualMatch, "%s*,%s*")<br />
else<br />
local range<br />
if rangesCache[1] then<br />
range = linearRangeSearch(codepoint, rangesCache)<br />
if range then<br />
for i, script in ipairs(range) do<br />
if i > 2 then<br />
table.insert(ret, script)<br />
if not all_scripts then<br />
break<br />
end<br />
end<br />
end<br />
end<br />
end<br />
if not ret[1] then<br />
local index = floor(codepoint / 0x1000)<br />
range = linearRangeSearch(index, charToScriptData.blocks)<br />
if not range and charToScriptData[index] then<br />
range = binaryRangeSearch(codepoint, charToScriptData[index])<br />
if range then<br />
table.insert(rangesCache, range)<br />
table.sort(rangesCache, compareRanges)<br />
end<br />
end<br />
if range then<br />
for i, script in ipairs(range) do<br />
if i > 2 then<br />
table.insert(ret, script)<br />
if not all_scripts then<br />
break<br />
end<br />
end<br />
end<br />
end<br />
end<br />
end<br />
if not ret[1] then<br />
table.insert(ret, "None")<br />
end<br />
if all_scripts then<br />
return ret<br />
else<br />
return ret[1]<br />
end<br />
end<br />
<br />
--[=[<br />
Finds the best script for a string in a language-agnostic way.<br />
<br />
Converts each character to a codepoint. Iterates the counter for the script code if the codepoint is in the list<br />
of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to.<br />
<br />
Each script has a two-part counter, for primary and secondary matches. Primary matches are when the script is the<br />
first one listed; otherwise, it's a secondary match. When comparing scripts, first the total of both are compared<br />
(i.e. the overall number of matches). If these are the same, the number of primary and then secondary matches are<br />
used as tiebreakers. For example, this is used to ensure that `Grek` takes priority over `Polyt` if no characters<br />
which exclusively match `Polyt` are found, as `Grek` is a subset of `Polyt`.<br />
]=]<br />
function subexport.findBestScriptWithoutLang(text)<br />
-- `scripts` contains counters for any scripts detected so far. Jpan and Kore are handled as special-cases, as they are combinations of other scripts.<br />
local scripts_mt = {Jpan = true, Kore = true}<br />
<br />
local weights_mt = {<br />
__lt = function(a, b)<br />
if a[1] + a[2] ~= b[1] + b[2] then<br />
return a[1] + a[2] < b[1] + b[2]<br />
elseif a[1] ~= b[1] then<br />
return a[1] < b[1]<br />
elseif a[2] ~= b[2] then<br />
return a[2] < b[2]<br />
else<br />
return false<br />
end<br />
end<br />
}<br />
scripts_mt.__index = function(t, k)<br />
local ret = {}<br />
if k == "Jpan" and scripts_mt.Jpan then<br />
for i = 1, 2 do<br />
ret[i] = t["Hani"][i] + t["Hira"][i] + t["Kana"][i]<br />
end<br />
elseif k == "Kore" and scripts_mt.Kore then<br />
for i = 1, 2 do<br />
ret[i] = t["Hani"][i] + t["Hang"][i]<br />
end<br />
else<br />
for i = 1, 2 do<br />
table.insert(ret, 0)<br />
end<br />
end<br />
return setmetatable(ret, weights_mt)<br />
end<br />
<br />
local scripts = setmetatable({}, scripts_mt)<br />
<br />
text = require("Module:utilities").get_plaintext(text)<br />
<br />
local combined_scripts = {<br />
Jpan = {["Hani"] = true, ["Hira"] = true, ["Kana"] = true},<br />
Kore = {["Hani"] = true, ["Hang"] = true}<br />
}<br />
<br />
for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do<br />
for i, script in ipairs(subexport.charToScript(character, true)) do<br />
scripts[script] = scripts[script]<br />
local weight = min(i, 2)<br />
scripts[script][weight] = scripts[script][weight] + 1<br />
end<br />
end<br />
<br />
-- Check the combined script counts. If a single constituent has the same count (i.e. it's the only one), discard the combined script.<br />
for combined_script, set in pairs(combined_scripts) do<br />
for script in pairs(set) do<br />
scripts[combined_script] = scripts[combined_script]<br />
if (scripts[script][1] + scripts[script][2]) == (scripts[combined_script][1] + scripts[combined_script][2]) then<br />
scripts[combined_script] = nil<br />
break<br />
end<br />
end<br />
end<br />
<br />
local bestScript<br />
local greatestCount<br />
for script, count in pairs(scripts) do<br />
if (not greatestCount) or greatestCount < count then<br />
bestScript = script<br />
greatestCount = count<br />
end<br />
end<br />
<br />
bestScript = bestScript or "None"<br />
<br />
return require("Module:scripts").getByCode(bestScript)<br />
end<br />
<br />
return subexport</div>Alexis Jazz