https://en.wikipedia.org/w/index.php?action=history&feed=atom&title=Module%3AScripts%2FcharToScript Module:Scripts/charToScript - Revision history 2025-05-28T11:15:36Z Revision history for this page on the wiki MediaWiki 1.45.0-wmf.2 https://en.wikipedia.org/w/index.php?title=Module:Scripts/charToScript&diff=1185975722&oldid=prev Alexis Jazz: Imported page from https://en.wiktionary.org/wiki/Module:scripts/charToScript AJsImportTool] 2023-11-20T03:58:13Z <p>Imported page from https://en.wiktionary.org/wiki/Module:scripts/charToScript [<a href="/w/index.php?title=User:Alexis_Jazz/AJsImportTool.js&amp;action=edit&amp;redlink=1" class="new" title="User:Alexis Jazz/AJsImportTool.js (page does not exist)">AJsImportTool</a>]</p> <p><b>New page</b></p><div>local subexport = {}<br /> <br /> local cp = mw.ustring.codepoint<br /> local floor = math.floor<br /> local min = math.min<br /> local split = mw.text.split<br /> <br /> -- Copied from [[Module:Unicode data]].<br /> local function binaryRangeSearch(codepoint, ranges)<br /> local low, mid, high<br /> low, high = 1, ranges.length or require &quot;Module:table&quot;.length(ranges)<br /> while low &lt;= high do<br /> mid = floor((low + high) / 2)<br /> local range = ranges[mid]<br /> if codepoint &lt; range[1] then<br /> high = mid - 1<br /> elseif codepoint &lt;= range[2] then<br /> return range, mid<br /> else<br /> low = mid + 1<br /> end<br /> end<br /> return nil, mid<br /> end<br /> <br /> -- Copied from [[Module:Unicode data]].<br /> local function linearRangeSearch(codepoint, ranges)<br /> for i, range in ipairs(ranges) do<br /> if codepoint &lt; range[1] then<br /> break<br /> elseif codepoint &lt;= range[2] then<br /> return range<br /> end<br /> end<br /> end<br /> <br /> local function compareRanges(range1, range2)<br /> return range1[1] &lt; range2[1]<br /> end<br /> <br /> -- Save previously used codepoint ranges in case another character is in the<br /> -- same range.<br /> local rangesCache = {}<br /> <br /> --[=[<br /> Takes a codepoint or a character and finds the script code(s) (if any) that are appropriate for it based on the codepoint, using the data module [[Module:scripts/recognition data]]. The data module was generated from the patterns in [[Module:scripts/data]] using [[Module:User:Erutuon/script recognition]].<br /> <br /> By default, it returns only the first script code if there are multiple matches (i.e. the code we take to be the default). If `all_scripts` is set, then a table of all matching codes is returned.<br /> ]=]<br /> <br /> local charToScriptData<br /> function subexport.charToScript(char, all_scripts)<br /> charToScriptData = charToScriptData or mw.loadData(&quot;Module:scripts/recognition data&quot;)<br /> local t = type(char)<br /> local codepoint<br /> if t == &quot;string&quot; then<br /> local etc<br /> codepoint, etc = cp(char, 1, 2)<br /> if etc then<br /> error(&quot;bad argument #1 to &#039;charToScript&#039; (expected a single character)&quot;)<br /> end<br /> elseif t == &quot;number&quot; then<br /> codepoint = char<br /> else<br /> error((&quot;bad argument #1 to &#039;charToScript&#039; (expected string or a number, got %s)&quot;)<br /> :format(t))<br /> end<br /> <br /> local ret = {}<br /> local individualMatch = charToScriptData.individual[codepoint]<br /> if individualMatch then<br /> ret = split(individualMatch, &quot;%s*,%s*&quot;)<br /> else<br /> local range<br /> if rangesCache[1] then<br /> range = linearRangeSearch(codepoint, rangesCache)<br /> if range then<br /> for i, script in ipairs(range) do<br /> if i &gt; 2 then<br /> table.insert(ret, script)<br /> if not all_scripts then<br /> break<br /> end<br /> end<br /> end<br /> end<br /> end<br /> if not ret[1] then<br /> local index = floor(codepoint / 0x1000)<br /> range = linearRangeSearch(index, charToScriptData.blocks)<br /> if not range and charToScriptData[index] then<br /> range = binaryRangeSearch(codepoint, charToScriptData[index])<br /> if range then<br /> table.insert(rangesCache, range)<br /> table.sort(rangesCache, compareRanges)<br /> end<br /> end<br /> if range then<br /> for i, script in ipairs(range) do<br /> if i &gt; 2 then<br /> table.insert(ret, script)<br /> if not all_scripts then<br /> break<br /> end<br /> end<br /> end<br /> end<br /> end<br /> end<br /> if not ret[1] then<br /> table.insert(ret, &quot;None&quot;)<br /> end<br /> if all_scripts then<br /> return ret<br /> else<br /> return ret[1]<br /> end<br /> end<br /> <br /> --[=[<br /> Finds the best script for a string in a language-agnostic way.<br /> <br /> Converts each character to a codepoint. Iterates the counter for the script code if the codepoint is in the list<br /> of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to.<br /> <br /> Each script has a two-part counter, for primary and secondary matches. Primary matches are when the script is the<br /> first one listed; otherwise, it&#039;s a secondary match. When comparing scripts, first the total of both are compared<br /> (i.e. the overall number of matches). If these are the same, the number of primary and then secondary matches are<br /> used as tiebreakers. For example, this is used to ensure that `Grek` takes priority over `Polyt` if no characters<br /> which exclusively match `Polyt` are found, as `Grek` is a subset of `Polyt`.<br /> ]=]<br /> function subexport.findBestScriptWithoutLang(text)<br /> -- `scripts` contains counters for any scripts detected so far. Jpan and Kore are handled as special-cases, as they are combinations of other scripts.<br /> local scripts_mt = {Jpan = true, Kore = true}<br /> <br /> local weights_mt = {<br /> __lt = function(a, b)<br /> if a[1] + a[2] ~= b[1] + b[2] then<br /> return a[1] + a[2] &lt; b[1] + b[2]<br /> elseif a[1] ~= b[1] then<br /> return a[1] &lt; b[1]<br /> elseif a[2] ~= b[2] then<br /> return a[2] &lt; b[2]<br /> else<br /> return false<br /> end<br /> end<br /> }<br /> scripts_mt.__index = function(t, k)<br /> local ret = {}<br /> if k == &quot;Jpan&quot; and scripts_mt.Jpan then<br /> for i = 1, 2 do<br /> ret[i] = t[&quot;Hani&quot;][i] + t[&quot;Hira&quot;][i] + t[&quot;Kana&quot;][i]<br /> end<br /> elseif k == &quot;Kore&quot; and scripts_mt.Kore then<br /> for i = 1, 2 do<br /> ret[i] = t[&quot;Hani&quot;][i] + t[&quot;Hang&quot;][i]<br /> end<br /> else<br /> for i = 1, 2 do<br /> table.insert(ret, 0)<br /> end<br /> end<br /> return setmetatable(ret, weights_mt)<br /> end<br /> <br /> local scripts = setmetatable({}, scripts_mt)<br /> <br /> text = require(&quot;Module:utilities&quot;).get_plaintext(text)<br /> <br /> local combined_scripts = {<br /> Jpan = {[&quot;Hani&quot;] = true, [&quot;Hira&quot;] = true, [&quot;Kana&quot;] = true},<br /> Kore = {[&quot;Hani&quot;] = true, [&quot;Hang&quot;] = true}<br /> }<br /> <br /> for character in text:gmatch(&quot;[%z\1-\127\194-\244][\128-\191]*&quot;) do<br /> for i, script in ipairs(subexport.charToScript(character, true)) do<br /> scripts[script] = scripts[script]<br /> local weight = min(i, 2)<br /> scripts[script][weight] = scripts[script][weight] + 1<br /> end<br /> end<br /> <br /> -- Check the combined script counts. If a single constituent has the same count (i.e. it&#039;s the only one), discard the combined script.<br /> for combined_script, set in pairs(combined_scripts) do<br /> for script in pairs(set) do<br /> scripts[combined_script] = scripts[combined_script]<br /> if (scripts[script][1] + scripts[script][2]) == (scripts[combined_script][1] + scripts[combined_script][2]) then<br /> scripts[combined_script] = nil<br /> break<br /> end<br /> end<br /> end<br /> <br /> local bestScript<br /> local greatestCount<br /> for script, count in pairs(scripts) do<br /> if (not greatestCount) or greatestCount &lt; count then<br /> bestScript = script<br /> greatestCount = count<br /> end<br /> end<br /> <br /> bestScript = bestScript or &quot;None&quot;<br /> <br /> return require(&quot;Module:scripts&quot;).getByCode(bestScript)<br /> end<br /> <br /> return subexport</div> Alexis Jazz