Module:Sandbox/Erutuon/Unicode: Difference between revisions
Appearance
Content deleted Content added
copied from Module:Sandbox/Erutuon |
removed language tag stuff |
||
Line 354: | Line 354: | ||
return table.concat(m_table.keysToList(language_codes), ", ") |
return table.concat(m_table.keysToList(language_codes), ", ") |
||
end |
|||
local parsed_subtags_mt = { |
|||
__index = { |
|||
-- "error" is the error message. |
|||
-- "index" is the ordinal of the subtag in which the error was found. |
|||
throw = function (self, error, index) |
|||
self.error = self.error_messages[error] |
|||
self.invalid = table.concat(self.input, "-", index) |
|||
return self:remove_unnecessary_fields() |
|||
end, |
|||
remove_unnecessary_fields = function (self) |
|||
-- Only useful internally. |
|||
self.input = nil |
|||
self:pretty_print() |
|||
p.validate_lang_tag(self) |
|||
return self |
|||
end, |
|||
-- Regularize capitalization of language subtags: |
|||
-- ZH-LATN -> zh-Latn, FR-ca -> fr-CA |
|||
pretty_print = function (self) |
|||
for key, func in pairs(self.print_funcs) do |
|||
if self[key] then |
|||
self[key] = func(self[key]) |
|||
end |
|||
end |
|||
return self |
|||
end, |
|||
-- Re-create the original tag from the parsed subtags. |
|||
get_tag = function (self) |
|||
if self.tag then return self.tag end |
|||
local tag = {} |
|||
for _, subtag_name in ipairs(self.subtag_order) do |
|||
if subtag_name == "private_use" then |
|||
table.insert(tag, "x") |
|||
end |
|||
if type(self[subtag_name]) == "table" then |
|||
for _, subtag in ipairs(self[subtag_name]) do |
|||
table.insert(tag, subtag) |
|||
end |
|||
else |
|||
table.insert(tag, self[subtag_name]) |
|||
end |
|||
end |
|||
tag = table.concat(tag, "-") |
|||
self.tag = tag -- Cache the result. |
|||
return tag |
|||
end, |
|||
subtag_order = { |
|||
"language", "script", "region", "variant", "private_use" |
|||
}, |
|||
error_messages = { |
|||
invalid_characters = "invalid characters", |
|||
no_language = "no language subtag", |
|||
invalid_subtag = "invalid subtag", |
|||
invalid_private_use = "length of private-use subtag out of range", |
|||
empty_private_use = "empty private-use subtag", |
|||
} |
|||
} |
|||
} |
|||
local function initial_caps_helper(initial, rest) |
|||
return string.upper(initial) .. string.lower(rest) |
|||
end |
|||
local function lower_or_map_lower(str) |
|||
if type(str) == "table" then |
|||
return fun.map(string.lower, str) |
|||
else |
|||
return string.lower(str) |
|||
end |
|||
end |
|||
parsed_subtags_mt.__index.print_funcs = { |
|||
language = string.lower, |
|||
script = function (script_code) |
|||
return (string.gsub(script_code, "^(%a)(%a%a%a)$", initial_caps_helper)) |
|||
end, |
|||
region = string.upper, |
|||
variant = lower_or_map_lower, |
|||
private_use = lower_or_map_lower, |
|||
} |
|||
setmetatable(parsed_subtags_mt, { |
|||
__call = function (self, input) |
|||
return setmetatable({ input = input }, self) |
|||
end |
|||
}) |
|||
-- An array of patterns for each subtag, and a "type" field for the name |
|||
-- of the subtag. |
|||
-- The patterns are checked in order, and any of the subtags can be skipped. |
|||
-- So, for example, the "language" subtag must precede the "script" |
|||
-- subtag, but a tag may contain a "language" subtag, no "script" subtag |
|||
-- and then a "region" subtag. |
|||
-- If the full list of subtags has been iterated over, the remaining subtags |
|||
-- must match the pattern for a private-use subtag, or the tag is invalid. |
|||
local subtag_info = { -- can be put in data module |
|||
{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case |
|||
-- include extlang? |
|||
{ "%a%a%a%a", type = "script" }, -- Ssss |
|||
{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD |
|||
{ |
|||
"%d%d%d%d", -- 4 digits |
|||
"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters |
|||
type = "variant", |
|||
repeatable = true, -- There can be multiple variants. |
|||
} |
|||
} |
|||
-- A previous draft, in [[Module:Lang/sandbox]]: |
|||
-- https://en.wikipedia.org/w/index.php?oldid=812819217 |
|||
-- Based on https://www.w3.org/International/articles/language-tags/. |
|||
-- Parse a language tag. |
|||
-- Returns nil if tag is not a string or empty. |
|||
-- Else returns a table with a map of subtag type to subtag for all subtags that |
|||
-- were parsed. |
|||
-- If there was an error, returns an "error" field with a description of the |
|||
-- error, and an "invalid" field with the suffix of the tag starting at the |
|||
-- index where the error occurred. |
|||
-- Does not recognize "extension" tags, such as those introduced by "u", as they |
|||
-- are not needed on Wikipedia. Does not recognize "grandfathered" tags. |
|||
-- Does not recognize extended language subtags, such as "zh-yue". |
|||
-- https://www.rfc-editor.org/rfc/rfc6067.txt, https://tools.ietf.org/html/bcp47 |
|||
-- Only checks that the syntax is correct, not that the values are valid. For |
|||
-- instance, will accept non-existent language codes, like "zz". |
|||
function p.parse_IETF(tag) |
|||
if type(tag) ~= "string" or tag == "" then |
|||
return nil |
|||
end |
|||
-- This may contain the special fields "invalid", "error". |
|||
-- "error" indicates why the |
|||
-- tag is invalid (if applicable). |
|||
-- All other fields are subtags, and they appear in the tag in the following |
|||
-- order: |
|||
-- "language", "script", "region", "variant", "private_use", "invalid" |
|||
-- All these subtags can be strings or nil, while "variant" can also be an |
|||
-- array of strings if more than one variant subtag was found. |
|||
-- "invalid" is the portion of the tag after the last valid subtag (minus a |
|||
-- hyphen). |
|||
local segments = mw.text.split(tag, "-") |
|||
local parsed_subtags = parsed_subtags_mt(segments) |
|||
-- Language tags probably only contain ASCII alphabetic and numerical |
|||
-- characters and hyphen-minus. |
|||
if not tag:find "^[A-Za-z0-9-]+$" then |
|||
return parsed_subtags:throw( |
|||
"invalid_characters", |
|||
fun.indexOf( |
|||
function (tag) |
|||
return tag:find "[^A-Za-z0-9-]" |
|||
end, |
|||
segments)) |
|||
end |
|||
local subtag_i = 1 -- Index of current item in subtag_info. |
|||
local segment_i = 1 -- Index of current segment. |
|||
while segments[segment_i] and subtag_info[subtag_i] do |
|||
local segment = segments[segment_i] |
|||
local subtag_type |
|||
while not subtag_type and subtag_info[subtag_i] do |
|||
-- Check each pattern for the subtag type at "subtag_i" in "subtag_info". |
|||
local cur_subtag = subtag_info[subtag_i] |
|||
for _, pattern in ipairs(cur_subtag) do |
|||
if segment:find("^" .. pattern .. "$") then |
|||
subtag_type = cur_subtag.type |
|||
-- There can be multiple "variant" subtags (and "extension" |
|||
-- subtags, if those are added). |
|||
if not cur_subtag.repeatable then |
|||
subtag_i = subtag_i + 1 |
|||
end |
|||
break |
|||
end |
|||
end |
|||
if not subtag_type then -- No match; try next subtag. |
|||
subtag_i = subtag_i + 1 |
|||
end |
|||
end |
|||
-- If language subtag has not been found, or the current segment has not |
|||
-- been matched as a subtag, break the loop and check for |
|||
-- a private-use subtag. |
|||
if segment_i == 1 and subtag_type ~= "language" or not subtag_type then |
|||
break |
|||
else |
|||
if parsed_subtags[subtag_type] then -- Create an array. |
|||
if type(parsed_subtags[subtag_type]) == "string" then |
|||
parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] } |
|||
end -- else table |
|||
table.insert(parsed_subtags[subtag_type], segment) |
|||
else |
|||
parsed_subtags[subtag_type] = segment |
|||
end |
|||
last_matched_segment_i = segment_i |
|||
end |
|||
segment_i = segment_i + 1 |
|||
end |
|||
if segments[segment_i] then -- More segments to scan? |
|||
-- Not all potential subtags were matched. Check for private-use subtags. |
|||
-- https://tools.ietf.org/html/bcp47#section-2.2.7 |
|||
-- Private-use subtags consist of one or more sequences of 1 to 8 |
|||
-- alphanumeric characters preceded by "x-". |
|||
-- Alphanumericity has already been checked. |
|||
-- A tag must start with either a language subtag or a private-use subtag. |
|||
-- If next segment is not "x", introducing a private-use subtag, there |
|||
-- is no private-use subtag. |
|||
if segments[segment_i] and segments[segment_i]:lower() ~= "x" then |
|||
if not parsed_subtags.language then |
|||
return parsed_subtags:throw("no_language", 1) |
|||
else |
|||
return parsed_subtags:throw("invalid_subtag", |
|||
segment_i) |
|||
end |
|||
elseif not segments[segment_i + 1] then |
|||
return parsed_subtags:throw("empty_private_use", |
|||
segment_i) |
|||
end |
|||
-- Check length of all segments after "x". |
|||
for i = segment_i + 1, #segments do |
|||
local length = #segments[i] |
|||
if not (1 <= length and length <= 8) then |
|||
return parsed_subtags |
|||
:throw("invalid_private_use", segment_i) |
|||
end |
|||
end |
|||
if not segments[last_matched_segment_i + 3] then -- There is only one private-use subtag. |
|||
parsed_subtags.private_use = segments[segment_i + 1] |
|||
else |
|||
parsed_subtags.private_use = {} |
|||
for i = segment_i + 1, #segments do |
|||
table.insert(parsed_subtags.private_use, segments[i]) |
|||
end |
|||
end |
|||
end |
|||
return parsed_subtags:remove_unnecessary_fields() |
|||
end |
|||
local lang_name_table = mw.loadData "Module:Language/name/data" |
|||
local synonym_table = mw.loadData "Module:Lang/ISO 639 synonyms" |
|||
local lang_data = mw.loadData "Module:Lang/data" |
|||
function p.validate_lang_tag(parsed_subtags) |
|||
-- Already checked that the tag starts with a language subtag or a private-use subtag. |
|||
-- Script code is initially capitalized, region code is uppercase, |
|||
-- everything else is lowercase. |
|||
-- Check existence of language tag. |
|||
if parsed_subtags.language and |
|||
not (lang_data.override[parsed_subtags.language] |
|||
or lang_name_table.lang[parsed_subtags.language]) then |
|||
mw.log("Invalid language code", parsed_subtags.language, "in", parsed_subtags:get_tag()) |
|||
end |
|||
-- Check existence of script tag. |
|||
if parsed_subtags.script then |
|||
local lower_script = parsed_subtags.script:lower() |
|||
if not lang_name_table.script[lower_script] then |
|||
mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag()) |
|||
end |
|||
-- Check that script tag is not marked as superfluous (because the |
|||
-- it is considered the default one for the language). |
|||
if lang_name_table.suppressed[lower_script] |
|||
and parsed_subtags.language |
|||
and m_table.inArray( |
|||
lang_name_table.suppressed[lower_script], |
|||
parsed_subtags.language:lower()) then |
|||
mw.log(parsed_subtags.script, "is suppressed with", |
|||
parsed_subtags.language, "in", parsed_subtags:get_tag()) |
|||
end |
|||
end |
|||
-- Check existence of region code.. |
|||
if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then |
|||
mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag()) |
|||
end |
|||
-- Check that variant code is valid, and that it can validly be used with the |
|||
-- given combination of language, script, region, and variant. |
|||
-- Check for duplicate variant subtags? |
|||
if parsed_subtags.variant then |
|||
local lower_tag = parsed_subtags:get_tag():lower() |
|||
for _, variant in ipairs(type(parsed_subtags.variant) == "table" |
|||
and parsed_subtags.variant or { parsed_subtags.variant }) do |
|||
if not lang_name_table.variant[variant] then |
|||
mw.log("Invalid variant code", variant, "in", parsed_subtags:get_tag()) |
|||
else |
|||
local prefix = parsed_subtags:get_tag():lower():match("^(.-)%-" .. variant) |
|||
-- Check that at least one of the prefixes is found at the |
|||
-- beginning of lower_tag. |
|||
if not fun.some(function (prefix) |
|||
return lower_tag:find(prefix, 1, true) == 1 |
|||
end, |
|||
lang_name_table.variant[variant].prefixes) then |
|||
mw.log("Variant tag", variant, "does not belong with prefix", |
|||
prefix, "in", parsed_subtags:get_tag()) |
|||
end |
|||
end |
|||
end |
|||
end |
|||
-- Check that the private-use subtag is actually used by Wikipedia. |
|||
if parsed_subtags.private_use and not lang_data.override[parsed_subtags.tag] then |
|||
mw.log("Invalid private-use subtag in", parsed_subtags:get_tag()) |
|||
end |
|||
end |
end |
||
Revision as of 05:11, 8 July 2018
local p = {}
local Unicode_data = require "Module:Unicode data/sandbox"
local function errorf(level, ...)
if type(level) == number then
return error(string.format(...), level + 1)
else -- level is actually the format string.
return error(string.format(level, ...), 2)
end
end
function mw.logf(...)
return mw.log(string.format(...))
end
local output_mt = {}
function output_mt:insert(str)
self.n = self.n + 1
self[self.n] = str
end
-- also in [[Module:Unicode data/documentation functions]]
function output_mt:insert_format(...)
self:insert(string.format(...))
end
output_mt.join = table.concat
output_mt.__index = output_mt
local function Output()
return setmetatable({ n = 0 }, output_mt)
end
local Latn_pattern = table.concat {
"[",
"\n\32-\127",
"\194\160-\194\172",
"\195\128-\195\191",
"\196\128-\197\191",
"\198\128-\201\143",
"\225\184\128-\225\187\191",
"\226\177\160-\226\177\191",
"\234\156\160-\234\159\191",
"\234\172\176-\234\173\175",
"\239\172\128-\239\172\134",
"\239\188\129-\239\188\188",
"–",
"—",
"«", "»",
"]",
};
local get_codepoint = mw.ustring.codepoint
local function expand_range(start, ending)
local lower, higher = get_codepoint(start), get_codepoint(ending)
if higher < lower then
return nil
end
local chars = {}
local i = 0
for codepoint = lower, higher do
i = i + 1
chars[i] = mw.ustring.char(codepoint)
end
return table.concat(chars)
end
local fun = require "Module:Fun"
local m_table = require "Module:Table"
local script_to_count_mt = {
__index = function (self, key)
self[key] = 0
return 0
end,
__call = function (self, ...)
return setmetatable({}, self)
end
}
setmetatable(script_to_count_mt, script_to_count_mt)
-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint
-- each time it is called with an optional state and another value.
local function show_scripts(iterator, state, value)
local script_to_count = script_to_count_mt()
for codepoint in iterator, state, value do
local script = Unicode_data.lookup_script(codepoint)
script_to_count[script] = script_to_count[script] + 1
end
return table.concat(
fun.mapIter(
function (count, script)
return ("%s (%d)"):format(script, count)
end,
m_table.sortedPairs(
script_to_count,
function (script1, script2)
return script_to_count[script1] > script_to_count[script2]
end)),
", ")
end
local function get_chars_in_scripts(iterator, state, value)
local script_to_char_set = {}
for codepoint in iterator, state, value do
local script = Unicode_data.lookup_script(codepoint)
script_to_char_set[script] = script_to_char_set[script] or {}
script_to_char_set[script][codepoint] = true
end
return script_to_char_set
end
local function print_char_set_map(script_to_char_set, format, separator)
format = format or "%s: %s"
separator = separator or "\n"
return table.concat(
fun.mapIter(
function (char_set, script)
local char_list = fun.mapIter(
function (_, codepoint)
return mw.ustring.char(codepoint)
end,
m_table.sortedPairs(char_set))
return (format):format(script, mw.text.nowiki(table.concat(char_list)))
end,
m_table.sortedPairs(script_to_char_set)),
separator)
end
function p.show(frame)
local expanded_pattern = Latn_pattern
:gsub("%[(.-)%]", "%1")
:gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.
"([%z\1-\127\194-\244][\128-\191]*)%-([%z\1-\127\194-\244][\128-\191]*)",
function (char1, char2)
return expand_range(char1, char2)
end)
return ('* <div style="overflow-wrap: break-word;">%s</div><br>%s')
:format(expanded_pattern
:gsub("^%s*", ""), -- Remove initial "\n " to avoid creating unwanted pre element.
show_scripts(mw.ustring.gcodepoint(expanded_pattern)))
end
local function get_block_info_from_arg(args, arg)
local block_name = args[1]
or errorf("Parameter %s is required", tostring(arg))
local block_info = Unicode_data.get_block_info(block_name)
or errorf("The block '%s' could be found", block_name)
return block_info
end
local function get_boolean_from_arg(args, arg)
return args[arg] and require "Module:Yesno" (args[arg])
end
function p.scripts_in_block(frame)
local block_info = get_block_info_from_arg(frame.args, 1)
local show_block_name = get_boolean_from_arg(frame.args, 2)
local script_list = show_scripts(fun.range(block_info[1], block_info[2]))
if show_block_name then
return ("%s: %s"):format(block_info[3], script_list)
else
return script_list
end
end
local function link_block_name(block_name)
if block_name:find " " then
return ("[[%s]]"):format(block_name)
else
return ("[[%s (Unicode block)|%s]]"):format(block_name, block_name)
end
end
function p.scripts_in_blocks(frame)
local output = Output()
local start = frame.args[1] and tonumber(frame.args[1], 16) or 0
local ending = frame.args[2] and tonumber(frame.args[2], 16) or 0x4000
local script_data = mw.loadData "Module:Unicode data/scripts"
local singles = script_data.singles
local ranges = script_data.ranges
local function clear (self)
for _, key in ipairs(m_table.keysToList(self, false)) do
self[key] = nil
end
end
local counts = {}
setmetatable(counts, {
__index = {
increment = function(self, script_code, amount)
self[script_code] = (self[script_code] or 0) + (amount or 1)
end,
clear = clear,
}
})
local codepoints_per_script = {}
setmetatable(codepoints_per_script, {
__index = {
add = function(self, script_code, codepoint)
self[script_code] = self[script_code] or { n = 0 }
if self[script_code].n <= 0x20
and not (codepoint <= 0x9F and (codepoint >= 0x80
or codepoint <= 0x1F)) then
if self[script_code].n == 0x20 then
local period = ("."):byte()
for _ = 1, 3 do
self[script_code].n = self[script_code].n + 1
self[script_code][self[script_code].n] = period
end
else
if script_code == "Zinh" then -- probably combining character
self[script_code].n = self[script_code].n + 1
self[script_code][self[script_code].n] = 0x25CC
end
self[script_code].n = self[script_code].n + 1
self[script_code][self[script_code].n] = codepoint
end
end
end,
clear = clear,
}
})
output:insert [[
{| class="wikitable"
|+ Scripts in each Unicode block
! block !! codepoints !! scripts
]]
for _, block in pairs(mw.loadData "Module:Unicode data/blocks") do
local codepoint = block[1]
if codepoint > ending then break end
if codepoint >= start then
while codepoint <= block[2] do
local script = singles[codepoint]
local count
if script then -- Codepoint is in "singles" map.
counts:increment(script)
codepoints_per_script:add(script, codepoint)
codepoint = codepoint + 1
count = 1 -- for potential future use
else
local range, index = Unicode_data.binary_range_search(codepoint, ranges)
if range then -- Codepoint is in "ranges" array.
count = 0
script = range[3]
while codepoint <= range[2] and codepoint <= block[2] do
count = count + 1
codepoints_per_script:add(script, codepoint)
codepoint = codepoint + 1
end
counts:increment(script, count)
else -- Codepoint doesn't have data; it's Zzzz.
-- Get range immediately above codepoint.
while ranges[index][2] < codepoint do
index = index + 1
end
count = 0
script = "Zzzz"
local range = ranges[index]
while codepoint < range[1] and codepoint <= block[2]
and not singles[codepoint] do
count = count + 1
codepoint = codepoint + 1
end
counts:increment(script, count)
end
end
end
output:insert_format([[
|-
| %s
| U+%04X–U+%04X
| %s
]], link_block_name(block[3]), block[1], block[2],
table.concat(
fun.map(
function (count, script)
return ('<abbr title="%s">%s</abbr> (<span title="%s">%d</span>)')
:format(
script_data.aliases[script], script,
codepoints_per_script[script]
and mw.text.nowiki(mw.ustring.char(
unpack(codepoints_per_script[script])))
or "",
count)
end,
m_table.sortedPairs(
counts,
function (script1, script2)
return counts[script1] > counts[script2]
end)),
", "))
end
-- mw.logObject(codepoints_per_script, block[3])
counts:clear()
codepoints_per_script:clear()
end
output:insert "|}"
return output:join()
end
function p.chars_in_scripts_in_block(frame)
local block_info = get_block_info_from_arg(frame.args, 1)
local show_block_name = get_boolean_from_arg(frame.args, 2)
local script_char_set_map = print_char_set_map(
get_chars_in_scripts(fun.range(block_info[1], block_info[2])))
if show_block_name then
return ("%s: %s"):format(block_info[3], script_char_set_map)
else
return script_char_set_map
end
end
function p.search_for_language_codes(frame)
local page_name = frame.args[1] or "English language"
local success, title_object = pcall(mw.title.new, page_name)
if not (success and title_object) then
mw.logf("Could not make title object for '%s'.", page_name)
return
end
local content = title_object:getContent()
local language_codes = {}
for lang_template in content:gmatch "{{lang[^}]+" do
local template_name = lang_template:match("{{([^|}]+)")
local language_code
if template_name == "lang" then
language_code = lang_template:match "{{lang|([^|}]+)"
elseif template_name:find "^lang-" then
language_code = lang_template:match "{{lang-([^|}]+)"
end
if language_code then
language_codes[language_code] = true
end
end
return table.concat(m_table.keysToList(language_codes), ", ")
end
return p