Module:Unicode data/category/doc: Difference between revisions
Appearance
Content deleted Content added
note |
created local module to generate |
||
Line 1: | Line 1: | ||
<!-- Please place categories where indicated at the bottom of this page and interwikis at Wikidata (see [[Wikipedia:Wikidata]]) --> |
<!-- Please place categories where indicated at the bottom of this page and interwikis at Wikidata (see [[Wikipedia:Wikidata]]) --> |
||
Unicode [[General Category]] data |
Unicode [[General Category]] data generated by [[Module:Unicode data/category/make]] from [https://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt DerivedGeneralCategory.txt] in the Unicode Character Database. The category Cn (Unassigned) is omitted because it is the default for characters not assigned to another category. |
||
The data was generated by the two Lua 5.3 scripts below. [http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html LPeg] is required. If the two scripts are in the same folder as <code>DerivedGeneralCategory.txt</code>, all one has to do is go to the directory with the command line and type <code>lua print_data.lua</code> to print the data to <code>data.lua</code>. |
|||
{{collapse top|title=Lua 5.3 scripts}} |
|||
; parse_data.lua |
|||
<source lang="lua"> |
|||
local f = assert(io.open 'DerivedGeneralCategory.txt', 'r') |
|||
local Derived_General_Category = f:read 'a' |
|||
f:close() |
|||
local lpeg = require 'lpeg' |
|||
for k, v in pairs(lpeg) do |
|||
if type(k) == 'string' then |
|||
local first_letter = k:sub(1, 1) |
|||
if first_letter == first_letter:upper() then |
|||
_ENV[k] = v |
|||
end |
|||
end |
|||
end |
|||
local General_Category_data = { singles = {}, ranges = {} } |
|||
local function process_match(str, pos, ...) |
|||
if select(3, ...) then -- three arguments: XXXX..XXXX ; gc |
|||
local low, high, category = ... |
|||
if category ~= 'Cn' then |
|||
low, high = tonumber(low, 16), tonumber(high, 16) |
|||
table.insert(General_Category_data.ranges, { low, high, category }) |
|||
end |
|||
else -- two arguments: XXXX ; gc |
|||
local codepoint, category = ... |
|||
if category ~= 'Cn' then |
|||
codepoint = tonumber(codepoint, 16) |
|||
General_Category_data.singles[codepoint] = category |
|||
end |
|||
end |
|||
return pos |
|||
end |
|||
local patt = P { |
|||
(V 'line' + 1)^1, |
|||
line = Cmt((V 'range' + C(V 'codepoint')) * V 'white' * P ';' * V 'white' * C(V 'gc') * (1 - V 'nl')^0, |
|||
process_match), |
|||
range = C(V 'codepoint') * P '..' * C(V 'codepoint'), |
|||
codepoint = V 'hex' * V 'hex' * V 'hex' * V 'hex' * V 'hex'^-2, |
|||
gc = R 'AZ' * P(1), |
|||
hex = R("09", "AF"), |
|||
white = S ' \t'^0, |
|||
nl = P '\r'^-1 * P '\n', |
|||
} |
|||
patt:match(Derived_General_Category) |
|||
return General_Category_data |
|||
</source> |
|||
; print_data.lua |
|||
<source lang="lua"> |
|||
local data_filename = [[make_data.lua]] |
|||
local data = dofile(data_filename) |
|||
local output_filename = [[data.lua]] |
|||
local output = assert(io.open(output_filename, 'w')) |
|||
local function writef(...) |
|||
output:write(string.format(...)) |
|||
end |
|||
writef [[ |
|||
return { |
|||
singles = { |
|||
]] |
|||
-- Check that maximum "singles" codepoint is less than 0x100000? |
|||
for codepoint, category in require 't'.spairs(data.singles) do |
|||
writef('\t\t[0x%05X] = "%s",\n', codepoint, category) |
|||
end |
|||
writef [[ |
|||
}, |
|||
ranges = { |
|||
]] |
|||
local function compare_ranges(range1, range2) |
|||
return range1[1] < range2[1] |
|||
end |
|||
table.sort(data.ranges, compare_ranges) |
|||
for _, range in ipairs(data.ranges) do |
|||
writef('\t\t{ 0x%06X, 0x%06X, "%s" },\n', table.unpack(range)) |
|||
end |
|||
writef [[ |
|||
}, |
|||
}]] |
|||
</source> |
|||
{{collapse bottom}} |
|||
<includeonly>{{#ifeq:{{SUBPAGENAME}}|sandbox | | |
<includeonly>{{#ifeq:{{SUBPAGENAME}}|sandbox | | |
Revision as of 19:19, 16 September 2019
Unicode General Category data generated by Module:Unicode data/category/make from DerivedGeneralCategory.txt in the Unicode Character Database. The category Cn (Unassigned) is omitted because it is the default for characters not assigned to another category.