Jump to content

Module:Unicode data/category/doc: Difference between revisions

From Wikipedia, the free encyclopedia
Content deleted Content added
Created page with '<!-- Please place categories where indicated at the bottom of this page and interwikis at Wikidata (see Wikipedia:Wikidata) --> == Usage == Unicode General...'
 
scripts
Line 1: Line 1:
<!-- Please place categories where indicated at the bottom of this page and interwikis at Wikidata (see [[Wikipedia:Wikidata]]) -->
<!-- Please place categories where indicated at the bottom of this page and interwikis at Wikidata (see [[Wikipedia:Wikidata]]) -->
== Usage ==
Unicode [[General Category]] data derived from [https://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt DerivedGeneralCategory.txt] in the Unicode Character Database.
Unicode [[General Category]] data derived from [https://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt DerivedGeneralCategory.txt] in the Unicode Character Database.

The data was generated by the two Lua 5.3 scripts below. [http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html LPeg] is required. If the two scripts are in the same folder as <code>DerivedGeneralCategory.txt</code>, all one has to do is go to the directory with the command line and type <code>lua print_data.lua</code> to print the data to <code>data.lua</code>.

{{collapse top|title=Lua 5.3 scripts}}
; parse_data.lua
<source lang="lua">
local f = assert(io.open 'DerivedGeneralCategory.txt', 'r')
local Derived_General_Category = f:read 'a'
f:close()

local lpeg = require 'lpeg'

for k, v in pairs(lpeg) do
if type(k) == 'string' then
local first_letter = k:sub(1, 1)
if first_letter == first_letter:upper() then
_ENV[k] = v
end
end
end

local General_Category_data = { singles = {}, ranges = {} }
local function process_match(str, pos, ...)
if select(3, ...) then -- three arguments: XXXX..XXXX ; gc
local low, high, category = ...
if category ~= 'Cn' then
low, high = tonumber(low, 16), tonumber(high, 16)
table.insert(General_Category_data.ranges, { low, high, category })
end
else -- two arguments: XXXX ; gc
local codepoint, category = ...
if category ~= 'Cn' then
codepoint = tonumber(codepoint, 16)
General_Category_data.singles[codepoint] = category
end
end
return pos
end

local patt = P {
(V 'line' + 1)^1,
line = Cmt((V 'range' + C(V 'codepoint')) * V 'white' * P ';' * V 'white' * C(V 'gc') * (1 - V 'nl')^0,
process_match),
range = C(V 'codepoint') * P '..' * C(V 'codepoint'),
codepoint = V 'hex' * V 'hex' * V 'hex' * V 'hex' * V 'hex'^-2,
gc = R 'AZ' * P(1),
hex = R("09", "AF"),
white = S ' \t'^0,
nl = P '\r'^-1 * P '\n',
}

patt:match(Derived_General_Category)

return General_Category_data
</source>

; print_data.lua
<source lang="lua">
local data_filename = [[make_data.lua]]

local data = dofile(data_filename)

local output_filename = [[data.lua]]
local output = assert(io.open(output_filename, 'w'))

local function writef(...)
output:write(string.format(...))
end

writef [[
return {
singles = {
]]

-- Check that maximum "singles" codepoint is less than 0x100000?
for codepoint, category in require 't'.spairs(data.singles) do
writef('\t\t [0x%05X] = "%s",\n', codepoint, category)
end

writef [[
},
ranges = {
]]

local function compare_ranges(range1, range2)
return range1[1] < range2[1]
end

table.sort(data.ranges, compare_ranges)
for _, range in ipairs(data.ranges) do
writef('\t\t{ 0x%06X, 0x%06X, "%s" },\n', table.unpack(range))
end

writef [[
},
}]]
</source>
{{collapse bottom}}


<includeonly>{{#ifeq:{{SUBPAGENAME}}|sandbox | |
<includeonly>{{#ifeq:{{SUBPAGENAME}}|sandbox | |

Revision as of 19:47, 8 October 2018

Unicode General Category data derived from DerivedGeneralCategory.txt in the Unicode Character Database.

The data was generated by the two Lua 5.3 scripts below. LPeg is required. If the two scripts are in the same folder as DerivedGeneralCategory.txt, all one has to do is go to the directory with the command line and type lua print_data.lua to print the data to data.lua.

Lua 5.3 scripts
parse_data.lua
local f = assert(io.open 'DerivedGeneralCategory.txt', 'r')
local Derived_General_Category = f:read 'a'
f:close()

local lpeg = require 'lpeg'

for k, v in pairs(lpeg) do
	if type(k) == 'string' then
		local first_letter = k:sub(1, 1)
		if first_letter == first_letter:upper() then
			_ENV[k] = v
		end
	end
end

local General_Category_data = { singles = {}, ranges = {} }
local function process_match(str, pos, ...)
	if select(3, ...) then -- three arguments: XXXX..XXXX ; gc
		local low, high, category = ...
		if category ~= 'Cn' then
			low, high = tonumber(low, 16), tonumber(high, 16)
			table.insert(General_Category_data.ranges, { low, high, category })
		end
	else -- two arguments: XXXX ; gc
		local codepoint, category = ...
		if category ~= 'Cn' then
			codepoint = tonumber(codepoint, 16)
			General_Category_data.singles[codepoint] = category
		end
	end
	return pos
end

local patt = P {
	(V 'line' + 1)^1,
	line = Cmt((V 'range' + C(V 'codepoint')) * V 'white' * P ';' * V 'white' * C(V 'gc') * (1 - V 'nl')^0,
		process_match),
	range = C(V 'codepoint') * P '..' * C(V 'codepoint'),
	codepoint = V 'hex' * V 'hex' * V 'hex' * V 'hex' * V 'hex'^-2,
	gc = R 'AZ' * P(1),
	hex = R("09", "AF"),
	white = S ' \t'^0,
	nl = P '\r'^-1 * P '\n',
}

patt:match(Derived_General_Category)

return General_Category_data
print_data.lua
local data_filename = [[make_data.lua]]

local data = dofile(data_filename)

local output_filename = [[data.lua]]
local output = assert(io.open(output_filename, 'w'))

local function writef(...)
	output:write(string.format(...))
end

writef [[
return {
	singles = {
]]

-- Check that maximum "singles" codepoint is less than 0x100000?
for codepoint, category in require 't'.spairs(data.singles) do
	writef('\t\t [0x%05X] = "%s",\n', codepoint, category)
end

writef [[
	},
	ranges = {
]]

local function compare_ranges(range1, range2)
	return range1[1] < range2[1]
end

table.sort(data.ranges, compare_ranges)
for _, range in ipairs(data.ranges) do
	writef('\t\t{ 0x%06X, 0x%06X, "%s" },\n', table.unpack(range))
end

writef [[
	},
}]]