Module:Unicode data/testcases
Appearance
![]() | This is the test cases page for the module Module:Unicode data. Results of the test cases. |
local p = require 'Module:UnitTests'
local Unicode_data = require 'Module:Unicode data'
local U = mw.ustring.char
local function show(codepoint)
if Unicode_data.is_printable(codepoint) then
local printed_codepoint = U(codepoint)
if mw.ustring.toNFC(printed_codepoint) ~= printed_codepoint then
printed_codepoint = ("&#x%X;"):format(codepoint)
end
if Unicode_data.is_combining(codepoint) then
printed_codepoint = "◌" .. printed_codepoint
end
return ("U+%04X: %s"):format(codepoint, printed_codepoint)
else
return ("U+%04X"):format(codepoint)
end
end
local function show_codepoint_and_name(codepoint)
return ("%s (%s)"):format(show(codepoint),
Unicode_data.lookup_name(codepoint))
end
function p:test_lookup_name()
local examples = {
{ 0x0000, "<control-0000>" },
{ 0x007F, "<control-007F>" },
{ 0x00C1, "LATIN CAPITAL LETTER A WITH ACUTE" },
{ 0x0300, "COMBINING GRAVE ACCENT" },
{ 0x0378, "<reserved-0378>" },
{ 0x1B44, "BALINESE ADEG ADEG" },
{ 0x1F71, "GREEK SMALL LETTER ALPHA WITH OXIA" },
{ 0x3555, "CJK UNIFIED IDEOGRAPH-3555" },
{ 0xAC01, "HANGUL SYLLABLE GAG" },
{ 0xD5FF, "HANGUL SYLLABLE HEH" },
{ 0xDC00, "<surrogate-DC00>", },
{ 0xEEEE, "<private-use-EEEE>" },
{ 0xFDD1, "<noncharacter-FDD1>", },
{ 0xFFFD, "REPLACEMENT CHARACTER" },
{ 0xFFFF, "<noncharacter-FFFF>" },
{ 0x1F4A9, "PILE OF POO" },
{ 0xE0000, "<reserved-E0000>" },
{ 0xF0F0F, "<private-use-F0F0F>" },
{ 0x10FFFF, "<noncharacter-10FFFF>" },
}
self:iterate(examples,
function (self, codepoint, name)
self:equals(show(codepoint),
Unicode_data.lookup_name(codepoint), name)
end)
end
function p:test_is_combining()
local examples = {
{ 0x0300, true },
{ 0x0060, false },
}
self:iterate(examples,
function (self, codepoint, expected)
self:equals(
show_codepoint_and_name(codepoint),
Unicode_data.is_combining(codepoint),
expected)
end)
end
function p:test_lookup_script()
local examples = {
{ 0x0061, "Latn" },
{ 0x002F, "Zyyy" },
{ 0x0300, "Zinh" },
{ 0x0378, "Zzzz" },
{ 0x0398, "Grek" },
{ 0x03E2, "Copt" },
{ 0x2014, "Zyyy" },
}
self:iterate(examples,
function (self, codepoint, expected)
self:equals(
show_codepoint_and_name(codepoint),
Unicode_data.lookup_script(codepoint),
expected)
end)
end
local fun = require "Module:Fun"
local m_table = require "Module:Table"
local script_to_count_mt = {
__index = function (self, key)
self[key] = 0
return 0
end,
__call = function (self, ...)
return setmetatable({}, self)
end
}
setmetatable(script_to_count_mt, script_to_count_mt)
local script_counts = {}
local function show_scripts(str)
if script_counts[str] then
return script_counts[str]
end
local script_to_count = script_to_count_mt()
for codepoint in mw.ustring.gcodepoint(str) do
local script = Unicode_data.lookup_script(codepoint)
script_to_count[script] = script_to_count[script] + 1
end
local printed = table.concat(
fun.mapIter(
function (count, script)
return ("%s (%d)"):format(script, count)
end,
m_table.sortedPairs(
script_to_count,
function (script1, script2)
return script_to_count[script1] > script_to_count[script2]
end)),
", ")
script_counts[str] = printed
return printed
end
local script_examples = {
-- To demonstrate that "is_Latin" doesn't treat a string of Zyyy and Zinh
-- characters as Latn.
-- This particular example only has characters below U+0340, so
-- lookup_script doesn't have to be called.
{ "%!?́", nil },
{ "’ʼ“”†‡•‰′‽⁕", nil },
-- Examples from [[Template talk:Lang#Italicisation of Halkomelem]]
"Halkomelem",
{ "lá:yelhp", "Latn" },
{ "xʷməθkʷəy̓əm", nil }, -- one Greek (Grek) character
{ "hən̓q̓əmin̓əm̓", "Latn" },
"Quotes",
{ "col·legi", "Latn" },
-- [[s:it:Divina Commedia/Inferno/Canto I]]
{
[[Tant’è amara che poco è più morte;
ma per trattar del ben ch’i’ vi trovai,
dirò de l’altre cose ch’i’ v’ ho scorte.]],
"Latn"
},
{ -- A blessing in Navajo:
--[[User talk:Stephen G. Brown/text8]]
[[Díí Késhmish biyiʼ yáʼąąshdę́ę́ʼ ląʼígóó bee nikʼihojidlíi dooleeł.
Niheechʼínáánáháhígíí biyiʼ iłhodeezyéél, iłhózhǫ́, ayóóʼóʼóʼní
bee nikʼihojidlíi dooleeł. Tʼáá sahdiigiʼ átʼéego baa hózhǫ́ǫgo
nihił hanááhoolzhiizhígí biyiʼ tʼáá ałtsojįʼ iłhózhǫ́ nííʼ dooleeł.]],
"Latn"
},
{ -- The opening of the Iliad ([[s:el:Ιλιάς/Α]]), with macrons and
-- breves added to mark the length of the monophthongs α, ι, υ:
[[Μῆνῐν ᾰ̓́ειδε, θεᾱ́, Πηληῐ̈ᾰ́δεω Ᾰ̓χῐλῆος
οὐλομένην, ἣ μῡρῐ́᾽ Ᾰ̓χαιοῖς ᾰ̓́λγε᾽ ἔθηκε,
πολλᾱ̀ς δ᾽ ῐ̓φθῑ́μους ψῡχᾱ̀ς Ἄῐ̈δῐ προῐ̈́ᾰψεν
ἡρώων, αὐτοὺς δὲ ἑλώρῐᾰ τεῦχε κῠ́νεσσιν
οἰωνοῖσῐ́ τε πᾶσῐ· Δῐὸς δ᾽ ἐτελείετο βουλή·]],
"Grek"
},
{ -- The Brothers Karamazov: [[w:ru:Братья Карамазовы (Достоевский)/Книга первая]]
[[Вот если вы не согласитесь с этим последним тезисом и
ответите: «Не так» или «не всегда так», то я, пожалуй, и
ободрюсь духом насчет значения героя моего Алексея
Федоровича. Ибо не только чудак «не всегда» частность и
обособление, а напротив, бывает так, что он-то, пожалуй,
и носит в себе иной раз сердцевину целого, а остальные
люди его эпохи — все, каким-нибудь наплывным ветром,
на время почему-то от него оторвались…]],
"Cyrl"
},
{ -- Rig Veda: [[https://sa.wikisource.org/wiki/ऋग्वेदः_सूक्तं_१.१]]
[[ॐ अग्निमीळे पुरोहितं यज्ञस्य देवमृत्विजम् ।
होतारं रत्नधातमम् ॥१॥
अग्निः पूर्वेभिरृषिभिरीड्यो नूतनैरुत ।
स देवाँ एह वक्षति ॥२॥
अग्निना रयिमश्नवत् पोषमेव दिवेदिवे ।
यशसं वीरवत्तमम् ॥३॥
अग्ने यं यज्ञमध्वरं विश्वतः परिभूरसि ।
स इद्देवेषु गच्छति ॥४॥
अग्निर्होता कविक्रतुः सत्यश्चित्रश्रवस्तमः ।
देवो देवेभिरा गमत् ॥५॥
यदङ्ग दाशुषे त्वमग्ने भद्रं करिष्यसि ।
तवेत् तत् सत्यमङ्गिरः ॥६॥
उप त्वाग्ने दिवेदिवे दोषावस्तर्धिया वयम् ।
नमो भरन्त एमसि ॥७॥
राजन्तमध्वराणां गोपामृतस्य दीदिविम् ।
वर्धमानं स्वे दमे ॥८॥
स नः पितेव सूनवेऽग्ने सूपायनो भव ।
सचस्वा नः स्वस्तये ॥९॥]],
"Deva"
},
}
local ends_in_punctuation = {}
local function show_script_example(script_example)
local separator = ": "
ends_in_punctuation[script_example] =
ends_in_punctuation[script_example]
or mw.ustring.match(mw.ustring.sub(script_example, -1), "%p")
-- If last character is punctuation, place script counts on their own line
-- Could use Unicode_data.lookup_category, but that is more memory-intensive.
if ends_in_punctuation[script_example] then
separator = "<br>• "
end
return script_example:gsub('\n', '<br>') .. separator
.. show_scripts(script_example)
end
function p:test_get_best_script()
self:iterate(script_examples,
function (self, str, expected)
self:equals(
show_script_example(str),
Unicode_data.get_best_script(str),
expected)
end)
end
function p:test_is_Latin()
self:iterate(script_examples,
function (self, str, best_script, is_Latin)
self:equals(show_script_example(str), Unicode_data.is_Latin(str),
is_Latin or best_script == "Latn")
end)
end
function p:test_lookup_block()
local examples = {
{ 0x0064, "Basic Latin" },
{ 0x030B, "Combining Diacritical Marks" },
{ 0x03A3, "Greek and Coptic" },
{ 0x0411, "Cyrillic" },
{ 0x10E6, "Georgian" },
{ 0x3175, "Hangul Compatibility Jamo" },
{ 0xAC01, "Hangul Syllables" },
{ 0x4E0A, "CJK Unified Ideographs" },
{ 0x1F608, "Emoticons" },
{ 0x30000, "No Block" },
{ 0x10FFFF, "Supplementary Private Use Area-B" },
}
self:iterate(examples,
function (self, codepoint, block_name)
self:equals(
show(codepoint),
Unicode_data.lookup_block(codepoint),
block_name)
end)
end
function p:test_is_rtl()
local examples = {
{ "ابج abc", false },
{ "أبو عبد الله محمد بن عبد الله اللواتي الطنجي بن بطوطة", true },
{ "%$!^&", false },
}
self:iterate(examples,
function (self, str, expected)
self:equals(str, Unicode_data.is_rtl(str), expected)
end)
end
-- Change function names into more readable headers for the testcases tables.
for k, v in require "Module:table".sortedPairs(p) do
if type(k) == "string" then
local new_k = k:gsub("^test_(.+)$", "testcases for <code>%1</code>")
if new_k ~= k then
p[k] = nil
p[new_k] = v
end
end
end
return p