Module:Hepburner: Difference between revisions
DarmaniLink (talk | contribs) no long i, oops! |
DarmaniLink (talk | contribs) updated to add kana support |
||
Line 1: | Line 1: | ||
require('strict'); |
require('strict'); |
||
local utf8 = require("Module:Unicode data") |
|||
-- Converts romanji input to modified hepburn, I recommend subst:ing |
|||
-- Converts romanji kana to modified hepburn, I recommend subst:ing |
|||
-- standard long vowel patterns |
-- standard long vowel patterns |
||
Line 20: | Line 21: | ||
["OU"] = "Ō" |
["OU"] = "Ō" |
||
} |
} |
||
local function romanjiToHepburn(romanji) |
|||
for target, replacement in pairs(diacritics) do |
|||
local p = {} |
|||
romanji = romanji:gsub(target, replacement) |
|||
--input: 1:1 transliterated romanji |
|||
function p.toHepburn(frame) |
|||
local romanji = frame.args[1] |
|||
for target, replacement in pairs(diacritics) do |
|||
romanji = romanji:gsub(target, replacement) |
|||
end |
end |
||
return romanji |
return romanji |
||
end |
end |
||
--map is made local so it wont get cached every single time this is ran |
|||
local function kanaToHepburn(kana) |
|||
local romanji = "" |
|||
local kanaMap = { |
|||
["あ"] = "a", ["い"] = "i", ["う"] = "u", ["え"] = "e", ["お"] = "o", |
|||
["か"] = "ka", ["き"] = "ki", ["く"] = "ku", ["け"] = "ke", ["こ"] = "ko", |
|||
["さ"] = "sa", ["し"] = "shi", ["す"] = "su", ["せ"] = "se", ["そ"] = "so", |
|||
["た"] = "ta", ["ち"] = "chi", ["つ"] = "tsu", ["て"] = "te", ["と"] = "to", |
|||
["な"] = "na", ["に"] = "ni", ["ぬ"] = "nu", ["ね"] = "ne", ["の"] = "no", |
|||
["は"] = "ha", ["ひ"] = "hi", ["ふ"] = "fu", ["へ"] = "he", ["ほ"] = "ho", |
|||
["ま"] = "ma", ["み"] = "mi", ["む"] = "mu", ["め"] = "me", ["も"] = "mo", |
|||
["や"] = "ya", ["ゆ"] = "yu", ["よ"] = "yo", --["ゃ"] = "ya", ["ゅ"] = "yu", ["ょ"] = "yo", |
|||
["ら"] = "ra", ["り"] = "ri", ["る"] = "ru", ["れ"] = "re", ["ろ"] = "ro", |
|||
["わ"] = "wa", ["ゐ"] = "wi", ["ゑ"] = "we", ["を"] = "wo", |
|||
["ん"] = "n", |
|||
["が"] = "ga", ["ぎ"] = "gi", ["ぐ"] = "gu", ["げ"] = "ge", ["ご"] = "go", |
|||
["ざ"] = "za", ["じ"] = "ji", ["ず"] = "zu", ["ぜ"] = "ze", ["ぞ"] = "zo", |
|||
["だ"] = "da", ["ぢ"] = "ji", ["づ"] = "zu", ["で"] = "de", ["ど"] = "do", |
|||
["ば"] = "ba", ["び"] = "bi", ["ぶ"] = "bu", ["べ"] = "be", ["ぼ"] = "bo", |
|||
["ぱ"] = "pa", ["ぴ"] = "pi", ["ぷ"] = "pu", ["ぺ"] = "pe", ["ぽ"] = "po", |
|||
["ゔ"] = "vu" |
|||
} |
|||
local smallKanaMap = { |
|||
["ぁ"] = "a", ["ぃ"] = "i", ["ぅ"] = "u", ["ぇ"] = "e", ["ぉ"] = "o", |
|||
["ヵ"] = "ka",["ヶ"] = "ke", |
|||
["ゃ"] = "ya", ["ゅ"] = "yu", ["ょ"] = "yo" |
|||
} |
|||
-- iterate over each kana |
|||
for character in mw.ustring.gcodepoint(kana) do |
|||
local char = mw.ustring.char(character) |
|||
local romanization = kanaMap[char] |
|||
local smallRomanization |
|||
-- short circuit eval, rare scenario |
|||
if not romanization then |
|||
smallRomanization = smallKanaMap[char] |
|||
end |
|||
if romanization then |
|||
romanji = romanji .. romanization |
|||
elseif smallRomanization then |
|||
romanji = romanji:sub(1, -2) .. smallKanaMap[char] -- remove vowel, append small vowel or other letters |
|||
else -- this will cause issues if someone tries something like あぁぁぁぁぁ => a |
|||
romanji = romanji .. char -- special rule for vowels maybe? will make this more expensive |
|||
end |
|||
end |
|||
-- handle xtsu/っ |
|||
-- Replace "っ" with the next consonant |
|||
for i = 1, mw.ustring.len(romanji) do |
|||
local chr = mw.ustring.sub(romanji, i, i) |
|||
if chr == "っ" then |
|||
local nextChar = mw.ustring.sub(romanji, i + 1, i + 1) |
|||
if nextChar and not nextChar:match("[aeiou]") then |
|||
romanji = mw.ustring.sub(romanji, 1, i-1) .. nextChar .. mw.ustring.sub(romanji, i + 1) -- surely there's a better way, right? |
|||
end |
|||
end |
|||
end |
|||
return romanjiToHepburn(romanji) -- kana is converted to romanji, now change it to hepburn |
|||
end -- TODO: add a flag to disable this, and return the normal romanji without the diacritics |
|||
-- checking for kana will need to check these bounds regardless |
|||
-- might as well convert at the same time |
|||
-- would it be better to have the kana conversion in the above function? |
|||
local function checkForKanaPresentAndConvert(data) |
|||
local kanaFound = false |
|||
local convertedString = "" |
|||
local kanaDelta = (mw.ustring.codepoint("ァ") - mw.ustring.codepoint("ぁ")) -- difference in the unicode table |
|||
local hiraganaLowerBound = mw.ustring.codepoint("ぁ") -- I know magic numbers are bad but it almost seems more worth |
|||
local hiraganaUpperBound = mw.ustring.codepoint("ゖ") -- it to use them in this context |
|||
local katakanaLowerBound = mw.ustring.codepoint("ァ") -- this is really expensive |
|||
local katakanaUpperBound = mw.ustring.codepoint("ヶ") |
|||
for c in mw.ustring.gcodepoint(data) do |
|||
if c<=127 then |
|||
--short circuit eval for ascii |
|||
elseif (hiraganaLowerBound <= c and c <= hiraganaUpperBound) then |
|||
kanaFound = true |
|||
elseif (katakanaLowerBound <= c and c <= katakanaUpperBound) then |
|||
kanaFound = true |
|||
c = c - kanaDelta -- convert it to hiragana codewise so i dont have to remake the lookup table for katakana |
|||
end |
|||
convertedString = convertedString .. mw.ustring.char(c) |
|||
end |
|||
return {kanaFound, convertedString} |
|||
end |
|||
local function toHepburnKana(data) |
|||
local processedData |
|||
if not data then -- short circuit |
|||
return |
|||
end |
|||
processedData = checkForKanaPresentAndConvert(data) |
|||
if processedData[1] then |
|||
return kanaToHepburn(processedData[2]) |
|||
else |
|||
return romanjiToHepburn(data) |
|||
end |
|||
end |
|||
local p = {} |
|||
--TODO add a performant way to detect if there is kana in a string |
|||
--TODO differentiate accordingly |
|||
--katakana=>hiragana? Can I just iterate over each and add a magic number like you can with ascii? |
|||
--this could be expanded to use bopomofo too |
|||
function p.toHepburn(frame) |
|||
local data = frame.args[1] |
|||
return toHepburnKana(data) |
|||
end |
|||
function p.toHepburnTEST(data) |
|||
return toHepburnKana(data) |
|||
end |
|||
return p |
return p |
Revision as of 19:03, 7 March 2024
Romanizes double vowels per the standard outlined in Hepburn Romanization.
Implemented in Template:Hepburn - Please use this instead of using this directly. It enforces subst for this *very* costly module.
Any double vowels will get converted to a long vowel, ou will get converted to ō.
{{#invoke:Hepburner|toHepburn|Kinou}} => Kinō
{{#invoke:Hepburner|toHepburn|Ooki}} => Ōki
{{#invoke:Hepburner|toHepburn|kara-age}} => kara-age
{{#invoke:Hepburner|toHepburn|sakkaa}} => sakkā
{{#invoke:Hepburner|toHepburn|raamen}} => rāmen
{{#invoke:Hepburner|toHepburn|ヴィデオ}} => video
{{#invoke:Hepburner|toHepburn|いこう}} => ikō
{{#invoke:Hepburner|toHepburn|やった}} => yatta
{{#invoke:Hepburner|toHepburn|いきましょう}} => ikimashyō
{{#invoke:Hepburner|toHepburn|ちゅうにびょう}} => chyūnibyō
{{#invoke:Hepburner|toHepburn|つづく}} => tsuzuku
note: oU aA or the like will break it. You should never, ever, ever do this regardless, but note that is a limitation. If you need that, add it to the list, following the current pattern.
require('strict');
local utf8 = require("Module:Unicode data")
-- Converts romanji kana to modified hepburn, I recommend subst:ing
-- standard long vowel patterns
local diacritics = {
["aa"] = "ā",
["uu"] = "ū",
["ee"] = "ē",
["oo"] = "ō",
["ou"] = "ō",
["Aa"] = "Ā",
["Uu"] = "Ū",
["Ee"] = "Ē",
["Oo"] = "Ō",
["Ou"] = "Ō",
["AA"] = "Ā",
["UU"] = "Ū",
["EE"] = "Ē",
["OO"] = "Ō",
["OU"] = "Ō"
}
local function romanjiToHepburn(romanji)
for target, replacement in pairs(diacritics) do
romanji = romanji:gsub(target, replacement)
end
return romanji
end
--map is made local so it wont get cached every single time this is ran
local function kanaToHepburn(kana)
local romanji = ""
local kanaMap = {
["あ"] = "a", ["い"] = "i", ["う"] = "u", ["え"] = "e", ["お"] = "o",
["か"] = "ka", ["き"] = "ki", ["く"] = "ku", ["け"] = "ke", ["こ"] = "ko",
["さ"] = "sa", ["し"] = "shi", ["す"] = "su", ["せ"] = "se", ["そ"] = "so",
["た"] = "ta", ["ち"] = "chi", ["つ"] = "tsu", ["て"] = "te", ["と"] = "to",
["な"] = "na", ["に"] = "ni", ["ぬ"] = "nu", ["ね"] = "ne", ["の"] = "no",
["は"] = "ha", ["ひ"] = "hi", ["ふ"] = "fu", ["へ"] = "he", ["ほ"] = "ho",
["ま"] = "ma", ["み"] = "mi", ["む"] = "mu", ["め"] = "me", ["も"] = "mo",
["や"] = "ya", ["ゆ"] = "yu", ["よ"] = "yo", --["ゃ"] = "ya", ["ゅ"] = "yu", ["ょ"] = "yo",
["ら"] = "ra", ["り"] = "ri", ["る"] = "ru", ["れ"] = "re", ["ろ"] = "ro",
["わ"] = "wa", ["ゐ"] = "wi", ["ゑ"] = "we", ["を"] = "wo",
["ん"] = "n",
["が"] = "ga", ["ぎ"] = "gi", ["ぐ"] = "gu", ["げ"] = "ge", ["ご"] = "go",
["ざ"] = "za", ["じ"] = "ji", ["ず"] = "zu", ["ぜ"] = "ze", ["ぞ"] = "zo",
["だ"] = "da", ["ぢ"] = "ji", ["づ"] = "zu", ["で"] = "de", ["ど"] = "do",
["ば"] = "ba", ["び"] = "bi", ["ぶ"] = "bu", ["べ"] = "be", ["ぼ"] = "bo",
["ぱ"] = "pa", ["ぴ"] = "pi", ["ぷ"] = "pu", ["ぺ"] = "pe", ["ぽ"] = "po",
["ゔ"] = "vu"
}
local smallKanaMap = {
["ぁ"] = "a", ["ぃ"] = "i", ["ぅ"] = "u", ["ぇ"] = "e", ["ぉ"] = "o",
["ヵ"] = "ka",["ヶ"] = "ke",
["ゃ"] = "ya", ["ゅ"] = "yu", ["ょ"] = "yo"
}
-- iterate over each kana
for character in mw.ustring.gcodepoint(kana) do
local char = mw.ustring.char(character)
local romanization = kanaMap[char]
local smallRomanization
-- short circuit eval, rare scenario
if not romanization then
smallRomanization = smallKanaMap[char]
end
if romanization then
romanji = romanji .. romanization
elseif smallRomanization then
romanji = romanji:sub(1, -2) .. smallKanaMap[char] -- remove vowel, append small vowel or other letters
else -- this will cause issues if someone tries something like あぁぁぁぁぁ => a
romanji = romanji .. char -- special rule for vowels maybe? will make this more expensive
end
end
-- handle xtsu/っ
-- Replace "っ" with the next consonant
for i = 1, mw.ustring.len(romanji) do
local chr = mw.ustring.sub(romanji, i, i)
if chr == "っ" then
local nextChar = mw.ustring.sub(romanji, i + 1, i + 1)
if nextChar and not nextChar:match("[aeiou]") then
romanji = mw.ustring.sub(romanji, 1, i-1) .. nextChar .. mw.ustring.sub(romanji, i + 1) -- surely there's a better way, right?
end
end
end
return romanjiToHepburn(romanji) -- kana is converted to romanji, now change it to hepburn
end -- TODO: add a flag to disable this, and return the normal romanji without the diacritics
-- checking for kana will need to check these bounds regardless
-- might as well convert at the same time
-- would it be better to have the kana conversion in the above function?
local function checkForKanaPresentAndConvert(data)
local kanaFound = false
local convertedString = ""
local kanaDelta = (mw.ustring.codepoint("ァ") - mw.ustring.codepoint("ぁ")) -- difference in the unicode table
local hiraganaLowerBound = mw.ustring.codepoint("ぁ") -- I know magic numbers are bad but it almost seems more worth
local hiraganaUpperBound = mw.ustring.codepoint("ゖ") -- it to use them in this context
local katakanaLowerBound = mw.ustring.codepoint("ァ") -- this is really expensive
local katakanaUpperBound = mw.ustring.codepoint("ヶ")
for c in mw.ustring.gcodepoint(data) do
if c<=127 then
--short circuit eval for ascii
elseif (hiraganaLowerBound <= c and c <= hiraganaUpperBound) then
kanaFound = true
elseif (katakanaLowerBound <= c and c <= katakanaUpperBound) then
kanaFound = true
c = c - kanaDelta -- convert it to hiragana codewise so i dont have to remake the lookup table for katakana
end
convertedString = convertedString .. mw.ustring.char(c)
end
return {kanaFound, convertedString}
end
local function toHepburnKana(data)
local processedData
if not data then -- short circuit
return
end
processedData = checkForKanaPresentAndConvert(data)
if processedData[1] then
return kanaToHepburn(processedData[2])
else
return romanjiToHepburn(data)
end
end
local p = {}
--TODO add a performant way to detect if there is kana in a string
--TODO differentiate accordingly
--katakana=>hiragana? Can I just iterate over each and add a magic number like you can with ascii?
--this could be expanded to use bopomofo too
function p.toHepburn(frame)
local data = frame.args[1]
return toHepburnKana(data)
end
function p.toHepburnTEST(data)
return toHepburnKana(data)
end
return p