Module:Hepburner: Difference between revisions

Content deleted Content added
Inline
Revision as of 19:03, 7 March 2024

require('strict');
local utf8 = require("Module:Unicode data")
-- Converts romanji kana to modified hepburn, I recommend subst:ing

-- standard long vowel patterns
local diacritics = {
    ["aa"] = "ā",
    ["uu"] = "ū",
    ["ee"] = "ē",
    ["oo"] = "ō",
    ["ou"] = "ō",
    ["Aa"] = "Ā",
    ["Uu"] = "Ū",
    ["Ee"] = "Ē",
    ["Oo"] = "Ō",
    ["Ou"] = "Ō",
    ["AA"] = "Ā",
    ["UU"] = "Ū",
    ["EE"] = "Ē",
    ["OO"] = "Ō",
    ["OU"] = "Ō"
}
local function romanjiToHepburn(romanji)
	for target, replacement in pairs(diacritics) do
    	romanji = romanji:gsub(target, replacement) 
    end
    return romanji	
end

--map is made local so it wont get cached every single time this is ran
local function kanaToHepburn(kana)
	local romanji = ""

    local kanaMap = {
        ["あ"] = "a", ["い"] = "i", ["う"] = "u", ["え"] = "e", ["お"] = "o",
        ["か"] = "ka", ["き"] = "ki", ["く"] = "ku", ["け"] = "ke", ["こ"] = "ko",
        ["さ"] = "sa", ["し"] = "shi", ["す"] = "su", ["せ"] = "se", ["そ"] = "so",
        ["た"] = "ta", ["ち"] = "chi", ["つ"] = "tsu", ["て"] = "te", ["と"] = "to",
        ["な"] = "na", ["に"] = "ni", ["ぬ"] = "nu", ["ね"] = "ne", ["の"] = "no",
        ["は"] = "ha", ["ひ"] = "hi", ["ふ"] = "fu", ["へ"] = "he", ["ほ"] = "ho",
        ["ま"] = "ma", ["み"] = "mi", ["む"] = "mu", ["め"] = "me", ["も"] = "mo",
        ["や"] = "ya", ["ゆ"] = "yu", ["よ"] = "yo", --["ゃ"] = "ya", ["ゅ"] = "yu", ["ょ"] = "yo",
        ["ら"] = "ra", ["り"] = "ri", ["る"] = "ru", ["れ"] = "re", ["ろ"] = "ro",
        ["わ"] = "wa", ["ゐ"] = "wi", ["ゑ"] = "we", ["を"] = "wo",
        ["ん"] = "n",
        ["が"] = "ga", ["ぎ"] = "gi", ["ぐ"] = "gu", ["げ"] = "ge", ["ご"] = "go",
        ["ざ"] = "za", ["じ"] = "ji", ["ず"] = "zu", ["ぜ"] = "ze", ["ぞ"] = "zo",
        ["だ"] = "da", ["ぢ"] = "ji", ["づ"] = "zu", ["で"] = "de", ["ど"] = "do",
        ["ば"] = "ba", ["び"] = "bi", ["ぶ"] = "bu", ["べ"] = "be", ["ぼ"] = "bo",
        ["ぱ"] = "pa", ["ぴ"] = "pi", ["ぷ"] = "pu", ["ぺ"] = "pe", ["ぽ"] = "po",
        ["ゔ"] = "vu"
    }
    local smallKanaMap = {
    	["ぁ"] = "a", ["ぃ"] = "i", ["ぅ"] = "u", ["ぇ"] = "e", ["ぉ"] = "o",
    	["ヵ"] = "ka",["ヶ"] = "ke",
    	["ゃ"] = "ya", ["ゅ"] = "yu", ["ょ"] = "yo"
    }
    
  -- iterate over each kana
  for character in mw.ustring.gcodepoint(kana) do
        local char = mw.ustring.char(character)
        local romanization = kanaMap[char]
        local smallRomanization
        
        -- short circuit eval, rare scenario
        if not romanization then 
			smallRomanization = smallKanaMap[char] 
        end
    	
  
        if romanization then				
            romanji = romanji .. romanization
        elseif smallRomanization then
        	romanji = romanji:sub(1, -2) .. smallKanaMap[char] -- remove vowel, append small vowel or other letters
        else												   -- this will cause issues if someone tries something like あぁぁぁぁぁ => a
            romanji = romanji .. char						   -- special rule for vowels maybe? will make this more expensive
        end
  end
  
	-- handle xtsu/っ
	-- Replace "っ" with the next consonant
	for i = 1, mw.ustring.len(romanji) do
	    local chr = mw.ustring.sub(romanji, i, i)
	    if chr == "っ" then
	        local nextChar = mw.ustring.sub(romanji, i + 1, i + 1)
	        if nextChar and not nextChar:match("[aeiou]") then
	            romanji = mw.ustring.sub(romanji, 1, i-1) .. nextChar .. mw.ustring.sub(romanji, i + 1) -- surely there's a better way, right?
	        end
	    end
	end
	return romanjiToHepburn(romanji) -- kana is converted to romanji, now change it to hepburn
end								     -- TODO: add a flag to disable this, and return the normal romanji without the diacritics 

-- checking for kana will need to check these bounds regardless
-- might as well convert at the same time
-- would it be better to have the kana conversion in the above function?
local function checkForKanaPresentAndConvert(data)
    local kanaFound = false
    local convertedString = ""
	local kanaDelta = (mw.ustring.codepoint("ァ") - mw.ustring.codepoint("ぁ"))  -- difference in the unicode table
	local hiraganaLowerBound = mw.ustring.codepoint("ぁ") -- I know magic numbers are bad but it almost seems more worth 
	local hiraganaUpperBound = mw.ustring.codepoint("ゖ") -- it to use them in this context
	local katakanaLowerBound = mw.ustring.codepoint("ァ") -- this is really expensive
	local katakanaUpperBound = mw.ustring.codepoint("ヶ")
    for c in mw.ustring.gcodepoint(data) do
        if c<=127 then
        	--short circuit eval for ascii
        elseif (hiraganaLowerBound <= c and c <= hiraganaUpperBound) then
            kanaFound = true
        elseif (katakanaLowerBound <= c and c <= katakanaUpperBound) then
            kanaFound = true
            c = c - kanaDelta -- convert it to hiragana codewise so i dont have to remake the lookup table for katakana 
        end
        convertedString = convertedString .. mw.ustring.char(c)
    end

    return {kanaFound, convertedString}
end

local function toHepburnKana(data)
	local processedData

	if not data then -- short circuit
		return
	end
	
	processedData = checkForKanaPresentAndConvert(data)
	
	if  processedData[1] then
		return kanaToHepburn(processedData[2])
	else
		return romanjiToHepburn(data)
	end
end

local p = {}

--TODO add a performant way to detect if there is kana in a string
--TODO differentiate accordingly
--katakana=>hiragana? Can I just iterate over each and add a magic number like you can with ascii?
--this could be expanded to use bopomofo too
function p.toHepburn(frame)
		local data = frame.args[1]
		return toHepburnKana(data)
end
function p.toHepburnTEST(data)
		return toHepburnKana(data)
end
return p