Jump to content

Module:Ko-translit: Difference between revisions

From Wikipedia, the free encyclopedia
Content deleted Content added
consistent
looks like it's good to update this now
Line 2: Line 2:
local find = mw.ustring.find
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local gsub = mw.ustring.gsub
local m_data = require('Module:Ko-translit/data')
local m_utils = require('Module:Ko-utils')
local get_args = require('Module:Arguments').getArgs


--[[
--[[
Line 14: Line 17:
For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$
For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$
--]]
--]]

local function gsub_iterate(text, table)
for _, entry in ipairs(table) do
text = gsub(text, entry[1], entry[2])
end
return text
end


local function remove_links_and_markup(text)
local function remove_links_and_markup(text)
Line 29: Line 39:
text = gsub(text, "%[%[[^%|]+%|(..-)%]%]", "%1")
text = gsub(text, "%[%[[^%|]+%|(..-)%]%]", "%1")
text = gsub(gsub(text, "%[%[", ""), "%]%]", "")
text = gsub(gsub(text, "%[%[", ""), "%]%]", "")

-- remove refs
-- text = gsub(text, "<ref.-</ref>", "")
text = mw.text.killMarkers(text)
text = mw.text.killMarkers(text)
-- remove templates
text = gsub(text, "{{.-}}", "")


return text
return text
Line 43: Line 50:


-- input must contain Hangul
-- input must contain Hangul
if not m_utils.contains_hangul(text) then
if text == nil or text == "" or find(text, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-꥿가-힣ힰ-퟿]") == nil then
error("Input must contain Hangul")
error("Input must contain Hangul")
end
end


-- no direct insertion of reference or footnote
-- no direct insertion of reference or footnote
if m_utils.contains_reference(text) then
if find(text, "'\"`UNIQ--") or find(text, "-QINU`\"'") then
error("Input cannot contain references")
error("Input cannot contain references")
end
end
Line 58: Line 65:
end
end


text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders
-- process escape chars first
text = gsub(text, "\\%$", "&#36;")
text = gsub(text, "\\%%", "&#37;")
text = gsub(text, "\\%*", "&#42;")
text = gsub(text, "\\@", "&#64;")
text = gsub(text, "\\%^", "&#94;")
text = gsub(text, "\\_", "&#95;")
text = gsub(text, "\\`", "&#96;")


-- various validations of input
if find(text, "[ᄀ-ᄒ]") or find(text, "[ᅡ-ᅵᆨ-ᇂ]") then
if find(text, "[ᄀ-ᄒ]") or find(text, "[ᅡ-ᅵᆨ-ᇂ]") then
error("Do not input conjoining Hangul jamo directly")
error("Do not input conjoining Hangul jamo directly")
Line 92: Line 93:
end
end



-- verify that hangul input is valid
local function check_invalid_seq(text)
-- checked right after removing links and markups (before decomposing Hangul)
-- validity check after removing links and markups (before decomposing Hangul)
-- Hangul status: precomposed (한)
-- Hangul status: precomposed (한)
local function check_invalid_input(text)

if find(text, "[ _][ _]") then
if find(text, "[ _][ _]") then
error("No two or more consecutive space characters")
error("No two or more consecutive space characters")
Line 105: Line 107:
end
end


local function check_invalid_seq_decomposed_hangul(text)
-- verify that hangul was correctly decomposed
-- checked after decomposing Hangul
-- validity check after decomposing Hangul
-- Hangul status: decomposed (ᄒ+ᅡ+ᆫ)
-- Hangul status: decomposed (ᄒ+ᅡ+ᆫ)

local function verify_decomposed_consonsants(text)
if find(text, "[ᆨ-ᆪᆬ-ᆮᆴ-ᆶᆸᆹᆻᆽ-ᇂ]%*?﷐?@﷐?[ᄀᄃᄇᄉᄌ]") or find(text, "ᆰ%*?﷐?@﷐?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆲ﷐?@﷐?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆺ%*@[ᄀᄇ]") or find(text, "ᆺ%*?﷐?@﷐?[ᄁ-ᄆᄈ-ᄊᄌ-ᄑ]") or find(text, "[ᅡ-ᅵᆨ-ᆪᆬ-ᇂ]﷐?@﷐?ᄅ") or find(text, "[ᅡ-ᅵᆨᆫᆭ-ᆯᆶ-ᆸᆼ]﷐?@﷐?ᄋ") or find(text, "[ᅡ-ᅵᆫ-ᆭᆯᆱ-ᆷᆼ]﷐?@﷐?ᄒ") then
if find(text, "[ᆨ-ᆪᆬ-ᆮᆴ-ᆶᆸᆹᆻᆽ-ᇂ]%*?﷐?@﷐?[ᄀᄃᄇᄉᄌ]") or find(text, "ᆰ%*?﷐?@﷐?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆲ﷐?@﷐?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆺ%*@[ᄀᄇ]") or find(text, "ᆺ%*?﷐?@﷐?[ᄁ-ᄆᄈ-ᄊᄌ-ᄑ]") or find(text, "[ᅡ-ᅵᆨ-ᆪᆬ-ᇂ]﷐?@﷐?ᄅ") or find(text, "[ᅡ-ᅵᆨᆫᆭ-ᆯᆶ-ᆸᆼ]﷐?@﷐?ᄋ") or find(text, "[ᅡ-ᅵᆫ-ᆭᆯᆱ-ᆷᆼ]﷐?@﷐?ᄒ") then
error("Found invalid sequence containing @")
error("Found invalid sequence containing @")
Line 118: Line 120:
end
end


-- Convert html encodings back to ASCII
local function html_encoding_to_ascii(text)
text = gsub(text, "&#36;", "$")
text = gsub(text, "&#37;", "%%")
text = gsub(text, "&#42;", "*")
text = gsub(text, "&#64;", "@")
text = gsub(text, "&#94;", "^")
text = gsub(text, "&#95;", "_")
text = gsub(text, "&#96;", "`")

return text
end


-- processing people names
local function parse_name(text)
local function parse_name(text)
-- processing people names
-- Hangul status: precomposed (한)
-- Hangul status: precomposed (한)


Line 177: Line 167:
-- does not occur when same syllable is repeated (e.g. 구구절절 [구구절절], not [구구절쩔]); just using U+FDD0 here too
-- does not occur when same syllable is repeated (e.g. 구구절절 [구구절절], not [구구절쩔]); just using U+FDD0 here too
for i = 1, mw.ustring.len(text) do
for i = 1, mw.ustring.len(text) do
text = gsub(text, "﷑([^﷒]*)달달([^﷒]*)﷒", "﷑%1달﷐달%2﷒")
text = gsub(text, "﷑([^﷒]*)([달돌살설솔술슬실절졸줄즐질])%2([^﷒]*)﷒", "﷑%1%2﷐%2%3﷒")
text = gsub(text, "﷑([^﷒]*)돌돌([^﷒]*)﷒", "﷑%1돌﷐돌%2﷒")
text = gsub(text, "﷑([^﷒]*)살살([^﷒]*)﷒", "﷑%1살﷐살%2﷒")
text = gsub(text, "﷑([^﷒]*)설설([^﷒]*)﷒", "﷑%1설﷐설%2﷒")
text = gsub(text, "﷑([^﷒]*)솔솔([^﷒]*)﷒", "﷑%1솔﷐솔%2﷒")
text = gsub(text, "﷑([^﷒]*)술술([^﷒]*)﷒", "﷑%1술﷐술%2﷒")
text = gsub(text, "﷑([^﷒]*)슬슬([^﷒]*)﷒", "﷑%1슬﷐슬%2﷒")
text = gsub(text, "﷑([^﷒]*)실실([^﷒]*)﷒", "﷑%1실﷐실%2﷒")
text = gsub(text, "﷑([^﷒]*)절절([^﷒]*)﷒", "﷑%1절﷐절%2﷒")
text = gsub(text, "﷑([^﷒]*)졸졸([^﷒]*)﷒", "﷑%1졸﷐졸%2﷒")
text = gsub(text, "﷑([^﷒]*)줄줄([^﷒]*)﷒", "﷑%1줄﷐줄%2﷒")
text = gsub(text, "﷑([^﷒]*)즐즐([^﷒]*)﷒", "﷑%1즐﷐즐%2﷒")
text = gsub(text, "﷑([^﷒]*)질질([^﷒]*)﷒", "﷑%1질﷐질%2﷒")
end
end
-- now apply tensification
-- now apply tensification
Line 208: Line 186:
end
end


local function final_processing(text)
--[[
pre-processing exceptions that apply to both RR and MR
-- final processing for RR and MR
IMPORTANT: Before adding an exception, be sure to check if it can ALWAYS be applied in ALL contexts.
Good example: 싫증 → 실@증
Bad example: 문자 → 문@자 (affects words like 방문자 (pronounced [방문자], not [방문짜]))
Hangul status: decomposed (ᄒ+ᅡ+ᆫ)
--]]
local function parse_exceptions(text)
-- for linguistic contexts
text = gsub(text, "ㄴ([ᄀ-ᄒ])", "ᆫ%1") -- -ㄴ다
text = gsub(text, "ㄹ([ᄀ-ᄒ])", "ᆯ%1") -- -ㄹ까, -ㄹ래
text = gsub(text, "ㄹ@([ᄀᄃᄇᄉᄌ])", "ᆯ@%1") -- -ㄹ지
text = gsub(text, "ㅁ([ᄀ-ᄒ])", "ᆷ%1")
text = gsub(text, "ㅂ([ᄀ-ᄒ])", "ᆸ%1") -- -ㅂ니다, -ㅂ시다
-- ㄴ-addition always occurs before 윷 and 잎
text = gsub(text, "([ᆨ-ᇂ])ᄋ(ᅲᆾ)", "%1ᄂ%2")
text = gsub(text, "([ᆨ-ᇂ])ᄋ(ᅵᇁ)", "%1ᄂ%2")
-- 곧이어 [고디어]
text = gsub(text, "(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ][^ᆨ-ᇂ])", "%1ᄃ%2")
text = gsub(text, "(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ])$", "%1ᄃ%2")
-- 싫증 [실쯩]
text = gsub(text, "(ᄉ[ᅵ])ᆶ(ᄌ[ᅳ]ᆼ)", "%1ᆯ@%2")
-- cases where ㄺㄱ is pronounced [ㄱㄲ]
-- not including very rarely used words such as 삼시욹, 안찱, 우줅거리다, etc.
text = gsub(text, "([ᄃᄉᄐ]ᅡ)ᆰᄀ", "%1ᆨᄀ") -- 닭, 삵, 수탉/암탉
text = gsub(text, "([ᄉᄒ]ᅳ)ᆰᄀ", "%1ᆨᄀ") -- 기슭, 흙
text = gsub(text, "(ᄎ[ᅵ])ᆰᄀ", "%1ᆨᄀ") -- 칡
-- palatalization and ㅈ + -히-
text = gsub(text, "ᆮᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᄌ%1") -- 해돋이 [해도지]
text = gsub(text, "ᆮᄋ(ᅵ[^ᆨ-ᇂ])", "ᄌ%1")
text = gsub(text, "ᆮᄋ(ᅵ)$", "ᄌ%1")
text = gsub(text, "[ᆮᆽ]ᄒ(ᅧᆻ)", "ᄎ%1") -- 굳히다 [구치다], 꽂히다 [꼬치다]
text = gsub(text, "[ᆮᆽ]ᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᄎ%1")
text = gsub(text, "[ᆮᆽ]ᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᄎ%1")
text = gsub(text, "[ᆮᆽ]ᄒ([ᅧᅵ])$", "ᄎ%1")
text = gsub(text, "ᆴᄋ(ᅧᆻ)", "ᆯᄎ%1") -- 훑이다 [훌치다]
text = gsub(text, "ᆴᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄎ%1")
text = gsub(text, "ᆴᄋ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄎ%1")
text = gsub(text, "ᆴᄋ([ᅧᅵ])$", "ᆯᄎ%1")
text = gsub(text, "ᇀᄋ(ᅧᆻ)", "ᄎ%1") -- 붙이다 [부치다]
text = gsub(text, "ᇀᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᄎ%1")
text = gsub(text, "ᇀᄋ([ᅧᅵ][^ᆨ-ᇂ])", "ᄎ%1")
text = gsub(text, "ᇀᄋ([ᅧᅵ])$", "ᄎ%1")
-- {ㄵ, ㄺ, ㄼ} + -히-
text = gsub(text, "ᆬᄒ(ᅧᆻ)", "ᆫᄎ%1") -- 앉히다 [안치다]
text = gsub(text, "ᆬᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆫᄎ%1")
text = gsub(text, "ᆬᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆫᄎ%1")
text = gsub(text, "ᆬᄒ([ᅧᅵ])$", "ᆫᄎ%1")
text = gsub(text, "ᆰᄒ(ᅧᆻ)", "ᆯᄏ%1") -- 밝히다 [발키다]
text = gsub(text, "ᆰᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄏ%1")
text = gsub(text, "ᆰᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄏ%1")
text = gsub(text, "ᆰᄒ([ᅧᅵ])$", "ᆯᄏ%1")
text = gsub(text, "ᆲᄒ(ᅧᆻ)", "ᆯᄑ%1") -- 넓히다 [널피다], 밟히다 [발피다]
text = gsub(text, "ᆲᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄑ%1")
text = gsub(text, "ᆲᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄑ%1")
text = gsub(text, "ᆲᄒ([ᅧᅵ])$", "ᆯᄑ%1")
-- cases where 넓- is pronounced [넙] before consonant
text = gsub(text, "(ᄂ[ᅥ])ᆲ([ᄁᄄ-ᄈᄊᄍ-ᄒ])", "%1ᆸ%2")
text = gsub(text, "(ᄂ[ᅥ])ᆲ(ᄃ[ᅡ]ᄃ[ᅳ]ᆷ)", "%1ᆸ%2") -- 넓다듬이
text = gsub(text, "(ᄂ[ᅥ])ᆲ(ᄃ[ᅮ]ᆼ)", "%1ᆸ%2") -- 넓둥글다
text = gsub(text, "(ᄂ[ᅥ])ᆲ(ᄉ[ᅡ]ᆯᄆ[ᅮ]ᆫ)", "%1ᆸ%2") -- 넓살문
text = gsub(text, "(ᄂ[ᅥ])ᆲ(ᄌ[ᅥᅮ]ᆨ)", "%1ᆸ%2") -- 넓적-, 넓죽-
-- 밟- is [밥] before consonant (except null-init consonant ㅇ)
text = gsub(text, "(ᄇ[ᅡ])ᆲ([^ᄋ])", "%1ᆸ%2")
text = gsub(text, "(ᄇ[ᅡ])ᆲ$", "%1ᆸ")
-- automatic 절음 법칙
text = gsub(text, "(ᄋ[ᅥ])ᆹᄋ(ᅢ[ᆫᆯᆷᆸᆻ])", "%1ᆸᄉ%2") -- except 없애다 [업쌔다]
text = gsub(text, "(ᄋ[ᅥ])ᆹᄋ(ᅢ[^ᆨ-ᇂ])", "%1ᆸᄉ%2")
text = gsub(text, "(ᄋ[ᅥ])ᆹᄋ(ᅢ)$", "%1ᆸᄉ%2")
text = gsub(text, "(ᄆ[ᅡᅥ])ᆺᄋ(ᅵᆻ)", "%1ᄉ%2") -- except 맛있다 and 멋있다 which are usually pronounced [마싣따] and [머싣따] respectively
text = gsub(text, "([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅡᅥᅧ][ᆨ-ᆺᆼ-ᇂ])", "%1@%2") -- except 아, 았, 어, 었, 여, 였
text = gsub(text, "([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅦ][ᆨ-ᆪᆬ-ᆮᆰ-ᇂ])", "%1@%2") -- except 에, 엔, 엘
text = gsub(text, "([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅭᅴ][ᆨ-ᇂ])", "%1@%2") -- except 요, 의 (w/o final consonant)
text = gsub(text, "([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅳᅵ][ᆨ-ᆪᆬ-ᆮᆰ-ᆶᆹ-ᇂ])", "%1@%2") -- except 으, 은, 을, 음, 읍, 이, 인, 일, 임, 입
text = gsub(text, "([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅢ-ᅤᅨ-ᅬᅮ-ᅲ])", "%1@%2")
-- _ for additional space in romanization only
text = gsub(text, "_", " ")


-- result should not contain Hangul
return text
if m_utils.contains_hangul(text) then
end
error("Result contains Hangul; debugging required")
end


text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII
-- processing misc characters that contain hangul

-- Hangul status: precomposed (한)
-- if result is nothing (e.g. when input is just ㅇ)
local function parse_enclosed_hangul(text)
if text == "" then
-- actually not very necessary, but these are also classified as Hangul chars in Unicode
text = "—"
-- no distinction is made between parenthesized and circled chars
end
text = gsub(text, "[㈀㉠]", "(기역)")
text = gsub(text, "[㈁㉡]", "(니은)")
text = gsub(text, "[㈂㉢]", "(디귿)")
text = gsub(text, "[㈃㉣]", "(리을)")
text = gsub(text, "[㈄㉤]", "(미음)")
text = gsub(text, "[㈅㉥]", "(비읍)")
text = gsub(text, "[㈆㉦]", "(시옷)")
text = gsub(text, "[㈇㉧]", "(이응)")
text = gsub(text, "[㈈㉨]", "(지읒)")
text = gsub(text, "[㈉㉩]", "(치읓)")
text = gsub(text, "[㈊㉪]", "(키읔)")
text = gsub(text, "[㈋㉫]", "(티읕)")
text = gsub(text, "[㈌㉬]", "(피읖)")
text = gsub(text, "[㈍㉭]", "(히읗)")
text = gsub(text, "[㈎㉮]", "(가)")
text = gsub(text, "[㈏㉯]", "(나)")
text = gsub(text, "[㈐㉰]", "(다)")
text = gsub(text, "[㈑㉱]", "(라)")
text = gsub(text, "[㈒㉲]", "(마)")
text = gsub(text, "[㈓㉳]", "(바)")
text = gsub(text, "[㈔㉴]", "(사)")
text = gsub(text, "[㈕㉵]", "(아)")
text = gsub(text, "[㈖㉶]", "(자)")
text = gsub(text, "[㈗㉷]", "(차)")
text = gsub(text, "[㈘㉸]", "(카)")
text = gsub(text, "[㈙㉹]", "(타)")
text = gsub(text, "[㈚㉺]", "(파)")
text = gsub(text, "[㈛㉻]", "(하)")
text = gsub(text, "㈜", "(주)")
text = gsub(text, "㈝", "(오전)")
text = gsub(text, "㈞", "(오후)")
text = gsub(text, "㉼", "(참고)")
text = gsub(text, "㉽", "(주의)")
text = gsub(text, "㉾", "(우)")


return text
return text
Line 334: Line 206:
-- Convert to Revised Romanization
-- Convert to Revised Romanization
function p.rr(frame)
function p.rr(frame)
return p._rr(get_args(frame))
local get_args = require('Module:Arguments').getArgs
local args = get_args(frame)
return p._rr(args)
end
end


Line 347: Line 217:
text = parse_name(text)
text = parse_name(text)
text = remove_links_and_markup(text)
text = remove_links_and_markup(text)
text = check_invalid_input(text)
text = check_invalid_seq(text)
text = gsub(text, "`", "") -- ignore ` (only needed for MR; not needed for RR)
text = gsub(text, "`", "") -- ignore ` (only needed for MR; not needed for RR)
text = parse_enclosed_hangul(text)
text = gsub_iterate(text, m_data.enclosed_hangul)
text = gsub(text, "[가-힣]", mw.ustring.toNFD)
text = m_utils.decompose_hangul(text) -- decompose Hangul
text = verify_decomposed_consonsants(text)
text = check_invalid_seq_decomposed_hangul(text)
text = parse_exceptions(text)
text = gsub_iterate(text, m_data.exceptions)


text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only
text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only
-- $ for ㄴ-addition
text = gsub_iterate(text, m_data.n_addition) -- $ for ㄴ-addition
text = gsub_iterate(text, m_data.null_init_ieung) -- for null-init consonant ㅇ (연음)
text = gsub(text, "([ᆨ-ᇂ])%$ᄋ([ᅣᅤᅧᅨᅭᅲᅵ])", "%1ᄂ%2") -- 색연필 [생년필], 물엿 [물렫]

text = gsub(text, "%$", "")
-- convert ㅎ combinations
-- for null-init consonant ㅇ (연음)
text = gsub(text, "ᆨᄋ", "ᄀ")
text = gsub_iterate(text, m_data.process_hieut)
text = gsub(text, "ᆩᄋ", "ᄁ")
text = gsub(text, "ᆪᄋ", "ᆨᄉ")
text = gsub(text, "ᆬᄋ", "ᆫᄌ")
text = gsub(text, "ᆮᄋ", "ᄃ")
text = gsub(text, "[ᆯᆶ]ᄋ", "ᄅ")
text = gsub(text, "ᆰᄋ", "ᆯᄀ")
text = gsub(text, "ᆱᄋ", "ᆯᄆ")
text = gsub(text, "ᆲᄋ", "ᆯᄇ")
text = gsub(text, "ᆳᄋ", "ᆯᄉ")
text = gsub(text, "ᆴᄋ", "ᆯᄐ")
text = gsub(text, "ᆵᄋ", "ᆯᄑ")
text = gsub(text, "ᆸᄋ", "ᄇ")
text = gsub(text, "ᆹᄋ", "ᆸᄉ")
text = gsub(text, "ᆺᄋ", "ᄉ")
text = gsub(text, "ᆻᄋ", "ᄊ")
text = gsub(text, "ᆽᄋ", "ᄌ")
text = gsub(text, "ᆾᄋ", "ᄎ")
text = gsub(text, "ᆿᄋ", "ᄏ")
text = gsub(text, "ᇀᄋ", "ᄐ")
text = gsub(text, "ᇁᄋ", "ᄑ")
text = gsub(text, "ᇂᄋ", "ᄋ") -- silent; 좋아 [조아]
-- for ㅎ
-- trivia: {ㄶ, ㅀ, ㅎ} + ㅂ doesn't actually exist, but added for completeness (syl-final ㅎ is for aspiration anyway)
text = gsub(text, "ᆭᄀ", "ᆫᄏ")
text = gsub(text, "ᆭᄃ", "ᆫᄐ")
text = gsub(text, "ᆭᄇ", "ᆫᄑ")
text = gsub(text, "ᆭᄌ", "ᆫᄎ")
text = gsub(text, "ᆶᄀ", "ᆯᄏ")
text = gsub(text, "ᆶᄃ", "ᆯᄐ")
text = gsub(text, "ᆶᄇ", "ᆯᄑ")
text = gsub(text, "ᆶᄌ", "ᆯᄎ")
text = gsub(text, "ᇂᄀ", "ᄏ")
text = gsub(text, "ᇂᄃ", "ᄐ")
text = gsub(text, "ᇂᄇ", "ᄑ")
text = gsub(text, "ᇂᄉ", "ᄉ")
text = gsub(text, "ᇂᄉ", "ᄉ")

text = gsub(text, "ᇂᄌ", "ᄎ")
-- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))
text = gsub(text, "ᆰᄀ", "ᆯᄀ") -- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))
text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants
text = gsub(text, "ᆰᄀ", "ᆯᄀ")
text = gsub_iterate(text, m_data.at_irregularities) -- @ for 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
-- neutralization of syl-final consonants
text = gsub_iterate(text, m_data.at_irregularities_additional_rr) -- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p
text = gsub(text, "[ᆩᆪᆰᆿ]", "ᆨ")
text = gsub(text, "[ᆬᆭ]", "ᆫ")
text = gsub(text, "[ᆺᆻᆽᆾᇀᇂ]", "ᆮ")
text = gsub(text, "[ᆲᆳᆴᆶ]", "ᆯ")
text = gsub(text, "ᆱ", "ᆷ")
text = gsub(text, "[ᆵᆹᇁ]", "ᆸ")
-- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p, 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
-- other irregularities documented are automatically handled
text = gsub(text, "ᆨ@ᄒ", "ᄏ")
text = gsub(text, "ᆮ@ᄒ", "ᄐ")
text = gsub(text, "ᆸ@ᄒ", "ᄑ")
text = gsub(text, "ᆨ@ᄋ", "ᄀ")
text = gsub(text, "ᆮ@ᄋ", "ᄃ") -- 웃어른 [우더른]
text = gsub(text, "ᆯ@ᄋ", "ᄅ")
text = gsub(text, "ᆸ@ᄋ", "ᄇ")
text = gsub(text, "ᆫ@ᄅ", "ᆫᄂ") -- 음운론 [으문논]
text = gsub(text, "@", "")
text = gsub(text, "@", "")
-- consonant assimilations
text = gsub_iterate(text, m_data.consonant_assimilations) -- consonant assimilations
text = gsub(text, "[ᆨᆼ][ᄂᄅ]", "ᆼᄂ")
text = gsub(text, "ᆯᄅ", "ᆯl") -- ㄹㄹ is ll
text = gsub(text, "ᆨᄆ", "ᆼᄆ")
text = gsub_iterate(text, m_data.drop_y) -- drop y after {ㅈ, ㅉ, ㅊ}
text = gsub_iterate(text, m_data.vowels_rr) -- replace Hangul vowels with romanized text
text = gsub(text, "ᆫᄅ", "ᆯᄅ")
text = gsub(text, "'([ᄋㅇ]+)'", "&#39;%1&#39;") -- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later)
text = gsub(text, "ᆮ[ᄂᄅ]", "ᆫᄂ")
text = gsub_iterate(text, m_data.single_consonants_rr) -- replace single consonants with romanized text
text = gsub(text, "ᆮᄆ", "ᆫᄆ")
text = gsub(text, "﷐", "") -- now remove U+FDD0 that was needed for converting each syllabic block in given name separately (e.g. 한복남 Han Boknam, not Han Bongnam)
text = gsub(text, "ᆯᄂ", "ᆯᄅ")

text = gsub(text, "[ᆷᆸ][ᄂᄅ]", "ᆷᄂ")
text = gsub(text, "ᆸᄆ", "ᆷᄆ")
text = gsub(text, "ᆯᄅ", "ᆯl")
-- drop y after {ㅈ, ㅉ, ㅊ}
text = gsub(text, "([ᄌ-ᄎ])ᅣ", "%1ᅡ")
text = gsub(text, "([ᄌ-ᄎ])ᅤ", "%1ᅢ")
text = gsub(text, "([ᄌ-ᄎ])ᅧ", "%1ᅥ")
text = gsub(text, "([ᄌ-ᄎ])ᅨ", "%1ᅦ")
text = gsub(text, "([ᄌ-ᄎ])ᅭ", "%1ᅩ")
text = gsub(text, "([ᄌ-ᄎ])ᅲ", "%1ᅮ")
-- vowels
text = gsub(text, "[ᅡㅏ]", "a")
text = gsub(text, "[ᅢㅐ]", "ae")
text = gsub(text, "[ᅣㅑ]", "ya")
text = gsub(text, "[ᅤㅒ]", "yae")
text = gsub(text, "[ᅥㅓ]", "eo")
text = gsub(text, "[ᅦㅔ]", "e")
text = gsub(text, "[ᅧㅕ]", "yeo")
text = gsub(text, "[ᅨㅖ]", "ye")
text = gsub(text, "[ᅩㅗ]", "o")
text = gsub(text, "[ᅪㅘ]", "wa")
text = gsub(text, "[ᅫㅙ]", "wae")
text = gsub(text, "[ᅬㅚ]", "oe")
text = gsub(text, "[ᅭㅛ]", "yo")
text = gsub(text, "[ᅮㅜ]", "u")
text = gsub(text, "[ᅯㅝ]", "wo")
text = gsub(text, "[ᅰㅞ]", "we")
text = gsub(text, "[ᅱㅟ]", "wi")
text = gsub(text, "[ᅲㅠ]", "yu")
text = gsub(text, "[ᅳㅡ]", "eu")
text = gsub(text, "[ᅴㅢ]", "ui")
text = gsub(text, "[ᅵㅣ]", "i")
-- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later)
text = gsub(text, "'([ᄋㅇ]+)'", "&#39;%1&#39;")
-- single consonants
text = gsub(text, "[ᄀㄱ]", "g")
text = gsub(text, "[ᄁㄲ]", "kk")
text = gsub(text, "ㄳ", "ks")
text = gsub(text, "[ᄂᆫㄴ]", "n")
text = gsub(text, "ㄵ", "nj")
text = gsub(text, "ㄶ", "nh")
text = gsub(text, "[ᄃㄷ]", "d")
text = gsub(text, "[ᄄㄸ]", "tt")
text = gsub(text, "[ᄅㄹ]", "r")
text = gsub(text, "ᆯ", "l")
text = gsub(text, "ㄺ", "lg")
text = gsub(text, "ㄻ", "lm")
text = gsub(text, "ㄼ", "lb")
text = gsub(text, "ㄽ", "ls")
text = gsub(text, "ㄾ", "lt")
text = gsub(text, "ㄿ", "lp")
text = gsub(text, "ㅀ", "lh")
text = gsub(text, "[ᄆᆷㅁ]", "m")
text = gsub(text, "[ᄇㅂ]", "b")
text = gsub(text, "[ᄈㅃ]", "pp")
text = gsub(text, "ㅄ", "ps")
text = gsub(text, "[ᄉㅅ]", "s")
text = gsub(text, "[ᄊㅆ]", "ss")
text = gsub(text, "[ᄋㅇ]", "")
text = gsub(text, "ᆼ", "ng")
text = gsub(text, "[ᄌㅈ]", "j")
text = gsub(text, "[ᄍㅉ]", "jj")
text = gsub(text, "[ᄎㅊ]", "ch")
text = gsub(text, "[ᄏᆨㅋ]", "k")
text = gsub(text, "[ᄐᆮㅌ]", "t")
text = gsub(text, "[ᄑᆸㅍ]", "p")
text = gsub(text, "[ᄒㅎ]", "h")
-- now remove U+FDD0 that was needed for converting each syllabic block in given name separately (e.g. 한복남 Han Boknam, not Han Bongnam)
text = gsub(text, "﷐", "")
-- ^ for capitalization
-- ^ for capitalization
text = gsub(text, "%^[a-eg-km-pr-uwy]", mw.ustring.upper)
text = gsub(text, "%^[a-eg-km-pr-uwy]", mw.ustring.upper)
text = gsub(text, "%^", "")
text = gsub(text, "%^", "")

-- final error checking
text = final_processing(text)
if find(text, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-꥿가-힣ힰ-퟿]") then
error("Result contains Hangul; debugging required")
end
-- return orig chars
text = html_encoding_to_ascii(text)
-- if result is nothing (e.g. when input is just ㅇ)
if text == "" then
text = "—"
end


return text
return text
Line 511: Line 256:
-- Convert to McCune–Reischauer
-- Convert to McCune–Reischauer
function p.mr(frame)
function p.mr(frame)
return p._mr(get_args(frame))
local get_args = require('Module:Arguments').getArgs
local args = get_args(frame)
return p._mr(args)
end
end


Line 525: Line 268:
text = gsub(text, "﷐", "") -- remove U+FDD0 (only needed for RR; not needed for MR)
text = gsub(text, "﷐", "") -- remove U+FDD0 (only needed for RR; not needed for MR)
text = remove_links_and_markup(text)
text = remove_links_and_markup(text)
text = check_invalid_input(text)
text = check_invalid_seq(text)
text = parse_enclosed_hangul(text)
text = gsub_iterate(text, m_data.enclosed_hangul)
text = gsub(text, "[가-힣]", mw.ustring.toNFD)
text = m_utils.decompose_hangul(text) -- decompose Hangul
text = verify_decomposed_consonsants(text)
text = check_invalid_seq_decomposed_hangul(text)
text = parse_exceptions(text)
text = gsub_iterate(text, m_data.exceptions)


text = gsub(text, "([ᄀᄁᄃ-ᄊᄌ-ᄑ])ᅴ", "%1ᅵ") -- syl-init consonant + ㅢ → syl-init consonant + ㅣ (except 의, 늬, 희)
text = gsub(text, "([ᄀᄁᄃ-ᄊᄌ-ᄑ])ᅴ", "%1ᅵ") -- syl-init consonant + ㅢ → syl-init consonant + ㅣ (except 의, 늬, 희)
text = gsub(text, "(ᄋ[ᅧ]ᄃ[ᅥ])ᆲ([ᄀᄃᄇᄉᄌ])", "%1ᆯ%2") -- 여덟 + particle (tensification does not occur)
text = gsub(text, "(ᄋ[ᅧ]ᄃ[ᅥ])ᆲ([ᄀᄃᄇᄉᄌ])", "%1ᆯ%2") -- 여덟 + particle (tensification does not occur)
-- $ for ㄴ-addition
text = gsub_iterate(text, m_data.n_addition) -- $ for ㄴ-addition
text = gsub_iterate(text, m_data.null_init_ieung) -- for null-init consonant ㅇ (연음)
text = gsub(text, "([ᆨ-ᇂ])%$ᄋ([ᅣᅤᅧᅨᅭᅲᅵ])", "%1ᄂ%2") -- 색연필 [생년필], 물엿 [물렫]

text = gsub(text, "%$", "")
-- convert ㅎ combinations
-- for null-init consonant ㅇ (연음)
text = gsub(text, "ᆨᄋ", "ᄀ")
text = gsub_iterate(text, m_data.process_hieut)
text = gsub(text, "ᆩᄋ", "ᄁ")
text = gsub_iterate(text, m_data.process_hieut_additional_mr)

text = gsub(text, "ᆪᄋ", "ᆨᄉ")
text = gsub(text, "ᆬᄋ", "ᆫᄌ")
text = gsub(text, "ᆮᄋ", "ᄃ")
text = gsub(text, "[ᆯᆶ]ᄋ", "ᄅ")
text = gsub(text, "ᆰᄋ", "ᆯᄀ")
text = gsub(text, "ᆱᄋ", "ᆯᄆ")
text = gsub(text, "ᆲᄋ", "ᆯᄇ")
text = gsub(text, "ᆳᄋ", "ᆯᄉ")
text = gsub(text, "ᆴᄋ", "ᆯᄐ")
text = gsub(text, "ᆵᄋ", "ᆯᄑ")
text = gsub(text, "ᆸᄋ", "ᄇ")
text = gsub(text, "ᆹᄋ", "ᆸᄉ")
text = gsub(text, "ᆺᄋ", "ᄉ")
text = gsub(text, "ᆻᄋ", "ᄊ")
text = gsub(text, "ᆽᄋ", "ᄌ")
text = gsub(text, "ᆾᄋ", "ᄎ")
text = gsub(text, "ᆿᄋ", "ᄏ")
text = gsub(text, "ᇀᄋ", "ᄐ")
text = gsub(text, "ᇁᄋ", "ᄑ")
text = gsub(text, "ᇂᄋ", "ᄋ") -- silent; 좋아 [조아]
-- for ㅎ
-- trivia: {ㄶ, ㅀ, ㅎ} + ㅂ doesn't actually exist, but added for completeness (syl-final ㅎ is for aspiration anyway)
text = gsub(text, "ᆭᄀ", "ᆫᄏ")
text = gsub(text, "ᆭᄃ", "ᆫᄐ")
text = gsub(text, "ᆭᄇ", "ᆫᄑ")
text = gsub(text, "[ᆬᆭ]ᄉ", "ᆫᄊ")
text = gsub(text, "ᆭᄌ", "ᆫᄎ")
text = gsub(text, "ᆶᄀ", "ᆯᄏ")
text = gsub(text, "ᆶᄃ", "ᆯᄐ")
text = gsub(text, "ᆶᄇ", "ᆯᄑ")
text = gsub(text, "[ᆲᆴᆶ]ᄉ", "ᆯᄊ")
text = gsub(text, "ᆶᄌ", "ᆯᄎ")
text = gsub(text, "ᇂᄀ", "ᄏ")
text = gsub(text, "ᇂᄃ", "ᄐ")
text = gsub(text, "ᇂᄇ", "ᄑ")
text = gsub(text, "ᇂᄉ", "ᄊ")
text = gsub(text, "ᇂᄌ", "ᄎ")
-- ㄵ, ㄼ, ㄾ cause tensification of following consonant
-- ㄵ, ㄼ, ㄾ cause tensification of following consonant
-- do not add ㄻ; does not always cause tensification (굶기다 [굼기다], 삶조차 [삼조차])
-- do not add ㄻ; does not always cause tensification (굶기다 [굼기다], 삶조차 [삼조차])
text = gsub(text, "([ᆬᆲᆴ])([ᄀᄃᄌ])", "%1@%2")
text = gsub(text, "([ᆬᆲᆴ])([ᄀᄃᄌ])", "%1@%2")

-- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))
text = gsub(text, "ᆰᄀ", "ᆯ@ᄀ")
text = gsub(text, "ᆰᄀ", "ᆯ@ᄀ") -- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))

-- @ for written 사이시옷 + ㄱ/ㅂ (should be done before neutralization of syl-final consonants)
-- @ for written 사이시옷 + ㄱ/ㅂ (should be done before neutralization of syl-final consonants)
text = gsub(text, "ᆺ@ᄀ", "ᄁ")
text = gsub(text, "ᆺ@ᄀ", "ᄁ")
text = gsub(text, "ᆺ@ᄇ", "ᄈ")
text = gsub(text, "ᆺ@ᄇ", "ᄈ")

-- neutralization of syl-final consonants
text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants
text = gsub(text, "[ᆩᆪᆰᆿ]", "ᆨ")
text = gsub(text, "[ᆬᆭ]", "")
text = gsub(text, "([ᅡ-ᅵᆫᆷᆼ])@ᄉ", "%1ᄊ") -- @ for tensification
text = gsub_iterate(text, m_data.at_irregularities) -- @ for 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
text = gsub(text, "[ᆺᆻᆽᆾᇀᇂ]", "ᆮ")

text = gsub(text, "[ᆲᆳᆴᆶ]", "ᆯ")
text = gsub(text, "ᆱ", "ᆷ")
text = gsub(text, "[ᆵᆹᇁ]", "ᆸ")
-- @ for tensification, 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
-- other irregularities documented are automatically handled
text = gsub(text, "([ᅡ-ᅵᆫᆷᆼ])@ᄉ", "%1ᄊ")
text = gsub(text, "ᆨ@ᄋ", "ᄀ")
text = gsub(text, "ᆮ@ᄋ", "ᄃ") -- 웃어른 [우더른]
text = gsub(text, "ᆯ@ᄋ", "ᄅ")
text = gsub(text, "ᆸ@ᄋ", "ᄇ")
text = gsub(text, "ᆫ@ᄅ", "ᆫᄂ") -- 음운론 [으문논]
-- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants
-- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants
-- * is for additional hyphen in romanization only (voicing is retained after hyphen)
-- * is for additional hyphen in romanization only (voicing is retained after hyphen)
Line 607: Line 305:
text = gsub(text, "%*", "-")
text = gsub(text, "%*", "-")
text = gsub(text, "@", "")
text = gsub(text, "@", "")

-- consonant assimilations
-- consonant assimilations
text = gsub(text, "[ᆨᆼ][ᄂᄅ]", "ᆼᄂ")
text = gsub_iterate(text, m_data.consonant_assimilations)
text = gsub(text, "ᆨᄆ", "ᆼᄆ")
text = gsub_iterate(text, m_data.consonant_assimilations_additional_mr)

text = gsub(text, "ᆫᄅ", "ᆯᄅ")
text = gsub(text, "ᆮ[ᄂᄅ]", "ᆫᄂ")
text = gsub_iterate(text, m_data.drop_y) -- drop y after {ㅈ, ㅉ, ㅊ}
text = gsub_iterate(text, m_data.vowels_mr) -- replace Hangul vowels with romanized text
text = gsub(text, "ᆮᄆ", "ᆫᄆ")
text = gsub(text, "ᆯᄂ", "ᆯᄅ")
text = gsub(text, "([ao])ᄋe", "%1ë") -- ㅏ에 (aë) and ㅗ에 (oë)
text = gsub(text, "'([ᄋㅇ]+)'", "&#39;%1&#39;") -- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later)
text = gsub(text, "[ᆷᆸ][ᄂᄅ]", "ᆷᄂ")
text = gsub_iterate(text, m_data.single_consonants_mr) -- replace single consonants with romanized text
text = gsub(text, "ᆸᄆ", "ᆷᄆ")

-- no {kkk, ttt, ppp, sss/ts/tss, ttch}
text = gsub(text, "ᆨᄁ", "ᄁ")
text = gsub(text, "ᆮᄄ", "ᄄ")
text = gsub(text, "ᆸᄈ", "ᄈ")
text = gsub(text, "ᆮ[ᄉᄊ]", "ᄊ")
text = gsub(text, "ᆮᄍ", "ᄍ")
-- other misc conversions
text = gsub(text, "ᆯᄅ", "ᆯl")
text = gsub(text, "ᆯᄒ", "rᄒ")
text = gsub(text, "ᄉ[ᅱ]", "shᅱ")
-- drop y after {ㅈ, ㅉ, ㅊ}
text = gsub(text, "([ᄌ-ᄎ])ᅣ", "%1ᅡ")
text = gsub(text, "([ᄌ-ᄎ])ᅤ", "%1ᅢ")
text = gsub(text, "([ᄌ-ᄎ])ᅧ", "%1ᅥ")
text = gsub(text, "([ᄌ-ᄎ])ᅨ", "%1ᅦ")
text = gsub(text, "([ᄌ-ᄎ])ᅭ", "%1ᅩ")
text = gsub(text, "([ᄌ-ᄎ])ᅲ", "%1ᅮ")
-- vowels
text = gsub(text, "[ᅡㅏ]", "a")
text = gsub(text, "[ᅢㅐ]", "ae")
text = gsub(text, "[ᅣㅑ]", "ya")
text = gsub(text, "[ᅤㅒ]", "yae")
text = gsub(text, "[ᅥㅓ]", "ŏ")
text = gsub(text, "[ᅦㅔ]", "e")
text = gsub(text, "[ᅧㅕ]", "yŏ")
text = gsub(text, "[ᅨㅖ]", "ye")
text = gsub(text, "[ᅩㅗ]", "o")
text = gsub(text, "[ᅪㅘ]", "wa")
text = gsub(text, "[ᅫㅙ]", "wae")
text = gsub(text, "[ᅬㅚ]", "oe")
text = gsub(text, "[ᅭㅛ]", "yo")
text = gsub(text, "[ᅮㅜ]", "u")
text = gsub(text, "[ᅯㅝ]", "wŏ")
text = gsub(text, "[ᅰㅞ]", "we")
text = gsub(text, "[ᅱㅟ]", "wi")
text = gsub(text, "[ᅲㅠ]", "yu")
text = gsub(text, "[ᅳㅡ]", "ŭ")
text = gsub(text, "[ᅴㅢ]", "ŭi")
text = gsub(text, "[ᅵㅣ]", "i")
-- ㅏ에 (aë) and ㅗ에 (oë)
text = gsub(text, "([ao])ᄋe", "%1ë")
-- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later)
text = gsub(text, "'([ᄋㅇ]+)'", "&#39;%1&#39;")
-- single consonants
text = gsub(text, "`ᄀ", "g")
text = gsub(text, "`ᄃ", "d")
text = gsub(text, "`ᄇ", "b")
text = gsub(text, "`ᄌ", "j")
text = gsub(text, "[ᄀᆨㄱ]", "k")
text = gsub(text, "[ᄁㄲ]", "kk")
text = gsub(text, "ㄳ", "ks")
text = gsub(text, "[ᄂᆫㄴ]", "n")
text = gsub(text, "ㄵ", "nj")
text = gsub(text, "ㄶ", "nh")
text = gsub(text, "[ᄃᆮㄷ]", "t")
text = gsub(text, "[ᄄㄸ]", "tt")
text = gsub(text, "[ᄅㄹ]", "r")
text = gsub(text, "ᆯ", "l")
text = gsub(text, "ㄺ", "lg")
text = gsub(text, "ㄻ", "lm")
text = gsub(text, "ㄼ", "lb")
text = gsub(text, "ㄽ", "ls")
text = gsub(text, "ㄾ", "lt'")
text = gsub(text, "ㄿ", "lp'")
text = gsub(text, "ㅀ", "rh")
text = gsub(text, "[ᄆᆷㅁ]", "m")
text = gsub(text, "[ᄇᆸㅂ]", "p")
text = gsub(text, "[ᄈㅃ]", "pp")
text = gsub(text, "ㅄ", "ps")
text = gsub(text, "[ᄉㅅ]", "s")
text = gsub(text, "[ᄊㅆ]", "ss")
text = gsub(text, "[ᄋㅇ]", "")
text = gsub(text, "ᆼ", "ng")
text = gsub(text, "[ᄌㅈ]", "ch")
text = gsub(text, "[ᄍㅉ]", "tch")
text = gsub(text, "[ᄎㅊ]", "ch'")
text = gsub(text, "[ᄏㅋ]", "k'")
text = gsub(text, "[ᄐㅌ]", "t'")
text = gsub(text, "[ᄑㅍ]", "p'")
text = gsub(text, "[ᄒㅎ]", "h")
text = gsub(text, "`", "")
-- replace ' with &#39; when followed by another ' or at end of string (to avoid possible clashes with bold/italic markup)
-- replace ' with &#39; when followed by another ' or at end of string (to avoid possible clashes with bold/italic markup)
text = gsub(text, "([hkpt])''", "%1&#39;'")
text = gsub(text, "([hkpt])''", "%1&#39;'")
text = gsub(text, "([hkpt])'$", "%1&#39;")
text = gsub(text, "([hkpt])'$", "%1&#39;")

-- ^ for capitalization
-- ^ for capitalization
text = gsub(text, "%^[acehikm-pr-uwyŏŭ]", mw.ustring.upper)
text = gsub(text, "%^[acehikm-pr-uwyŏŭ]", mw.ustring.upper)
text = gsub(text, "%^", "")
text = gsub(text, "%^", "")

-- final error checking
text = final_processing(text)
if find(text, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-꥿가-힣ힰ-퟿]") then
error("Result contains Hangul; debugging required")
end
-- return orig chars
text = html_encoding_to_ascii(text)
-- if result is nothing (e.g. when input is just ㅇ)
if text == "" then
text = "—"
end


return text
return text
Line 719: Line 331:
-- Removing special chars (except for escaped ones)
-- Removing special chars (except for escaped ones)
function p.clean_hangul(frame)
function p.clean_hangul(frame)
return p._clean_hangul(get_args(frame))
local get_args = require('Module:Arguments').getArgs
local args = get_args(frame)
return p._clean_hangul(args)
end
end


function p._clean_hangul(args)
function p._clean_hangul(args)
local hangul = args[1]
local text = args[1]


-- input must contain Hangul
-- input must contain Hangul
if not m_utils.contains_hangul(text) then
if hangul == nil or hangul == "" or find(hangul, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-꥿가-힣ힰ-퟿]") == nil then
error("Input must contain Hangul")
error("Input must contain Hangul")
end
end


-- no direct insertion of reference or footnote
-- no direct insertion of reference or footnote
if m_utils.contains_reference(text) then
if find(hangul, "'\"`UNIQ--") or find(hangul, "-QINU`\"'") then
error("Input cannot contain references")
error("Input cannot contain references")
end
end
-- Replacing escaped special chars with placeholders
local cleaned = gsub(hangul, "\\%$", "&#36;")
cleaned = gsub(cleaned, "\\%%", "&#37;")
cleaned = gsub(cleaned, "\\%*", "&#42;")
cleaned = gsub(cleaned, "\\@", "&#64;")
cleaned = gsub(cleaned, "\\%^", "&#94;")
cleaned = gsub(cleaned, "\\_", "&#95;")
cleaned = gsub(cleaned, "\\`", "&#96;")
-- Removing non-escaped special chars
cleaned = gsub(cleaned, "[%$%%%*@%^_`]", "")
-- Returning orig chars
cleaned = html_encoding_to_ascii(cleaned)


-- symbol should not appear within single syllabic block
-- Unstripping test
if find(text, "[ᄀ-ᅟꥠ-ꥼ][%$%%%*@%^_`][ᅠ-ᆧힰ-ퟆ]") or find(text, "[ᅠ-ᆧ가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기까깨꺄꺠꺼께껴꼐꼬꽈꽤꾀꾜꾸꿔꿰뀌뀨끄끠끼나내냐냬너네녀녜노놔놰뇌뇨누눠눼뉘뉴느늬니다대댜댸더데뎌뎨도돠돼되됴두둬뒈뒤듀드듸디따때땨떄떠떼뗘뗴또똬뙈뙤뚀뚜뚸뛔뛰뜌뜨띄띠라래랴럐러레려례로롸뢔뢰료루뤄뤠뤼류르릐리마매먀먜머메며몌모뫄뫠뫼묘무뭐뭬뮈뮤므믜미바배뱌뱨버베벼볘보봐봬뵈뵤부붜붸뷔뷰브븨비빠빼뺘뺴뻐뻬뼈뼤뽀뽜뽸뾔뾰뿌뿨쀄쀠쀼쁘쁴삐사새샤섀서세셔셰소솨쇄쇠쇼수숴쉐쉬슈스싀시싸쌔쌰썌써쎄쎠쎼쏘쏴쐐쐬쑈쑤쒀쒜쒸쓔쓰씌씨아애야얘어에여예오와왜외요우워웨위유으의이자재쟈쟤저제져졔조좌좨죄죠주줘줴쥐쥬즈즤지짜째쨔쨰쩌쩨쪄쪠쪼쫘쫴쬐쬬쭈쭤쮀쮜쮸쯔쯰찌차채챠챼처체쳐쳬초촤쵀최쵸추춰췌취츄츠츼치카캐캬컈커케켜켸코콰쾌쾨쿄쿠쿼퀘퀴큐크킈키타태탸턔터테텨톄토톼퇘퇴툐투퉈퉤튀튜트틔티파패퍄퍠퍼페펴폐포퐈퐤푀표푸풔풰퓌퓨프픠피하해햐햬허헤혀혜호화홰회효후훠훼휘휴흐희히ힰ-ퟆ][%$%%%*@%^_`][ᆨ-ᇿퟋ-ퟻ]") then
cleaned = mw.text.unstrip(cleaned)
error("Do not insert symbol within single syllabic block")
end


text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders
return cleaned
text = gsub(text, "[%$%%%*@%^_`]", "") -- removing non-escaped special chars
text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII
text = mw.text.unstrip(text) -- unstripping test

return text
end
end



Revision as of 10:58, 20 April 2025

local p = {}
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local m_data = require('Module:Ko-translit/data')
local m_utils = require('Module:Ko-utils')
local get_args = require('Module:Arguments').getArgs

--[[
IMPORTANT NOTE before editing this module:
1. Make sure that you use a font that displays the following characters differently, and that you know the differences of them:
	ᄀ (U+1100)
	ᆨ (U+11A8)
	ㄱ (U+3131)
2. When dealing with decomposed Hangul,
	a. [ᄀ-ᄒ] should not be directly followed by [ᅡ-ᅵ] because MediaWiki uses Unicode Normalization Form C (NFC), which converts any sequence of [ᄀ-ᄒ][ᅡ-ᅵ] into a precomposed character; write ᄀ[ᅡ] or ᄀ(ᅡ)
	b. ᄀ[ᅡ] or ᄀ(ᅡ) at the end of a pattern is equivalent to not just 가 but [가-갛] in precomposed form. To match a syllabic block without a final consonant at the end of a pattern, use both vowel + [^ᆨ-ᇂ] and vowel + $
		For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$
--]]

local function gsub_iterate(text, table)
	for _, entry in ipairs(table) do
		text = gsub(text, entry[1], entry[2])
	end
	return text
end

local function remove_links_and_markup(text)
	-- these either are unnecessary or interfere with assimilation

	-- remove bold/italic
	-- it is not impossible to allow bold/italic when it does not interfere with assimilation, but determining when to allow or disallow that adds complication for little practical gain
	text = gsub(text, "'''", "")
	text = gsub(text, "''", "")
	-- remove HTML tags (except br)
	text = gsub(text, "<[Bb][Rr] */?>", "&#10;")
	text = gsub(text, "</?[A-Za-z][^>]->", "")
	text = gsub(text, "&#10;", "<br>")
	-- remove wikilinks
	text = gsub(text, "%[%[[^%|]+%|(..-)%]%]", "%1")
	text = gsub(gsub(text, "%[%[", ""), "%]%]", "")

	text = mw.text.killMarkers(text)

	return text
end

local function disallow_invalid_input(text)
	-- very first step
	-- Hangul status: precomposed (한)

	-- input must contain Hangul
	if not m_utils.contains_hangul(text) then
		error("Input must contain Hangul")
	end

	-- no direct insertion of reference or footnote
	if m_utils.contains_reference(text) then
		error("Input cannot contain references")
	end

	-- if input contains Hangul not supported by RR and MR, change text to "N/A" and skip everything
	if find(text, "[ᄓ-ᅠᅶ-ᆧᇃ-ᇿ〮〯ㅤ-ㆎꥠ-꥿ힰ-퟿]") then
		text = "N/A"
		return text
	end

	text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders

	-- various validations of input
	if find(text, "[ᄀ-ᄒ]") or find(text, "[ᅡ-ᅵᆨ-ᇂ]") then
		error("Do not input conjoining Hangul jamo directly")
	elseif find(text, "`%*") then
		error("Use *` instead of `*")
	elseif find(text, "@%*") then
		error("Use *@ instead of @*")
	elseif find(text, "%^[^가-힣]") then
		error("^ must be immediately followed by Hangul syllabic block")
	elseif find(text, "[^%*0-9A-Za-z]`") or find(text, "[^0-9A-Za-z]%*`") or find(text, "`[^가-깋다-딯바-빟자-짛]") then
		error("Found invalid sequence containing `")
	elseif find(text, "[^%*ㄹ가-힣]@") or find(text, "[^가-힣]%*@") or find(text, "%*@[^가-깋다-딯바-빟자-짛]") or find(text, "ㄹ@[^가-깋다-딯바-빟사-싷자-짛]") or find(text, "@[^가-깋다-딯라-맇바-빟사-싷아어에엔엘여요으은을음읍의이인일임입자-짛하-힣]") then
		error("Found invalid sequence containing @")
	elseif find(text, "[^가-힣]%$") or find(text, "%$[^야-얳여-옣요-욯유-윶윸-윻이-잍잏]") then
		error("Found invalid sequence containing $")
	elseif find(text, "%%$") then
		error("Remove final %")
	elseif find(text, "[ _][ _]") then
		error("No two or more consecutive space characters")
	elseif find(text, "^[%$%*@_`]") or find(text, "^%%[^_가-힣]") or find(text, "[ _]%*") or find(text, "%*[ %*%-_]") or find(text, "%-%*") or find(text, "[﷐-﷒]") or find(text, "[%$%*@%^`]$") then
		error("Invalid input")
	end

	return text
end


local function check_invalid_seq(text)
	-- validity check after removing links and markups (before decomposing Hangul)
	-- Hangul status: precomposed (한)

	if find(text, "[ _][ _]") then
		error("No two or more consecutive space characters")
	elseif find(text, "^[%$%*@_`]") or find(text, "[ _]%*") or find(text, "%*[ %*%-_]") or find(text, "%-%*") or find(text, "[%$%*@%^_`]$") then
		error("Invalid input")
	end

	return text
end

local function check_invalid_seq_decomposed_hangul(text)
	-- validity check after decomposing Hangul
	-- Hangul status: decomposed (ᄒ+ᅡ+ᆫ)

	if find(text, "[ᆨ-ᆪᆬ-ᆮᆴ-ᆶᆸᆹᆻᆽ-ᇂ]%*?﷐?@﷐?[ᄀᄃᄇᄉᄌ]") or find(text, "ᆰ%*?﷐?@﷐?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆲ﷐?@﷐?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆺ%*@[ᄀᄇ]") or find(text, "ᆺ%*?﷐?@﷐?[ᄁ-ᄆᄈ-ᄊᄌ-ᄑ]") or find(text, "[ᅡ-ᅵᆨ-ᆪᆬ-ᇂ]﷐?@﷐?ᄅ") or find(text, "[ᅡ-ᅵᆨᆫᆭ-ᆯᆶ-ᆸᆼ]﷐?@﷐?ᄋ") or find(text, "[ᅡ-ᅵᆫ-ᆭᆯᆱ-ᆷᆼ]﷐?@﷐?ᄒ") then
		error("Found invalid sequence containing @")
	elseif find(text, "[ᅡ-ᅵ]﷐?%$") then
		error("Found invalid sequence containing $")
	end

	return text
end


local function parse_name(text)
	-- processing people names
	-- Hangul status: precomposed (한)

	local hanja_readings_final_L = "갈걸결골괄굴궐귤글길날녈놜눌닐달돌랄렬률말멸몰물밀발벌별불살설솔술슬실알얼열올왈울월율을일절졸줄즐질찰철촬출칠탈팔필할헐혈홀활훌휼흘힐"
	local hanja_readings_init_DSJ = "다단달담답당대댁덕도독돈돌동두둔득등사삭산살삼삽상새색생서석선설섬섭성세소속손솔송쇄쇠수숙순술숭쉬슬습승시식신실심십자작잔잠잡장재쟁저적전절점접정제조족존졸종좌죄주죽준줄중즉즐즙증지직진질짐집징"

	-- note: internally uses 3 noncharacters
	-- ﷐ (U+FDD0): mostly for given name in RR
	-- ﷑ (U+FDD1): marks beginning of name
	-- ﷒ (U+FDD2): marks end of name

	-- change % to U+FDD1 and U+FDD2 (end of string also terminates name mode)
	text = gsub(text, "%%([^%%]*)%%", "﷑%1﷒")
	text = gsub(text, "%%([^%%]*)$", "﷑%1﷒")
	-- disallow invalid input for name
	if find(text, "﷑﷒") then
		error("Name cannot be empty")
	elseif find(text, "﷑[^﷑﷒]*[^가-힣_ ][^﷑﷒]*﷒") then
		error("Invalid character in name")
	elseif find(text, "﷑ ") then
		error("Name cannot begin with space")
	elseif find(text, " ﷒") then
		error("Name cannot end with space")
	elseif find(text, "﷑[^﷒]*[ _][^﷒]*[ _][^﷒]*﷒") then
		error("No more than two components in name")
	elseif find(text, "﷑[가-힣]_") then
		error("No _ after one-syllable surname")
	elseif find(text, "﷑[^﷒]*[" .. hanja_readings_final_L .. "]@[" .. hanja_readings_init_DSJ .. "][^﷒]*﷒") then
		error("Contains unnecessary @ in name") -- see below
	end
	-- separate surname and given name
	-- if input contains _ or space, separate there
	text = gsub(text, "﷑([가-힣%$@]+)_﷒", "﷑^%1_﷒") -- for surname-only string
	text = gsub(text, "﷑_([가-힣%$@]+)﷒", "﷑_^%1﷒") -- for mononym
	text = gsub(text, "﷑([가-힣%$@]+)[ _]([가-힣%$@]+)﷒", "﷑^%1_^%2﷒")
	-- otherwise, separate after first syllabic block
	text = gsub(text, "﷑([가-힣])﷒", "﷑^%1_﷒") -- for surname-only string
	text = gsub(text, "﷑([가-힣])([가-힣%$@]+)﷒", "﷑^%1_^%2﷒")
	-- check invalid input after separating surname and given name
	if find(text, "﷑[^﷒]*_%^[%$@][^﷒]*﷒") then
		error("No @ or $ between surname and given name")
	end
	-- tensification of ㄹ + {ㄷ, ㅅ, ㅈ} (needed for MR; e.g. 홍길동 [홍길똥], 을지문덕 [을찌문덕])
	-- does not occur when same syllable is repeated (e.g. 구구절절 [구구절절], not [구구절쩔]); just using U+FDD0 here too
	for i = 1, mw.ustring.len(text) do
		text = gsub(text, "﷑([^﷒]*)([달돌살설솔술슬실절졸줄즐질])%2([^﷒]*)﷒", "﷑%1%2﷐%2%3﷒")
	end
	-- now apply tensification
	for i = 1, mw.ustring.len(text) do
		text = gsub(text, "﷑([^﷒]*)([" .. hanja_readings_final_L .. "])([" .. hanja_readings_init_DSJ .. "])([^﷒]*)﷒", "﷑%1%2@%3%4﷒")
	end
	-- insert U+FDD0 in given name (needed for RR; e.g. 한복남 Han Boknam, not Han Bongnam)
	for i = 1, mw.ustring.len(text) do
		text = gsub(text, "﷑([^﷒]*)_%^([^﷒]*)([가-힣%$@])([가-힣%$@])([^﷒]*)﷒", "﷑%1_^%2%3﷐%4%5﷒")
	end
	-- remove _ which was needed for surname-only string and mononym
	text = gsub(text, "_﷒", "﷒")
	text = gsub(text, "﷑_%^", "﷑^")
	-- remove U+FDD1 and U+FDD2
	text = gsub(text, "[﷑﷒]", "")

	return text
end

local function final_processing(text)
	-- final processing for RR and MR

	-- result should not contain Hangul
	if m_utils.contains_hangul(text) then
		error("Result contains Hangul; debugging required")
	end

	text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII

	-- if result is nothing (e.g. when input is just ㅇ)
	if text == "" then
		text = "—"
	end

	return text
end

-- Convert to Revised Romanization
function p.rr(frame)
	return p._rr(get_args(frame))
end

function p._rr(args)
	local text = args[1]
	text = disallow_invalid_input(text)
	if text == "N/A" then
		return text
	end
	text = parse_name(text)
	text = remove_links_and_markup(text)
	text = check_invalid_seq(text)
	text = gsub(text, "`", "") -- ignore ` (only needed for MR; not needed for RR)
	text = gsub_iterate(text, m_data.enclosed_hangul)
	text = m_utils.decompose_hangul(text) -- decompose Hangul
	text = check_invalid_seq_decomposed_hangul(text)
	text = gsub_iterate(text, m_data.exceptions)

	text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only
	text = gsub_iterate(text, m_data.n_addition) -- $ for ㄴ-addition
	text = gsub_iterate(text, m_data.null_init_ieung) -- for null-init consonant ㅇ (연음)

	-- convert ㅎ combinations
	text = gsub_iterate(text, m_data.process_hieut)
	text = gsub(text, "ᇂᄉ", "ᄉ")

	text = gsub(text, "ᆰᄀ", "ᆯᄀ") -- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))
	text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants
	text = gsub_iterate(text, m_data.at_irregularities) -- @ for 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
	text = gsub_iterate(text, m_data.at_irregularities_additional_rr) -- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p
	text = gsub(text, "@", "")
	text = gsub_iterate(text, m_data.consonant_assimilations) -- consonant assimilations
	text = gsub(text, "ᆯᄅ", "ᆯl") -- ㄹㄹ is ll
	text = gsub_iterate(text, m_data.drop_y) -- drop y after {ㅈ, ㅉ, ㅊ}
	text = gsub_iterate(text, m_data.vowels_rr) -- replace Hangul vowels with romanized text
	text = gsub(text, "'([ᄋㅇ]+)'", "&#39;%1&#39;") -- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later)
	text = gsub_iterate(text, m_data.single_consonants_rr) -- replace single consonants with romanized text
	text = gsub(text, "﷐", "") -- now remove U+FDD0 that was needed for converting each syllabic block in given name separately (e.g. 한복남 Han Boknam, not Han Bongnam)

	-- ^ for capitalization
	text = gsub(text, "%^[a-eg-km-pr-uwy]", mw.ustring.upper)
	text = gsub(text, "%^", "")

	text = final_processing(text)

	return text
end

-- Convert to McCune–Reischauer
function p.mr(frame)
	return p._mr(get_args(frame))
end

function p._mr(args)
	local text = args[1]
	text = disallow_invalid_input(text)
	if text == "N/A" then
		return text
	end
	text = parse_name(text)
	text = gsub(text, "﷐", "") -- remove U+FDD0 (only needed for RR; not needed for MR)
	text = remove_links_and_markup(text)
	text = check_invalid_seq(text)
	text = gsub_iterate(text, m_data.enclosed_hangul)
	text = m_utils.decompose_hangul(text) -- decompose Hangul
	text = check_invalid_seq_decomposed_hangul(text)
	text = gsub_iterate(text, m_data.exceptions)

	text = gsub(text, "([ᄀᄁᄃ-ᄊᄌ-ᄑ])ᅴ", "%1ᅵ") -- syl-init consonant + ㅢ → syl-init consonant + ㅣ (except 의, 늬, 희)
	text = gsub(text, "(ᄋ[ᅧ]ᄃ[ᅥ])ᆲ([ᄀᄃᄇᄉᄌ])", "%1ᆯ%2") -- 여덟 + particle (tensification does not occur)
	text = gsub_iterate(text, m_data.n_addition) -- $ for ㄴ-addition
	text = gsub_iterate(text, m_data.null_init_ieung) -- for null-init consonant ㅇ (연음)

	-- convert ㅎ combinations
	text = gsub_iterate(text, m_data.process_hieut)
	text = gsub_iterate(text, m_data.process_hieut_additional_mr)

	-- ㄵ, ㄼ, ㄾ cause tensification of following consonant
	-- do not add ㄻ; does not always cause tensification (굶기다 [굼기다], 삶조차 [삼조차])
	text = gsub(text, "([ᆬᆲᆴ])([ᄀᄃᄌ])", "%1@%2")

	text = gsub(text, "ᆰᄀ", "ᆯ@ᄀ") -- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))

	-- @ for written 사이시옷 + ㄱ/ㅂ (should be done before neutralization of syl-final consonants)
	text = gsub(text, "ᆺ@ᄀ", "ᄁ")
	text = gsub(text, "ᆺ@ᄇ", "ᄈ")

	text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants
	text = gsub(text, "([ᅡ-ᅵᆫᆷᆼ])@ᄉ", "%1ᄊ") -- @ for tensification
	text = gsub_iterate(text, m_data.at_irregularities) -- @ for 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]

	-- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants
	-- * is for additional hyphen in romanization only (voicing is retained after hyphen)
	text = gsub(text, "ᆫᄀ", "ᆫ'`ᄀ") -- n'g
	text = gsub(text, "([ᅡ-ᅵᆫᆯᆷᆼ])([ᄀᄃᄇᄌ])", "%1`%2")
	text = gsub(text, "([ᅡ-ᅵᆫᆯᆷᆼ])%*([ᄀᄃᄇᄌ])", "%1-`%2")
	text = gsub(text, "ᆯ%*ᄅ", "ᆯ-l") -- ㄹ-ㄹ should probably be l-l rather than l-r
	text = gsub(text, "%*", "-")
	text = gsub(text, "@", "")

	-- consonant assimilations
	text = gsub_iterate(text, m_data.consonant_assimilations)
	text = gsub_iterate(text, m_data.consonant_assimilations_additional_mr)

	text = gsub_iterate(text, m_data.drop_y) -- drop y after {ㅈ, ㅉ, ㅊ}
	text = gsub_iterate(text, m_data.vowels_mr) -- replace Hangul vowels with romanized text
	text = gsub(text, "([ao])ᄋe", "%1ë") -- ㅏ에 (aë) and ㅗ에 (oë)
	text = gsub(text, "'([ᄋㅇ]+)'", "&#39;%1&#39;") -- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later)
	text = gsub_iterate(text, m_data.single_consonants_mr) -- replace single consonants with romanized text

	-- replace ' with &#39; when followed by another ' or at end of string (to avoid possible clashes with bold/italic markup)
	text = gsub(text, "([hkpt])''", "%1&#39;'")
	text = gsub(text, "([hkpt])'$", "%1&#39;")

	-- ^ for capitalization
	text = gsub(text, "%^[acehikm-pr-uwyŏŭ]", mw.ustring.upper)
	text = gsub(text, "%^", "")

	text = final_processing(text)

	return text
end

-- Removing special chars (except for escaped ones)
function p.clean_hangul(frame)
	return p._clean_hangul(get_args(frame))
end

function p._clean_hangul(args)
	local text = args[1]

	-- input must contain Hangul
	if not m_utils.contains_hangul(text) then
		error("Input must contain Hangul")
	end

	-- no direct insertion of reference or footnote
	if m_utils.contains_reference(text) then
		error("Input cannot contain references")
	end

	-- symbol should not appear within single syllabic block
	if find(text, "[ᄀ-ᅟꥠ-ꥼ][%$%%%*@%^_`][ᅠ-ᆧힰ-ퟆ]") or find(text, "[ᅠ-ᆧ가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기까깨꺄꺠꺼께껴꼐꼬꽈꽤꾀꾜꾸꿔꿰뀌뀨끄끠끼나내냐냬너네녀녜노놔놰뇌뇨누눠눼뉘뉴느늬니다대댜댸더데뎌뎨도돠돼되됴두둬뒈뒤듀드듸디따때땨떄떠떼뗘뗴또똬뙈뙤뚀뚜뚸뛔뛰뜌뜨띄띠라래랴럐러레려례로롸뢔뢰료루뤄뤠뤼류르릐리마매먀먜머메며몌모뫄뫠뫼묘무뭐뭬뮈뮤므믜미바배뱌뱨버베벼볘보봐봬뵈뵤부붜붸뷔뷰브븨비빠빼뺘뺴뻐뻬뼈뼤뽀뽜뽸뾔뾰뿌뿨쀄쀠쀼쁘쁴삐사새샤섀서세셔셰소솨쇄쇠쇼수숴쉐쉬슈스싀시싸쌔쌰썌써쎄쎠쎼쏘쏴쐐쐬쑈쑤쒀쒜쒸쓔쓰씌씨아애야얘어에여예오와왜외요우워웨위유으의이자재쟈쟤저제져졔조좌좨죄죠주줘줴쥐쥬즈즤지짜째쨔쨰쩌쩨쪄쪠쪼쫘쫴쬐쬬쭈쭤쮀쮜쮸쯔쯰찌차채챠챼처체쳐쳬초촤쵀최쵸추춰췌취츄츠츼치카캐캬컈커케켜켸코콰쾌쾨쿄쿠쿼퀘퀴큐크킈키타태탸턔터테텨톄토톼퇘퇴툐투퉈퉤튀튜트틔티파패퍄퍠퍼페펴폐포퐈퐤푀표푸풔풰퓌퓨프픠피하해햐햬허헤혀혜호화홰회효후훠훼휘휴흐희히ힰ-ퟆ][%$%%%*@%^_`][ᆨ-ᇿퟋ-ퟻ]") then
		error("Do not insert symbol within single syllabic block")
	end

	text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders
	text = gsub(text, "[%$%%%*@%^_`]", "") -- removing non-escaped special chars
	text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII
	text = mw.text.unstrip(text) -- unstripping test

	return text
end

return p