Module:Ko-translit: Difference between revisions
Appearance
Content deleted Content added
Grapesurgeon (talk | contribs) consistent |
looks like it's good to update this now |
||
Line 2: | Line 2: | ||
local find = mw.ustring.find |
local find = mw.ustring.find |
||
local gsub = mw.ustring.gsub |
local gsub = mw.ustring.gsub |
||
local m_data = require('Module:Ko-translit/data') |
|||
local m_utils = require('Module:Ko-utils') |
|||
local get_args = require('Module:Arguments').getArgs |
|||
--[[ |
--[[ |
||
Line 14: | Line 17: | ||
For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$ |
For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$ |
||
--]] |
--]] |
||
local function gsub_iterate(text, table) |
|||
for _, entry in ipairs(table) do |
|||
text = gsub(text, entry[1], entry[2]) |
|||
end |
|||
return text |
|||
end |
|||
local function remove_links_and_markup(text) |
local function remove_links_and_markup(text) |
||
Line 29: | Line 39: | ||
text = gsub(text, "%[%[[^%|]+%|(..-)%]%]", "%1") |
text = gsub(text, "%[%[[^%|]+%|(..-)%]%]", "%1") |
||
text = gsub(gsub(text, "%[%[", ""), "%]%]", "") |
text = gsub(gsub(text, "%[%[", ""), "%]%]", "") |
||
-- remove refs |
|||
-- text = gsub(text, "<ref.-</ref>", "") |
|||
text = mw.text.killMarkers(text) |
text = mw.text.killMarkers(text) |
||
-- remove templates |
|||
text = gsub(text, "{{.-}}", "") |
|||
return text |
return text |
||
Line 43: | Line 50: | ||
-- input must contain Hangul |
-- input must contain Hangul |
||
if not m_utils.contains_hangul(text) then |
|||
if text == nil or text == "" or find(text, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-가-힣ힰ-]") == nil then |
|||
error("Input must contain Hangul") |
error("Input must contain Hangul") |
||
end |
end |
||
-- no direct insertion of reference or footnote |
-- no direct insertion of reference or footnote |
||
if m_utils.contains_reference(text) then |
|||
if find(text, "'\"`UNIQ--") or find(text, "-QINU`\"'") then |
|||
error("Input cannot contain references") |
error("Input cannot contain references") |
||
end |
end |
||
Line 58: | Line 65: | ||
end |
end |
||
text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders |
|||
-- process escape chars first |
|||
text = gsub(text, "\\%$", "$") |
|||
text = gsub(text, "\\%%", "%") |
|||
text = gsub(text, "\\%*", "*") |
|||
text = gsub(text, "\\@", "@") |
|||
text = gsub(text, "\\%^", "^") |
|||
text = gsub(text, "\\_", "_") |
|||
text = gsub(text, "\\`", "`") |
|||
-- various validations of input |
|||
if find(text, "[ᄀ-ᄒ]") or find(text, "[ᅡ-ᅵᆨ-ᇂ]") then |
if find(text, "[ᄀ-ᄒ]") or find(text, "[ᅡ-ᅵᆨ-ᇂ]") then |
||
error("Do not input conjoining Hangul jamo directly") |
error("Do not input conjoining Hangul jamo directly") |
||
Line 92: | Line 93: | ||
end |
end |
||
-- verify that hangul input is valid |
|||
local function check_invalid_seq(text) |
|||
-- checked right after removing links and markups (before decomposing Hangul) |
|||
-- validity check after removing links and markups (before decomposing Hangul) |
|||
-- Hangul status: precomposed (한) |
|||
-- Hangul status: precomposed (한) |
|||
local function check_invalid_input(text) |
|||
if find(text, "[ _][ _]") then |
if find(text, "[ _][ _]") then |
||
error("No two or more consecutive space characters") |
error("No two or more consecutive space characters") |
||
Line 105: | Line 107: | ||
end |
end |
||
local function check_invalid_seq_decomposed_hangul(text) |
|||
-- verify that hangul was correctly decomposed |
|||
-- |
-- validity check after decomposing Hangul |
||
-- Hangul status: decomposed (ᄒ+ᅡ+ᆫ) |
-- Hangul status: decomposed (ᄒ+ᅡ+ᆫ) |
||
local function verify_decomposed_consonsants(text) |
|||
if find(text, "[ᆨ-ᆪᆬ-ᆮᆴ-ᆶᆸᆹᆻᆽ-ᇂ]%*??@?[ᄀᄃᄇᄉᄌ]") or find(text, "ᆰ%*??@?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆲ?@?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆺ%*@[ᄀᄇ]") or find(text, "ᆺ%*??@?[ᄁ-ᄆᄈ-ᄊᄌ-ᄑ]") or find(text, "[ᅡ-ᅵᆨ-ᆪᆬ-ᇂ]?@?ᄅ") or find(text, "[ᅡ-ᅵᆨᆫᆭ-ᆯᆶ-ᆸᆼ]?@?ᄋ") or find(text, "[ᅡ-ᅵᆫ-ᆭᆯᆱ-ᆷᆼ]?@?ᄒ") then |
if find(text, "[ᆨ-ᆪᆬ-ᆮᆴ-ᆶᆸᆹᆻᆽ-ᇂ]%*??@?[ᄀᄃᄇᄉᄌ]") or find(text, "ᆰ%*??@?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆲ?@?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆺ%*@[ᄀᄇ]") or find(text, "ᆺ%*??@?[ᄁ-ᄆᄈ-ᄊᄌ-ᄑ]") or find(text, "[ᅡ-ᅵᆨ-ᆪᆬ-ᇂ]?@?ᄅ") or find(text, "[ᅡ-ᅵᆨᆫᆭ-ᆯᆶ-ᆸᆼ]?@?ᄋ") or find(text, "[ᅡ-ᅵᆫ-ᆭᆯᆱ-ᆷᆼ]?@?ᄒ") then |
||
error("Found invalid sequence containing @") |
error("Found invalid sequence containing @") |
||
Line 118: | Line 120: | ||
end |
end |
||
-- Convert html encodings back to ASCII |
|||
local function html_encoding_to_ascii(text) |
|||
text = gsub(text, "$", "$") |
|||
text = gsub(text, "%", "%%") |
|||
text = gsub(text, "*", "*") |
|||
text = gsub(text, "@", "@") |
|||
text = gsub(text, "^", "^") |
|||
text = gsub(text, "_", "_") |
|||
text = gsub(text, "`", "`") |
|||
return text |
|||
end |
|||
-- processing people names |
|||
local function parse_name(text) |
local function parse_name(text) |
||
-- processing people names |
|||
-- Hangul status: precomposed (한) |
-- Hangul status: precomposed (한) |
||
Line 177: | Line 167: | ||
-- does not occur when same syllable is repeated (e.g. 구구절절 [구구절절], not [구구절쩔]); just using U+FDD0 here too |
-- does not occur when same syllable is repeated (e.g. 구구절절 [구구절절], not [구구절쩔]); just using U+FDD0 here too |
||
for i = 1, mw.ustring.len(text) do |
for i = 1, mw.ustring.len(text) do |
||
text = gsub(text, "([^]*) |
text = gsub(text, "([^]*)([달돌살설솔술슬실절졸줄즐질])%2([^]*)", "%1%2%2%3") |
||
text = gsub(text, "([^]*)돌돌([^]*)", "%1돌돌%2") |
|||
text = gsub(text, "([^]*)살살([^]*)", "%1살살%2") |
|||
text = gsub(text, "([^]*)설설([^]*)", "%1설설%2") |
|||
text = gsub(text, "([^]*)솔솔([^]*)", "%1솔솔%2") |
|||
text = gsub(text, "([^]*)술술([^]*)", "%1술술%2") |
|||
text = gsub(text, "([^]*)슬슬([^]*)", "%1슬슬%2") |
|||
text = gsub(text, "([^]*)실실([^]*)", "%1실실%2") |
|||
text = gsub(text, "([^]*)절절([^]*)", "%1절절%2") |
|||
text = gsub(text, "([^]*)졸졸([^]*)", "%1졸졸%2") |
|||
text = gsub(text, "([^]*)줄줄([^]*)", "%1줄줄%2") |
|||
text = gsub(text, "([^]*)즐즐([^]*)", "%1즐즐%2") |
|||
text = gsub(text, "([^]*)질질([^]*)", "%1질질%2") |
|||
end |
end |
||
-- now apply tensification |
-- now apply tensification |
||
Line 208: | Line 186: | ||
end |
end |
||
local function final_processing(text) |
|||
--[[ |
|||
-- final processing for RR and MR |
|||
IMPORTANT: Before adding an exception, be sure to check if it can ALWAYS be applied in ALL contexts. |
|||
Good example: 싫증 → 실@증 |
|||
Bad example: 문자 → 문@자 (affects words like 방문자 (pronounced [방문자], not [방문짜])) |
|||
Hangul status: decomposed (ᄒ+ᅡ+ᆫ) |
|||
--]] |
|||
local function parse_exceptions(text) |
|||
-- for linguistic contexts |
|||
text = gsub(text, "ㄴ([ᄀ-ᄒ])", "ᆫ%1") -- -ㄴ다 |
|||
text = gsub(text, "ㄹ([ᄀ-ᄒ])", "ᆯ%1") -- -ㄹ까, -ㄹ래 |
|||
text = gsub(text, "ㄹ@([ᄀᄃᄇᄉᄌ])", "ᆯ@%1") -- -ㄹ지 |
|||
text = gsub(text, "ㅁ([ᄀ-ᄒ])", "ᆷ%1") |
|||
text = gsub(text, "ㅂ([ᄀ-ᄒ])", "ᆸ%1") -- -ㅂ니다, -ㅂ시다 |
|||
-- ㄴ-addition always occurs before 윷 and 잎 |
|||
text = gsub(text, "([ᆨ-ᇂ])ᄋ(ᅲᆾ)", "%1ᄂ%2") |
|||
text = gsub(text, "([ᆨ-ᇂ])ᄋ(ᅵᇁ)", "%1ᄂ%2") |
|||
-- 곧이어 [고디어] |
|||
text = gsub(text, "(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ][^ᆨ-ᇂ])", "%1ᄃ%2") |
|||
text = gsub(text, "(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ])$", "%1ᄃ%2") |
|||
-- 싫증 [실쯩] |
|||
text = gsub(text, "(ᄉ[ᅵ])ᆶ(ᄌ[ᅳ]ᆼ)", "%1ᆯ@%2") |
|||
-- cases where ㄺㄱ is pronounced [ㄱㄲ] |
|||
-- not including very rarely used words such as 삼시욹, 안찱, 우줅거리다, etc. |
|||
text = gsub(text, "([ᄃᄉᄐ]ᅡ)ᆰᄀ", "%1ᆨᄀ") -- 닭, 삵, 수탉/암탉 |
|||
text = gsub(text, "([ᄉᄒ]ᅳ)ᆰᄀ", "%1ᆨᄀ") -- 기슭, 흙 |
|||
text = gsub(text, "(ᄎ[ᅵ])ᆰᄀ", "%1ᆨᄀ") -- 칡 |
|||
-- palatalization and ㅈ + -히- |
|||
text = gsub(text, "ᆮᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᄌ%1") -- 해돋이 [해도지] |
|||
text = gsub(text, "ᆮᄋ(ᅵ[^ᆨ-ᇂ])", "ᄌ%1") |
|||
text = gsub(text, "ᆮᄋ(ᅵ)$", "ᄌ%1") |
|||
text = gsub(text, "[ᆮᆽ]ᄒ(ᅧᆻ)", "ᄎ%1") -- 굳히다 [구치다], 꽂히다 [꼬치다] |
|||
text = gsub(text, "[ᆮᆽ]ᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᄎ%1") |
|||
text = gsub(text, "[ᆮᆽ]ᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᄎ%1") |
|||
text = gsub(text, "[ᆮᆽ]ᄒ([ᅧᅵ])$", "ᄎ%1") |
|||
text = gsub(text, "ᆴᄋ(ᅧᆻ)", "ᆯᄎ%1") -- 훑이다 [훌치다] |
|||
text = gsub(text, "ᆴᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄎ%1") |
|||
text = gsub(text, "ᆴᄋ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄎ%1") |
|||
text = gsub(text, "ᆴᄋ([ᅧᅵ])$", "ᆯᄎ%1") |
|||
text = gsub(text, "ᇀᄋ(ᅧᆻ)", "ᄎ%1") -- 붙이다 [부치다] |
|||
text = gsub(text, "ᇀᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᄎ%1") |
|||
text = gsub(text, "ᇀᄋ([ᅧᅵ][^ᆨ-ᇂ])", "ᄎ%1") |
|||
text = gsub(text, "ᇀᄋ([ᅧᅵ])$", "ᄎ%1") |
|||
-- {ㄵ, ㄺ, ㄼ} + -히- |
|||
text = gsub(text, "ᆬᄒ(ᅧᆻ)", "ᆫᄎ%1") -- 앉히다 [안치다] |
|||
text = gsub(text, "ᆬᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆫᄎ%1") |
|||
text = gsub(text, "ᆬᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆫᄎ%1") |
|||
text = gsub(text, "ᆬᄒ([ᅧᅵ])$", "ᆫᄎ%1") |
|||
text = gsub(text, "ᆰᄒ(ᅧᆻ)", "ᆯᄏ%1") -- 밝히다 [발키다] |
|||
text = gsub(text, "ᆰᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄏ%1") |
|||
text = gsub(text, "ᆰᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄏ%1") |
|||
text = gsub(text, "ᆰᄒ([ᅧᅵ])$", "ᆯᄏ%1") |
|||
text = gsub(text, "ᆲᄒ(ᅧᆻ)", "ᆯᄑ%1") -- 넓히다 [널피다], 밟히다 [발피다] |
|||
text = gsub(text, "ᆲᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄑ%1") |
|||
text = gsub(text, "ᆲᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄑ%1") |
|||
text = gsub(text, "ᆲᄒ([ᅧᅵ])$", "ᆯᄑ%1") |
|||
-- cases where 넓- is pronounced [넙] before consonant |
|||
text = gsub(text, "(ᄂ[ᅥ])ᆲ([ᄁᄄ-ᄈᄊᄍ-ᄒ])", "%1ᆸ%2") |
|||
text = gsub(text, "(ᄂ[ᅥ])ᆲ(ᄃ[ᅡ]ᄃ[ᅳ]ᆷ)", "%1ᆸ%2") -- 넓다듬이 |
|||
text = gsub(text, "(ᄂ[ᅥ])ᆲ(ᄃ[ᅮ]ᆼ)", "%1ᆸ%2") -- 넓둥글다 |
|||
text = gsub(text, "(ᄂ[ᅥ])ᆲ(ᄉ[ᅡ]ᆯᄆ[ᅮ]ᆫ)", "%1ᆸ%2") -- 넓살문 |
|||
text = gsub(text, "(ᄂ[ᅥ])ᆲ(ᄌ[ᅥᅮ]ᆨ)", "%1ᆸ%2") -- 넓적-, 넓죽- |
|||
-- 밟- is [밥] before consonant (except null-init consonant ㅇ) |
|||
text = gsub(text, "(ᄇ[ᅡ])ᆲ([^ᄋ])", "%1ᆸ%2") |
|||
text = gsub(text, "(ᄇ[ᅡ])ᆲ$", "%1ᆸ") |
|||
-- automatic 절음 법칙 |
|||
text = gsub(text, "(ᄋ[ᅥ])ᆹᄋ(ᅢ[ᆫᆯᆷᆸᆻ])", "%1ᆸᄉ%2") -- except 없애다 [업쌔다] |
|||
text = gsub(text, "(ᄋ[ᅥ])ᆹᄋ(ᅢ[^ᆨ-ᇂ])", "%1ᆸᄉ%2") |
|||
text = gsub(text, "(ᄋ[ᅥ])ᆹᄋ(ᅢ)$", "%1ᆸᄉ%2") |
|||
text = gsub(text, "(ᄆ[ᅡᅥ])ᆺᄋ(ᅵᆻ)", "%1ᄉ%2") -- except 맛있다 and 멋있다 which are usually pronounced [마싣따] and [머싣따] respectively |
|||
text = gsub(text, "([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅡᅥᅧ][ᆨ-ᆺᆼ-ᇂ])", "%1@%2") -- except 아, 았, 어, 었, 여, 였 |
|||
text = gsub(text, "([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅦ][ᆨ-ᆪᆬ-ᆮᆰ-ᇂ])", "%1@%2") -- except 에, 엔, 엘 |
|||
text = gsub(text, "([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅭᅴ][ᆨ-ᇂ])", "%1@%2") -- except 요, 의 (w/o final consonant) |
|||
text = gsub(text, "([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅳᅵ][ᆨ-ᆪᆬ-ᆮᆰ-ᆶᆹ-ᇂ])", "%1@%2") -- except 으, 은, 을, 음, 읍, 이, 인, 일, 임, 입 |
|||
text = gsub(text, "([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅢ-ᅤᅨ-ᅬᅮ-ᅲ])", "%1@%2") |
|||
-- _ for additional space in romanization only |
|||
text = gsub(text, "_", " ") |
|||
-- result should not contain Hangul |
|||
return text |
|||
if m_utils.contains_hangul(text) then |
|||
end |
|||
error("Result contains Hangul; debugging required") |
|||
end |
|||
text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII |
|||
-- processing misc characters that contain hangul |
|||
-- Hangul status: precomposed (한) |
|||
-- if result is nothing (e.g. when input is just ㅇ) |
|||
local function parse_enclosed_hangul(text) |
|||
if text == "" then |
|||
-- actually not very necessary, but these are also classified as Hangul chars in Unicode |
|||
text = "—" |
|||
-- no distinction is made between parenthesized and circled chars |
|||
end |
|||
text = gsub(text, "[㈀㉠]", "(기역)") |
|||
text = gsub(text, "[㈁㉡]", "(니은)") |
|||
text = gsub(text, "[㈂㉢]", "(디귿)") |
|||
text = gsub(text, "[㈃㉣]", "(리을)") |
|||
text = gsub(text, "[㈄㉤]", "(미음)") |
|||
text = gsub(text, "[㈅㉥]", "(비읍)") |
|||
text = gsub(text, "[㈆㉦]", "(시옷)") |
|||
text = gsub(text, "[㈇㉧]", "(이응)") |
|||
text = gsub(text, "[㈈㉨]", "(지읒)") |
|||
text = gsub(text, "[㈉㉩]", "(치읓)") |
|||
text = gsub(text, "[㈊㉪]", "(키읔)") |
|||
text = gsub(text, "[㈋㉫]", "(티읕)") |
|||
text = gsub(text, "[㈌㉬]", "(피읖)") |
|||
text = gsub(text, "[㈍㉭]", "(히읗)") |
|||
text = gsub(text, "[㈎㉮]", "(가)") |
|||
text = gsub(text, "[㈏㉯]", "(나)") |
|||
text = gsub(text, "[㈐㉰]", "(다)") |
|||
text = gsub(text, "[㈑㉱]", "(라)") |
|||
text = gsub(text, "[㈒㉲]", "(마)") |
|||
text = gsub(text, "[㈓㉳]", "(바)") |
|||
text = gsub(text, "[㈔㉴]", "(사)") |
|||
text = gsub(text, "[㈕㉵]", "(아)") |
|||
text = gsub(text, "[㈖㉶]", "(자)") |
|||
text = gsub(text, "[㈗㉷]", "(차)") |
|||
text = gsub(text, "[㈘㉸]", "(카)") |
|||
text = gsub(text, "[㈙㉹]", "(타)") |
|||
text = gsub(text, "[㈚㉺]", "(파)") |
|||
text = gsub(text, "[㈛㉻]", "(하)") |
|||
text = gsub(text, "㈜", "(주)") |
|||
text = gsub(text, "㈝", "(오전)") |
|||
text = gsub(text, "㈞", "(오후)") |
|||
text = gsub(text, "㉼", "(참고)") |
|||
text = gsub(text, "㉽", "(주의)") |
|||
text = gsub(text, "㉾", "(우)") |
|||
return text |
return text |
||
Line 334: | Line 206: | ||
-- Convert to Revised Romanization |
-- Convert to Revised Romanization |
||
function p.rr(frame) |
function p.rr(frame) |
||
return p._rr(get_args(frame)) |
|||
local get_args = require('Module:Arguments').getArgs |
|||
local args = get_args(frame) |
|||
return p._rr(args) |
|||
end |
end |
||
Line 347: | Line 217: | ||
text = parse_name(text) |
text = parse_name(text) |
||
text = remove_links_and_markup(text) |
text = remove_links_and_markup(text) |
||
text = |
text = check_invalid_seq(text) |
||
text = gsub(text, "`", "") -- ignore ` (only needed for MR; not needed for RR) |
text = gsub(text, "`", "") -- ignore ` (only needed for MR; not needed for RR) |
||
text = |
text = gsub_iterate(text, m_data.enclosed_hangul) |
||
text = |
text = m_utils.decompose_hangul(text) -- decompose Hangul |
||
text = |
text = check_invalid_seq_decomposed_hangul(text) |
||
text = |
text = gsub_iterate(text, m_data.exceptions) |
||
text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only |
text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only |
||
-- $ for ㄴ-addition |
text = gsub_iterate(text, m_data.n_addition) -- $ for ㄴ-addition |
||
text = gsub_iterate(text, m_data.null_init_ieung) -- for null-init consonant ㅇ (연음) |
|||
text = gsub(text, "([ᆨ-ᇂ])%$ᄋ([ᅣᅤᅧᅨᅭᅲᅵ])", "%1ᄂ%2") -- 색연필 [생년필], 물엿 [물렫] |
|||
text = gsub(text, "%$", "") |
|||
-- convert ㅎ combinations |
|||
-- for null-init consonant ㅇ (연음) |
|||
text = |
text = gsub_iterate(text, m_data.process_hieut) |
||
text = gsub(text, "ᆩᄋ", "ᄁ") |
|||
text = gsub(text, "ᆪᄋ", "ᆨᄉ") |
|||
text = gsub(text, "ᆬᄋ", "ᆫᄌ") |
|||
text = gsub(text, "ᆮᄋ", "ᄃ") |
|||
text = gsub(text, "[ᆯᆶ]ᄋ", "ᄅ") |
|||
text = gsub(text, "ᆰᄋ", "ᆯᄀ") |
|||
text = gsub(text, "ᆱᄋ", "ᆯᄆ") |
|||
text = gsub(text, "ᆲᄋ", "ᆯᄇ") |
|||
text = gsub(text, "ᆳᄋ", "ᆯᄉ") |
|||
text = gsub(text, "ᆴᄋ", "ᆯᄐ") |
|||
text = gsub(text, "ᆵᄋ", "ᆯᄑ") |
|||
text = gsub(text, "ᆸᄋ", "ᄇ") |
|||
text = gsub(text, "ᆹᄋ", "ᆸᄉ") |
|||
text = gsub(text, "ᆺᄋ", "ᄉ") |
|||
text = gsub(text, "ᆻᄋ", "ᄊ") |
|||
text = gsub(text, "ᆽᄋ", "ᄌ") |
|||
text = gsub(text, "ᆾᄋ", "ᄎ") |
|||
text = gsub(text, "ᆿᄋ", "ᄏ") |
|||
text = gsub(text, "ᇀᄋ", "ᄐ") |
|||
text = gsub(text, "ᇁᄋ", "ᄑ") |
|||
text = gsub(text, "ᇂᄋ", "ᄋ") -- silent; 좋아 [조아] |
|||
-- for ㅎ |
|||
-- trivia: {ㄶ, ㅀ, ㅎ} + ㅂ doesn't actually exist, but added for completeness (syl-final ㅎ is for aspiration anyway) |
|||
text = gsub(text, "ᆭᄀ", "ᆫᄏ") |
|||
text = gsub(text, "ᆭᄃ", "ᆫᄐ") |
|||
text = gsub(text, "ᆭᄇ", "ᆫᄑ") |
|||
text = gsub(text, "ᆭᄌ", "ᆫᄎ") |
|||
text = gsub(text, "ᆶᄀ", "ᆯᄏ") |
|||
text = gsub(text, "ᆶᄃ", "ᆯᄐ") |
|||
text = gsub(text, "ᆶᄇ", "ᆯᄑ") |
|||
text = gsub(text, "ᆶᄌ", "ᆯᄎ") |
|||
text = gsub(text, "ᇂᄀ", "ᄏ") |
|||
text = gsub(text, "ᇂᄃ", "ᄐ") |
|||
text = gsub(text, "ᇂᄇ", "ᄑ") |
|||
text = gsub(text, "ᇂᄉ", "ᄉ") |
text = gsub(text, "ᇂᄉ", "ᄉ") |
||
text = gsub(text, "ᇂᄌ", "ᄎ") |
|||
-- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨])) |
text = gsub(text, "ᆰᄀ", "ᆯᄀ") -- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨])) |
||
text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants |
|||
text = gsub(text, "ᆰᄀ", "ᆯᄀ") |
|||
text = gsub_iterate(text, m_data.at_irregularities) -- @ for 절음 법칙, ㄴㄹ pronounced [ㄴㄴ] |
|||
-- neutralization of syl-final consonants |
|||
text = gsub_iterate(text, m_data.at_irregularities_additional_rr) -- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p |
|||
text = gsub(text, "[ᆩᆪᆰᆿ]", "ᆨ") |
|||
text = gsub(text, "[ᆬᆭ]", "ᆫ") |
|||
text = gsub(text, "[ᆺᆻᆽᆾᇀᇂ]", "ᆮ") |
|||
text = gsub(text, "[ᆲᆳᆴᆶ]", "ᆯ") |
|||
text = gsub(text, "ᆱ", "ᆷ") |
|||
text = gsub(text, "[ᆵᆹᇁ]", "ᆸ") |
|||
-- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p, 절음 법칙, ㄴㄹ pronounced [ㄴㄴ] |
|||
-- other irregularities documented are automatically handled |
|||
text = gsub(text, "ᆨ@ᄒ", "ᄏ") |
|||
text = gsub(text, "ᆮ@ᄒ", "ᄐ") |
|||
text = gsub(text, "ᆸ@ᄒ", "ᄑ") |
|||
text = gsub(text, "ᆨ@ᄋ", "ᄀ") |
|||
text = gsub(text, "ᆮ@ᄋ", "ᄃ") -- 웃어른 [우더른] |
|||
text = gsub(text, "ᆯ@ᄋ", "ᄅ") |
|||
text = gsub(text, "ᆸ@ᄋ", "ᄇ") |
|||
text = gsub(text, "ᆫ@ᄅ", "ᆫᄂ") -- 음운론 [으문논] |
|||
text = gsub(text, "@", "") |
text = gsub(text, "@", "") |
||
-- consonant assimilations |
text = gsub_iterate(text, m_data.consonant_assimilations) -- consonant assimilations |
||
text = gsub(text, " |
text = gsub(text, "ᆯᄅ", "ᆯl") -- ㄹㄹ is ll |
||
text = |
text = gsub_iterate(text, m_data.drop_y) -- drop y after {ㅈ, ㅉ, ㅊ} |
||
text = gsub_iterate(text, m_data.vowels_rr) -- replace Hangul vowels with romanized text |
|||
text = gsub(text, "ᆫᄅ", "ᆯᄅ") |
|||
text = gsub(text, "'([ᄋㅇ]+)'", "'%1'") -- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later) |
|||
text = gsub(text, "ᆮ[ᄂᄅ]", "ᆫᄂ") |
|||
text = gsub_iterate(text, m_data.single_consonants_rr) -- replace single consonants with romanized text |
|||
text = gsub(text, "ᆮᄆ", "ᆫᄆ") |
|||
text = gsub(text, "", "") -- now remove U+FDD0 that was needed for converting each syllabic block in given name separately (e.g. 한복남 Han Boknam, not Han Bongnam) |
|||
text = gsub(text, "ᆯᄂ", "ᆯᄅ") |
|||
text = gsub(text, "[ᆷᆸ][ᄂᄅ]", "ᆷᄂ") |
|||
text = gsub(text, "ᆸᄆ", "ᆷᄆ") |
|||
text = gsub(text, "ᆯᄅ", "ᆯl") |
|||
-- drop y after {ㅈ, ㅉ, ㅊ} |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅣ", "%1ᅡ") |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅤ", "%1ᅢ") |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅧ", "%1ᅥ") |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅨ", "%1ᅦ") |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅭ", "%1ᅩ") |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅲ", "%1ᅮ") |
|||
-- vowels |
|||
text = gsub(text, "[ᅡㅏ]", "a") |
|||
text = gsub(text, "[ᅢㅐ]", "ae") |
|||
text = gsub(text, "[ᅣㅑ]", "ya") |
|||
text = gsub(text, "[ᅤㅒ]", "yae") |
|||
text = gsub(text, "[ᅥㅓ]", "eo") |
|||
text = gsub(text, "[ᅦㅔ]", "e") |
|||
text = gsub(text, "[ᅧㅕ]", "yeo") |
|||
text = gsub(text, "[ᅨㅖ]", "ye") |
|||
text = gsub(text, "[ᅩㅗ]", "o") |
|||
text = gsub(text, "[ᅪㅘ]", "wa") |
|||
text = gsub(text, "[ᅫㅙ]", "wae") |
|||
text = gsub(text, "[ᅬㅚ]", "oe") |
|||
text = gsub(text, "[ᅭㅛ]", "yo") |
|||
text = gsub(text, "[ᅮㅜ]", "u") |
|||
text = gsub(text, "[ᅯㅝ]", "wo") |
|||
text = gsub(text, "[ᅰㅞ]", "we") |
|||
text = gsub(text, "[ᅱㅟ]", "wi") |
|||
text = gsub(text, "[ᅲㅠ]", "yu") |
|||
text = gsub(text, "[ᅳㅡ]", "eu") |
|||
text = gsub(text, "[ᅴㅢ]", "ui") |
|||
text = gsub(text, "[ᅵㅣ]", "i") |
|||
-- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later) |
|||
text = gsub(text, "'([ᄋㅇ]+)'", "'%1'") |
|||
-- single consonants |
|||
text = gsub(text, "[ᄀㄱ]", "g") |
|||
text = gsub(text, "[ᄁㄲ]", "kk") |
|||
text = gsub(text, "ㄳ", "ks") |
|||
text = gsub(text, "[ᄂᆫㄴ]", "n") |
|||
text = gsub(text, "ㄵ", "nj") |
|||
text = gsub(text, "ㄶ", "nh") |
|||
text = gsub(text, "[ᄃㄷ]", "d") |
|||
text = gsub(text, "[ᄄㄸ]", "tt") |
|||
text = gsub(text, "[ᄅㄹ]", "r") |
|||
text = gsub(text, "ᆯ", "l") |
|||
text = gsub(text, "ㄺ", "lg") |
|||
text = gsub(text, "ㄻ", "lm") |
|||
text = gsub(text, "ㄼ", "lb") |
|||
text = gsub(text, "ㄽ", "ls") |
|||
text = gsub(text, "ㄾ", "lt") |
|||
text = gsub(text, "ㄿ", "lp") |
|||
text = gsub(text, "ㅀ", "lh") |
|||
text = gsub(text, "[ᄆᆷㅁ]", "m") |
|||
text = gsub(text, "[ᄇㅂ]", "b") |
|||
text = gsub(text, "[ᄈㅃ]", "pp") |
|||
text = gsub(text, "ㅄ", "ps") |
|||
text = gsub(text, "[ᄉㅅ]", "s") |
|||
text = gsub(text, "[ᄊㅆ]", "ss") |
|||
text = gsub(text, "[ᄋㅇ]", "") |
|||
text = gsub(text, "ᆼ", "ng") |
|||
text = gsub(text, "[ᄌㅈ]", "j") |
|||
text = gsub(text, "[ᄍㅉ]", "jj") |
|||
text = gsub(text, "[ᄎㅊ]", "ch") |
|||
text = gsub(text, "[ᄏᆨㅋ]", "k") |
|||
text = gsub(text, "[ᄐᆮㅌ]", "t") |
|||
text = gsub(text, "[ᄑᆸㅍ]", "p") |
|||
text = gsub(text, "[ᄒㅎ]", "h") |
|||
-- now remove U+FDD0 that was needed for converting each syllabic block in given name separately (e.g. 한복남 Han Boknam, not Han Bongnam) |
|||
text = gsub(text, "", "") |
|||
-- ^ for capitalization |
-- ^ for capitalization |
||
text = gsub(text, "%^[a-eg-km-pr-uwy]", mw.ustring.upper) |
text = gsub(text, "%^[a-eg-km-pr-uwy]", mw.ustring.upper) |
||
text = gsub(text, "%^", "") |
text = gsub(text, "%^", "") |
||
-- final error checking |
|||
text = final_processing(text) |
|||
if find(text, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-가-힣ힰ-]") then |
|||
error("Result contains Hangul; debugging required") |
|||
end |
|||
-- return orig chars |
|||
text = html_encoding_to_ascii(text) |
|||
-- if result is nothing (e.g. when input is just ㅇ) |
|||
if text == "" then |
|||
text = "—" |
|||
end |
|||
return text |
return text |
||
Line 511: | Line 256: | ||
-- Convert to McCune–Reischauer |
-- Convert to McCune–Reischauer |
||
function p.mr(frame) |
function p.mr(frame) |
||
return p._mr(get_args(frame)) |
|||
local get_args = require('Module:Arguments').getArgs |
|||
local args = get_args(frame) |
|||
return p._mr(args) |
|||
end |
end |
||
Line 525: | Line 268: | ||
text = gsub(text, "", "") -- remove U+FDD0 (only needed for RR; not needed for MR) |
text = gsub(text, "", "") -- remove U+FDD0 (only needed for RR; not needed for MR) |
||
text = remove_links_and_markup(text) |
text = remove_links_and_markup(text) |
||
text = |
text = check_invalid_seq(text) |
||
text = |
text = gsub_iterate(text, m_data.enclosed_hangul) |
||
text = |
text = m_utils.decompose_hangul(text) -- decompose Hangul |
||
text = |
text = check_invalid_seq_decomposed_hangul(text) |
||
text = |
text = gsub_iterate(text, m_data.exceptions) |
||
text = gsub(text, "([ᄀᄁᄃ-ᄊᄌ-ᄑ])ᅴ", "%1ᅵ") -- syl-init consonant + ㅢ → syl-init consonant + ㅣ (except 의, 늬, 희) |
text = gsub(text, "([ᄀᄁᄃ-ᄊᄌ-ᄑ])ᅴ", "%1ᅵ") -- syl-init consonant + ㅢ → syl-init consonant + ㅣ (except 의, 늬, 희) |
||
text = gsub(text, "(ᄋ[ᅧ]ᄃ[ᅥ])ᆲ([ᄀᄃᄇᄉᄌ])", "%1ᆯ%2") -- 여덟 + particle (tensification does not occur) |
text = gsub(text, "(ᄋ[ᅧ]ᄃ[ᅥ])ᆲ([ᄀᄃᄇᄉᄌ])", "%1ᆯ%2") -- 여덟 + particle (tensification does not occur) |
||
-- $ for ㄴ-addition |
text = gsub_iterate(text, m_data.n_addition) -- $ for ㄴ-addition |
||
text = gsub_iterate(text, m_data.null_init_ieung) -- for null-init consonant ㅇ (연음) |
|||
text = gsub(text, "([ᆨ-ᇂ])%$ᄋ([ᅣᅤᅧᅨᅭᅲᅵ])", "%1ᄂ%2") -- 색연필 [생년필], 물엿 [물렫] |
|||
text = gsub(text, "%$", "") |
|||
-- convert ㅎ combinations |
|||
-- for null-init consonant ㅇ (연음) |
|||
text = |
text = gsub_iterate(text, m_data.process_hieut) |
||
text = |
text = gsub_iterate(text, m_data.process_hieut_additional_mr) |
||
text = gsub(text, "ᆪᄋ", "ᆨᄉ") |
|||
text = gsub(text, "ᆬᄋ", "ᆫᄌ") |
|||
text = gsub(text, "ᆮᄋ", "ᄃ") |
|||
text = gsub(text, "[ᆯᆶ]ᄋ", "ᄅ") |
|||
text = gsub(text, "ᆰᄋ", "ᆯᄀ") |
|||
text = gsub(text, "ᆱᄋ", "ᆯᄆ") |
|||
text = gsub(text, "ᆲᄋ", "ᆯᄇ") |
|||
text = gsub(text, "ᆳᄋ", "ᆯᄉ") |
|||
text = gsub(text, "ᆴᄋ", "ᆯᄐ") |
|||
text = gsub(text, "ᆵᄋ", "ᆯᄑ") |
|||
text = gsub(text, "ᆸᄋ", "ᄇ") |
|||
text = gsub(text, "ᆹᄋ", "ᆸᄉ") |
|||
text = gsub(text, "ᆺᄋ", "ᄉ") |
|||
text = gsub(text, "ᆻᄋ", "ᄊ") |
|||
text = gsub(text, "ᆽᄋ", "ᄌ") |
|||
text = gsub(text, "ᆾᄋ", "ᄎ") |
|||
text = gsub(text, "ᆿᄋ", "ᄏ") |
|||
text = gsub(text, "ᇀᄋ", "ᄐ") |
|||
text = gsub(text, "ᇁᄋ", "ᄑ") |
|||
text = gsub(text, "ᇂᄋ", "ᄋ") -- silent; 좋아 [조아] |
|||
-- for ㅎ |
|||
-- trivia: {ㄶ, ㅀ, ㅎ} + ㅂ doesn't actually exist, but added for completeness (syl-final ㅎ is for aspiration anyway) |
|||
text = gsub(text, "ᆭᄀ", "ᆫᄏ") |
|||
text = gsub(text, "ᆭᄃ", "ᆫᄐ") |
|||
text = gsub(text, "ᆭᄇ", "ᆫᄑ") |
|||
text = gsub(text, "[ᆬᆭ]ᄉ", "ᆫᄊ") |
|||
text = gsub(text, "ᆭᄌ", "ᆫᄎ") |
|||
text = gsub(text, "ᆶᄀ", "ᆯᄏ") |
|||
text = gsub(text, "ᆶᄃ", "ᆯᄐ") |
|||
text = gsub(text, "ᆶᄇ", "ᆯᄑ") |
|||
text = gsub(text, "[ᆲᆴᆶ]ᄉ", "ᆯᄊ") |
|||
text = gsub(text, "ᆶᄌ", "ᆯᄎ") |
|||
text = gsub(text, "ᇂᄀ", "ᄏ") |
|||
text = gsub(text, "ᇂᄃ", "ᄐ") |
|||
text = gsub(text, "ᇂᄇ", "ᄑ") |
|||
text = gsub(text, "ᇂᄉ", "ᄊ") |
|||
text = gsub(text, "ᇂᄌ", "ᄎ") |
|||
-- ㄵ, ㄼ, ㄾ cause tensification of following consonant |
-- ㄵ, ㄼ, ㄾ cause tensification of following consonant |
||
-- do not add ㄻ; does not always cause tensification (굶기다 [굼기다], 삶조차 [삼조차]) |
-- do not add ㄻ; does not always cause tensification (굶기다 [굼기다], 삶조차 [삼조차]) |
||
text = gsub(text, "([ᆬᆲᆴ])([ᄀᄃᄌ])", "%1@%2") |
text = gsub(text, "([ᆬᆲᆴ])([ᄀᄃᄌ])", "%1@%2") |
||
-- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨])) |
|||
text = gsub(text, "ᆰᄀ", "ᆯ@ᄀ") |
text = gsub(text, "ᆰᄀ", "ᆯ@ᄀ") -- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨])) |
||
-- @ for written 사이시옷 + ㄱ/ㅂ (should be done before neutralization of syl-final consonants) |
-- @ for written 사이시옷 + ㄱ/ㅂ (should be done before neutralization of syl-final consonants) |
||
text = gsub(text, "ᆺ@ᄀ", "ᄁ") |
text = gsub(text, "ᆺ@ᄀ", "ᄁ") |
||
text = gsub(text, "ᆺ@ᄇ", "ᄈ") |
text = gsub(text, "ᆺ@ᄇ", "ᄈ") |
||
-- neutralization of syl-final consonants |
|||
text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants |
|||
text = gsub(text, "[ᆩᆪᆰᆿ]", "ᆨ") |
|||
text = gsub(text, "[ |
text = gsub(text, "([ᅡ-ᅵᆫᆷᆼ])@ᄉ", "%1ᄊ") -- @ for tensification |
||
text = gsub_iterate(text, m_data.at_irregularities) -- @ for 절음 법칙, ㄴㄹ pronounced [ㄴㄴ] |
|||
text = gsub(text, "[ᆺᆻᆽᆾᇀᇂ]", "ᆮ") |
|||
text = gsub(text, "[ᆲᆳᆴᆶ]", "ᆯ") |
|||
text = gsub(text, "ᆱ", "ᆷ") |
|||
text = gsub(text, "[ᆵᆹᇁ]", "ᆸ") |
|||
-- @ for tensification, 절음 법칙, ㄴㄹ pronounced [ㄴㄴ] |
|||
-- other irregularities documented are automatically handled |
|||
text = gsub(text, "([ᅡ-ᅵᆫᆷᆼ])@ᄉ", "%1ᄊ") |
|||
text = gsub(text, "ᆨ@ᄋ", "ᄀ") |
|||
text = gsub(text, "ᆮ@ᄋ", "ᄃ") -- 웃어른 [우더른] |
|||
text = gsub(text, "ᆯ@ᄋ", "ᄅ") |
|||
text = gsub(text, "ᆸ@ᄋ", "ᄇ") |
|||
text = gsub(text, "ᆫ@ᄅ", "ᆫᄂ") -- 음운론 [으문논] |
|||
-- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants |
-- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants |
||
-- * is for additional hyphen in romanization only (voicing is retained after hyphen) |
-- * is for additional hyphen in romanization only (voicing is retained after hyphen) |
||
Line 607: | Line 305: | ||
text = gsub(text, "%*", "-") |
text = gsub(text, "%*", "-") |
||
text = gsub(text, "@", "") |
text = gsub(text, "@", "") |
||
-- consonant assimilations |
-- consonant assimilations |
||
text = |
text = gsub_iterate(text, m_data.consonant_assimilations) |
||
text = |
text = gsub_iterate(text, m_data.consonant_assimilations_additional_mr) |
||
text = gsub(text, "ᆫᄅ", "ᆯᄅ") |
|||
text = |
text = gsub_iterate(text, m_data.drop_y) -- drop y after {ㅈ, ㅉ, ㅊ} |
||
text = gsub_iterate(text, m_data.vowels_mr) -- replace Hangul vowels with romanized text |
|||
text = gsub(text, "ᆮᄆ", "ᆫᄆ") |
|||
text = gsub(text, " |
text = gsub(text, "([ao])ᄋe", "%1ë") -- ㅏ에 (aë) and ㅗ에 (oë) |
||
text = gsub(text, "'([ᄋㅇ]+)'", "'%1'") -- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later) |
|||
text = gsub(text, "[ᆷᆸ][ᄂᄅ]", "ᆷᄂ") |
|||
text = gsub_iterate(text, m_data.single_consonants_mr) -- replace single consonants with romanized text |
|||
text = gsub(text, "ᆸᄆ", "ᆷᄆ") |
|||
-- no {kkk, ttt, ppp, sss/ts/tss, ttch} |
|||
text = gsub(text, "ᆨᄁ", "ᄁ") |
|||
text = gsub(text, "ᆮᄄ", "ᄄ") |
|||
text = gsub(text, "ᆸᄈ", "ᄈ") |
|||
text = gsub(text, "ᆮ[ᄉᄊ]", "ᄊ") |
|||
text = gsub(text, "ᆮᄍ", "ᄍ") |
|||
-- other misc conversions |
|||
text = gsub(text, "ᆯᄅ", "ᆯl") |
|||
text = gsub(text, "ᆯᄒ", "rᄒ") |
|||
text = gsub(text, "ᄉ[ᅱ]", "shᅱ") |
|||
-- drop y after {ㅈ, ㅉ, ㅊ} |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅣ", "%1ᅡ") |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅤ", "%1ᅢ") |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅧ", "%1ᅥ") |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅨ", "%1ᅦ") |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅭ", "%1ᅩ") |
|||
text = gsub(text, "([ᄌ-ᄎ])ᅲ", "%1ᅮ") |
|||
-- vowels |
|||
text = gsub(text, "[ᅡㅏ]", "a") |
|||
text = gsub(text, "[ᅢㅐ]", "ae") |
|||
text = gsub(text, "[ᅣㅑ]", "ya") |
|||
text = gsub(text, "[ᅤㅒ]", "yae") |
|||
text = gsub(text, "[ᅥㅓ]", "ŏ") |
|||
text = gsub(text, "[ᅦㅔ]", "e") |
|||
text = gsub(text, "[ᅧㅕ]", "yŏ") |
|||
text = gsub(text, "[ᅨㅖ]", "ye") |
|||
text = gsub(text, "[ᅩㅗ]", "o") |
|||
text = gsub(text, "[ᅪㅘ]", "wa") |
|||
text = gsub(text, "[ᅫㅙ]", "wae") |
|||
text = gsub(text, "[ᅬㅚ]", "oe") |
|||
text = gsub(text, "[ᅭㅛ]", "yo") |
|||
text = gsub(text, "[ᅮㅜ]", "u") |
|||
text = gsub(text, "[ᅯㅝ]", "wŏ") |
|||
text = gsub(text, "[ᅰㅞ]", "we") |
|||
text = gsub(text, "[ᅱㅟ]", "wi") |
|||
text = gsub(text, "[ᅲㅠ]", "yu") |
|||
text = gsub(text, "[ᅳㅡ]", "ŭ") |
|||
text = gsub(text, "[ᅴㅢ]", "ŭi") |
|||
text = gsub(text, "[ᅵㅣ]", "i") |
|||
-- ㅏ에 (aë) and ㅗ에 (oë) |
|||
text = gsub(text, "([ao])ᄋe", "%1ë") |
|||
-- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later) |
|||
text = gsub(text, "'([ᄋㅇ]+)'", "'%1'") |
|||
-- single consonants |
|||
text = gsub(text, "`ᄀ", "g") |
|||
text = gsub(text, "`ᄃ", "d") |
|||
text = gsub(text, "`ᄇ", "b") |
|||
text = gsub(text, "`ᄌ", "j") |
|||
text = gsub(text, "[ᄀᆨㄱ]", "k") |
|||
text = gsub(text, "[ᄁㄲ]", "kk") |
|||
text = gsub(text, "ㄳ", "ks") |
|||
text = gsub(text, "[ᄂᆫㄴ]", "n") |
|||
text = gsub(text, "ㄵ", "nj") |
|||
text = gsub(text, "ㄶ", "nh") |
|||
text = gsub(text, "[ᄃᆮㄷ]", "t") |
|||
text = gsub(text, "[ᄄㄸ]", "tt") |
|||
text = gsub(text, "[ᄅㄹ]", "r") |
|||
text = gsub(text, "ᆯ", "l") |
|||
text = gsub(text, "ㄺ", "lg") |
|||
text = gsub(text, "ㄻ", "lm") |
|||
text = gsub(text, "ㄼ", "lb") |
|||
text = gsub(text, "ㄽ", "ls") |
|||
text = gsub(text, "ㄾ", "lt'") |
|||
text = gsub(text, "ㄿ", "lp'") |
|||
text = gsub(text, "ㅀ", "rh") |
|||
text = gsub(text, "[ᄆᆷㅁ]", "m") |
|||
text = gsub(text, "[ᄇᆸㅂ]", "p") |
|||
text = gsub(text, "[ᄈㅃ]", "pp") |
|||
text = gsub(text, "ㅄ", "ps") |
|||
text = gsub(text, "[ᄉㅅ]", "s") |
|||
text = gsub(text, "[ᄊㅆ]", "ss") |
|||
text = gsub(text, "[ᄋㅇ]", "") |
|||
text = gsub(text, "ᆼ", "ng") |
|||
text = gsub(text, "[ᄌㅈ]", "ch") |
|||
text = gsub(text, "[ᄍㅉ]", "tch") |
|||
text = gsub(text, "[ᄎㅊ]", "ch'") |
|||
text = gsub(text, "[ᄏㅋ]", "k'") |
|||
text = gsub(text, "[ᄐㅌ]", "t'") |
|||
text = gsub(text, "[ᄑㅍ]", "p'") |
|||
text = gsub(text, "[ᄒㅎ]", "h") |
|||
text = gsub(text, "`", "") |
|||
-- replace ' with ' when followed by another ' or at end of string (to avoid possible clashes with bold/italic markup) |
-- replace ' with ' when followed by another ' or at end of string (to avoid possible clashes with bold/italic markup) |
||
text = gsub(text, "([hkpt])''", "%1''") |
text = gsub(text, "([hkpt])''", "%1''") |
||
text = gsub(text, "([hkpt])'$", "%1'") |
text = gsub(text, "([hkpt])'$", "%1'") |
||
-- ^ for capitalization |
-- ^ for capitalization |
||
text = gsub(text, "%^[acehikm-pr-uwyŏŭ]", mw.ustring.upper) |
text = gsub(text, "%^[acehikm-pr-uwyŏŭ]", mw.ustring.upper) |
||
text = gsub(text, "%^", "") |
text = gsub(text, "%^", "") |
||
-- final error checking |
|||
text = final_processing(text) |
|||
if find(text, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-가-힣ힰ-]") then |
|||
error("Result contains Hangul; debugging required") |
|||
end |
|||
-- return orig chars |
|||
text = html_encoding_to_ascii(text) |
|||
-- if result is nothing (e.g. when input is just ㅇ) |
|||
if text == "" then |
|||
text = "—" |
|||
end |
|||
return text |
return text |
||
Line 719: | Line 331: | ||
-- Removing special chars (except for escaped ones) |
-- Removing special chars (except for escaped ones) |
||
function p.clean_hangul(frame) |
function p.clean_hangul(frame) |
||
return p._clean_hangul(get_args(frame)) |
|||
local get_args = require('Module:Arguments').getArgs |
|||
local args = get_args(frame) |
|||
return p._clean_hangul(args) |
|||
end |
end |
||
function p._clean_hangul(args) |
function p._clean_hangul(args) |
||
local |
local text = args[1] |
||
-- input must contain Hangul |
-- input must contain Hangul |
||
if not m_utils.contains_hangul(text) then |
|||
if hangul == nil or hangul == "" or find(hangul, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-가-힣ힰ-]") == nil then |
|||
error("Input must contain Hangul") |
error("Input must contain Hangul") |
||
end |
end |
||
-- no direct insertion of reference or footnote |
-- no direct insertion of reference or footnote |
||
if m_utils.contains_reference(text) then |
|||
if find(hangul, "'\"`UNIQ--") or find(hangul, "-QINU`\"'") then |
|||
error("Input cannot contain references") |
error("Input cannot contain references") |
||
end |
end |
||
-- Replacing escaped special chars with placeholders |
|||
local cleaned = gsub(hangul, "\\%$", "$") |
|||
cleaned = gsub(cleaned, "\\%%", "%") |
|||
cleaned = gsub(cleaned, "\\%*", "*") |
|||
cleaned = gsub(cleaned, "\\@", "@") |
|||
cleaned = gsub(cleaned, "\\%^", "^") |
|||
cleaned = gsub(cleaned, "\\_", "_") |
|||
cleaned = gsub(cleaned, "\\`", "`") |
|||
-- Removing non-escaped special chars |
|||
cleaned = gsub(cleaned, "[%$%%%*@%^_`]", "") |
|||
-- Returning orig chars |
|||
cleaned = html_encoding_to_ascii(cleaned) |
|||
-- symbol should not appear within single syllabic block |
|||
-- Unstripping test |
|||
if find(text, "[ᄀ-ᅟꥠ-ꥼ][%$%%%*@%^_`][ᅠ-ᆧힰ-ퟆ]") or find(text, "[ᅠ-ᆧ가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기까깨꺄꺠꺼께껴꼐꼬꽈꽤꾀꾜꾸꿔꿰뀌뀨끄끠끼나내냐냬너네녀녜노놔놰뇌뇨누눠눼뉘뉴느늬니다대댜댸더데뎌뎨도돠돼되됴두둬뒈뒤듀드듸디따때땨떄떠떼뗘뗴또똬뙈뙤뚀뚜뚸뛔뛰뜌뜨띄띠라래랴럐러레려례로롸뢔뢰료루뤄뤠뤼류르릐리마매먀먜머메며몌모뫄뫠뫼묘무뭐뭬뮈뮤므믜미바배뱌뱨버베벼볘보봐봬뵈뵤부붜붸뷔뷰브븨비빠빼뺘뺴뻐뻬뼈뼤뽀뽜뽸뾔뾰뿌뿨쀄쀠쀼쁘쁴삐사새샤섀서세셔셰소솨쇄쇠쇼수숴쉐쉬슈스싀시싸쌔쌰썌써쎄쎠쎼쏘쏴쐐쐬쑈쑤쒀쒜쒸쓔쓰씌씨아애야얘어에여예오와왜외요우워웨위유으의이자재쟈쟤저제져졔조좌좨죄죠주줘줴쥐쥬즈즤지짜째쨔쨰쩌쩨쪄쪠쪼쫘쫴쬐쬬쭈쭤쮀쮜쮸쯔쯰찌차채챠챼처체쳐쳬초촤쵀최쵸추춰췌취츄츠츼치카캐캬컈커케켜켸코콰쾌쾨쿄쿠쿼퀘퀴큐크킈키타태탸턔터테텨톄토톼퇘퇴툐투퉈퉤튀튜트틔티파패퍄퍠퍼페펴폐포퐈퐤푀표푸풔풰퓌퓨프픠피하해햐햬허헤혀혜호화홰회효후훠훼휘휴흐희히ힰ-ퟆ][%$%%%*@%^_`][ᆨ-ᇿퟋ-ퟻ]") then |
|||
cleaned = mw.text.unstrip(cleaned) |
|||
error("Do not insert symbol within single syllabic block") |
|||
end |
|||
text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders |
|||
return cleaned |
|||
text = gsub(text, "[%$%%%*@%^_`]", "") -- removing non-escaped special chars |
|||
text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII |
|||
text = mw.text.unstrip(text) -- unstripping test |
|||
return text |
|||
end |
end |
||
Revision as of 10:58, 20 April 2025
![]() | This module is rated as beta, and is ready for widespread use. It is still new and should be used with some caution to ensure the results are as expected. |
![]() | This module is subject to page protection. It is a highly visible module in use by a very large number of pages, or is substituted very frequently. Because vandalism or mistakes would affect many pages, and even trivial editing might cause substantial load on the servers, it is protected from editing. |
![]() | This module depends on the following other modules: |
This module automatically romanizes Korean text (Hangul). It supports the Revised Romanization (RR) and McCune–Reischauer (MR) romanization systems.
- Example: 독립 (pronounced [동닙]) → dongnip (RR), tongnip (MR)
Due to the nature of both systems, users will sometimes need to input symbols to instruct the system on how to correctly handle various edge cases. These input symbols do not appear in the output; they are only used to output the correct romanization per the rules of each system.
Quick guide
- Personal names
- For names with one-character surnames, prepend
%
to the full name (e.g.%홍길동
).- If the surname is longer than one syllable, insert
_
or space between the surname and the given name (e.g.%선우_진
or%선우 진
).
- If the surname is longer than one syllable, insert
- If there is no surname (e.g. a mononym), add
%_
to the start of the name (e.g.%_복남
).
- For names with one-character surnames, prepend
- Capitalization for proper nouns
- Prepend
^
to the proper noun (e.g.^압구정
).
- Prepend
- Mandatory hyphens in RR (for separating an administrative unit)
- Place
*
where the hyphen should go (e.g.^양주*군
).
- Place
- Unpredictable pronunciation (cannot be predicted from the Hangul spelling and regular pronunciation rules)
- Use
$
for ㄴ-addition (e.g.색$연필
[생년필]). - Otherwise, use
@
(e.g.손@등
[손뜽],웃@어른
[우더른], etc.).
- Use
Full guide
- Note: When adding
@
,$
or`
, it is recommend that you state why the symbol is needed.- Example:
물@고기<!--pronounced [물꼬기]-->
- Example:
Symbol | RR | MR | Without symbol | Note |
---|---|---|---|---|
@ |
N/A | unpredictable tensification (e.g. 손@등 [손뜽] sontŭng) |
전등 [전등] chŏndŭng | 싫증 [실쯩] siljeung/silchŭng is automatically handled (i.e. does not need @ ).
|
written 사이시옷 + ㄱ/ㅂ (e.g. 바닷@가 [바다까~바닫까] padakka) |
옷걸이 [옫꺼리] otkŏri | |||
절음 법칙 (e.g. 웃@어른 [우더른] udeoreun/udŏrŭn) |
웃어라 [우서라] useora/usŏra | Most 절음 법칙 is automatically applied. For example, words like 맛없다 [마덥따] madeopda/madŏpta do not need @ . Manual 절음 법칙 is needed only before the syllabic blocks 아, 어, 에, 엔, 엘, 여, 요, 으, 은, 을, 음, 읍, 의, 이, 인, 일, 임, and 입.
| ||
ㄴㄹ pronounced [ㄴㄴ] (e.g. 음운@론 [으문논] eumunnon/ŭmunnon) |
난리 [날리] nalli | |||
k/t/p for ㄱㅎ/ㄷㅎ/ㅂㅎ in words that are not nouns, pronouns, and numerals (e.g. 잡@혀 [자펴] japyeo) |
N/A | 낙하산 nakhasan | ||
$ |
ㄴ-addition (e.g. 색$연필 [생년필] saengnyeonpil/saengnyŏnp'il, 물$엿 [물렫] mullyeot/mullyŏt) |
입양 [이뱡] ibyang | ㄴ-addition before 윷 and 잎 is automatically applied. For example, words like 가락윷 [가랑뉻] garangnyut/karangnyut and 깻잎 [깬닙] kkaennip do not need $ .
| |
` |
N/A | voicing of syllable-initial ㄱ/ㄷ/ㅂ/ㅈ after a non-Hangul character (e.g. 1`번 [일번/한번] 1bŏn) |
6번 [육뻔/여섣뻔] 6pŏn | only between a non-Hangul character and a Hangul syllabic block beginning with ㄱ/ㄷ/ㅂ/ㅈ Currently, the "non-Hangul character" is limited to ASCII digits and alphabet letters ( [0-9A-Za-z] ). This can be broadened later if needed.
|
_ |
1. adds an additional space not in Hangul text 2. in the personal name mode (see % below), used when the length of the surname is not one syllable |
|||
* |
adds an additional hyphen not in Hangul text (e.g. ^양주*군 Yangju-gun) |
needed for a mandatory hyphen (for separating an administrative unit) in RR | ||
^ |
capitalizes the following letter | must be immediately followed by a Hangul syllabic block | ||
% |
personal name mode | 1. By default, this mode assumes that the first syllabic block is the surname and the remaining syllabic blocks are the given name. (e.g. %홍길동 → surname 홍, given name 길동)2. To specify a different segmentation, insert _ or space between the components.(e.g. surname 선우 + given name 진: %선우_진 or %선우 진 )3. For a mononym (personal name that does not have a surname, or any case where only the given name is needed), precede the name with %_ .(e.g. %_복남 )4. For a surname-only name (personal name that only consists of a surname, or any case where only the surname is needed), if it is 4.1. two or more syllables long, _ is needed after it.(e.g. %남궁_ )4.2. just one syllable long, do not add _ after it.(e.g. %최 )5. If a personal name is followed by non–personal name text, put the personal name between two % .(e.g. 국립 %홍길동% 기념관 )Note: This mode does not support 1. the "given name + surname" order (e.g. 제임스 홍). 2. a name consisting of three or more components. | ||
\ |
escape character (e.g. \@ → @) |
for outputting literal @, $, etc. |
local p = {}
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local m_data = require('Module:Ko-translit/data')
local m_utils = require('Module:Ko-utils')
local get_args = require('Module:Arguments').getArgs
--[[
IMPORTANT NOTE before editing this module:
1. Make sure that you use a font that displays the following characters differently, and that you know the differences of them:
ᄀ (U+1100)
ᆨ (U+11A8)
ㄱ (U+3131)
2. When dealing with decomposed Hangul,
a. [ᄀ-ᄒ] should not be directly followed by [ᅡ-ᅵ] because MediaWiki uses Unicode Normalization Form C (NFC), which converts any sequence of [ᄀ-ᄒ][ᅡ-ᅵ] into a precomposed character; write ᄀ[ᅡ] or ᄀ(ᅡ)
b. ᄀ[ᅡ] or ᄀ(ᅡ) at the end of a pattern is equivalent to not just 가 but [가-갛] in precomposed form. To match a syllabic block without a final consonant at the end of a pattern, use both vowel + [^ᆨ-ᇂ] and vowel + $
For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$
--]]
local function gsub_iterate(text, table)
for _, entry in ipairs(table) do
text = gsub(text, entry[1], entry[2])
end
return text
end
local function remove_links_and_markup(text)
-- these either are unnecessary or interfere with assimilation
-- remove bold/italic
-- it is not impossible to allow bold/italic when it does not interfere with assimilation, but determining when to allow or disallow that adds complication for little practical gain
text = gsub(text, "'''", "")
text = gsub(text, "''", "")
-- remove HTML tags (except br)
text = gsub(text, "<[Bb][Rr] */?>", " ")
text = gsub(text, "</?[A-Za-z][^>]->", "")
text = gsub(text, " ", "<br>")
-- remove wikilinks
text = gsub(text, "%[%[[^%|]+%|(..-)%]%]", "%1")
text = gsub(gsub(text, "%[%[", ""), "%]%]", "")
text = mw.text.killMarkers(text)
return text
end
local function disallow_invalid_input(text)
-- very first step
-- Hangul status: precomposed (한)
-- input must contain Hangul
if not m_utils.contains_hangul(text) then
error("Input must contain Hangul")
end
-- no direct insertion of reference or footnote
if m_utils.contains_reference(text) then
error("Input cannot contain references")
end
-- if input contains Hangul not supported by RR and MR, change text to "N/A" and skip everything
if find(text, "[ᄓ-ᅠᅶ-ᆧᇃ-ᇿ〮〯ㅤ-ㆎꥠ-ힰ-]") then
text = "N/A"
return text
end
text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders
-- various validations of input
if find(text, "[ᄀ-ᄒ]") or find(text, "[ᅡ-ᅵᆨ-ᇂ]") then
error("Do not input conjoining Hangul jamo directly")
elseif find(text, "`%*") then
error("Use *` instead of `*")
elseif find(text, "@%*") then
error("Use *@ instead of @*")
elseif find(text, "%^[^가-힣]") then
error("^ must be immediately followed by Hangul syllabic block")
elseif find(text, "[^%*0-9A-Za-z]`") or find(text, "[^0-9A-Za-z]%*`") or find(text, "`[^가-깋다-딯바-빟자-짛]") then
error("Found invalid sequence containing `")
elseif find(text, "[^%*ㄹ가-힣]@") or find(text, "[^가-힣]%*@") or find(text, "%*@[^가-깋다-딯바-빟자-짛]") or find(text, "ㄹ@[^가-깋다-딯바-빟사-싷자-짛]") or find(text, "@[^가-깋다-딯라-맇바-빟사-싷아어에엔엘여요으은을음읍의이인일임입자-짛하-힣]") then
error("Found invalid sequence containing @")
elseif find(text, "[^가-힣]%$") or find(text, "%$[^야-얳여-옣요-욯유-윶윸-윻이-잍잏]") then
error("Found invalid sequence containing $")
elseif find(text, "%%$") then
error("Remove final %")
elseif find(text, "[ _][ _]") then
error("No two or more consecutive space characters")
elseif find(text, "^[%$%*@_`]") or find(text, "^%%[^_가-힣]") or find(text, "[ _]%*") or find(text, "%*[ %*%-_]") or find(text, "%-%*") or find(text, "[-]") or find(text, "[%$%*@%^`]$") then
error("Invalid input")
end
return text
end
local function check_invalid_seq(text)
-- validity check after removing links and markups (before decomposing Hangul)
-- Hangul status: precomposed (한)
if find(text, "[ _][ _]") then
error("No two or more consecutive space characters")
elseif find(text, "^[%$%*@_`]") or find(text, "[ _]%*") or find(text, "%*[ %*%-_]") or find(text, "%-%*") or find(text, "[%$%*@%^_`]$") then
error("Invalid input")
end
return text
end
local function check_invalid_seq_decomposed_hangul(text)
-- validity check after decomposing Hangul
-- Hangul status: decomposed (ᄒ+ᅡ+ᆫ)
if find(text, "[ᆨ-ᆪᆬ-ᆮᆴ-ᆶᆸᆹᆻᆽ-ᇂ]%*??@?[ᄀᄃᄇᄉᄌ]") or find(text, "ᆰ%*??@?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆲ?@?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆺ%*@[ᄀᄇ]") or find(text, "ᆺ%*??@?[ᄁ-ᄆᄈ-ᄊᄌ-ᄑ]") or find(text, "[ᅡ-ᅵᆨ-ᆪᆬ-ᇂ]?@?ᄅ") or find(text, "[ᅡ-ᅵᆨᆫᆭ-ᆯᆶ-ᆸᆼ]?@?ᄋ") or find(text, "[ᅡ-ᅵᆫ-ᆭᆯᆱ-ᆷᆼ]?@?ᄒ") then
error("Found invalid sequence containing @")
elseif find(text, "[ᅡ-ᅵ]?%$") then
error("Found invalid sequence containing $")
end
return text
end
local function parse_name(text)
-- processing people names
-- Hangul status: precomposed (한)
local hanja_readings_final_L = "갈걸결골괄굴궐귤글길날녈놜눌닐달돌랄렬률말멸몰물밀발벌별불살설솔술슬실알얼열올왈울월율을일절졸줄즐질찰철촬출칠탈팔필할헐혈홀활훌휼흘힐"
local hanja_readings_init_DSJ = "다단달담답당대댁덕도독돈돌동두둔득등사삭산살삼삽상새색생서석선설섬섭성세소속손솔송쇄쇠수숙순술숭쉬슬습승시식신실심십자작잔잠잡장재쟁저적전절점접정제조족존졸종좌죄주죽준줄중즉즐즙증지직진질짐집징"
-- note: internally uses 3 noncharacters
-- (U+FDD0): mostly for given name in RR
-- (U+FDD1): marks beginning of name
-- (U+FDD2): marks end of name
-- change % to U+FDD1 and U+FDD2 (end of string also terminates name mode)
text = gsub(text, "%%([^%%]*)%%", "%1")
text = gsub(text, "%%([^%%]*)$", "%1")
-- disallow invalid input for name
if find(text, "") then
error("Name cannot be empty")
elseif find(text, "[^]*[^가-힣_ ][^]*") then
error("Invalid character in name")
elseif find(text, " ") then
error("Name cannot begin with space")
elseif find(text, " ") then
error("Name cannot end with space")
elseif find(text, "[^]*[ _][^]*[ _][^]*") then
error("No more than two components in name")
elseif find(text, "[가-힣]_") then
error("No _ after one-syllable surname")
elseif find(text, "[^]*[" .. hanja_readings_final_L .. "]@[" .. hanja_readings_init_DSJ .. "][^]*") then
error("Contains unnecessary @ in name") -- see below
end
-- separate surname and given name
-- if input contains _ or space, separate there
text = gsub(text, "([가-힣%$@]+)_", "^%1_") -- for surname-only string
text = gsub(text, "_([가-힣%$@]+)", "_^%1") -- for mononym
text = gsub(text, "([가-힣%$@]+)[ _]([가-힣%$@]+)", "^%1_^%2")
-- otherwise, separate after first syllabic block
text = gsub(text, "([가-힣])", "^%1_") -- for surname-only string
text = gsub(text, "([가-힣])([가-힣%$@]+)", "^%1_^%2")
-- check invalid input after separating surname and given name
if find(text, "[^]*_%^[%$@][^]*") then
error("No @ or $ between surname and given name")
end
-- tensification of ㄹ + {ㄷ, ㅅ, ㅈ} (needed for MR; e.g. 홍길동 [홍길똥], 을지문덕 [을찌문덕])
-- does not occur when same syllable is repeated (e.g. 구구절절 [구구절절], not [구구절쩔]); just using U+FDD0 here too
for i = 1, mw.ustring.len(text) do
text = gsub(text, "([^]*)([달돌살설솔술슬실절졸줄즐질])%2([^]*)", "%1%2%2%3")
end
-- now apply tensification
for i = 1, mw.ustring.len(text) do
text = gsub(text, "([^]*)([" .. hanja_readings_final_L .. "])([" .. hanja_readings_init_DSJ .. "])([^]*)", "%1%2@%3%4")
end
-- insert U+FDD0 in given name (needed for RR; e.g. 한복남 Han Boknam, not Han Bongnam)
for i = 1, mw.ustring.len(text) do
text = gsub(text, "([^]*)_%^([^]*)([가-힣%$@])([가-힣%$@])([^]*)", "%1_^%2%3%4%5")
end
-- remove _ which was needed for surname-only string and mononym
text = gsub(text, "_", "")
text = gsub(text, "_%^", "^")
-- remove U+FDD1 and U+FDD2
text = gsub(text, "[]", "")
return text
end
local function final_processing(text)
-- final processing for RR and MR
-- result should not contain Hangul
if m_utils.contains_hangul(text) then
error("Result contains Hangul; debugging required")
end
text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII
-- if result is nothing (e.g. when input is just ㅇ)
if text == "" then
text = "—"
end
return text
end
-- Convert to Revised Romanization
function p.rr(frame)
return p._rr(get_args(frame))
end
function p._rr(args)
local text = args[1]
text = disallow_invalid_input(text)
if text == "N/A" then
return text
end
text = parse_name(text)
text = remove_links_and_markup(text)
text = check_invalid_seq(text)
text = gsub(text, "`", "") -- ignore ` (only needed for MR; not needed for RR)
text = gsub_iterate(text, m_data.enclosed_hangul)
text = m_utils.decompose_hangul(text) -- decompose Hangul
text = check_invalid_seq_decomposed_hangul(text)
text = gsub_iterate(text, m_data.exceptions)
text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only
text = gsub_iterate(text, m_data.n_addition) -- $ for ㄴ-addition
text = gsub_iterate(text, m_data.null_init_ieung) -- for null-init consonant ㅇ (연음)
-- convert ㅎ combinations
text = gsub_iterate(text, m_data.process_hieut)
text = gsub(text, "ᇂᄉ", "ᄉ")
text = gsub(text, "ᆰᄀ", "ᆯᄀ") -- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))
text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants
text = gsub_iterate(text, m_data.at_irregularities) -- @ for 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
text = gsub_iterate(text, m_data.at_irregularities_additional_rr) -- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p
text = gsub(text, "@", "")
text = gsub_iterate(text, m_data.consonant_assimilations) -- consonant assimilations
text = gsub(text, "ᆯᄅ", "ᆯl") -- ㄹㄹ is ll
text = gsub_iterate(text, m_data.drop_y) -- drop y after {ㅈ, ㅉ, ㅊ}
text = gsub_iterate(text, m_data.vowels_rr) -- replace Hangul vowels with romanized text
text = gsub(text, "'([ᄋㅇ]+)'", "'%1'") -- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later)
text = gsub_iterate(text, m_data.single_consonants_rr) -- replace single consonants with romanized text
text = gsub(text, "", "") -- now remove U+FDD0 that was needed for converting each syllabic block in given name separately (e.g. 한복남 Han Boknam, not Han Bongnam)
-- ^ for capitalization
text = gsub(text, "%^[a-eg-km-pr-uwy]", mw.ustring.upper)
text = gsub(text, "%^", "")
text = final_processing(text)
return text
end
-- Convert to McCune–Reischauer
function p.mr(frame)
return p._mr(get_args(frame))
end
function p._mr(args)
local text = args[1]
text = disallow_invalid_input(text)
if text == "N/A" then
return text
end
text = parse_name(text)
text = gsub(text, "", "") -- remove U+FDD0 (only needed for RR; not needed for MR)
text = remove_links_and_markup(text)
text = check_invalid_seq(text)
text = gsub_iterate(text, m_data.enclosed_hangul)
text = m_utils.decompose_hangul(text) -- decompose Hangul
text = check_invalid_seq_decomposed_hangul(text)
text = gsub_iterate(text, m_data.exceptions)
text = gsub(text, "([ᄀᄁᄃ-ᄊᄌ-ᄑ])ᅴ", "%1ᅵ") -- syl-init consonant + ㅢ → syl-init consonant + ㅣ (except 의, 늬, 희)
text = gsub(text, "(ᄋ[ᅧ]ᄃ[ᅥ])ᆲ([ᄀᄃᄇᄉᄌ])", "%1ᆯ%2") -- 여덟 + particle (tensification does not occur)
text = gsub_iterate(text, m_data.n_addition) -- $ for ㄴ-addition
text = gsub_iterate(text, m_data.null_init_ieung) -- for null-init consonant ㅇ (연음)
-- convert ㅎ combinations
text = gsub_iterate(text, m_data.process_hieut)
text = gsub_iterate(text, m_data.process_hieut_additional_mr)
-- ㄵ, ㄼ, ㄾ cause tensification of following consonant
-- do not add ㄻ; does not always cause tensification (굶기다 [굼기다], 삶조차 [삼조차])
text = gsub(text, "([ᆬᆲᆴ])([ᄀᄃᄌ])", "%1@%2")
text = gsub(text, "ᆰᄀ", "ᆯ@ᄀ") -- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))
-- @ for written 사이시옷 + ㄱ/ㅂ (should be done before neutralization of syl-final consonants)
text = gsub(text, "ᆺ@ᄀ", "ᄁ")
text = gsub(text, "ᆺ@ᄇ", "ᄈ")
text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants
text = gsub(text, "([ᅡ-ᅵᆫᆷᆼ])@ᄉ", "%1ᄊ") -- @ for tensification
text = gsub_iterate(text, m_data.at_irregularities) -- @ for 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
-- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants
-- * is for additional hyphen in romanization only (voicing is retained after hyphen)
text = gsub(text, "ᆫᄀ", "ᆫ'`ᄀ") -- n'g
text = gsub(text, "([ᅡ-ᅵᆫᆯᆷᆼ])([ᄀᄃᄇᄌ])", "%1`%2")
text = gsub(text, "([ᅡ-ᅵᆫᆯᆷᆼ])%*([ᄀᄃᄇᄌ])", "%1-`%2")
text = gsub(text, "ᆯ%*ᄅ", "ᆯ-l") -- ㄹ-ㄹ should probably be l-l rather than l-r
text = gsub(text, "%*", "-")
text = gsub(text, "@", "")
-- consonant assimilations
text = gsub_iterate(text, m_data.consonant_assimilations)
text = gsub_iterate(text, m_data.consonant_assimilations_additional_mr)
text = gsub_iterate(text, m_data.drop_y) -- drop y after {ㅈ, ㅉ, ㅊ}
text = gsub_iterate(text, m_data.vowels_mr) -- replace Hangul vowels with romanized text
text = gsub(text, "([ao])ᄋe", "%1ë") -- ㅏ에 (aë) and ㅗ에 (oë)
text = gsub(text, "'([ᄋㅇ]+)'", "'%1'") -- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later)
text = gsub_iterate(text, m_data.single_consonants_mr) -- replace single consonants with romanized text
-- replace ' with ' when followed by another ' or at end of string (to avoid possible clashes with bold/italic markup)
text = gsub(text, "([hkpt])''", "%1''")
text = gsub(text, "([hkpt])'$", "%1'")
-- ^ for capitalization
text = gsub(text, "%^[acehikm-pr-uwyŏŭ]", mw.ustring.upper)
text = gsub(text, "%^", "")
text = final_processing(text)
return text
end
-- Removing special chars (except for escaped ones)
function p.clean_hangul(frame)
return p._clean_hangul(get_args(frame))
end
function p._clean_hangul(args)
local text = args[1]
-- input must contain Hangul
if not m_utils.contains_hangul(text) then
error("Input must contain Hangul")
end
-- no direct insertion of reference or footnote
if m_utils.contains_reference(text) then
error("Input cannot contain references")
end
-- symbol should not appear within single syllabic block
if find(text, "[ᄀ-ᅟꥠ-ꥼ][%$%%%*@%^_`][ᅠ-ᆧힰ-ퟆ]") or find(text, "[ᅠ-ᆧ가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기까깨꺄꺠꺼께껴꼐꼬꽈꽤꾀꾜꾸꿔꿰뀌뀨끄끠끼나내냐냬너네녀녜노놔놰뇌뇨누눠눼뉘뉴느늬니다대댜댸더데뎌뎨도돠돼되됴두둬뒈뒤듀드듸디따때땨떄떠떼뗘뗴또똬뙈뙤뚀뚜뚸뛔뛰뜌뜨띄띠라래랴럐러레려례로롸뢔뢰료루뤄뤠뤼류르릐리마매먀먜머메며몌모뫄뫠뫼묘무뭐뭬뮈뮤므믜미바배뱌뱨버베벼볘보봐봬뵈뵤부붜붸뷔뷰브븨비빠빼뺘뺴뻐뻬뼈뼤뽀뽜뽸뾔뾰뿌뿨쀄쀠쀼쁘쁴삐사새샤섀서세셔셰소솨쇄쇠쇼수숴쉐쉬슈스싀시싸쌔쌰썌써쎄쎠쎼쏘쏴쐐쐬쑈쑤쒀쒜쒸쓔쓰씌씨아애야얘어에여예오와왜외요우워웨위유으의이자재쟈쟤저제져졔조좌좨죄죠주줘줴쥐쥬즈즤지짜째쨔쨰쩌쩨쪄쪠쪼쫘쫴쬐쬬쭈쭤쮀쮜쮸쯔쯰찌차채챠챼처체쳐쳬초촤쵀최쵸추춰췌취츄츠츼치카캐캬컈커케켜켸코콰쾌쾨쿄쿠쿼퀘퀴큐크킈키타태탸턔터테텨톄토톼퇘퇴툐투퉈퉤튀튜트틔티파패퍄퍠퍼페펴폐포퐈퐤푀표푸풔풰퓌퓨프픠피하해햐햬허헤혀혜호화홰회효후훠훼휘휴흐희히ힰ-ퟆ][%$%%%*@%^_`][ᆨ-ᇿퟋ-ퟻ]") then
error("Do not insert symbol within single syllabic block")
end
text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders
text = gsub(text, "[%$%%%*@%^_`]", "") -- removing non-escaped special chars
text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII
text = mw.text.unstrip(text) -- unstripping test
return text
end
return p