Module:Ko-translit/data
Appearance
![]() | This module is rated as pre-alpha. It is unfinished, and may or may not be in active development. It should not be used from article namespace pages. Modules remain pre-alpha until the original editor (or someone who takes one over if it is abandoned for some time) is satisfied with the basic structure. |
![]() | This module is subject to page protection. It is a highly visible module in use by a very large number of pages, or is substituted very frequently. Because vandalism or mistakes would affect many pages, and even trivial editing might cause substantial load on the servers, it is protected from editing. |
Data module for Module:Ko-translit.
local p = {}
--[[
IMPORTANT NOTE before editing this module:
1. Make sure that you use a font that displays the following characters differently, and that you know the differences of them:
ᄀ (U+1100)
ᆨ (U+11A8)
ㄱ (U+3131)
2. When dealing with decomposed Hangul,
a. [ᄀ-ᄒ] should not be directly followed by [ᅡ-ᅵ] because MediaWiki uses Unicode Normalization Form C (NFC), which converts any sequence of [ᄀ-ᄒ][ᅡ-ᅵ] into a precomposed character; write ᄀ[ᅡ] or ᄀ(ᅡ)
b. ᄀ[ᅡ] or ᄀ(ᅡ) at the end of a pattern is equivalent to not just 가 but [가-갛] in precomposed form. To match a syllabic block without a final consonant at the end of a pattern, use both vowel + [^ᆨ-ᇂ] and vowel + $
For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$
--]]
--[[
pre-processing that applies to both RR and MR
IMPORTANT: Before adding a replacement, be sure to check if it can ALWAYS be applied in ALL contexts.
Good example: 싫증 → 실@증
Bad example: 문자 → 문@자 (affects words like 방문자 (pronounced [방문자], not [방문짜]))
--]]
p.preprocessing = {
-- _ for additional space in romanization only
{"_", " "},
-- for linguistic contexts
{"ㄴ([ᄀ-ᄒ])", "ᆫ%1"}, -- -ㄴ다
{"ㄹ([ᄀ-ᄒ])", "ᆯ%1"}, -- -ㄹ까, -ㄹ래
{"ㄹ@([ᄀᄃᄇᄉᄌ])", "ᆯ@%1"}, -- -ㄹ지
{"ㅁ([ᄀ-ᄒ])", "ᆷ%1"},
{"ㅂ([ᄀ-ᄒ])", "ᆸ%1"}, -- -ㅂ니다, -ㅂ시다
-- ㄴ-addition always occurs before 윷 and 잎
{"([ᆨ-ᇂ])ᄋ(ᅲᆾ)", "%1ᄂ%2"},
{"([ᆨ-ᇂ])ᄋ(ᅵᇁ)", "%1ᄂ%2"},
-- 곧이어 [고디어]
{"(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ][^ᆨ-ᇂ])", "%1ᄃ%2"},
{"(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ])$", "%1ᄃ%2"},
-- 싫증 [실쯩]
{"(ᄉ[ᅵ])ᆶ(ᄌ[ᅳ]ᆼ)", "%1ᆯ@%2"},
-- 여덟 + particle (tensification does not occur)
{"(ᄋ[ᅧ]ᄃ[ᅥ])ᆲ([ᄀᄃᄇᄉᄌ])", "%1ᆯ%2"},
-- cases where ㄺㄱ is pronounced [ㄱㄲ]
-- not including very rarely used words such as 삼시욹, 안찱, 우줅거리다, etc.
{"([ᄃᄉᄐ]ᅡ)ᆰᄀ", "%1ᆨᄀ"}, -- 닭, 삵, 수탉/암탉
{"([ᄉᄒ]ᅳ)ᆰᄀ", "%1ᆨᄀ"}, -- 기슭, 흙
{"(ᄎ[ᅵ])ᆰᄀ", "%1ᆨᄀ"}, -- 칡
-- otherwise, ㄺㄱ is pronounced [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))
{"ᆰᄀ", "ᆯ@ᄀ"},
-- palatalization and ㅈ + -히-
{"ᆮᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᄌ%1"}, -- 해돋이 [해도지]
{"ᆮᄋ(ᅵ[^ᆨ-ᇂ])", "ᄌ%1"},
{"ᆮᄋ(ᅵ)$", "ᄌ%1"},
{"[ᆮᆽ]ᄒ(ᅧᆻ)", "ᄎ%1"}, -- 굳히다 [구치다], 꽂히다 [꼬치다]
{"[ᆮᆽ]ᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᄎ%1"},
{"[ᆮᆽ]ᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᄎ%1"},
{"[ᆮᆽ]ᄒ([ᅧᅵ])$", "ᄎ%1"},
{"ᆴᄋ(ᅧᆻ)", "ᆯᄎ%1"}, -- 훑이다 [훌치다]
{"ᆴᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄎ%1"},
{"ᆴᄋ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄎ%1"},
{"ᆴᄋ([ᅧᅵ])$", "ᆯᄎ%1"},
{"ᇀᄋ(ᅧᆻ)", "ᄎ%1"}, -- 붙이다 [부치다]
{"ᇀᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᄎ%1"},
{"ᇀᄋ([ᅧᅵ][^ᆨ-ᇂ])", "ᄎ%1"},
{"ᇀᄋ([ᅧᅵ])$", "ᄎ%1"},
-- {ㄵ, ㄺ, ㄼ} + -히-
{"ᆬᄒ(ᅧᆻ)", "ᆫᄎ%1"}, -- 앉히다 [안치다]
{"ᆬᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆫᄎ%1"},
{"ᆬᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆫᄎ%1"},
{"ᆬᄒ([ᅧᅵ])$", "ᆫᄎ%1"},
{"ᆰᄒ(ᅧᆻ)", "ᆯᄏ%1"}, -- 밝히다 [발키다]
{"ᆰᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄏ%1"},
{"ᆰᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄏ%1"},
{"ᆰᄒ([ᅧᅵ])$", "ᆯᄏ%1"},
{"ᆲᄒ(ᅧᆻ)", "ᆯᄑ%1"}, -- 넓히다 [널피다], 밟히다 [발피다]
{"ᆲᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄑ%1"},
{"ᆲᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄑ%1"},
{"ᆲᄒ([ᅧᅵ])$", "ᆯᄑ%1"},
-- cases where 넓- is pronounced [넙] before consonant
{"(ᄂ[ᅥ])ᆲ([ᄁᄄ-ᄈᄊᄍ-ᄒ])", "%1ᆸ%2"},
{"(ᄂ[ᅥ])ᆲ(ᄃ[ᅡ]ᄃ[ᅳ]ᆷ)", "%1ᆸ%2"}, -- 넓다듬이
{"(ᄂ[ᅥ])ᆲ(ᄃ[ᅮ]ᆼ)", "%1ᆸ%2"}, -- 넓둥글다
{"(ᄂ[ᅥ])ᆲ(ᄉ[ᅡ]ᆯᄆ[ᅮ]ᆫ)", "%1ᆸ%2"}, -- 넓살문
{"(ᄂ[ᅥ])ᆲ(ᄌ[ᅥᅮ]ᆨ)", "%1ᆸ%2"}, -- 넓적-, 넓죽-
-- 밟- is [밥] before consonant (except null-init consonant ㅇ)
{"(ᄇ[ᅡ])ᆲ([^ᄋ])", "%1ᆸ%2"},
{"(ᄇ[ᅡ])ᆲ$", "%1ᆸ"},
-- ㄵ, ㄼ, ㄾ cause tensification of following consonant
-- do not add ㄻ; does not always cause tensification (굶기다 [굼기다], 삶조차 [삼조차])
{"([ᆬᆲᆴ])([ᄀᄃᄌ])", "%1@%2"},
-- automatic 절음 법칙
{"(ᄋ[ᅥ])ᆹᄋ(ᅢ[ᆫᆯᆷᆸᆻ])", "%1ᆸᄉ%2"}, -- except 없애다 [업쌔다]
{"(ᄋ[ᅥ])ᆹᄋ(ᅢ[^ᆨ-ᇂ])", "%1ᆸᄉ%2"},
{"(ᄋ[ᅥ])ᆹᄋ(ᅢ)$", "%1ᆸᄉ%2"},
{"(ᄆ[ᅡᅥ])ᆺᄋ(ᅵᆻ)", "%1ᄉ%2"}, -- except 맛있다 and 멋있다 which are usually pronounced [마싣따] and [머싣따] respectively
{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅡᅥᅧ][ᆨ-ᆺᆼ-ᇂ])", "%1@%2"}, -- except 아, 았, 어, 었, 여, 였
{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅦ][ᆨ-ᆪᆬ-ᆮᆰ-ᇂ])", "%1@%2"}, -- except 에, 엔, 엘
{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅭᅴ][ᆨ-ᇂ])", "%1@%2"}, -- except 요, 의 (w/o final consonant)
{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅳᅵ][ᆨ-ᆪᆬ-ᆮᆰ-ᆶᆹ-ᇂ])", "%1@%2"}, -- except 으, 은, 을, 음, 읍, 이, 인, 일, 임, 입
{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅢ-ᅤᅨ-ᅬᅮ-ᅲ])", "%1@%2"},
-- $ for ㄴ-addition
{"([ᆨ-ᇂ])%$ᄋ([ᅣᅤᅧᅨᅭᅲᅵ])", "%1ᄂ%2"}, -- 색연필 [생년필], 물엿 [물렫]
{"%$", ""},
-- for null-init consonant ㅇ (연음)
{"ᆨᄋ", "ᄀ"},
{"ᆩᄋ", "ᄁ"},
{"ᆪᄋ", "ᆨᄉ"},
{"ᆬᄋ", "ᆫᄌ"},
{"ᆮᄋ", "ᄃ"},
{"[ᆯᆶ]ᄋ", "ᄅ"},
{"ᆰᄋ", "ᆯᄀ"},
{"ᆱᄋ", "ᆯᄆ"},
{"ᆲᄋ", "ᆯᄇ"},
{"ᆳᄋ", "ᆯᄉ"},
{"ᆴᄋ", "ᆯᄐ"},
{"ᆵᄋ", "ᆯᄑ"},
{"ᆸᄋ", "ᄇ"},
{"ᆹᄋ", "ᆸᄉ"},
{"ᆺᄋ", "ᄉ"},
{"ᆻᄋ", "ᄊ"},
{"ᆽᄋ", "ᄌ"},
{"ᆾᄋ", "ᄎ"},
{"ᆿᄋ", "ᄏ"},
{"ᇀᄋ", "ᄐ"},
{"ᇁᄋ", "ᄑ"},
{"ᇂᄋ", "ᄋ"}, -- silent; 좋아 [조아]
-- convert ㅎ combinations
-- trivia: {ㄶ, ㅀ, ㅎ} + ㅂ doesn't actually exist, but added for completeness (syl-final ㅎ is for aspiration anyway)
{"ᆭᄀ", "ᆫᄏ"},
{"ᆭᄃ", "ᆫᄐ"},
{"ᆭᄇ", "ᆫᄑ"},
{"ᆭᄌ", "ᆫᄎ"},
{"ᆶᄀ", "ᆯᄏ"},
{"ᆶᄃ", "ᆯᄐ"},
{"ᆶᄇ", "ᆯᄑ"},
{"ᆶᄌ", "ᆯᄎ"},
{"ᇂᄀ", "ᄏ"},
{"ᇂᄃ", "ᄐ"},
{"ᇂᄇ", "ᄑ"},
{"ᇂᄌ", "ᄎ"}
}
-- should be done before neutralization of syl-final consonants (MR only)
p.before_neutralizing_syl_final_consonants_mr = {
-- additional ㅎ combinations
{"[ᆬᆭ]ᄉ", "ᆫᄊ"},
{"[ᆲᆴᆶ]ᄉ", "ᆯᄊ"},
{"ᇂᄉ", "ᄊ"},
-- @ for written 사이시옷 + ㄱ/ㅂ
{"ᆺ@ᄀ", "ᄁ"},
{"ᆺ@ᄇ", "ᄈ"}
}
-- neutralization of syl-final consonants
p.neutralize_syl_final_consonants = {
{"[ᆩᆪᆰᆿ]", "ᆨ"},
{"[ᆬᆭ]", "ᆫ"},
{"[ᆺᆻᆽᆾᇀᇂ]", "ᆮ"},
{"[ᆲᆳᆴᆶ]", "ᆯ"},
{"ᆱ", "ᆷ"},
{"[ᆵᆹᇁ]", "ᆸ"}
}
-- @ for 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
-- other irregularities documented are automatically handled
p.at_irregularities = {
{"ᆨ@ᄋ", "ᄀ"},
{"ᆮ@ᄋ", "ᄃ"}, -- 웃어른 [우더른]
{"ᆯ@ᄋ", "ᄅ"},
{"ᆸ@ᄋ", "ᄇ"},
{"ᆫ@ᄅ", "ᆫᄂ"} -- 음운론 [으문논]
}
-- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p for RR only
p.at_irregularities_additional_rr = {
{"ᆨ@ᄒ", "ᄏ"},
{"ᆮ@ᄒ", "ᄐ"},
{"ᆸ@ᄒ", "ᄑ"},
{"@", ""}
}
-- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants (MR only)
p.gdbj_mr = {
{"ᆫᄀ", "ᆫ'`ᄀ"}, -- n'g
{"([ᅡ-ᅵᆫᆯᆷᆼ])([ᄀᄃᄇᄌ])", "%1`%2"},
{"([ᅡ-ᅵᆫᆯᆷᆼ])%*([ᄀᄃᄇᄌ])", "%1-`%2"} -- * is for additional hyphen in romanization only (voicing is retained after hyphen)
}
p.consonant_assimilations = {
{"[ᆨᆼ][ᄂᄅ]", "ᆼᄂ"},
{"ᆨᄆ", "ᆼᄆ"},
{"ᆫᄅ", "ᆯᄅ"},
{"ᆮ[ᄂᄅ]", "ᆫᄂ"},
{"ᆮᄆ", "ᆫᄆ"},
{"ᆯᄂ", "ᆯᄅ"},
{"[ᆷᆸ][ᄂᄅ]", "ᆷᄂ"},
{"ᆸᄆ", "ᆷᄆ"}
}
-- additional consonant assimilations that apply to MR only
p.consonant_assimilations_additional_mr = {
-- no {kkk, ttt, ppp, sss/ts/tss, ttch}
{"ᆨᄁ", "ᄁ"},
{"ᆮᄄ", "ᄄ"},
{"ᆸᄈ", "ᄈ"},
{"ᆮ[ᄉᄊ]", "ᄊ"},
{"ᆮᄍ", "ᄍ"},
-- other misc conversions
{"ᆯᄅ", "ᆯl"},
{"ᆯᄒ", "rᄒ"},
{"ᄉ[ᅱ]", "shᅱ"}
}
-- drop y after {ㅈ, ㅉ, ㅊ}
p.drop_y = {
{"([ᄌ-ᄎ])ᅣ", "%1ᅡ"},
{"([ᄌ-ᄎ])ᅤ", "%1ᅢ"},
{"([ᄌ-ᄎ])ᅧ", "%1ᅥ"},
{"([ᄌ-ᄎ])ᅨ", "%1ᅦ"},
{"([ᄌ-ᄎ])ᅭ", "%1ᅩ"},
{"([ᄌ-ᄎ])ᅲ", "%1ᅮ"}
}
-- vowels to romanized text for RR
p.vowels_rr = {
{"[ᅡㅏ]", "a"},
{"[ᅢㅐ]", "ae"},
{"[ᅣㅑ]", "ya"},
{"[ᅤㅒ]", "yae"},
{"[ᅥㅓ]", "eo"},
{"[ᅦㅔ]", "e"},
{"[ᅧㅕ]", "yeo"},
{"[ᅨㅖ]", "ye"},
{"[ᅩㅗ]", "o"},
{"[ᅪㅘ]", "wa"},
{"[ᅫㅙ]", "wae"},
{"[ᅬㅚ]", "oe"},
{"[ᅭㅛ]", "yo"},
{"[ᅮㅜ]", "u"},
{"[ᅯㅝ]", "wo"},
{"[ᅰㅞ]", "we"},
{"[ᅱㅟ]", "wi"},
{"[ᅲㅠ]", "yu"},
{"[ᅳㅡ]", "eu"},
{"[ᅴㅢ]", "ui"},
{"[ᅵㅣ]", "i"}
}
-- vowels to romanized text for MR
p.vowels_mr = {
{"[ᅡㅏ]", "a"},
{"[ᅢㅐ]", "ae"},
{"[ᅣㅑ]", "ya"},
{"[ᅤㅒ]", "yae"},
{"[ᅥㅓ]", "ŏ"},
{"[ᅦㅔ]", "e"},
{"[ᅧㅕ]", "yŏ"},
{"[ᅨㅖ]", "ye"},
{"[ᅩㅗ]", "o"},
{"[ᅪㅘ]", "wa"},
{"[ᅫㅙ]", "wae"},
{"[ᅬㅚ]", "oe"},
{"[ᅭㅛ]", "yo"},
{"[ᅮㅜ]", "u"},
{"[ᅯㅝ]", "wŏ"},
{"[ᅰㅞ]", "we"},
{"[ᅱㅟ]", "wi"},
{"[ᅲㅠ]", "yu"},
{"[ᅳㅡ]", "ŭ"},
{"[ᅴㅢ]", "ŭi"},
{"[ᅵㅣ]", "i"}
}
-- single consonants to romanized text for RR
p.single_consonants_rr = {
{"[ᄀㄱ]", "g"},
{"[ᄁㄲ]", "kk"},
{"ㄳ", "ks"},
{"[ᄂᆫㄴ]", "n"},
{"ㄵ", "nj"},
{"ㄶ", "nh"},
{"[ᄃㄷ]", "d"},
{"[ᄄㄸ]", "tt"},
{"[ᄅㄹ]", "r"},
{"ᆯ", "l"},
{"ㄺ", "lg"},
{"ㄻ", "lm"},
{"ㄼ", "lb"},
{"ㄽ", "ls"},
{"ㄾ", "lt"},
{"ㄿ", "lp"},
{"ㅀ", "lh"},
{"[ᄆᆷㅁ]", "m"},
{"[ᄇㅂ]", "b"},
{"[ᄈㅃ]", "pp"},
{"ㅄ", "ps"},
{"[ᄉㅅ]", "s"},
{"[ᄊㅆ]", "ss"},
{"[ᄋㅇ]", ""},
{"ᆼ", "ng"},
{"[ᄌㅈ]", "j"},
{"[ᄍㅉ]", "jj"},
{"[ᄎㅊ]", "ch"},
{"[ᄏᆨㅋ]", "k"},
{"[ᄐᆮㅌ]", "t"},
{"[ᄑᆸㅍ]", "p"},
{"[ᄒㅎ]", "h"}
}
-- single consonants to romanized text for MR
p.single_consonants_mr = {
{"`ᄀ", "g"},
{"`ᄃ", "d"},
{"`ᄇ", "b"},
{"`ᄌ", "j"},
{"`", ""},
{"[ᄀᆨㄱ]", "k"},
{"[ᄁㄲ]", "kk"},
{"ㄳ", "ks"},
{"[ᄂᆫㄴ]", "n"},
{"ㄵ", "nj"},
{"ㄶ", "nh"},
{"[ᄃᆮㄷ]", "t"},
{"[ᄄㄸ]", "tt"},
{"[ᄅㄹ]", "r"},
{"ᆯ", "l"},
{"ㄺ", "lg"},
{"ㄻ", "lm"},
{"ㄼ", "lb"},
{"ㄽ", "ls"},
{"ㄾ", "lt'"},
{"ㄿ", "lp'"},
{"ㅀ", "rh"},
{"[ᄆᆷㅁ]", "m"},
{"[ᄇᆸㅂ]", "p"},
{"[ᄈㅃ]", "pp"},
{"ㅄ", "ps"},
{"[ᄉㅅ]", "s"},
{"[ᄊㅆ]", "ss"},
{"[ᄋㅇ]", ""},
{"ᆼ", "ng"},
{"[ᄌㅈ]", "ch"},
{"[ᄍㅉ]", "tch"},
{"[ᄎㅊ]", "ch'"},
{"[ᄏㅋ]", "k'"},
{"[ᄐㅌ]", "t'"},
{"[ᄑㅍ]", "p'"},
{"[ᄒㅎ]", "h"}
}
-- unwrapping enclosed Hangul text
-- actually not very necessary, but these are also classified as Hangul chars in Unicode
-- no distinction is made between parenthesized and circled chars
-- needs to be executed before decomposing Hangul
p.enclosed_hangul = {
{"[㈀㉠]", "(기역)"},
{"[㈁㉡]", "(니은)"},
{"[㈂㉢]", "(디귿)"},
{"[㈃㉣]", "(리을)"},
{"[㈄㉤]", "(미음)"},
{"[㈅㉥]", "(비읍)"},
{"[㈆㉦]", "(시옷)"},
{"[㈇㉧]", "(이응)"},
{"[㈈㉨]", "(지읒)"},
{"[㈉㉩]", "(치읓)"},
{"[㈊㉪]", "(키읔)"},
{"[㈋㉫]", "(티읕)"},
{"[㈌㉬]", "(피읖)"},
{"[㈍㉭]", "(히읗)"},
{"[㈎㉮]", "(가)"},
{"[㈏㉯]", "(나)"},
{"[㈐㉰]", "(다)"},
{"[㈑㉱]", "(라)"},
{"[㈒㉲]", "(마)"},
{"[㈓㉳]", "(바)"},
{"[㈔㉴]", "(사)"},
{"[㈕㉵]", "(아)"},
{"[㈖㉶]", "(자)"},
{"[㈗㉷]", "(차)"},
{"[㈘㉸]", "(카)"},
{"[㈙㉹]", "(타)"},
{"[㈚㉺]", "(파)"},
{"[㈛㉻]", "(하)"},
{"㈜", "(주)"},
{"㈝", "(오전)"},
{"㈞", "(오후)"},
{"㉼", "(참고)"},
{"㉽", "(주의)"},
{"㉾", "(우)"}
}
-- converting escaped special chars to HTML tags to preserve them
p.escaped_to_html_enc = {
{"\\%$", "$"},
{"\\%%", "%"},
{"\\%*", "*"},
{"\\@", "@"},
{"\\%^", "^"},
{"\\_", "_"},
{"\\`", "`"}
}
-- converting HTML tags back to unescaped chars
p.html_enc_to_ascii = {
{"$", "$"},
{"%", "%%"},
{"*", "*"},
{"@", "@"},
{"^", "^"},
{"_", "_"},
{"`", "`"}
}
return p