Jump to content
Main menu
Main menu
move to sidebar
hide
Navigation
Main page
Recent changes
Random page
Help about MediaWiki
Special pages
Cultopedia
Search
Search
Appearance
Create account
Log in
Personal tools
Create account
Log in
Pages for logged out editors
learn more
Contributions
Talk
Editing
Module:Ko-translit
Module
Discussion
English
Read
Edit
View history
Tools
Tools
move to sidebar
hide
Actions
Read
Edit
View history
General
What links here
Related changes
Page information
Appearance
move to sidebar
hide
Warning:
You are not logged in. Your IP address will be publicly visible if you make any edits. If you
log in
or
create an account
, your edits will be attributed to your username, along with other benefits.
Anti-spam check. Do
not
fill this in!
local p = {} local find = mw.ustring.find local gsub = mw.ustring.gsub local m_data = require('Module:Ko-translit/data') local m_utils = require('Module:Ko-utils') local get_args = require('Module:Arguments').getArgs --[[ IMPORTANT NOTE before editing this module: 1. Make sure that you use a font that displays the following characters differently, and that you know the differences of them: แ (U+1100) แจ (U+11A8) ใฑ (U+3131) 2. When dealing with decomposed Hangul, a. [แ-แ] should not be directly followed by [แ ก-แ ต] because MediaWiki uses Unicode Normalization Form C (NFC), which converts any sequence of [แ-แ][แ ก-แ ต] into a precomposed character; write แ[แ ก] or แ(แ ก) b. แ[แ ก] or แ(แ ก) at the end of a pattern is equivalent to not just ๊ฐ but [๊ฐ-๊ฐ] in precomposed form. To match a syllabic block without a final consonant at the end of a pattern, use both vowel + [^แจ-แ] and vowel + $ For example, to only match ๊ฐ (and not [๊ฐ-๊ฐ]) at the end of a pattern, use both แ[แ ก][^แจ-แ] and แ[แ ก]$ --]] local function gsub_iterate(text, table) for _, entry in ipairs(table) do text = gsub(text, entry[1], entry[2]) end return text end local function remove_links_and_markup(text) -- these either are unnecessary or interfere with assimilation -- remove bold/italic -- it is not impossible to allow bold/italic when it does not interfere with assimilation, but determining when to allow or disallow that adds complication for little practical gain text = gsub(text, "'''", "") text = gsub(text, "''", "") -- remove HTML tags (except br) text = gsub(text, "<[Bb][Rr] */?>", " ") text = gsub(text, "</?[A-Za-z][^>]->", "") text = gsub(text, " ", "<br>") -- remove wikilinks text = gsub(text, "%[%[[^%|]+%|(..-)%]%]", "%1") text = gsub(gsub(text, "%[%[", ""), "%]%]", "") text = mw.text.killMarkers(text) return text end local function disallow_invalid_input(text) -- very first step -- Hangul status: precomposed (ํ) -- input must contain Hangul if not m_utils.contains_hangul(text) then error("Input must contain Hangul") end -- no direct insertion of reference or footnote if m_utils.contains_reference(text) then error("Input cannot contain references") end -- if input contains Hangul not supported by RR and MR, change text to "N/A" and skip everything if find(text, "[แ-แ แ ถ-แงแ-แฟใฎใฏใ ค-ใ๊ฅ -๊ฅฟํฐ-ํฟ]") then text = "N/A" return text end text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders -- various validations of input if find(text, "[แ-แ]") or find(text, "[แ ก-แ ตแจ-แ]") then error("Do not input conjoining Hangul jamo directly") elseif find(text, "`%*") then error("Use *` instead of `*") elseif find(text, "@%*") then error("Use *@ instead of @*") elseif find(text, "%^[^๊ฐ-ํฃ]") then error("^ must be immediately followed by Hangul syllabic block") elseif find(text, "[^%*0-9A-Za-z]`") or find(text, "[^0-9A-Za-z]%*`") or find(text, "`[^๊ฐ-๊น๋ค-๋ฏ๋ฐ-๋น์-์ง]") then error("Found invalid sequence containing `") elseif find(text, "[^%*ใน๊ฐ-ํฃ]@") or find(text, "[^๊ฐ-ํฃ]%*@") or find(text, "%*@[^๊ฐ-๊น๋ค-๋ฏ๋ฐ-๋น์-์ง]") or find(text, "ใน@[^๊ฐ-๊น๋ค-๋ฏ๋ฐ-๋น์ฌ-์ท์-์ง]") or find(text, "@[^๊ฐ-๊น๋ค-๋ฏ๋ผ-๋ง๋ฐ-๋น์ฌ-์ท์ผ-์ณ์ฌ-์ฃ์-์ฏ์ -์ถ์ธ-์ป์ด-์์์-์งํ-ํฃ]") then error("Found invalid sequence containing @") elseif find(text, "[^๊ฐ-ํฃ]%$") or find(text, "%$[^์์ด์์์์ฌ์์ผ์์์์์์ด์ธ์ผ์์ ]") then error("Found invalid sequence containing $") elseif find(text, "%%$") then error("Remove final %") elseif find(text, "[ _][ _]") then error("No two or more consecutive space characters") elseif find(text, "^[%$%*@_`]") or find(text, "^%%[^_๊ฐ-ํฃ]") or find(text, "[ _]%*") or find(text, "%*[ %*%-_]") or find(text, "%-%*") or find(text, "[๏ท-๏ท]") or find(text, "%%_$") or find(text, "[%$%*@%^`]$") then error("Invalid input") end return text end local function check_invalid_seq(text) -- validity check after removing links and markups (before decomposing Hangul) -- Hangul status: precomposed (ํ) if find(text, "[ _][ _]") then error("No two or more consecutive space characters") elseif find(text, "^[%$%*@_`]") or find(text, "[ _]%*") or find(text, "%*[ %*%-_]") or find(text, "%-%*") or find(text, "[%$%*@%^_`]$") then error("Invalid input") end return text end local function check_invalid_seq_decomposed_hangul(text) -- validity check after decomposing Hangul -- Hangul status: decomposed (แ+แ ก+แซ) if find(text, "[แจ-แชแฌ-แฎแด-แถแธแนแปแฝ-แ]%*?๏ท?@๏ท?[แแแแแ]") or find(text, "แฐ%*?๏ท?@๏ท?[แ-แแ-แ]") or find(text, "แฒ๏ท?@๏ท?[แ-แแ-แ]") or find(text, "แบ%*@[แแ]") or find(text, "แบ%*?๏ท?@๏ท?[แ-แแ-แแ-แ]") or find(text, "[แ ก-แ ตแจ-แชแฌ-แ]๏ท?@๏ท?แ ") or find(text, "[แ ก-แ ต]๏ท?@๏ท?แ") or find(text, "[แ ก-แ ตแซ-แญแฏแฑ-แทแผ]๏ท?@๏ท?แ") then error("Found invalid sequence containing @") elseif find(text, "[แ ก-แ ตแจแซแญ-แฏแถ-แธแผ]๏ท?%$") then error("Found invalid sequence containing $") end return text end local function parse_name(text) -- processing people names -- Hangul status: precomposed (ํ) local hanja_readings_final_L = "๊ฐ๊ฑธ๊ฒฐ๊ณจ๊ด๊ตด๊ถ๊ทค๊ธ๊ธธ๋ ๋ ๋๋๋๋ฌ๋๋๋ ฌ๋ฅ ๋ง๋ฉธ๋ชฐ๋ฌผ๋ฐ๋ฐ๋ฒ๋ณ๋ถ์ด์ค์์ ์ฌ์ค์์ผ์ด์ฌ์์ธ์์จ์์ผ์ ์กธ์ค์ฆ์ง์ฐฐ์ฒ ์ดฌ์ถ์น ํํํํ ํํํํํํผํํ" local hanja_readings_init_DSJ = "๋ค๋จ๋ฌ๋ด๋ต๋น๋๋๋๋๋ ๋๋๋๋๋๋๋ฑ์ฌ์ญ์ฐ์ด์ผ์ฝ์์์์์์์ ์ค์ฌ์ญ์ฑ์ธ์์์์์ก์์ ์์์์ ์ญ์ฌ์ฌ์ต์น์์์ ์ค์ฌ์ญ์์์์ ์ก์ฅ์ฌ์์ ์ ์ ์ ์ ์ ์ ์ ์กฐ์กฑ์กด์กธ์ข ์ข์ฃ์ฃผ์ฃฝ์ค์ค์ค์ฆ์ฆ์ฆ์ฆ์ง์ง์ง์ง์ง์ง์ง" -- note: internally uses 3 noncharacters -- ๏ท (U+FDD0): mostly for given name in RR -- ๏ท (U+FDD1): marks beginning of name -- ๏ท (U+FDD2): marks end of name -- change % to U+FDD1 and U+FDD2 (end of string also terminates name mode) text = gsub(text, "%%([^%%]*)%%", "๏ท%1๏ท") text = gsub(text, "%%([^%%]*)$", "๏ท%1๏ท") -- disallow invalid input for name if find(text, "๏ท_?๏ท") then error("Name cannot be empty") elseif find(text, "๏ท[^๏ท๏ท]*[^๊ฐ-ํฃ_ ][^๏ท๏ท]*๏ท") then error("Invalid character in name") elseif find(text, "๏ท ") then error("Name cannot begin with space") elseif find(text, " ๏ท") then error("Name cannot end with space") elseif find(text, "๏ท[^๏ท]*[ _][^๏ท]*[ _][^๏ท]*๏ท") then error("No more than two components in name") elseif find(text, "๏ท[๊ฐ-ํฃ]_") then error("No _ after one-syllable surname") elseif find(text, "๏ท[^๏ท]*[" .. hanja_readings_final_L .. "]@[" .. hanja_readings_init_DSJ .. "][^๏ท]*๏ท") then error("Contains unnecessary @ in name") -- see below end -- separate surname and given name -- if input contains _ or space, separate there text = gsub(text, "๏ท([๊ฐ-ํฃ%$@]+)_๏ท", "๏ท^%1_๏ท") -- for surname-only string text = gsub(text, "๏ท_([๊ฐ-ํฃ%$@]+)๏ท", "๏ท_^%1๏ท") -- for mononym text = gsub(text, "๏ท([๊ฐ-ํฃ%$@]+)[ _]([๊ฐ-ํฃ%$@]+)๏ท", "๏ท^%1_^%2๏ท") -- otherwise, separate after first syllabic block text = gsub(text, "๏ท([๊ฐ-ํฃ])๏ท", "๏ท^%1_๏ท") -- for surname-only string text = gsub(text, "๏ท([๊ฐ-ํฃ])([๊ฐ-ํฃ%$@]+)๏ท", "๏ท^%1_^%2๏ท") -- check invalid input after separating surname and given name if find(text, "๏ท[^๏ท]*_%^[%$@][^๏ท]*๏ท") then error("No @ or $ between surname and given name") end -- tensification of ใน + {ใท, ใ , ใ } (needed for MR; e.g. ํ๊ธธ๋ [ํ๊ธธ๋ฅ], ์์ง๋ฌธ๋ [์์ฐ๋ฌธ๋]) -- does not occur when same syllable is repeated (e.g. ๊ตฌ๊ตฌ์ ์ [๊ตฌ๊ตฌ์ ์ ], not [๊ตฌ๊ตฌ์ ์ฉ]); just using U+FDD0 here too for i = 1, mw.ustring.len(text) do text = gsub(text, "๏ท([^๏ท]*)([๋ฌ๋์ด์ค์์ ์ฌ์ค์ ์กธ์ค์ฆ์ง])%2([^๏ท]*)๏ท", "๏ท%1%2๏ท%2%3๏ท") end -- now apply tensification for i = 1, mw.ustring.len(text) do text = gsub(text, "๏ท([^๏ท]*)([" .. hanja_readings_final_L .. "])([" .. hanja_readings_init_DSJ .. "])([^๏ท]*)๏ท", "๏ท%1%2@%3%4๏ท") end -- insert U+FDD0 in given name (needed for RR; e.g. ํ๋ณต๋จ Han Boknam, not Han Bongnam) for i = 1, mw.ustring.len(text) do text = gsub(text, "๏ท([^๏ท]*)_%^([^๏ท]*)([๊ฐ-ํฃ%$@])([๊ฐ-ํฃ%$@])([^๏ท]*)๏ท", "๏ท%1_^%2%3๏ท%4%5๏ท") end -- remove _ which was needed for surname-only string and mononym text = gsub(text, "_๏ท", "๏ท") text = gsub(text, "๏ท_%^", "๏ท^") text = gsub(text, "[๏ท๏ท]", "") -- remove U+FDD1 and U+FDD2 return text end local function final_processing(text) -- final processing for RR and MR -- result should not contain Hangul if m_utils.contains_hangul(text) then error("Result contains Hangul; debugging required") end text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII -- if result is nothing (e.g. when input is just ใ ) if text == "" then text = "โ" end return text end -- Convert to Revised Romanization function p.rr(frame) return p._rr(get_args(frame)) end function p._rr(args) local text = args[1] text = disallow_invalid_input(text) if text == "N/A" then return text end text = parse_name(text) text = remove_links_and_markup(text) text = check_invalid_seq(text) text = gsub(text, "`", "") -- ignore ` (only needed for MR; not needed for RR) text = gsub_iterate(text, m_data.enclosed_hangul) text = m_utils.decompose_hangul(text) -- decompose Hangul text = check_invalid_seq_decomposed_hangul(text) text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only text = gsub_iterate(text, m_data.preprocessing) text = gsub(text, "แแ", "แ") -- convert remaining ใ combination text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants text = gsub_iterate(text, m_data.at_dollar_irregularities) -- @ for ใดใน pronounced [ใดใด], $ for ์ ์ ๋ฒ์น text = gsub_iterate(text, m_data.at_irregularities_additional_rr) -- @ for ใฑใ /ใทใ /ใ ใ โ k/t/p text = gsub_iterate(text, m_data.consonant_assimilations) -- consonant assimilations text = gsub(text, "แฏแ ", "แฏl") -- ในใน is ll text = gsub_iterate(text, m_data.drop_y) -- drop y after {ใ , ใ , ใ } text = gsub_iterate(text, m_data.vowels_rr) -- replace Hangul vowels with romanized text text = gsub(text, "'([แใ ]+)'", "'%1'") -- to prevent input like 'ใ ' (with ') from becoming italic markup (as [แใ ] is simply removed later) text = gsub_iterate(text, m_data.single_consonants_rr) -- replace single consonants with romanized text text = gsub(text, "๏ท", "") -- now remove U+FDD0 that was needed for converting each syllabic block in given name separately (e.g. ํ๋ณต๋จ Han Boknam, not Han Bongnam) -- ^ for capitalization text = gsub(text, "%^[a-eg-km-pr-uwy]", mw.ustring.upper) text = gsub(text, "%^", "") text = final_processing(text) return text end -- Convert to McCuneโReischauer function p.mr(frame) return p._mr(get_args(frame)) end function p._mr(args) local text = args[1] text = disallow_invalid_input(text) if text == "N/A" then return text end text = parse_name(text) text = gsub(text, "๏ท", "") -- remove U+FDD0 (only needed for RR; not needed for MR) text = remove_links_and_markup(text) text = check_invalid_seq(text) text = gsub_iterate(text, m_data.enclosed_hangul) text = m_utils.decompose_hangul(text) -- decompose Hangul text = check_invalid_seq_decomposed_hangul(text) text = gsub(text, "([แแแ-แแ-แ])แ ด", "%1แ ต") -- syl-init consonant + ใ ข โ syl-init consonant + ใ ฃ (except ์, ๋ฌ, ํฌ) text = gsub_iterate(text, m_data.preprocessing) text = gsub_iterate(text, m_data.before_neutralizing_syl_final_consonants_mr) -- should be done before neutralization of syl-final consonants text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants text = gsub(text, "([แ ก-แ ตแซแทแผ])@แ", "%1แ") -- @ for tensification text = gsub_iterate(text, m_data.at_dollar_irregularities) -- @ for ใดใน pronounced [ใดใด], $ for ์ ์ ๋ฒ์น text = gsub_iterate(text, m_data.gdbj_mr) -- cases where ใฑ, ใท, ใ , ใ become voiced consonants text = gsub(text, "แฏ%*แ ", "แฏ-l") -- ใน-ใน should probably be l-l rather than l-r text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only text = gsub(text, "@", "") -- consonant assimilations text = gsub_iterate(text, m_data.consonant_assimilations) text = gsub_iterate(text, m_data.consonant_assimilations_additional_mr) text = gsub_iterate(text, m_data.drop_y) -- drop y after {ใ , ใ , ใ } text = gsub_iterate(text, m_data.vowels_mr) -- replace Hangul vowels with romanized text text = gsub(text, "([ao])แe", "%1รซ") -- ใ ์ (aรซ) and ใ ์ (oรซ) text = gsub(text, "'([แใ ]+)'", "'%1'") -- to prevent input like 'ใ ' (with ') from becoming italic markup (as [แใ ] is simply removed later) text = gsub_iterate(text, m_data.single_consonants_mr) -- replace single consonants with romanized text -- replace ' with ' when followed by another ' or at end of string (to avoid possible clashes with bold/italic markup) text = gsub(text, "([hkpt])''", "%1''") text = gsub(text, "([hkpt])'$", "%1'") -- ^ for capitalization text = gsub(text, "%^[acehikm-pr-uwyลลญ]", mw.ustring.upper) text = gsub(text, "%^", "") text = final_processing(text) return text end -- Removing special chars (except for escaped ones) function p.clean_hangul(frame) return p._clean_hangul(get_args(frame)) end function p._clean_hangul(args) local text = args[1] -- input must contain Hangul if not m_utils.contains_hangul(text) then error("Input must contain Hangul") end -- no direct insertion of reference or footnote if m_utils.contains_reference(text) then error("Input cannot contain references") end -- symbol should not appear within single syllabic block if find(text, "[แ-แ ๊ฅ -๊ฅผ][%$%%%*@%^_`][แ -แงํฐ-ํ]") or find(text, "[แ -แง๊ฐ๊ฐ๊ฐธ๊ฑ๊ฑฐ๊ฒ๊ฒจ๊ณ๊ณ ๊ณผ๊ด๊ดด๊ต๊ตฌ๊ถ๊ถค๊ท๊ท๊ทธ๊ธ๊ธฐ๊น๊นจ๊บ๊บ ๊บผ๊ป๊ปด๊ผ๊ผฌ๊ฝ๊ฝค๊พ๊พ๊พธ๊ฟ๊ฟฐ๋๋จ๋๋ ๋ผ๋๋ด๋๋ฌ๋๋ค๋ ๋ ๋ ธ๋๋ฐ๋๋จ๋๋ ๋ผ๋๋ด๋๋ฌ๋๋ค๋๋๋ธ๋๋ฐ๋๋จ๋๋ ๋ผ๋๋ด๋๋ฌ๋๋ค๋๋๋ธ๋๋ฐ๋๋จ๋๋ ๋ผ๋๋ด๋๋ฌ๋๋ค๋๋๋ธ๋๋ฐ๋๋จ๋๋ ๋ผ๋๋ด๋๋ฌ๋ ๋ ค๋ก๋ก๋กธ๋ข๋ขฐ๋ฃ๋ฃจ๋ค๋ค ๋คผ๋ฅ๋ฅด๋ฆ๋ฆฌ๋ง๋งค๋จ๋จ๋จธ๋ฉ๋ฉฐ๋ช๋ชจ๋ซ๋ซ ๋ซผ๋ฌ๋ฌด๋ญ๋ญฌ๋ฎ๋ฎค๋ฏ๋ฏ๋ฏธ๋ฐ๋ฐฐ๋ฑ๋ฑจ๋ฒ๋ฒ ๋ฒผ๋ณ๋ณด๋ด๋ดฌ๋ต๋ตค๋ถ๋ถ๋ถธ๋ท๋ทฐ๋ธ๋ธจ๋น๋น ๋นผ๋บ๋บด๋ป๋ปฌ๋ผ๋ผค๋ฝ๋ฝ๋ฝธ๋พ๋พฐ๋ฟ๋ฟจ์์ ์ผ์์ด์์ฌ์์ค์์์ธ์ ์ ฐ์์จ์์ ์ผ์์ด์์ฌ์์ค์์์ธ์์ฐ์์จ์์ ์ผ์์ด์์ฌ์์ค์์์ธ์์ฐ์์จ์์ ์ผ์์ด์์ฌ์์ค์์์ธ์์ฐ์์จ์์ ์ผ์์ด์์ฌ์์ค์ ์ ์ ธ์ก์กฐ์ข์ขจ์ฃ์ฃ ์ฃผ์ค์คด์ฅ์ฅฌ์ฆ์ฆค์ง์ง์งธ์จ์จฐ์ฉ์ฉจ์ช์ช ์ชผ์ซ์ซด์ฌ์ฌฌ์ญ์ญค์ฎ์ฎ์ฎธ์ฏ์ฏฐ์ฐ์ฐจ์ฑ์ฑ ์ฑผ์ฒ์ฒด์ณ์ณฌ์ด์ดค์ต์ต์ตธ์ถ์ถฐ์ท์ทจ์ธ์ธ ์ธผ์น์นด์บ์บฌ์ป์ปค์ผ์ผ์ผธ์ฝ์ฝฐ์พ์พจ์ฟ์ฟ ์ฟผํํดํํฌํํคํํํธํํฐํ ํ จํํ ํผํํดํํฌํํคํํํธํํฐํํจํํ ํผํํดํํฌํํคํํํธํํฐํํจํํ ํผํํดํํฌํํคํํํธํํฐํํจํํ ํผํํดํํฌํํฐ-ํ][%$%%%*@%^_`][แจ-แฟํ-ํป]") then error("Do not insert symbol within single syllabic block") end text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders text = gsub(text, "[%$%%%*@%^_`]", "") -- removing non-escaped special chars text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII text = mw.text.unstrip(text) -- unstripping test return text end return p
Summary:
Please note that all contributions to Cultopedia may be edited, altered, or removed by other contributors. If you do not want your writing to be edited mercilessly, then do not submit it here.
You are also promising us that you wrote this yourself, or copied it from a public domain or similar free resource (see
Cultopedia:Copyrights
for details).
Do not submit copyrighted work without permission!
Cancel
Editing help
(opens in new window)
Template used on this page:
Module:Ko-translit/doc
(
edit
)
Search
Search
Editing
Module:Ko-translit
Add topic