Módulo:translit/ru
Apariencia
La documentación para este módulo puede ser creada en Módulo:translit/ru/doc
local export = {}
local concat = table.concat
local insert = table.insert
local ipairs = ipairs
local remove = table.remove
local select = select
local m_str = require("Módulo:String")
local strexplode = m_str.explode_utf8
local strfind = m_str.find
local strsubn = m_str.gsub
local strsplit = m_str.split
local strnfc = m_str.toNFC
local strnfd = m_str.toNFD
local u = m_str.char
--ru.common
local AC = u(0x0301) -- acute = ́
local GR = u(0x0300) -- grave = ̀
local CFLEX = u(0x0302) -- circumflex = ̂
local BREVE = u(0x0306) -- breve ̆
local DIA = u(0x0308) -- diaeresis = ̈
local CARON = u(0x030C) -- caron ̌
local OGONEK = u(0x0328) -- ogonek ̨
local DUBGR = u(0x030F) -- double grave = ̏
local DOTABOVE = u(0x0307) -- dot above = ̇
local DOTBELOW = u(0x0323) -- dot below = ̣
local PSEUDOVOWEL = u(0xFFF1) -- pseudovowel placeholder
local PSEUDOCONS = u(0xFFF2) -- pseudoconsonant placeholder
local TEMPCFLEX = u(0xFFF3) -- placeholder to be converted to a circumflex
local TEMPSUB = u(0xFFF4) -- miscellaneous temporary placeholder
-- any accent
local accent = AC .. GR .. DIA .. BREVE .. CARON .. OGONEK
-- regex for any optional accent(s)
local opt_accent = "[" .. accent .. "]*"
-- any composed Cyrillic vowel with grave accent
local composed_grave_vowel = "ѐЀѝЍ"
-- any Cyrillic vowel except ёЁ
local vowel_no_jo = "аеиоуяэыюіѣѵАЕИОУЯЭЫЮІѢѴ" .. PSEUDOVOWEL .. composed_grave_vowel
-- any Cyrillic vowel, including ёЁ
local vowel = vowel_no_jo .. "ёЁ"
-- any vowel in transliteration
local tr_vowel = "aeěɛiouyAEĚƐIOUY" .. PSEUDOVOWEL
-- any consonant in transliteration, omitting soft/hard sign
local tr_cons_no_sign = "bcčdfghjklmnpqrsštvwxzžBCČDFGHJKLMNPQRSŠTVWXZŽ" .. PSEUDOCONS
-- any consonant in transliteration, including soft/hard sign
local tr_cons = tr_cons_no_sign .. "ʹʺ"
-- regex for any consonant in transliteration, including soft/hard sign,
-- optionally followed by any accent
local tr_cons_acc_re = "[" .. tr_cons .. "]" .. opt_accent
-- any Cyrillic consonant except sibilants and ц
local cons_except_sib_c = "бдфгйклмнпрствхзьъБДФГЙКЛМНПРСТВХЗЬЪ" .. PSEUDOCONS
-- Cyrillic sibilant consonants
local sib = "шщчжШЩЧЖ"
-- Cyrillic sibilant consonants and ц
local sib_c = sib .. "цЦ"
-- any Cyrillic consonant
local cons = cons_except_sib_c .. sib_c
-- Cyrillic velar consonants
local velar = "кгхКГХ"
-- uppercase Cyrillic consonants
local uppercase = "АЕИОУЯЭЫЁЮІѢѴБДФГЙКЛМНПРСТВХЗЬЪШЩЧЖЦ"
local recomposer = {
-- Cyrillic letters
["е" .. DIA] = "ё",
["Е" .. DIA] = "Ё",
["и" .. BREVE] = "й",
["И" .. BREVE] = "Й",
["і" .. DIA] = "ї",
["І" .. DIA] = "Ї",
-- Latin letters
["c" .. CARON] = "č",
["C" .. CARON] = "Č",
["e" .. CARON] = "ě",
["E" .. CARON] = "Ě",
["o" .. CARON] = "ǒ",
["O" .. CARON] = "Ǒ",
["o" .. OGONEK] = "ǫ",
["O" .. OGONEK] = "Ǫ",
["s" .. CARON] = "š",
["S" .. CARON] = "Š",
["z" .. CARON] = "ž",
["Z" .. CARON] = "Ž",
-- used in ru-pron:
["ж" .. BREVE] = "ӂ", -- used in ru-pron
["Ж" .. BREVE] = "Ӂ",
["j" .. CFLEX] = "ĵ",
["J" .. CFLEX] = "Ĵ",
["j" .. CARON] = "ǰ",
-- no composed uppercase equivalent of J-caron
["ʒ" .. CARON] = "ǯ",
["Ʒ" .. CARON] = "Ǯ",
}
-- Decompose acute, grave, etc. on letters (esp. Latin) into individivual
-- character + combining accent. But recompose Cyrillic and Latin characters
-- that we want to treat as units and get caught in the crossfire. We mostly
-- want acute and grave decomposed; perhaps should just explicitly decompose
-- those and no others.
local function decompose(text)
text = strnfd(text)
text = strsubn(text, ".[" .. BREVE .. DIA .. CARON .. OGONEK .. "]", recomposer)
return text
end
local UTF8 = "[%z\1-\127\194-\244][\128-\191]*"
local BR = u(0x0306) -- breve ̆
local DI = u(0x0308) -- diaeresis = ̈
local DIACRITICS = AC .. GR .. BR .. DI ..
u(0x0302) .. -- circumflex ̂
u(0x0304) .. -- macron ̄
u(0x0307) .. -- dot above ̇
u(0x030A) .. -- ring above ̊
u(0x030C) .. -- caron ̌
u(0x030F) .. -- double grave ̏
u(0x0323) .. -- dot below ̣
u(0x0328) -- ogonek ̨
local TEMP_G = u(0xFFF1) -- substitute to prevent g from changing to v
local word_chars = "%a’%(%)%[%]" .. DIACRITICS
local function ine(x) -- if not empty
return x ~= "" and x or nil
end
-- Main letter conversion table.
local letters = {
["а"] = "a", ["б"] = "b", ["в"] = "v", ["г"] = "g", ["д"] = "d", ["е"] = "je", ["ж"] = "ž", ["з"] = "z", ["и"] = "i", ["й"] = "j", ["к"] = "k", ["л"] = "l", ["м"] = "m", ["н"] = "n", ["о"] = "o", ["п"] = "p", ["р"] = "r", ["с"] = "s", ["т"] = "t", ["у"] = "u", ["ф"] = "f", ["х"] = "x", ["ц"] = "c", ["ч"] = "č", ["ш"] = "š", ["щ"] = "šč", ["ъ"] = "ʺ", ["ы"] = "y", ["ь"] = "ʹ", ["э"] = "e", ["ю"] = "ju", ["я"] = "ja",
["А"] = "A", ["Б"] = "B", ["В"] = "V", ["Г"] = "G", ["Д"] = "D", ["Е"] = "Je", ["Ж"] = "Ž", ["З"] = "Z", ["И"] = "I", ["Й"] = "J", ["К"] = "K", ["Л"] = "L", ["М"] = "M", ["Н"] = "N", ["О"] = "O", ["П"] = "P", ["Р"] = "R", ["С"] = "S", ["Т"] = "T", ["У"] = "U", ["Ф"] = "F", ["Х"] = "X", ["Ц"] = "C", ["Ч"] = "Č", ["Ш"] = "Š", ["Щ"] = "Šč", ["Ъ"] = "ʺ", ["Ы"] = "Y", ["Ь"] = "ʹ", ["Э"] = "E", ["Ю"] = "Ju", ["Я"] = "Ja",
-- Russian style quotes
["«"] = "“", ["»"] = "”",
-- archaic, pre-1918 letters
["і"] = "i", ["ѳ"] = "f", ["ѣ"] = "jě", ["ѵ"] = "i",
["І"] = "I", ["Ѳ"] = "F", ["Ѣ"] = "Jě", ["Ѵ"] = "I",
-- archaic, pre-1708 letters (most of these are covered by aliases below)
["ѥ"] = "je", ["ѯ"] = "ks", ["ѱ"] = "ps",
["Ѥ"] = "Je", ["Ѯ"] = "Ks", ["Ѱ"] = "Ps",
}
-- Treat most archaic letters as aliases. Exceptions:
-- ѥ is not the same as е, because it doesn't lose iotation after a consonant.
-- ѯ and ѱ can't be treated as aliases, because mapping 1 character to 2 messes
-- can cause the logic which checks the capitalization of adjacent letters to
-- become unreliable. This only affects the uppercase forms, but the lowercase
-- forms are also excepted for consistency.
local aliases = {
["є"] = "е", ["ꙁ"] = "з", ["ꙃ"] = "з", ["ѕ"] = "з", ["ї"] = "і", ["ꙋ"] = "у", ["ѡ"] = "о", ["ѿ"] = "о", ["ꙑ"] = "ы", ["ꙗ"] = "я", ["ѧ"] = "я", ["ѫ"] = "у", ["ѩ"] = "я", ["ѭ"] = "ю",
["Є"] = "Е", ["Ꙁ"] = "З", ["Ꙃ"] = "З", ["Ѕ"] = "З", ["Ї"] = "І", ["Ꙋ"] = "У", ["Ѡ"] = "О", ["Ѿ"] = "О", ["Ꙑ"] = "Ы", ["Ꙗ"] = "Я", ["Ѧ"] = "Я", ["Ѫ"] = "У", ["Ѩ"] = "Я", ["Ѭ"] = "Ю", ["'"] = "’"
}
local plain_e = {
["е"] = "e", ["ѣ"] = "ě", ["э"] = "ɛ",
["Е"] = "E", ["Ѣ"] = "Ě", ["Э"] = "Ɛ"
}
local jo_letters = {
["ё"] = "jo", ["ѣ̈"] = "jǒ", ["я̈"] = "jǫ",
["Ё"] = "Jo", ["Ѣ̈"] = "Jǒ", ["Я̈"] = "Jǫ"
}
local vowels = "аеиіоуыѣэюяѥѵaæɐeəɛiɪɨoɵuyʊʉАЕИІОУЫѢЭЮЯѤѴAEƐIOUY"
-- Apply transformations to the Cyrillic to more closely match pronunciation.
-- Return two arguments: the "original" text (after decomposing composed
-- grave characters), and the transformed text. If the two are different,
-- {{ru-IPA}} should display a "phonetic respelling" notation.
-- NOADJ disables special-casing for adjectives in -го, while FORCEADJ forces
-- special-casing for adjectives, including those in -аго (pre-reform spelling)
-- and disables checking for exceptions (e.g. много, ого). NOSHTO disables
-- special-casing for что and related words.
function export.apply_tr_fixes(text, noadj, noshto, forceadj)
-- normalize any aliases
text = text:gsub(UTF8, aliases) --no funciona strsubn
-- decompose stress accents without decomposing letters we want to treat
-- as units (e.g. й or ё)
text = decompose(text)
local origtext = text
-- the second half of the if-statement below is an optimization; see above.
if not noadj and text:find("го") then
local v = {["г"] = "в", ["Г"] = "В"}
local repl = function(e, g, o, sja) return e .. v[g] .. o .. (sja or "") end
-- Handle какого-нибудь/-либо/-то; must be done first because of an exception
-- made for бого-, снего-, etc.
text = strsubn(text, "([кКтТ][аА][кК][оеОЕ" .. (forceadj and "аА" or "") .. "][" .. AC .. GR .. "]?)([гГ])([оО]%-)", repl)
if not forceadj then
local function go(text, case)
local pattern = strsubn(case, "^(.)(.*)(го[" .. AC .. GR .. "]?)(%-?)$", function(m1, m2, m3, m4)
m1 = "%f[%a" .. AC .. GR .. "]([" .. m1:uupper() .. m1 .. "]"
m2 = m2:gsub("\204[\128\129]", "[" .. AC .. GR .. "]?") .. ")"
m3 = m3:gsub("\204[\128\129]", "[" .. AC .. GR .. "]?")
:gsub("^г(.*)", "г(%1")
m4 = m4 == "-" and "%-)" or ")%f[^%a" .. AC .. GR .. "]"
return m1 .. m2 .. m3 .. m4
end)
return strsubn(text, pattern, "%1" .. TEMP_G .. "%2")
end
for _, case in ipairs{"мно́го", "н[еа]мно́го", "до́рого", "недо́рого", "стро́го", "нестро́го", "на́строго", "убо́го", "пол[ао]́го"} do
text = go(text, case)
end
-- check for neuter short forms of compound adjectives in -но́гий
if strfind(text, "но[" .. AC .. GR .. "]?го%f[^%a" .. AC .. GR .. "]") then
for _, case in ipairs{"безно́го", "босоно́го", "веслоно́го", "длинноно́го", "двуно́го", "коротконо́го", "кривоно́го", "одноно́го", "пятино́го", "трёхно́го", "трехно́го", "хромоно́го", "четвероно́го", "шестино́го"} do
text = go(text, case)
end
end
for _, case in ipairs{"ого́", "го́го", "ваго́го", "ло́го", "п[ео]́го", "со́го", "То́го", "ле́го", "игого́", "огого́", "альбиньязего", "д[иі]е́го", "бо́лого", "гр[иі]е́го", "манче́го", "пичис[иі]е́го", "тенкодого", "хио́го", "аго-", "его-", "ого-"} do
text = go(text, case)
end
end
--handle genitive/accusative endings, which are spelled -ого/-его/-аго
-- (-ogo/-ego/-ago) but transliterated -ovo/-evo/-avo; only for adjectives
-- and pronouns, excluding words like много, ого (-аго occurs in
-- pre-reform spelling); \204\129 is an acute accent, \204\128 is a grave accent
local pattern = "([оеОЕ" .. (forceadj and "аА" or "") .. "][" .. AC .. GR .. "]?)([гГ])([оО][" .. AC .. GR .. "]?)"
local reflexive = "([сС][яЯ][" .. AC .. GR .. "]?)"
text = strsubn(text, pattern .. "%f[^%a" .. AC .. GR .. TEMP_G .. "]", repl)
text = strsubn(text, pattern .. reflexive .. "%f[^%a" .. AC .. GR .. TEMP_G .. "]", repl)
-- handle сегодня
text = strsubn(text, "%f[%a" .. AC .. GR .. "]([Сс]е)г(о[" .. AC .. GR .. "]?дня)%f[^%a" .. AC .. GR .. "]", "%1в%2")
-- handle сегодняшн-
text = strsubn(text, "%f[%a" .. AC .. GR .. "]([Сс]е)г(о[" .. AC .. GR .. "]?дняшн)", "%1в%2")
-- replace TEMP_G with g; must be done after the -go -> -vo changes
text = strsubn(text, TEMP_G, "г")
end
-- the second half of the if-statement below is an optimization; see above.
if not noshto and text:find("то") then
local ch2sh = {["ч"] = "ш", ["Ч"] = "Ш"}
-- Handle что
text = strsubn(text, "%f[%a" .. AC .. GR .. "]([Чч])(то[" .. AC .. GR .. "]?)%f[^%a" .. AC .. GR .. "]",
function(ch, to) return ch2sh[ch] .. to end)
-- Handle чтобы, чтоб
text = strsubn(text, "%f[%a" .. AC .. GR .. "]([Чч])(то[" .. AC .. GR .. "]?бы?)%f[^%a" .. AC .. GR .. "]",
function(ch, to) return ch2sh[ch] .. to end)
-- Handle ничто
text = strsubn(text, "%f[%a" .. AC .. GR .. "]([Нн]и)ч(то[" .. AC .. GR .. "]?)%f[^%a" .. AC .. GR .. "]", "%1ш%2")
end
-- Handle мягкий, лёгкий, легчать, etc.
text = strsubn(text, "([МмЛл][яеё][" .. AC .. GR .. "]?)г([кч])", "%1х%2")
return origtext, text
end
do
local function get_prev_char(word, i)
local j, ch = 0, nil
repeat
j = j + 1
ch = word[i - j]
until not (ch and (DIACRITICS .. "()’"):find(ch, 1, true))
return ch
end
local function get_next_char(word, i)
local j, ch = 0, nil
repeat
j = j + 1
ch = word[i + j]
until ch ~= "(" and ch ~= ")"
-- If и, check if it's actually й to avoid wrongly treating it as
-- a vowel.
if (ch == "и" or ch == "И") and word[i + j + 1] == BR then
remove(word, i + j + 1)
ch = strnfc(ch .. BR)
word[i + j] = ch
end
return ch
end
-- Check if a vowel should be made "plain" (usually by removing the "j"
-- in the transliteration). Returns true if `prev` is in the string `check`.
-- If `this` and `prev` are both uppercase, always returns false (on the
-- assumption the term is an initialism).
-- Note: We check both because of terms like Романо-д’Эццелино and
-- Комон-л’Эванте, where an uppercase `this` follows a lowercase `prev`,
-- (since the apostrophe is ignored).
local function check_plain(this, prev, check, in_check)
if prev and (this == this:ulower() or prev == prev:ulower()) then
if check:match(prev, 1, true) then
return in_check
end
return not in_check
end
end
-- Convert any jos (ё, ѣ̈, я̈) as a special-case.
local function is_jo_letter(this, prev, output, word, d)
local tr = jo_letters[this]
if not tr then
return
end
-- Remove "j" if preceded by a hushing consonant (ж ч ш щ).
if check_plain(this, prev, "жчшщЖЧШЩ", true) then
tr = tr:sub(2)
if this == this:uupper() then
tr = tr:uupper()
end
end
insert(output, tr)
-- Note the position, so we can give it an implicit primary stress
-- if necessary (unless it already has secondary stress; shouldn't
-- ever come after primary stress, but just in case it does we
-- shouldn't override it or give the jo two stress marks.
if word[d.i + 1] ~= GR then
d.final_jo = #output
end
return true
end
local function do_iteration(output, word, d)
-- Get current, previous and next characters, skipping over brackets, and
-- ignoring diacritics for the previous character (which simplifies checks).
local this = word[d.i]
local prev = get_prev_char(word, d.i)
local nxt = get_next_char(word, d.i)
-- A word is monosyllabic if it has only one vowel.
if vowels:find(this, 1, true) then
d.vowels = d.vowels + 1
end
if nxt == DI then
d.i = d.i + 1
this = strnfc(this .. DI)
if is_jo_letter(this, prev, output, word, d) then
return
end
elseif nxt == BR then
d.i = d.i + 1
this = strnfc(this .. BR)
-- Note that explicit stress has been found, which prevents any
-- implicit stress from being added for jos.
elseif this == AC then
d.primary = true
-- After a lowercase consonant or at the start of a suffix, е becomes
-- e, ѣ becomes ě and э becomes ɛ.
elseif plain_e[this] and (
check_plain(this, prev, vowels .. "ъьЪЬʹʺ", false) or
not prev and d.dash_before
) then
insert(output, plain_e[this])
return
-- ю becomes u if if preceded by ж or ш.
elseif (
(this == "ю" or this == "Ю") and
check_plain(this, prev, "жшЖШ", true)
) then
insert(output, this == "ю" and "u" or "U")
return
-- Make lowercase izhitsa display as -v- after /a/, /e/ and /i/
-- (matching the equivalent Greek digraphs αυ, ευ and ηυ).
elseif (
this == "ѵ" and
prev and ("аеиіѣэяѥaæɐeəɛiɪɨАЕИІѢЭЯѤAEƐI"):find(prev, 1, true)
) then
this = "в"
word[d.i] = "в"
-- Ignore word-final hard signs.
elseif (this == "ъ" or this == "Ъ") and d.i == #word then
return
end
insert(output, letters[this] or this)
end
-- Transliterate after the pronunciation-related transformations of
-- export.apply_tr_fixes() have been applied. Called from {{ru-IPA}}.
-- `jo_accent` is as in export.tr().
function export.tr_after_fixes(text, jo_accent)
-- normalize any aliases
text = text:gsub(UTF8, aliases)
text = strnfc(text)
local output = {}
-- Note: We use ustring gsub because ustring gmatch is bugged, and
-- it's easy to make gsub do the same thing.
strsubn(text, "([^" .. word_chars .. "]*)([" .. word_chars .. "]*)", function(before, word)
for _, ch in ipairs(strexplode(before)) do
insert(output, ch)
end
-- FIXME: Do this in one loop instead of splitting by word.
word = strexplode(strnfd(word))
local d = {
i = 0,
vowels = 0
}
-- Prefix if it's preceded by "^-" or " -".
if output[#output] == "-" then
local prev = output[#output - 1]
if not prev or strfind(prev, "%s") then
d.dash_before = true
end
end
while d.i < #word do
d.i = d.i + 1
do_iteration(output, word, d)
end
-- Add an implicit primary stress to a jo (if applicable).
-- Jos do not implicitly take stress accents if an explicit primary
-- stress is given. Otherwise, the final jo which doesn't have
-- secondary stress takes primary stress.
-- Prefixes do not take implicit primary stress.
-- Primary stress will be shown on monosyllables if either they
-- are a suffix or `jo_accent` is "mono".
if (
jo_accent ~= "none" and
d.final_jo and
(not (d.primary or word[#word] == "-")) and
(jo_accent == "mono" or d.vowels > 1 or d.dash_before)
) then
output[d.final_jo] = output[d.final_jo] .. AC
end
end)
return strnfc(concat(output))
end
end
-- Transliterates text, which should be a single word or phrase. It should
-- include stress marks, which are then preserved in the transliteration.
-- ё is a special case: it is rendered (j)ó in multisyllabic words and
-- monosyllabic words in multi-word phrases, but rendered (j)o without an
-- accent in isolated monosyllabic words. This can be overridden with the
-- JO_ACCENT parameter: if set to "mono", monosyllabic words will also be
-- given as (j)ó (this is used in conjugation and declension tables); if set
-- to "none", it will always be rendered (j)o.
-- NOADJ disables special-casing for adjectives in -го, while FORCEADJ forces
-- special-casing for adjectives and disables checking for exceptions
-- (e.g. много). NOSHTO disables special-casing for что and related words.
-- As a special case, if `lang` is a language other than "ru", then none of
-- the special transformations are applied, and JO_ACCENT is set to "none".
-- This is for situations which require Russian transcriptions of Cyrillic,
-- but where the special cases don't make sense (e.g. the Cyrillization of
-- Mandarin, or pidgins such as Russenorsk).
function export.tr(text, lang, sc, jo_accent, noadj, noshto, forceadj)
if (ine(lang) or "ru") ~= "ru" then
return export.tr_after_fixes(text, "none")
end
return export.tr_after_fixes(
select(2, export.apply_tr_fixes(text, noadj, noshto, forceadj)),
jo_accent
)
end
-- translit with various special-case substitutions; NOADJ disables
-- special-casing for adjectives in -го, while FORCEADJ forces special-casing
-- for adjectives and disables checking for expections (e.g. много).
-- NOSHTO disables special-casing for что and related words. SUB is used
-- to implement arbitrary substitutions in the Cyrillic text before other
-- transformations are applied and before translit. It is of the form
-- FROM/TO,FROM/TO,...
function export.tr_sub(text, jo_accent, noadj, noshto, sub,
forceadj)
if type(text) == "table" then -- called directly from a template
jo_accent = ine(text.args.jo_accent)
noadj = ine(text.args.noadj)
noshto = ine(text.args.noshto)
sub = ine(text.args.sub)
text = text.args[1]
end
if sub then
local subs = strsplit(sub, ",")
for _, subpair in ipairs(subs) do
local subsplit = strsplit(subpair, "/")
text = strsubn(text, subsplit[1], subsplit[2])
end
end
return export.tr(text, nil, nil, jo_accent, noadj, noshto, forceadj)
end
--for adjectives, pronouns
function export.tr_adj(text, jo_accent)
if type(text) == "table" then -- called directly from a template
jo_accent = ine(text.args.jo_accent)
text = text.args[1]
end
-- we have to include "forceadj" because typically when tr_adj() is called
-- from the noun or adjective modules, it's called with suffix ого, which
-- would otherwise trigger the exceptional case and be transliterated as ogo
return export.tr(text, nil, nil, jo_accent, false,
"noshto", "forceadj")
end
return export