Jump to content

Module:wuu-pron

Wiktionary වෙතින්
09:59, 3 අප්‍රේල් 2024 වන විට Lee (සාකච්ඡාව | දායකත්ව) (en:Module:wuu-pron වෙතින් එක් සංශෝධනයක්) විසින් සිදු කර ඇති සංශෝධන

Generates IPA based on romanisation for Wu Chinese. See {{zh-pron}}. Rimes and tones found in Module:wuu-pron/data.

Lua දෝෂය 519 පේලියේදී: Wugniu: prefix "cs" is not recognized.


local export = {}
local data = mw.loadData("Module:wuu-pron/data")

--[=[
TODO:
- do IPA for glottalised nasal intials (currently the glottal stop is dropped)
- FIND DATA FOR 3+ SYLLABLE SANDHI AND RPS!!! (we can settle for trisyllabics right)
- modules for taizhou, wenzhou
]=]--

local loc_names = {      -- Basic 聲韻調    Disyll LPS    Trisyll LPS    Quad+ LPS    RPS    Logic
	['sh'] = 'Shanghai', --      +               +             +             +         +       +
	['sj'] = 'Songjiang',--      +               +             +             -         -       +
	['cm'] = 'Chongming',--      +               +             +             -         +       +
	['sz'] = 'Suzhou',   --      +               +             +             +         +       +
--  ['ks'] = 'Kunshan',          +               +             -             -         -       -    Wugniu "新派"
	['cz'] = 'Changzhou',--      +               +             +             -         -       ?
	['jx'] = 'Jiaxing',  --      +               +             +             -         -       +
	['tx'] = 'Tongxiang',--      +               +             +             -         -       +
	['hn'] = 'Haining',  --      +               +             +             -         +       +    Xiashi locality
	['hy'] = 'Haiyan',   --      +               +             +             -         -       +
	['hz'] = 'Hangzhou', --      +               +             +             +         +       +
	['sx'] = 'Shaoxing', --      +               +             +             -         +       +
	['nb'] = 'Ningbo',   --      +               +             +             -         +       +
}

-- default to "<loc_name> dialect" if empty
-- specifies the name of the Wikipedia article of the lect
local wiki_names = {
	sh = 'Shanghainese',
	sj = 'Taihu Wu',
	jx = 'Taihu Wu',
	tx = 'Taihu Wu',
	hn = 'Taihu Wu',
	hy = 'Taihu Wu'
}

local minidict = {
	['sh'] = true,
	['cm'] = true,
	['sz'] = true,
	['cz'] = true,
	['jx'] = true,
	['hz'] = true,
	['sx'] = true,
	['nb'] = true
}

local order = {'sh', 'sj', 'cm', 'sz', 'cz', 'jx', 'tx', 'hn', 'hy', 'hz', 'sx', 'nb'}

local ipa_initial = {
	["p"] = "p", ["ph"] = "pʰ", ["b"] = "b", ["m"] = "m", ["f"] = "f", ["v"] = "v",
	["t"] = "t", ["th"] = "tʰ", ["d"] = "d", ["n"] = "n", ["l"] = "l",
	["ts"] = "t͡s", ["tsh"] = "t͡sʰ", ["s"] = "s", ["z"] = "z", ["c"] = "t͡ɕ", ["ch"] = "t͡ɕʰ",
	["dz"] = "d͡z", ["j"] = "d͡ʑ", ["gn"] = "n̠ʲ", ["sh"] = "ɕ", ["zh"] = "ʑ",
	["k"] = "k", ["kh"] = "kʰ", ["g"] = "ɡ", ["ng"] = "ŋ", ["h"] = "h", ["gh"] = "ɦ",
	[""] = "",
}

local ipa_initial_override = {
	--this always takes priority over the table above
	--additional unique initials can also be defined here
	--a question mark means the initial does not exist
	--there must be empty tables for all locations, even if there is nothing there
	['sh'] = {
		["dz"] = "?"
	},
	['sj'] = {
		["p"] = "ɓ", ["t"] = "ɗ",
		["f"] = "ɸ", ["v"] = "β",
		["ch"] = "cʰ", ["c"] = "c", ["j"] = "ɟ", ["sh"] = "ç",
		["zh"] = "?", ["dz"] = "?"
	},
	['cm'] = {
		["v"] = "fv", ["z"] = "sz", ["zh"] = "ɕʑ", ["gh"] = "hɦ",
	},
	['sz'] = {
		["dz"] = "?", ["zh"] = "?"
	},
	--[[['ks'] = {
		["zh"] = "?", ["h"] = "x"	
	},]]
	['cz'] = {
		
	},
	['jx'] = {
		['dz'] = "?", ["vh"] = "ʔv"
	},
	['tx'] = {
		
	},
	['hn'] = {
		
	},
	['hy'] = {
		["zh"] = "?"
	},
	['hz'] = {
		["zh"] = "?"
	},
	['sx'] = {
		
	},
	['nb'] = {
		
	}
}

local function get_initial(initial, loc)
	return ipa_initial_override[loc][initial] or ipa_initial[initial] or error('Invalid initial: "' .. initial .. '"')
end

local function get_final(final, loc)
	return data.ipa_final[loc][final] or error('Invalid final: "' .. final .. '"')
end

local ipa_syllabic = {
	["m"] = "m̩", ["n"] = "n̩", ["ng"] = "ŋ̍",
}

-- diagnose tone error
local function diagnose_tones(word_length, loc, text, tone, tone2, tone3)
	-- the cap on number of syllables
	local syl_cap = ({sh=5,sj=3,cm=3,sz=4,cz=3,jx=3,tx=3,hn=3,hy=3,hz=5,sx=3,nb=3})[loc]
	if syl_cap and word_length > syl_cap then
		error(("Maximum %d syllables supported for %s."):format(syl_cap, loc))
	end
	-- the cap on number of specified tones
	local tone_cap = ({sj=3,ks=2,cz=3,jx=3,tx=3,hn=3,hy=2})[loc]
	if tone_cap then
		local expected = math.min(tone_cap, word_length)
		local received = 1 + (tone2 ~= '' and 1 or 0) + (tone3 ~= '' and 1 or 0)
		if received ~= expected then
			error(('Expected %d tones, but received %d: "%s:%s".'):format(expected, received, loc, text))
		end
	elseif loc == 'sz' or loc == 'sx' then
		-- sz: tone is 7 or 8, but second tone not provided
		error("For " .. loc .. ", second tone must be specified.")
	end
	error(('Incorrect tone notation "%s" for %s. See [[WT:AZH/Wu]].'):format(tone..tone2..tone3, loc))
end

local function tone_superscript(text)
	return text:gsub('[1-5]',{['1']='¹',['2']='²',['3']='³',['4']='⁴',['5']='⁵'})
end

local function get_tone(text, loc)
	local word_length = text:gsub("[^ ]+", ""):len() + 1
	local tone, tone2, tone3 = text:match("^(.%u*)%w+ ?(%d?%u?)%w* ?(%d?%u?)")
	if loc == "jx" and tone == "3" then
		tone = text:find("^3[ptkc]s?h") and "3B" or "3A"
	elseif loc == "cm" then
		local result = nil
		if tone:find("[MP]") then -- Verb + Motion / Verb + Pronoun
			if word_length ~= 2 then error("cm: Unsupported word length.") end
			result = data.tone_contours[loc][tone] or error("cm: Wrong motion/pronoun format.")
		elseif tone:find("R",1,true) then -- Reduplication
			local main_tone, redup_type, word, sub_tone = text:match("^(%d)R([VCN])(%l+) (%d)%3$")
			main_tone, sub_tone = tonumber(main_tone), tonumber(sub_tone)
			local conv_tone = (redup_type == "N" and main_tone%2==0 and word:find("^g?[mnl]") and main_tone-1) or main_tone
			if sub_tone ~= conv_tone then error("cm: Wrong reduplication format.") end
			result = data.tone_contours[loc]["R"..redup_type..main_tone]
		end
		if result then
			return tone_superscript(result)
		end
	elseif loc == "sx" and tone:find("^%dA$") then
		return tone_superscript(data.tone_contours[loc][tone])
	end
	local result = data.tone_contours[loc][word_length..tone..tone2..tone3]
		or data.tone_contours[loc][word_length..tone..tone2]
		or data.tone_contours[loc][word_length..tone]
	return result and tone_superscript(result) or diagnose_tones(word_length, loc, text, tone, tone2, tone3)
end

local function RPS_tone_determ(word_length, tone, loc)
	local result
	if word_length == 1 then
		result = data.tone_contours[loc][tone .. "s"] or data.tone_contours[loc]['s']
	else
		result = data.tone_contours[loc]["multiple"]
	end
	return tone_superscript(result)
end

local function rom_check(text, locs) --this checks wugniu
	if text:match("%f[%l']['qx]") or text:match('ny') or text:match('hh') or text:match("h$") then
		error('Invalid syllable: ' .. text ..'. Wugniu expected, but another romanisation is supplied.')
	end
	if text:match('ghi') and locs ~= 'cm' then
		error('Invalid initial "ghi". Use "yi" instead.')
	end
	if text:match('ghu') and locs ~= 'cm' then
		error('Invalid initial "ghu". Use "wu" instead.')
	end
	if text:match('%f[%l]y%f[%L]') then
		error('Invalid syllable "y"')
	end
	if text:match('gn[aeou]') then
		error('Palatalization expected. Insert an "i" after the "gn".')
	end
	if text:match('uw') then
		error(('Invalid syllable in "%s".'):format(text))
	end
	if locs:find('cm') and (text:find('ueu') or text:find('uon') or text:find('ui')) then
		error('cm: Mutation-only final found.')
	end
	return nil
end

function export.ipa_syl_conv(text, loc, initials, finals, syllabics, i, main_tone, tone)
	-- get ipa from tables
	local initial, final = text:match("^([td]?[pbmfvtdnlszcjghk][hng]?)(.+)$")
	local if_syllabic = syllabics[text]
	if loc == 'sx' and text == 'gn' then if_syllabic = "ɲ̩" end
	if not initial or if_syllabic then
		initial, final = '', text
	end
	if loc == 'cm' then -- mutation
		local mutated_initial = i > 1 and initial == "z" and "z"
		local preglottal = ""
		if tone ~= "0" and (mutated_initial or initial:find("^g?[mnl]") or initial == "") then
			preglottal = (i > 1 or main_tone:find("^[1357]$")) and "ʔ" or "ɦ"
		end
		return preglottal
			.. (mutated_initial or initials(initial,loc))
			.. (if_syllabic or finals(final,loc))
	end
	return initials(initial,loc) .. (if_syllabic or finals(final,loc))
end

function export.wugniu_to_ipa(original_text, loc, initials, finals, syllabics, tones)
	local text, conv_text = "", ""
	local tone_number = ""
	original_text = original_text:gsub(" (%l+)(%d%u?)", ' %2%1')
	if loc == 'cm' then
		original_text = original_text:gsub("%f[%l]yi?","i"):gsub("%f[%l]wu?","u")
	else
		original_text = original_text:gsub("%f[%l]yi?","ghi"):gsub("%f[%l]wu?","ghu")
	end
	local reading = mw.text.split(original_text, ",", true)
	local syllable = {}
	local syl_tone = {}
	for reading_index = 1, #reading, 1 do
		local components = mw.text.split(reading[reading_index], "&", true)
		for component_index = 1, #components do
			local indep_words = mw.text.split(components[component_index], "+", true)
			for indep_index = 1, #indep_words do
				text = indep_words[indep_index]
				tone_number = text:sub(1, 1)
				local tone = tones(text, loc)
				text = text:gsub("[^ %l]+", "")
				local syllable = mw.text.split(text, " ", true)
				local syl_tone = mw.text.split(tone, " ", true)
				for i = 1, #syllable, 1 do
					--RPS
					if i == #syllable and indep_words[indep_index + 1] and tone ~= "³³" then
						syl_tone[i] = RPS_tone_determ(#syllable, tone_number, loc)
					end
					syllable[i] = (syllable[i] ~= "" and export.ipa_syl_conv(syllable[i], loc, initials, finals, syllabics, i, tone_number, syl_tone[i]) or "")
						.. (syl_tone[i] == "0" and "" or syl_tone[i])
				end
				indep_words[indep_index] = table.concat(syllable, " ")
			end
			components[component_index] = table.concat(indep_words, " &nbsp;")
		end
		reading[reading_index] = table.concat(components, " ")
	end
	return table.concat(reading, "/, /")
end

function export.wikt_to_wugniu(text)
	if type(text) == "table" then text = text.args[1] end
	return text
	--initials
		:gsub("'+", {["'"]=""})
		:gsub("%f[%l][jqx][jx]?", {j="c", jy="cy", jj="j", q="ch", x="sh", xx="zh"})
		:gsub("%f[%l]ny", "gn")
		:gsub("%f[%l]hh", "gh")

	--vowels
		:gsub("un", "uen")
		:gsub("yoe", "ioe")
		:gsub("y", "iu")
		:gsub("aan", "aon")
		:gsub("%f[er]r", "y")

	--syllabics
		:gsub("g?h?mm", "m")
		:gsub("g?h?ngg", "ng")

	--tones	
		:gsub("[2-5]", {['2']='5', ['3']='6', ['4']='7', ['5']='8'})
		
	--gh rules
		:gsub("ghi", "yi")
		:gsub("yi%f[aeou]", "y")
		:gsub("ghu", "wu")
		:gsub("wu%f[aeo]", "w")
end

local function wugniu_to_wikt(text)
	if type(text) == "table" then text = text.args[1] end
	--initials
	return export.wugniu_format(text
		:gsub("%f[%l][cjszg][nh]?", {c="j", ch="q", j="jj", sh="x", zh="xx", gn="ny", gh="hh"})
		:gsub("%f[%l]yi?", "hhi")
		:gsub("wu?", "hhu")

	--vowels
		:gsub("y%f[%L]", "r")
		:gsub("uen", "un")
		:gsub("ioe", "yoe")
		:gsub("iu", "y")
		:gsub("aon", "aan")

	--syllabics
		:gsub("%f[%l][mn]g?%f[%L]", {m="mm",n="nn",ng="ngg"})

	--initial hh and '
		:gsub("([157])([mnl])", "%1'%2")
		:gsub("([68])([mn][mng]g?)%f[%L]", "%1hh%2")

	--tones	
		:gsub("[5-8]", {['5']='2', ['6']='3', ['7']='4', ['8']='5'}))
end

function export.wugniu_format(text, loc)
	-- 1a a 1a 1a3 a1 -> ^1a-a-a_1-^1a_3-a_1
	-- 1a3-3a5 -> ^1a_3-^3a_5
	return text
		:gsub("[%- &+,]", {["-"]="", [" "]="-", ["&"]=" ", ["+"]=" ", [","]="; "})
		:gsub("(%-?)(%d?%u?)('?%l+)(%d?%u?)", function(dash, tone1, main, tone2)
			if dash == '-' and tone2 == '' then
				tone1, tone2 = tone2, tone1
			end
			if tone1 ~= '' then
				tone1 = '<sup>' .. tone1 .. '</sup>'
			end
			if tone2 ~= '' then
				tone2 = '<sub>' .. tone2 .. '</sub>'
			end
			return dash .. tone1 .. main .. tone2
		end)
end

local function wikt_format(text)
	return export.wugniu_format(text)
end

local function minidict_format(text)
	-- 1A3 3B5 3C D3 E -> A^3 B^5 C^3 D^3 E
	-- 1A B -> A^1 B
	return text
		:gsub("-", "")
		:gsub("[&+]", " ")
		:gsub(",", "; ")
		:gsub("[1-8]?(%l+)([1-8])", '%1<sup>%2</sup>')
		:gsub("([1-8])(%l+)", '%2<sup>%1</sup>')
		:gsub("%f[%l]([mnlr]%l*)(<sup>[1357]</sup>)", "'%1%2")
		:gsub("[1-8]",{
			["1"]="平",["2"]="平",
			["3"]="上",["4"]="上",
			["5"]="去",["6"]="去",
			["7"]="入",["8"]="入",
		})
end

function export.wugniu_to_minidict(text, loc)
	if type(text) == "table" then text = text.args[1] end
	if loc == 'sx' then
		text = text:gsub("[aeiou]+[nq]",{een="en",en="eon",iq="ieq"})
	elseif loc == 'hz' then -- are we dealing with mergers?
		text = text:gsub("[aeiou]+[nq]",{eu="ei",ieu="iu",aq="eq",iaq="ieq",iq="ieq",uaq="ueq"})
	elseif loc == 'sz' or loc == 'cz' then
		text = text:gsub("%f[%l]ye%f[%L]", "yie") -- minidict why?
	elseif loc == 'nb' then
		text = text:gsub("(%l)yu%f[nq]", "oe")
	elseif loc == 'sh' then
		text = text:gsub("([iy])e%f[%L]", "%1ae")
	elseif loc == 'cm' then
		text = text:gsub("<sup>→%l+</sup>", "")
	end
	return minidict_format(text
		--initials
		--Glottal stops? text = text:gsub("", "'")
		:gsub("gn", "ny")
		:gsub("nyi%f[aeou]", "ny")
		
		--finals & syllabic
		:gsub("([iy])u([nq])", "%1ui%2")
		:gsub("gher", "r")
		:gsub("er", "r")
		:gsub("q", "h"))
end

-- various boilerplates
function export.name_boilerplate(name, wiki)
	return '<i>[[w:' .. (wiki or name..' dialect') .. '|' .. name.. ']]</i>'
end

function export.consolas(text)
	return '<span style="font-family: Consolas, monospace;">' .. text .. '</span>'
end

function export.wugniu_boilerplate(text)
	return '\n*** <small><i>[[Wiktionary:About Chinese/Wu|Wugniu]]</i></small>: '
		.. export.consolas(text)
end

function export.minidict_boilerplate(text)
	return '\n*** <small><i>[[Wiktionary:About Chinese/Wu|MiniDict]]</i></small>: '
		.. export.consolas(text)
end

function export.wikt_boilerplate(text)
	return '\n*** <small><i>[[Wiktionary:About Chinese/Wu|Wiktionary Romanisation]] (Shanghai)</i></small>: '
		.. export.consolas(text)
end

function export.IPA_boilerplate(text, name, wiki)
	text = text:gsub("(/?[^ /,]*/[^ /,]*/?)", '<span style="white-space: nowrap;">%1</span>')
	return '\n*** <small>Sinological [[Wiktionary:International Phonetic Alphabet|IPA]]'
		.. ' (' .. export.name_boilerplate(name, wiki) .. ')</small>: '
		.. '<span class="IPA">/' .. text .. '/</span>'
end

-- backwards compatibility for old usage "|w=<text>"
function export.legacy(text, mode)
	require("Module:debug").track("wuu-pron/legacy")
	local wugniu_text = export.wikt_to_wugniu(text)
	local show = '\n**<small>(<i>[[w:Taihu Wu|Northern]]</i>)</small>: ' .. export.consolas(export.wugniu_format(wugniu_text))
	local hide = '\n**<small>(' .. export.name_boilerplate('Shanghai') .. ')</small>: ' .. export.wugniu_boilerplate(export.wugniu_format(wugniu_text))
		.. export.minidict_boilerplate(export.wugniu_to_minidict(wugniu_text, 'sh'))
		.. export.wikt_boilerplate(wikt_format(text))
		.. export.IPA_boilerplate(export.wugniu_to_ipa(wugniu_text, 'sh', get_initial, get_final, ipa_syllabic, get_tone), 'Shanghai')
	return show, hide
end

local function preprocess_IPA(text, loc)
	if loc == 'hz' then
		return text:gsub("%f[%l]([td]?[szcj]h?i?u)%f[aeonq]", "%1w")
	elseif loc == 'sx' then
		return text:gsub("[^,&]+%+[^,&]+", function(chain)
			local tone1,mode,word1,tone2,word2 = chain:match("^(%d)([AP]?)(%l+)%+(%d)(%l+)$")
			if not tone1 then error("sx: Wrong chain format.") end
			if mode == '' then mode = 'O' end
			if mode == 'A' then
				return tone1..'A'..word1..'&'..tone2..word2
			end
			return tone1..word1..' '..tone2..mode..word2
		end):gsub("#(%d)","%1N")
	elseif loc == 'cm' then
		return text:gsub("%f[%l]%l+<(%l*)>","%1")
	end
	return text
end

local function preprocess_wugniu(text, loc)
	if loc == 'jx' then
		return text:gsub("3[AB]","3")
	elseif loc == 'cm' then
		return text:gsub("[CMPR][VCN]?","")
			:gsub("%f[%l](%l*)<(%l*)>(%l*)(%d?)","%1%3%4<sup>→%2%3</sup>")
	elseif loc == 'sx' then
		return text:gsub("[#CAP]","")
	end
	return text
end

local function preprocess_mutation(text, locs)
	if locs:find('cm') then
		text = text:gsub(" (%d?C?)([vzgd]h?)([%w<>]+)", function(tone, initial, final)
			local mutated_initial = ({v="u",zh="",gh=""})[initial]
			if mutated_initial == "u" and final:find("^u") then
				mutated_initial = ""
			elseif initial == "d" and final:find("^i") then
				mutated_initial = "l"
			end
			if final:find("<") or not mutated_initial then
				return " "..tone..initial..final
			end
			return " "..tone..initial.."<"..mutated_initial..">"..final
		end)
	end
	if text:find("<") and locs ~= "cm" then
		error("cm: Mutation is incompatible with collapsing.")
	end
	return text
end

function export.make(text, mode) --mode: textShow > true / textHide > false
	text = text:gsub("勿", "6veq8") -- for backwards compatibility
	if not text:match(':') then -- for backwards compatibility
		return export.legacy(text, mode)
	end
	local show = ""
	local hide = ""
	local roms = {}
	local input_seen, duplicated = {}, false
	text = mw.text.split(text, ';', true)
	for i = 1,#text,1 do
		local s = mw.text.split(text[i], ':', true)
		if not duplicated then
			if input_seen[s[2]] then duplicated = true end
			input_seen[s[2]] = true
		end
		if #s ~= 2 or #s[1] == 0 then
			error("Wugniu: prefix is required or too many prefixes")
		end
		local locs, t = mw.text.split(s[1], ',', true), s[2]
		t = preprocess_mutation(t, s[1])
		local list = {}
		local format_text = t
		for _, loc in ipairs(locs) do
			if loc_names[loc] then
				list[loc] = true
			else
				error('Wugniu: prefix "' .. loc .. '" is not recognized')
			end
			format_text = preprocess_wugniu(format_text, loc)
		end
		rom_check(t, s[1])
		local wugniu_text = export.wugniu_format(format_text, locs[1])
		table.insert(roms,wugniu_text)
		local names = {}
		local minidicts = {}
		local minidicts_seen = {}
		local IPAs = {}
		for _, loc in ipairs(order) do if list[loc] then
			table.insert(names, export.name_boilerplate(loc_names[loc], wiki_names[loc]))
			if minidict[loc] then
				local minidict_result = export.wugniu_to_minidict(format_text, loc)
				if not minidicts_seen[minidict_result] then
					table.insert(minidicts, minidict_result)
					minidicts_seen[minidict_result] = true
				end
			end
			local ipa_text = preprocess_IPA(t, loc)
			ipa_text = export.wugniu_to_ipa(ipa_text, loc, get_initial, get_final, ipa_syllabic, get_tone)
			table.insert(IPAs,export.IPA_boilerplate(ipa_text, loc_names[loc], wiki_names[loc]))
		end end
		hide = hide .. '\n** <small>(' .. table.concat(names,', ') .. ')</small>'
		hide = hide .. export.wugniu_boilerplate(wugniu_text)
		for _,minidict_text in ipairs(minidicts) do
			hide = hide .. export.minidict_boilerplate(minidict_text)
		end
		if list.sh then
			hide = hide .. export.wikt_boilerplate(wugniu_to_wikt(format_text))
		end
		hide = hide .. table.concat(IPAs, '')
	end
	show = '\n** <small>(<i>[[w:Taihu Wu|Northern]]</i>)</small>: ' .. export.consolas(table.concat(roms, ' / '))
	if duplicated then 
		require("Module:debug").track("wuu-pron/duplicated")
	end
	return show, hide
end

return export
"https://si.wiktionary.org/w/index.php?title=Module:wuu-pron&oldid=163886" වෙතින් සම්ප්‍රවේශනය කෙරිණි