「モジュール:headword/data」の版間の差分

削除された内容追加された内容
インライン
2024年5月21日 (火) 10:08時点における最新版

このモジュールについての説明文ページをモジュール:headword/data/doc に作成できます
local data = {}

data.invariable = {
	"cmavo",
	"cmene",
	"fu'ivla",
	"gismu",
	"Han tu",
	"hanzi",
	"hanja",
	"jyutping",
	"kanji",
	"lujvo",
	"phrasebook",
	"pinyin",
	"rafsi",
	"romaji",
}

data.lemmas = {
	--TODO: 英語はいらない。
	"acronym",
	"adnominal",
	"adposition",
	"ambiposition",
	"circumposition",
	"cmavo",
	"cmavo cluster",
	"cmene",
	"combining form",
	"counter",
	"diacritical mark",
	"equative adjective",
	"fu'ivla",
	"gismu",
	"Han tu",
	"ideophone",
	"letter",
	"ligature",
	"lujvo",
	"morpheme",
	"non-constituent",
	"number",
	"numeral symbol",
	"postpositional phrase",
	"prepositional phrase",
	"preverb",
	"pronominal adverb",
	"punctuation mark",
	"relative",
	"依存名詞",
	"オノマトペ",
	"音節",
	"冠形詞",
	"冠詞",
	"漢字",
	"漢字音",
	"間投詞",
	"感動詞",
	"間投助詞",
	"慣用句",
	"記号",
	"基数",
	"擬声語",
	"疑問詞",
	"疑問代名詞",
	"敬称",
	"形態素",
	"形容詞",
	"形容動詞",
	"原型副詞",
	"限定詞",
	"後置詞",
	"コード",
	"語根",
	"ことわざ",
	"語尾",
	"固有名詞",
	"ジェスチャー",
	"指示形容詞",
	"指示詞",
	"指示代名詞",
	"集合名詞",
	"修飾詞",
	"縮約形",
	"述語",
	"述詞",
	"小詞",
	"小辞",
	"助詞",
	"助数詞",
	"助動詞",
	"所有形容詞",
	"人名",
	"数詞",
	"数量詞",
	"成句",
	"成句 形容詞句",
	"接語",
	"接辞",
	"接合辞",
	"接周辞",
	"接続詞",
	"接中辞",
	"接頭辞",
	"接尾辞",
	"前置詞",
	"造語成分",
	"代名詞",
	"動詞",
	"自動詞",
	"他動詞",
	"頭字語",
	"日本語人名",
	"人称接辞",
	"複合助辞",
	"副詞",
	"副分詞過去",
	"副分詞現在",
	"不変化詞",
	"文化語",
	"文末詞",
	"補助動詞",
	"名詞",
	"屋号",
	"指文字",
	"略語",
	"量詞",
	"類別詞",
	"連句",
	"連語",
	"連体詞",
}

data.nonlemmas = {
	-- TODO: 英語はいらない。
	"active participle",
	"adjectival participle",
	"adverbial participle",
	"agent participle",
	"combined form",
	"comparative adjective form",
	"comparative adverb form",
	"contraction",
	"converb",
	"determiner comparative form",
	"determiner superlative form",
	"equative adjective form",
	"equative adjective",
	"future participle",
	"gerund",
	"jyutping",
	"kanji reading",
	"misspelling",
	"negative participle",
	"nominal participle",
	"noun case form",
	"noun paucal form",
	"noun possessive form",
	"noun singulative form",
	"passive participle",
	"past active participle",
	"past passive participle",
	"perfect active participle",
	"perfect participle",
	"perfect passive participle",
	"postposition form",
	"preposition contraction",
	"preposition form",
	"prepositional pronoun",
	"present active participle",
	"present passive participle",
	"pronoun possessive form",
	"rafsi",
	"singulative",
	"superlative adjective form",
	"superlative adverb form",
	"過去分詞",
	"過去分詞 定形",
	"カタカナ表記",
	"漢字表記語の仮名表記",
	"冠詞 定形",
	"間投詞 定形",
	"楔形文字表記",
	"形容詞 活用形",
	"形容詞 最上級",
	"形容詞 女性形",
	"形容詞 定形",
	"形容詞 比較級",
	"形容詞 複数形",
	"現在分詞",
	"限定詞 定形",
	"語根 定形",
	"固有名詞 定形",
	"固有名詞 複数形",
	"助詞 終助詞",
	"助動詞 活用形",
	"数詞 定形",
	"接周辞 定形",
	"接中辞 定形",
	"接頭辞 定形",
	"接尾辞 定形",
	"代名詞 定形",
	"代名詞 複数形",
	"同音異義",
	"動詞 活用形",
	"動詞 定形",
	"動名詞",
	"ピンイン",
	"副詞 最上級",
	"副詞 定形",
	"副詞 比較級",
	"不定詞",
	"不定詞 定形",
	"不変化詞 定形",
	"分詞",
	"分詞 定形",
	"分離符",
	"名詞 指小辞",
	"名詞 定形",
	"名詞 双数形",
	"名詞 複数形",
	"名詞 複数形 不規則変化",
	"ラテン文字表記",
	"和語の漢字表記",
}

-- These languages will not have "LANG multiword terms" categories added.
data.no_multiword_cat = {
	-------- Languages without spaces between words (sometimes spaces between phrases) --------
	"aho", -- Ahom
	"blt", -- Tai Dam
	"ja", -- Japanese
	"khb", -- Lü
	"km", -- Khmer
	"lo", -- Lao
	"mnw", -- Mon
	"my", -- Burmese
	"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
	"nod", -- Northern Thai
	"ojp", -- Old Japanese
	"tdd", -- Tai Nüa
	"th", -- Thai
	"tts", -- Isan
	"twh", -- Tai Dón
	"shn", -- Shan
	"sou", -- Southern Thai
	"zh", -- Chinese (all varieties with Chinese characters)

	-------- Languages with spaces between syllables --------
	"ahk", -- Akha
	"aou", -- A'ou
	"atb", -- Zaiwa
	"byk", -- Biao
	--"duu", -- Drung; not sure
	--"hmx-pro", -- Proto-Hmong-Mien
	--"hnj", -- Green Hmong; not sure
	"huq", -- Tsat
	"ium", -- Iu Mien
	--"lis", -- Lisu; not sure
	"mtq", -- Muong
	--"mww", -- White Hmong; not sure
	--"sit-gkh", -- Gokhy; not sure
	--"swi", -- Sui; not sure
	"tbq-lol-pro", -- Proto-Loloish
	"tdh", -- Thulung
	"ukk", -- Muak Sa-aak
	"vi", -- Vietnamese
	"yig", -- Wusa Nasu
	"zng", -- Mang

	-------- Languages with ~ with surrounding spaces used to separate variants --------
	"mkh-ban-pro", -- Proto-Bahnaric
	"sit-pro", -- Proto-Sino-Tibetan; listed above
	
	-------- Other weirdnesses --------
	"mul", -- Translingual; gestures, Morse code, etc.
	"aot", -- Atong (India); bullet is a letter

	-------- All sign languages	--------
	"ads",
	"aed",
	"aen",
	"afg",
	"ase",
	"asf",
	"asp",
	"asq",
	"asw",
	"bfi",
	"bfk",
	"bog",
	"bqn",
	"bqy",
	"bvl",
	"bzs",
	"cds",
	"csc",
	"csd",
	"cse",
	"csf",
	"csg",
	"csl",
	"csn",
	"csq",
	"csr",
	"doq",
	"dse",
	"dsl",
	"ecs",
	"esl",
	"esn",
	"eso",
	"eth",
	"fcs",
	"fse",
	"fsl",
	"fss",
	"gds",
	"gse",
	"gsg",
	"gsm",
	"gss",
	"gus",
	"hab",
	"haf",
	"hds",
	"hks",
	"hos",
	"hps",
	"hsh",
	"hsl",
	"icl",
	"iks",
	"ils",
	"inl",
	"ins",
	"ise",
	"isg",
	"isr",
	"jcs",
	"jhs",
	"jls",
	"jos",
	"jsl",
	"jus",
	"kgi",
	"kvk",
	"lbs",
	"lls",
	"lsl",
	"lso",
	"lsp",
	"lst",
	"lsy",
	"lws",
	"mdl",
	"mfs",
	"mre",
	"msd",
	"msr",
	"mzc",
	"mzg",
	"mzy",
	"nbs",
	"ncs",
	"nsi",
	"nsl",
	"nsp",
	"nsr",
	"nzs",
	"okl",
	"pgz",
	"pks",
	"prl",
	"prz",
	"psc",
	"psd",
	"psg",
	"psl",
	"pso",
	"psp",
	"psr",
	"pys",
	"rms",
	"rsl",
	"rsm",
	"sdl",
	"sfb",
	"sfs",
	"sgg",
	"sgx",
	"slf",
	"sls",
	"sqk",
	"sqs",
	"ssp",
	"ssr",
	"svk",
	"swl",
	"syy",
	"tse",
	"tsm",
	"tsq",
	"tss",
	"tsy",
	"tza",
	"ugn",
	"ugy",
	"ukl",
	"uks",
	"vgt",
	"vsi",
	"vsl",
	"vsv",
	"xki",
	"xml",
	"xms",
	"ygs",
	"ysl",
	"zib",
	"zsl",
}

-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
data.hyphen_not_multiword_sep = {
	"akk", -- Akkadian; hyphens between syllables
	"akl", -- Aklanon; hyphens for mid-word glottal stops
	"ber-pro", -- Proto-Berber; morphemes separated by hyphens
	"ceb", -- Cebuano; hyphens for mid-word glottal stops
	"cnk", -- Khumi Chin; hyphens used in single words
	"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
	"de", -- too many false positives
	"esx-esk-pro", -- hyphen used to separate morphemes
	"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
	"hil", -- Hiligaynon; hyphens for mid-word glottal stops
	"ilo", -- Ilocano; hyphens for mid-word glottal stops
	"lcp", -- Western Lawa; dash as syllable joiner
	"lwl", -- Eastern Lawa; dash as syllable joiner
	"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
	"msb", -- Masbatenyo; too many false positives
	"tl", -- Tagalog; too many false positives
	"war", -- Waray-Waray; too many false positives
}

-- These languages will not have "LANG masculine nouns" and similar categories added.
data.no_gender_cat = {
	-- Languages without gender but which use the gender field for other purposes
	"ja",
	"th",
}

data.notranslit = {
	"ams",
	"az",
	"bbc",
	"bug",
	"cia",
	"cjm",
	"cmn",
	"hak",
	"ja",
	"kzg",
	"lad",
	"lzh",
	"ms",
	"mul",
	"mvi",
	"nan",
	"oj",
	"ojp",
	"okn",
	"pi",
	"ro",
	"ryn",
	"rys",
	"ryu",
	"sh",
	"tgt",
	"th",
	"tkn",
	"tly",
	"und",
	"vi",
	"xug",
	"yue",
	"yoi",
	"yox",
	"za",
	"zh",
}

-- Script codes for which a script-tagged display title will be added.	
data.toBeTagged = {
	"Ahom",
	"Arab",
	"Avst",
	"Bali",
	"Cham",
	"Copt",
	"Kali",
	"Hani",
	"Hebr",
	"Lana",
	"Linb",
	"Mand",
	"Mong",
	"polytonic",
	"Rjng",
	"Samr",
	"Sund",
	"Sylo",
	"Tang",
	"Tavt",
	"Xsux",
}

for key, list in pairs(data) do
	data[key] = require("モジュール:utils").list_to_set(list)
end

-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
-- will be generated if the headword is of the appropriate gender/number. We put this at the bottom
-- because it's a map, not a list.
data.pos_for_gender_number_cat = {
	["名詞"] = "名詞",
	["固有名詞"] = "名詞",
	["接尾辞"] = "接尾辞",
	-- We include verbs because impf and pf are valid "genders".
	["動詞"] = "動詞",
}

return data
「モジュール:headword/data」の版間の差分

2024年5月21日 (火) 10:08時点における最新版

案内メニュー

検索