မေႃႇၵျူး:Hani-sortkey
This module will sort text in the Han script. It is used to sort Ai-Cham, Central Bai, Northern Bai, Southern Bai, Biao-Jiao Mien, Biyo, Min Dong, Jin, Mandarin, Northern Pinghua, Chinese Pidgin English, Puxian, Macau Pidgin Portuguese, Southern Pinghua, Huizhou, Min Zhong, Dungan, Daur, Gan, Hakka, Xiang, Japanese, Lama Bai, ၶႄႇ ပွတ်းၵၢင်, Literary Chinese, Middle Vietnamese, Caolan, Min Bei, Translingual, Min Nan, Nung, ၶႄႇပၢၼ်ၵဝ်ႇ, Bouyei, Baekje, Tuyuhun, Tuoba, Wuhuan, Xianbei, Sui, Pai-lang, Kyakala, Tày, Vietnamese, Wu, Waxianghua, Classical Tibetan, Rouran, Middle Mongol, Buyeo, Cantonese, တႆးၸွင်ႈ, Zauzou, ၶႄႇ, Shaozhou Tuhua, Taishanese, Teochew, Goguryeo, Zakhring, and Khitan.
The module should preferably not be called directly from templates or other modules.
To use it from a template, use {{sortkey}}
.
Within a module, use Module:languages#Language:makeSortKey.
For testcases, see Module:Hani-sortkey/testcases.
Functions
makeSortKey(text, lang, sc)
- Generates a sortkey for a given piece of
text
written in the script specified by the codesc
, and language specified by the codelang
. - When the sort fails, returns
nil
.
The demonstration functions that generated the content shown below are housed in Module:Hani-sortkey/templates. Modifications to the module can be tested in Module:Hani-sortkey/sandbox. Sortkeys for individual characters are retrieved from one of 178 data modules. Module:Hani-sortkey/data creates documentation for these modules.
Show sortkeys
[မႄးထတ်း]Lua error at line 149: attempt to call upvalue 'pe' (a nil value). Lua error at line 149: attempt to call upvalue 'pe' (a nil value). Lua error at line 149: attempt to call upvalue 'pe' (a nil value).
Ideographic description sequences
[မႄးထတ်း]Lua error at line 149: attempt to call upvalue 'pe' (a nil value).
Lua error in မေႃႇၵျူး:module_categorization at line 173: This template should only be used in the Module namespace, not on page 'မေႃႇၵျူး:Hani-sortkey'..
local export = {}
local cp = mw.ustring.codepoint
local concat = table.concat
local explode_utf8 = require("Module:string utilities").explode_utf8
local insert = table.insert
local namespace = mw.title.getCurrentTitle().nsText
local m_data = require("Module:Hani-sortkey/data/serialized")
local radicals_start = m_data:find("\254")
local preconvert_start = m_data:find("\255", radicals_start)
local pe = require("Module:utilities").pattern_escape
local function log(...)
if namespace == "Module" then
mw.log(...)
end
end
--[[
The number of characters or ideographic sequences that must follow each
ideographic description character.
]]
local IDchars = {
["⿰"] = 2, -- left-to-right
["⿱"] = 2, -- above-to-below
["⿲"] = 3, -- left-to-middle and right
["⿳"] = 3, -- above-to-middle and below
["⿴"] = 2, -- full surround
["⿵"] = 2, -- surround from above
["⿶"] = 2, -- surround from below
["⿷"] = 2, -- surround from left
["⿸"] = 2, -- surround from upper left
["⿹"] = 2, -- surround from upper right
["⿺"] = 2, -- surround from lower left
["⿻"] = 2, -- overlaid
[""] = 2, -- surround from right
[""] = 2, -- surround from lower right
[""] = 1, -- horizontal reflection
[""] = 1, -- rotation
[""] = 1 -- subtraction
}
--[[
Returns the index in the string where the ideographic description sequence
(IDS) ends, or the index of the end of the string. Iterates whenever
another ideographic description character (IDC) is found.
]]
local function findEndOfIDS(text, IDchar, i)
if not (text and IDchar and i) then
return nil
end
local j = i
local component = 1
-- Number of components expected after current IDC.
local components = IDchars[IDchar]
while component <= components do
j = j + 1
local char = text[j]
if not char then
break
elseif IDchars[char] then
j = findEndOfIDS(text, char, j)
end
component = component + 1
end
--[[
If the expected number of components has been found,
return the current index in the text.
]]
if component - components == 1 then
return j
else
return nil
end
end
local module_cache = {}
local function radical_lookup(b)
local lookup = 3 * b:byte() + radicals_start
return m_data:sub(lookup - 2, lookup)
end
-- The data is stored in [[Module:Hani-sortkey/data]]. This data is not accessed directly (due to the large amount of memory this would consume), but is instead stored in a serialized form as [[Module:Hani-sortkey/data/serialized]]. If the data is changed, the new serialized data can be generated with [[Module:Hani-sortkey/data/serializer]].
function export.getData(char, returnModule)
if type(char) == "string" then
char = cp(char)
elseif type(char) ~= number then
error("getData must operate on a single character or codepoint.")
end
local sections = {
{0x3007, 0x3007},
{0x3400, 0x4DBF},
{0x4E00, 0x9FFF},
{0xF900, 0xFA6D},
{0xFA70, 0xFAD9},
{0x20000, 0x2A6DF},
{0x2A700, 0x2B739},
{0x2B740, 0x2B81D},
{0x2B820, 0x2CEA1},
{0x2CEB0, 0x2EBE0},
{0x2EBF0, 0x2EE5D},
{0x2F800, 0x2FA1D},
{0x30000, 0x3134A},
{0x31350, 0x323AF}
}
local section_offset = 0
for k, v in ipairs(sections) do
if char > v[2] then
section_offset = section_offset + v[2] - v[1] + 1
elseif char >= v[1] and char <= v[2] then
local lookup = 3 * (section_offset + char - sections[k][1] + 1)
return m_data:sub(lookup - 2, lookup):gsub("^.", radical_lookup)
end
end
return mw.ustring.char(char)
end
local unsupported_data
function export.makeSortKey(text, lang, sc)
local scripts = {
Hani = true,
Hans = true,
Hant = true,
Jpan = true,
Kore = true
}
if sc and not scripts[sc] then
return text:uupper()
end
local sort = {}
text = explode_utf8(text)
local text_len = #text
local i, char = 0
while i < text_len do
i = i + 1
char = text[i]
char = (m_data:match("%f[^\2\255]" .. pe(char) .. "\1([^\2]+)\2", preconvert_start)) or char
--[=[
If we encounter an ideographic description character (IDC),
find out if it begins a valid ideographic description sequence (IDS).
If the IDS is valid and a sortkey for it is listed in
[[Module:Hani-sortkey/data/unsupported]], then return
the sortkey, and move to the next character after the
IDS.
Otherwise, insert the IDC into the sortkey and move to the next
character after the IDC.
If the IDS is valid and no sortkey for it is found, track it.
]=]
if IDchars[char] then
local j = findEndOfIDS(text, char, i)
local IDS, data
if j then
IDS = concat(text, nil, i, j)
unsupported_data = unsupported_data or mw.loadData("Module:Hani-sortkey/data/unsupported")
data = unsupported_data[IDS]
end
if not data then
if IDS then
require("Module:debug").track("Hani-sortkey/IDS-without-sortkey")
mw.log("ideographic description sequence without sortkey: '"
.. IDS .. "'")
else
require("Module:debug").track("Hani-sortkey/invalid-IDS")
mw.log("invalid ideographic description sequence at the beginning of '"
.. text[i] .. "'")
end
end
if IDS and data then
insert(sort, data)
i = j
else
insert(sort, char)
end
else
insert(sort, export.getData(char) or char)
end
end
return concat(sort)
end
return export