Skip to content

Commit b8f9a2a

Browse files
committed
Add support for collation attributes on older ICU versions
Starting in ICU 54, collation customization attributes can be specified in the locale string, for example "@colStrength=primary;colCaseLevel=yes". Add support for this for older ICU versions as well, by adding some minimal parsing of the attributes in the locale string and calling ucol_setAttribute() on them. This is essentially what never ICU versions do internally in ucol_open(). This was we can offer this functionality in a consistent way in all ICU versions supported by PostgreSQL. Also add some tests for ICU collation customization. Reported-by: Daniel Verite <[email protected]> Discussion: https://www.postgresql.org/message-id/[email protected]
1 parent 042162d commit b8f9a2a

File tree

3 files changed

+164
-0
lines changed

3 files changed

+164
-0
lines changed

src/backend/utils/adt/pg_locale.c

+104
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include "catalog/pg_control.h"
5959
#include "mb/pg_wchar.h"
6060
#include "utils/builtins.h"
61+
#include "utils/formatting.h"
6162
#include "utils/hsearch.h"
6263
#include "utils/lsyscache.h"
6364
#include "utils/memutils.h"
@@ -132,6 +133,9 @@ static HTAB *collation_cache = NULL;
132133
static char *IsoLocaleName(const char *); /* MSVC specific */
133134
#endif
134135

136+
#ifdef USE_ICU
137+
static void icu_set_collation_attributes(UCollator *collator, const char *loc);
138+
#endif
135139

136140
/*
137141
* pg_perm_setlocale
@@ -1380,6 +1384,9 @@ pg_newlocale_from_collation(Oid collid)
13801384
(errmsg("could not open collator for locale \"%s\": %s",
13811385
collcollate, u_errorName(status))));
13821386

1387+
if (U_ICU_VERSION_MAJOR_NUM < 54)
1388+
icu_set_collation_attributes(collator, collcollate);
1389+
13831390
/* We will leak this string if we get an error below :-( */
13841391
result.info.icu.locale = MemoryContextStrdup(TopMemoryContext,
13851392
collcollate);
@@ -1588,6 +1595,103 @@ icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
15881595
return len_result;
15891596
}
15901597

1598+
/*
1599+
* Parse collation attributes and apply them to the open collator. This takes
1600+
* a string like "und@colStrength=primary;colCaseLevel=yes" and parses and
1601+
* applies the key-value arguments.
1602+
*
1603+
* Starting with ICU version 54, the attributes are processed automatically by
1604+
* ucol_open(), so this is only necessary for emulating this behavior on older
1605+
* versions.
1606+
*/
1607+
pg_attribute_unused()
1608+
static void
1609+
icu_set_collation_attributes(UCollator *collator, const char *loc)
1610+
{
1611+
char *str = asc_tolower(loc, strlen(loc));
1612+
1613+
str = strchr(str, '@');
1614+
if (!str)
1615+
return;
1616+
str++;
1617+
1618+
for (char *token = strtok(str, ";"); token; token = strtok(NULL, ";"))
1619+
{
1620+
char *e = strchr(token, '=');
1621+
1622+
if (e)
1623+
{
1624+
char *name;
1625+
char *value;
1626+
UColAttribute uattr = -1;
1627+
UColAttributeValue uvalue = -1;
1628+
UErrorCode status;
1629+
1630+
status = U_ZERO_ERROR;
1631+
1632+
*e = '\0';
1633+
name = token;
1634+
value = e + 1;
1635+
1636+
/*
1637+
* See attribute name and value lists in ICU i18n/coll.cpp
1638+
*/
1639+
if (strcmp(name, "colstrength") == 0)
1640+
uattr = UCOL_STRENGTH;
1641+
else if (strcmp(name, "colbackwards") == 0)
1642+
uattr = UCOL_FRENCH_COLLATION;
1643+
else if (strcmp(name, "colcaselevel") == 0)
1644+
uattr = UCOL_CASE_LEVEL;
1645+
else if (strcmp(name, "colcasefirst") == 0)
1646+
uattr = UCOL_CASE_FIRST;
1647+
else if (strcmp(name, "colalternate") == 0)
1648+
uattr = UCOL_ALTERNATE_HANDLING;
1649+
else if (strcmp(name, "colnormalization") == 0)
1650+
uattr = UCOL_NORMALIZATION_MODE;
1651+
else if (strcmp(name, "colnumeric") == 0)
1652+
uattr = UCOL_NUMERIC_COLLATION;
1653+
/* ignore if unknown */
1654+
1655+
if (strcmp(value, "primary") == 0)
1656+
uvalue = UCOL_PRIMARY;
1657+
else if (strcmp(value, "secondary") == 0)
1658+
uvalue = UCOL_SECONDARY;
1659+
else if (strcmp(value, "tertiary") == 0)
1660+
uvalue = UCOL_TERTIARY;
1661+
else if (strcmp(value, "quaternary") == 0)
1662+
uvalue = UCOL_QUATERNARY;
1663+
else if (strcmp(value, "identical") == 0)
1664+
uvalue = UCOL_IDENTICAL;
1665+
else if (strcmp(value, "no") == 0)
1666+
uvalue = UCOL_OFF;
1667+
else if (strcmp(value, "yes") == 0)
1668+
uvalue = UCOL_ON;
1669+
else if (strcmp(value, "shifted") == 0)
1670+
uvalue = UCOL_SHIFTED;
1671+
else if (strcmp(value, "non-ignorable") == 0)
1672+
uvalue = UCOL_NON_IGNORABLE;
1673+
else if (strcmp(value, "lower") == 0)
1674+
uvalue = UCOL_LOWER_FIRST;
1675+
else if (strcmp(value, "upper") == 0)
1676+
uvalue = UCOL_UPPER_FIRST;
1677+
else
1678+
status = U_ILLEGAL_ARGUMENT_ERROR;
1679+
1680+
if (uattr != -1 && uvalue != -1)
1681+
ucol_setAttribute(collator, uattr, uvalue, &status);
1682+
1683+
/*
1684+
* Pretend the error came from ucol_open(), for consistent error
1685+
* message across ICU versions.
1686+
*/
1687+
if (U_FAILURE(status))
1688+
ereport(ERROR,
1689+
(errmsg("could not open collator for locale \"%s\": %s",
1690+
loc, u_errorName(status))));
1691+
}
1692+
}
1693+
}
1694+
15911695
#endif /* USE_ICU */
15921696

15931697
/*

src/test/regress/expected/collate.icu.utf8.out

+39
Original file line numberDiff line numberDiff line change
@@ -1100,6 +1100,45 @@ select textrange_en_us('A','Z') @> 'b'::text;
11001100

11011101
drop type textrange_c;
11021102
drop type textrange_en_us;
1103+
-- test ICU collation customization
1104+
CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');
1105+
SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents;
1106+
?column? | ?column?
1107+
----------+----------
1108+
t | t
1109+
(1 row)
1110+
1111+
CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes');
1112+
SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards;
1113+
?column? | ?column?
1114+
----------+----------
1115+
t | t
1116+
(1 row)
1117+
1118+
CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower');
1119+
CREATE COLLATION testcoll_upper_first (provider = icu, locale = '@colCaseFirst=upper');
1120+
SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcoll_upper_first;
1121+
?column? | ?column?
1122+
----------+----------
1123+
t | t
1124+
(1 row)
1125+
1126+
CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted');
1127+
SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted;
1128+
?column? | ?column?
1129+
----------+----------
1130+
t | t
1131+
(1 row)
1132+
1133+
CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
1134+
SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric;
1135+
?column? | ?column?
1136+
----------+----------
1137+
t | t
1138+
(1 row)
1139+
1140+
CREATE COLLATION testcoll_error1 (provider = icu, locale = '@colNumeric=lower');
1141+
ERROR: could not open collator for locale "@colNumeric=lower": U_ILLEGAL_ARGUMENT_ERROR
11031142
-- cleanup
11041143
SET client_min_messages TO warning;
11051144
DROP SCHEMA collate_tests CASCADE;

src/test/regress/sql/collate.icu.utf8.sql

+21
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,27 @@ drop type textrange_c;
425425
drop type textrange_en_us;
426426

427427

428+
-- test ICU collation customization
429+
430+
CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');
431+
SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents;
432+
433+
CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes');
434+
SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards;
435+
436+
CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower');
437+
CREATE COLLATION testcoll_upper_first (provider = icu, locale = '@colCaseFirst=upper');
438+
SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcoll_upper_first;
439+
440+
CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted');
441+
SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted;
442+
443+
CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
444+
SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric;
445+
446+
CREATE COLLATION testcoll_error1 (provider = icu, locale = '@colNumeric=lower');
447+
448+
428449
-- cleanup
429450
SET client_min_messages TO warning;
430451
DROP SCHEMA collate_tests CASCADE;

0 commit comments

Comments
 (0)