summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Eisentraut2017-08-21 15:22:00 +0000
committerPeter Eisentraut2017-08-21 23:21:07 +0000
commit2bfd1b1ee562c4e4fd065c7f7d1beaa9b9852070 (patch)
tree5f22baf585a1b4aa406f48d46d85348e0ebb038b
parent51e225da306e14616b690308a59fd89e22335035 (diff)
Don't install ICU collation keyword variants
Users can still create them themselves. Instead, document Unicode TR 35 collation options for ICU, so users can create all this themselves. Reviewed-by: Peter Geoghegan <[email protected]>
-rw-r--r--doc/src/sgml/charset.sgml98
-rw-r--r--src/backend/commands/collationcmds.c71
2 files changed, 84 insertions, 85 deletions
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
index f2a4acc115..44e43503a6 100644
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -665,13 +665,6 @@ SELECT a COLLATE "C" &lt; b COLLATE "POSIX" FROM test1;
</varlistentry>
<varlistentry>
- <term><literal>de-u-co-phonebk-x-icu</literal></term>
- <listitem>
- <para>German collation, phone book variant</para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
<term><literal>de-AT-x-icu</literal></term>
<listitem>
<para>German collation for Austria, default variant</para>
@@ -684,13 +677,6 @@ SELECT a COLLATE "C" &lt; b COLLATE "POSIX" FROM test1;
</varlistentry>
<varlistentry>
- <term><literal>de-AT-u-co-phonebk-x-icu</literal></term>
- <listitem>
- <para>German collation for Austria, phone book variant</para>
- </listitem>
- </varlistentry>
-
- <varlistentry>
<term><literal>und-x-icu</literal> (for <quote>undefined</quote>)</term>
<listitem>
<para>
@@ -709,6 +695,90 @@ SELECT a COLLATE "C" &lt; b COLLATE "POSIX" FROM test1;
will draw an error along the lines of <quote>collation "de-x-icu" for
encoding "WIN874" does not exist</>.
</para>
+
+ <para>
+ ICU allows collations to be customized beyond the basic language+country
+ set that is preloaded by <command>initdb</command>. Users are encouraged
+ to define their own collation objects that make use of these facilities to
+ suit the sorting behavior to their requirements. Here are some examples:
+
+ <variablelist>
+ <varlistentry>
+ <term><literal>CREATE COLLATION "de-u-co-phonebk-x-icu" (provider = icu, locale = 'de-u-co-phonebk')</literal></term>
+ <listitem>
+ <para>German collation with phone book collation type</para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>CREATE COLLATION "und-u-co-emoji-x-icu" (provider = icu, locale = 'und-u-co-emoji')</literal></term>
+ <listitem>
+ <para>
+ Root collation with Emoji collation type, per Unicode Technical Standard #51
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>CREATE COLLATION digitslast (provider = icu, locale = 'en-u-kr-latn-digit')</literal></term>
+ <listitem>
+ <para>
+ Sort digits after Latin letters. (The default is digits before letters.)
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper')</literal></term>
+ <listitem>
+ <para>
+ Sort upper-case letters before lower-case letters. (The default is
+ lower-case letters first.)
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>CREATE COLLATION special (provider = icu, locale = 'en-u-kf-upper-kr-latn-digit')</literal></term>
+ <listitem>
+ <para>
+ Combines both of the above options.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>CREATE COLLATION numeric (provider = icu, locale = 'en-u-kn-true')</literal></term>
+ <listitem>
+ <para>
+ Numeric ordering, sorts sequences of digits by their numeric value,
+ for example: <literal>A-21</literal> &lt; <literal>A-123</literal>
+ (also known as natural sort).
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+
+ See <ulink url="http://unicode.org/reports/tr35/tr35-collation.html">Unicode
+ Technical Standard #35</ulink>
+ and <ulink url="https://tools.ietf.org/html/bcp47">BCP 47</ulink> for
+ details. The list of possible collation types (<literal>co</literal>
+ subtag) can be found in
+ the <ulink url="http://www.unicode.org/repos/cldr/trunk/common/bcp47/collation.xml">CLDR
+ repository</ulink>.
+ The <ulink url="https://ssl.icu-project.org/icu-bin/locexp">ICU Locale
+ Explorer</ulink> can be used to check the details of a particular locale
+ definition.
+ </para>
+
+ <para>
+ Note that while this system allows creating collations that <quote>ignore
+ case</quote> or <quote>ignore accents</quote> or similar (using
+ the <literal>ks</literal> key), PostgreSQL does not at the moment allow
+ such collations to act in a truly case- or accent-insensitive manner. Any
+ strings that compare equal according to the collation but are not
+ byte-wise equal will be sorted according to their byte values.
+ </para>
</sect4>
</sect3>
diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index d36ce53560..9437731276 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -687,30 +687,11 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
*/
for (i = -1; i < uloc_countAvailable(); i++)
{
- /*
- * In ICU 4.2, ucol_getKeywordValuesForLocale() sometimes returns
- * values that will not be accepted by uloc_toLanguageTag(). Skip
- * loading keyword variants in that version. (Both
- * ucol_getKeywordValuesForLocale() and uloc_toLanguageTag() are
- * new in ICU 4.2, so older versions are not supported at all.)
- *
- * XXX We have no information about ICU 4.3 through 4.7, but we
- * know the code below works with 4.8.
- */
-#if U_ICU_VERSION_MAJOR_NUM > 4 || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM > 2)
-#define LOAD_ICU_KEYWORD_VARIANTS
-#endif
-
const char *name;
char *langtag;
char *icucomment;
const char *collcollate;
Oid collid;
-#ifdef LOAD_ICU_KEYWORD_VARIANTS
- UEnumeration *en;
- UErrorCode status;
- const char *val;
-#endif
if (i == -1)
name = ""; /* ICU root locale */
@@ -744,58 +725,6 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
CreateComments(collid, CollationRelationId, 0,
icucomment);
}
-
- /*
- * Add keyword variants, if enabled.
- */
-#ifdef LOAD_ICU_KEYWORD_VARIANTS
- status = U_ZERO_ERROR;
- en = ucol_getKeywordValuesForLocale("collation", name, TRUE, &status);
- if (U_FAILURE(status))
- ereport(ERROR,
- (errmsg("could not get keyword values for locale \"%s\": %s",
- name, u_errorName(status))));
-
- status = U_ZERO_ERROR;
- uenum_reset(en, &status);
- while ((val = uenum_next(en, NULL, &status)))
- {
- char *localeid = psprintf("%s@collation=%s", name, val);
-
- langtag = get_icu_language_tag(localeid);
- collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : localeid;
-
- /*
- * Be paranoid about not allowing any non-ASCII strings into
- * pg_collation
- */
- if (!is_all_ascii(langtag) || !is_all_ascii(collcollate))
- continue;
-
- collid = CollationCreate(psprintf("%s-x-icu", langtag),
- nspid, GetUserId(),
- COLLPROVIDER_ICU, -1,
- collcollate, collcollate,
- get_collation_actual_version(COLLPROVIDER_ICU, collcollate),
- true, true);
- if (OidIsValid(collid))
- {
- ncreated++;
-
- CommandCounterIncrement();
-
- icucomment = get_icu_locale_comment(localeid);
- if (icucomment)
- CreateComments(collid, CollationRelationId, 0,
- icucomment);
- }
- }
- if (U_FAILURE(status))
- ereport(ERROR,
- (errmsg("could not get keyword values for locale \"%s\": %s",
- name, u_errorName(status))));
- uenum_close(en);
-#endif /* LOAD_ICU_KEYWORD_VARIANTS */
}
}
#endif /* USE_ICU */