Skip to content

Commit f85a485

Browse files
committed
Add support for automatically updating Unicode derived files
We currently have several sets of files generated from data provided by Unicode. These all have ad hoc rules and instructions for updating when new Unicode versions appear, and it's not done consistently. This patch centralizes and automates the process and makes it part of the release checklist. The Unicode and CLDR versions are specified in Makefile.global.in. There is a new make target "update-unicode" that downloads all the relevant files and runs the generation script. There is also a new script for generating the table of combining characters for ucs_wcwidth(). That table is now in a separate include file rather than hardcoded into the middle of other code. This is based on the script that was used for generating d8594d1, but the script itself wasn't committed at that time. Reviewed-by: John Naylor <[email protected]> Discussion: https://www.postgresql.org/message-id/flat/[email protected]
1 parent f5fd995 commit f85a485

File tree

13 files changed

+313
-94
lines changed

13 files changed

+313
-94
lines changed

GNUmakefile.in

+4
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ $(call recurse,installcheck-world,src/test src/pl src/interfaces/ecpg contrib sr
7575
GNUmakefile: GNUmakefile.in $(top_builddir)/config.status
7676
./config.status $@
7777

78+
update-unicode: | submake-generated-headers submake-libpgport
79+
$(MAKE) -C src/common/unicode $@
80+
$(MAKE) -C contrib/unaccent $@
81+
7882

7983
##########################################################################
8084

contrib/unaccent/.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,6 @@
22
/log/
33
/results/
44
/tmp_check/
5+
6+
# Downloaded files
7+
/Latin-ASCII.xml

contrib/unaccent/Makefile

+19
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,22 @@ top_builddir = ../..
2626
include $(top_builddir)/src/Makefile.global
2727
include $(top_srcdir)/contrib/contrib-global.mk
2828
endif
29+
30+
update-unicode: unaccent.rules
31+
32+
# Allow running this even without --with-python
33+
PYTHON ?= python
34+
35+
unaccent.rules: generate_unaccent_rules.py ../../src/common/unicode/UnicodeData.txt Latin-ASCII.xml
36+
$(PYTHON) $< --unicode-data-file $(word 2,$^) --latin-ascii-file $(word 3,$^) >$@
37+
38+
# Only download it once; dependencies must match src/common/unicode/
39+
../../src/common/unicode/UnicodeData.txt: $(top_builddir)/src/Makefile.global
40+
$(MAKE) -C $(@D) $(@F)
41+
42+
# Dependency on Makefile.global is for CLDR_VERSION
43+
Latin-ASCII.xml: $(top_builddir)/src/Makefile.global
44+
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/cldr/release-$(subst .,-,$(CLDR_VERSION))/common/transforms/Latin-ASCII.xml
45+
46+
distclean:
47+
rm -f Latin-ASCII.xml

contrib/unaccent/generate_unaccent_rules.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,11 @@
2020
# option is enabled, the XML file of this transliterator [2] -- given as a
2121
# command line argument -- will be parsed and used.
2222
#
23-
# Ideally you should use the latest release for each data set. For
24-
# Latin-ASCII.xml, the latest data sets released can be browsed directly
25-
# via [3]. Note that this script is compatible with at least release 29.
23+
# Ideally you should use the latest release for each data set. This
24+
# script is compatible with at least CLDR release 29.
2625
#
27-
# [1] https://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt
28-
# [2] https://raw.githubusercontent.com/unicode-org/cldr/release-34/common/transforms/Latin-ASCII.xml
29-
# [3] https://github.com/unicode-org/cldr/tags
26+
# [1] https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/UnicodeData.txt
27+
# [2] https://raw.githubusercontent.com/unicode-org/cldr/${TAG}/common/transforms/Latin-ASCII.xml
3028

3129
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
3230
# The approach is to be Python3 compatible with Python2 "backports".

src/Makefile.global.in

+17-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ standard_targets = all install installdirs uninstall distprep clean distclean ma
2323
# these targets should recurse even into subdirectories not being built:
2424
standard_always_targets = distprep clean distclean maintainer-clean
2525

26-
.PHONY: $(standard_targets) install-strip html man installcheck-parallel
26+
.PHONY: $(standard_targets) install-strip html man installcheck-parallel update-unicode
2727

2828
# make `all' the default target
2929
all:
@@ -352,6 +352,22 @@ XGETTEXT = @XGETTEXT@
352352
GZIP = gzip
353353
BZIP2 = bzip2
354354

355+
DOWNLOAD = wget -O $@ --no-use-server-timestamps
356+
#DOWNLOAD = curl -o $@
357+
358+
359+
# Unicode data information
360+
361+
# Before each major release, update these and run make update-unicode.
362+
363+
# Pick a release from here: <https://www.unicode.org/Public/>. Note
364+
# that the most recent release listed there is often a pre-release;
365+
# don't pick that one, except for testing.
366+
UNICODE_VERSION = 12.1.0
367+
368+
# Pick a release from here: <http://cldr.unicode.org/index/downloads>
369+
CLDR_VERSION = 34
370+
355371

356372
# Tree-wide build support
357373

src/backend/utils/mb/Unicode/Makefile

-3
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,6 @@ maintainer-clean: distclean
115115
rm -f $(MAPS)
116116

117117

118-
DOWNLOAD = wget -O $@ --no-use-server-timestamps
119-
#DOWNLOAD = curl -o $@
120-
121118
BIG5.TXT CNS11643.TXT:
122119
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F)
123120

src/backend/utils/mb/wchar.c

+1-67
Original file line numberDiff line numberDiff line change
@@ -643,73 +643,7 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
643643
static int
644644
ucs_wcwidth(pg_wchar ucs)
645645
{
646-
/* sorted list of non-overlapping intervals of non-spacing characters */
647-
static const struct mbinterval combining[] = {
648-
{0x0300, 0x036F}, {0x0483, 0x0489}, {0x0591, 0x05BD},
649-
{0x05BF, 0x05BF}, {0x05C1, 0x05C2}, {0x05C4, 0x05C5},
650-
{0x05C7, 0x05C7}, {0x0610, 0x061A}, {0x064B, 0x065F},
651-
{0x0670, 0x0670}, {0x06D6, 0x06DC}, {0x06DF, 0x06E4},
652-
{0x06E7, 0x06E8}, {0x06EA, 0x06ED}, {0x0711, 0x0711},
653-
{0x0730, 0x074A}, {0x07A6, 0x07B0}, {0x07EB, 0x07F3},
654-
{0x07FD, 0x07FD}, {0x0816, 0x0819}, {0x081B, 0x0823},
655-
{0x0825, 0x0827}, {0x0829, 0x082D}, {0x0859, 0x085B},
656-
{0x08D3, 0x08E1}, {0x08E3, 0x0902}, {0x093A, 0x093A},
657-
{0x093C, 0x093C}, {0x0941, 0x0948}, {0x094D, 0x094D},
658-
{0x0951, 0x0957}, {0x0962, 0x0963}, {0x0981, 0x0981},
659-
{0x09BC, 0x09BC}, {0x09C1, 0x09C4}, {0x09CD, 0x09CD},
660-
{0x09E2, 0x09E3}, {0x09FE, 0x0A02}, {0x0A3C, 0x0A3C},
661-
{0x0A41, 0x0A51}, {0x0A70, 0x0A71}, {0x0A75, 0x0A75},
662-
{0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC8},
663-
{0x0ACD, 0x0ACD}, {0x0AE2, 0x0AE3}, {0x0AFA, 0x0B01},
664-
{0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B44},
665-
{0x0B4D, 0x0B56}, {0x0B62, 0x0B63}, {0x0B82, 0x0B82},
666-
{0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C00, 0x0C00},
667-
{0x0C04, 0x0C04}, {0x0C3E, 0x0C40}, {0x0C46, 0x0C56},
668-
{0x0C62, 0x0C63}, {0x0C81, 0x0C81}, {0x0CBC, 0x0CBC},
669-
{0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
670-
{0x0CE2, 0x0CE3}, {0x0D00, 0x0D01}, {0x0D3B, 0x0D3C},
671-
{0x0D41, 0x0D44}, {0x0D4D, 0x0D4D}, {0x0D62, 0x0D63},
672-
{0x0DCA, 0x0DCA}, {0x0DD2, 0x0DD6}, {0x0E31, 0x0E31},
673-
{0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1},
674-
{0x0EB4, 0x0EBC}, {0x0EC8, 0x0ECD}, {0x0F18, 0x0F19},
675-
{0x0F35, 0x0F35}, {0x0F37, 0x0F37}, {0x0F39, 0x0F39},
676-
{0x0F71, 0x0F7E}, {0x0F80, 0x0F84}, {0x0F86, 0x0F87},
677-
{0x0F8D, 0x0FBC}, {0x0FC6, 0x0FC6}, {0x102D, 0x1030},
678-
{0x1032, 0x1037}, {0x1039, 0x103A}, {0x103D, 0x103E},
679-
{0x1058, 0x1059}, {0x105E, 0x1060}, {0x1071, 0x1074},
680-
{0x1082, 0x1082}, {0x1085, 0x1086}, {0x108D, 0x108D},
681-
{0x109D, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714},
682-
{0x1732, 0x1734}, {0x1752, 0x1753}, {0x1772, 0x1773},
683-
{0x17B4, 0x17B5}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6},
684-
{0x17C9, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D},
685-
{0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x1922},
686-
{0x1927, 0x1928}, {0x1932, 0x1932}, {0x1939, 0x193B},
687-
{0x1A17, 0x1A18}, {0x1A1B, 0x1A1B}, {0x1A56, 0x1A56},
688-
{0x1A58, 0x1A60}, {0x1A62, 0x1A62}, {0x1A65, 0x1A6C},
689-
{0x1A73, 0x1A7F}, {0x1AB0, 0x1B03}, {0x1B34, 0x1B34},
690-
{0x1B36, 0x1B3A}, {0x1B3C, 0x1B3C}, {0x1B42, 0x1B42},
691-
{0x1B6B, 0x1B73}, {0x1B80, 0x1B81}, {0x1BA2, 0x1BA5},
692-
{0x1BA8, 0x1BA9}, {0x1BAB, 0x1BAD}, {0x1BE6, 0x1BE6},
693-
{0x1BE8, 0x1BE9}, {0x1BED, 0x1BED}, {0x1BEF, 0x1BF1},
694-
{0x1C2C, 0x1C33}, {0x1C36, 0x1C37}, {0x1CD0, 0x1CD2},
695-
{0x1CD4, 0x1CE0}, {0x1CE2, 0x1CE8}, {0x1CED, 0x1CED},
696-
{0x1CF4, 0x1CF4}, {0x1CF8, 0x1CF9}, {0x1DC0, 0x1DFF},
697-
{0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F},
698-
{0x2DE0, 0x2DFF}, {0x302A, 0x302D}, {0x3099, 0x309A},
699-
{0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F},
700-
{0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806},
701-
{0xA80B, 0xA80B}, {0xA825, 0xA826}, {0xA8C4, 0xA8C5},
702-
{0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D},
703-
{0xA947, 0xA951}, {0xA980, 0xA982}, {0xA9B3, 0xA9B3},
704-
{0xA9B6, 0xA9B9}, {0xA9BC, 0xA9BD}, {0xA9E5, 0xA9E5},
705-
{0xAA29, 0xAA2E}, {0xAA31, 0xAA32}, {0xAA35, 0xAA36},
706-
{0xAA43, 0xAA43}, {0xAA4C, 0xAA4C}, {0xAA7C, 0xAA7C},
707-
{0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8},
708-
{0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEC, 0xAAED},
709-
{0xAAF6, 0xAAF6}, {0xABE5, 0xABE5}, {0xABE8, 0xABE8},
710-
{0xABED, 0xABED}, {0xFB1E, 0xFB1E}, {0xFE00, 0xFE0F},
711-
{0xFE20, 0xFE2F},
712-
};
646+
#include "common/unicode_combining_table.h"
713647

714648
/* test for 8-bit control characters */
715649
if (ucs == 0)

src/common/unicode/.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/norm_test
22
/norm_test_table.h
33

4-
# Files downloaded from the Unicode Character Database
4+
# Downloaded files
55
/CompositionExclusions.txt
66
/NormalizationTest.txt
77
/UnicodeData.txt

src/common/unicode/Makefile

+10-4
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,24 @@ LIBS += $(PTHREAD_LIBS)
1818
# By default, do nothing.
1919
all:
2020

21-
DOWNLOAD = wget -O $@ --no-use-server-timestamps
21+
update-unicode: unicode_norm_table.h unicode_combining_table.h
22+
$(MAKE) normalization-check
23+
mv unicode_norm_table.h unicode_combining_table.h ../../../src/include/common/
2224

2325
# These files are part of the Unicode Character Database. Download
24-
# them on demand.
25-
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt:
26-
$(DOWNLOAD) https://www.unicode.org/Public/UNIDATA/$(@F)
26+
# them on demand. The dependency on Makefile.global is for
27+
# UNICODE_VERSION.
28+
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
29+
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
2730

2831
# Generation of conversion tables used for string normalization with
2932
# UTF-8 strings.
3033
unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
3134
$(PERL) generate-unicode_norm_table.pl
3235

36+
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
37+
$(PERL) $^ >$@
38+
3339
# Test suite
3440
normalization-check: norm_test
3541
./norm_test

src/common/unicode/README

+5-12
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,11 @@ of Unicode.
88
Generating unicode_norm_table.h
99
-------------------------------
1010

11-
1. Download the Unicode data file, UnicodeData.txt, from the Unicode
12-
consortium and place it to the current directory. Run the perl script
13-
"generate-unicode_norm_table.pl", to process it, and to generate the
14-
"unicode_norm_table.h" file. The Makefile contains a rule to download the
15-
data files if they don't exist.
16-
17-
make unicode_norm_table.h
18-
19-
2. Inspect the resulting header file. Once you're happy with it, copy it to
20-
the right location.
21-
22-
cp unicode_norm_table.h ../../../src/include/common/
11+
Run
2312

13+
make update-unicode
2414

15+
from the top level of the source tree and commit the result.
2516

2617
Tests
2718
-----
@@ -33,3 +24,5 @@ normalization code with all the test strings in NormalizationTest.txt.
3324
To download NormalizationTest.txt and run the tests:
3425

3526
make normalization-check
27+
28+
This is also run as part of the update-unicode target.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/usr/bin/perl
2+
#
3+
# Generate sorted list of non-overlapping intervals of non-spacing
4+
# characters, using Unicode data files as input. Pass UnicodeData.txt
5+
# as argument. The output is on stdout.
6+
#
7+
# Copyright (c) 2019, PostgreSQL Global Development Group
8+
9+
use strict;
10+
use warnings;
11+
12+
my $range_start = undef;
13+
my $codepoint;
14+
my $prev_codepoint;
15+
my $count = 0;
16+
17+
print "/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n";
18+
19+
print "static const struct mbinterval combining[] = {\n";
20+
21+
foreach my $line (<ARGV>)
22+
{
23+
chomp $line;
24+
my @fields = split ';', $line;
25+
$codepoint = hex $fields[0];
26+
27+
next if $codepoint > 0xFFFF;
28+
29+
if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
30+
{
31+
# combining character, save for start of range
32+
if (!defined($range_start))
33+
{
34+
$range_start = $codepoint;
35+
}
36+
}
37+
else
38+
{
39+
# not a combining character, print out previous range if any
40+
if (defined($range_start))
41+
{
42+
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint;
43+
$range_start = undef;
44+
}
45+
}
46+
}
47+
continue
48+
{
49+
$prev_codepoint = $codepoint;
50+
}
51+
52+
print "};\n";

0 commit comments

Comments
 (0)