Skip to content

Commit bab9821

Browse files
committed
Update display widths as part of updating Unicode
The hardcoded "wide character" set in ucs_wcwidth() was last updated around the Unicode 5.0 era. This led to misalignment when printing emojis and other codepoints that have since been designated wide or full-width. To fix and keep up to date, extend update-unicode to download the list of wide and full-width codepoints from the offical sources. In passing, remove some comments about non-spacing characters that haven't been accurate since we removed the former hardcoded logic. Jacob Champion Reported and reviewed by Pavel Stehule Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRCeX21O69YHxmykYySYyprZAqrKWWg0KoGKdjgqcGyygg@mail.gmail.com
1 parent 1563ecb commit bab9821

File tree

5 files changed

+220
-27
lines changed

5 files changed

+220
-27
lines changed

src/common/unicode/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@
44
# Downloaded files
55
/CompositionExclusions.txt
66
/DerivedNormalizationProps.txt
7+
/EastAsianWidth.txt
78
/NormalizationTest.txt
89
/UnicodeData.txt

src/common/unicode/Makefile

+6-3
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS)
1818
# By default, do nothing.
1919
all:
2020

21-
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
21+
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
2222
mv $^ ../../../src/include/common/
2323
$(MAKE) normalization-check
2424

2525
# These files are part of the Unicode Character Database. Download
2626
# them on demand. The dependency on Makefile.global is for
2727
# UNICODE_VERSION.
28-
UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
28+
UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
2929
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3030

3131
# Generation of conversion tables used for string normalization with
@@ -38,6 +38,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition
3838
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
3939
$(PERL) $^ >$@
4040

41+
unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
42+
$(PERL) $^ >$@
43+
4144
unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
4245
$(PERL) $^ >$@
4346

@@ -64,6 +67,6 @@ clean:
6467
rm -f $(OBJS) norm_test norm_test.o
6568

6669
distclean: clean
67-
rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
70+
rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
6871

6972
maintainer-clean: distclean
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/perl
2+
#
3+
# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
4+
# and East Asian Fullwidth (F) characters, using Unicode data files as input.
5+
# Pass EastAsianWidth.txt as argument. The output is on stdout.
6+
#
7+
# Copyright (c) 2019-2021, PostgreSQL Global Development Group
8+
9+
use strict;
10+
use warnings;
11+
12+
my $range_start = undef;
13+
my ($first, $last);
14+
my $prev_last;
15+
16+
print
17+
"/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
18+
19+
print "static const struct mbinterval east_asian_fw[] = {\n";
20+
21+
foreach my $line (<ARGV>)
22+
{
23+
chomp $line;
24+
$line =~ s/\s*#.*$//;
25+
next if $line eq '';
26+
my ($codepoint, $width) = split ';', $line;
27+
28+
if ($codepoint =~ /\.\./)
29+
{
30+
($first, $last) = split /\.\./, $codepoint;
31+
}
32+
else
33+
{
34+
$first = $last = $codepoint;
35+
}
36+
37+
($first, $last) = map(hex, ($first, $last));
38+
39+
if ($width eq 'F' || $width eq 'W')
40+
{
41+
# fullwidth/wide characters
42+
if (!defined($range_start))
43+
{
44+
# save for start of range if one hasn't been started yet
45+
$range_start = $first;
46+
}
47+
elsif ($first != $prev_last + 1)
48+
{
49+
# ranges aren't contiguous; emit the last and start a new one
50+
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
51+
$range_start = $first;
52+
}
53+
}
54+
else
55+
{
56+
# not wide characters, print out previous range if any
57+
if (defined($range_start))
58+
{
59+
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
60+
$range_start = undef;
61+
}
62+
}
63+
}
64+
continue
65+
{
66+
$prev_last = $last;
67+
}
68+
69+
# don't forget any ranges at the very end of the database (though there are none
70+
# as of Unicode 13.0)
71+
if (defined($range_start))
72+
{
73+
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
74+
}
75+
76+
print "};\n";

src/common/wchar.c

+17-24
Original file line numberDiff line numberDiff line change
@@ -583,8 +583,8 @@ pg_utf_mblen(const unsigned char *s)
583583

584584
struct mbinterval
585585
{
586-
unsigned short first;
587-
unsigned short last;
586+
unsigned int first;
587+
unsigned int last;
588588
};
589589

590590
/* auxiliary function for binary search in interval table */
@@ -623,12 +623,6 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
623623
* category code Mn or Me in the Unicode database) have a
624624
* column width of 0.
625625
*
626-
* - Other format characters (general category code Cf in the Unicode
627-
* database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
628-
*
629-
* - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
630-
* have a column width of 0.
631-
*
632626
* - Spacing characters in the East Asian Wide (W) or East Asian
633627
* FullWidth (F) category as defined in Unicode Technical
634628
* Report #11 have a column width of 2.
@@ -645,6 +639,7 @@ static int
645639
ucs_wcwidth(pg_wchar ucs)
646640
{
647641
#include "common/unicode_combining_table.h"
642+
#include "common/unicode_east_asian_fw_table.h"
648643

649644
/* test for 8-bit control characters */
650645
if (ucs == 0)
@@ -653,27 +648,25 @@ ucs_wcwidth(pg_wchar ucs)
653648
if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
654649
return -1;
655650

656-
/* binary search in table of non-spacing characters */
651+
/*
652+
* binary search in table of non-spacing characters
653+
*
654+
* XXX: In the official Unicode sources, it is possible for a character to
655+
* be described as both non-spacing and wide at the same time. As of
656+
* Unicode 13.0, treating the non-spacing property as the determining
657+
* factor for display width leads to the correct behavior, so do that
658+
* search first.
659+
*/
657660
if (mbbisearch(ucs, combining,
658661
sizeof(combining) / sizeof(struct mbinterval) - 1))
659662
return 0;
660663

661-
/*
662-
* if we arrive here, ucs is not a combining or C0/C1 control character
663-
*/
664+
/* binary search in table of wide characters */
665+
if (mbbisearch(ucs, east_asian_fw,
666+
sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
667+
return 2;
664668

665-
return 1 +
666-
(ucs >= 0x1100 &&
667-
(ucs <= 0x115f || /* Hangul Jamo init. consonants */
668-
(ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
669-
ucs != 0x303f) || /* CJK ... Yi */
670-
(ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
671-
(ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
672-
* Ideographs */
673-
(ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
674-
(ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
675-
(ucs >= 0xffe0 && ucs <= 0xffe6) ||
676-
(ucs >= 0x20000 && ucs <= 0x2ffff)));
669+
return 1;
677670
}
678671

679672
/*
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */
2+
3+
static const struct mbinterval east_asian_fw[] = {
4+
{0x1100, 0x115F},
5+
{0x231A, 0x231B},
6+
{0x2329, 0x232A},
7+
{0x23E9, 0x23EC},
8+
{0x23F0, 0x23F0},
9+
{0x23F3, 0x23F3},
10+
{0x25FD, 0x25FE},
11+
{0x2614, 0x2615},
12+
{0x2648, 0x2653},
13+
{0x267F, 0x267F},
14+
{0x2693, 0x2693},
15+
{0x26A1, 0x26A1},
16+
{0x26AA, 0x26AB},
17+
{0x26BD, 0x26BE},
18+
{0x26C4, 0x26C5},
19+
{0x26CE, 0x26CE},
20+
{0x26D4, 0x26D4},
21+
{0x26EA, 0x26EA},
22+
{0x26F2, 0x26F3},
23+
{0x26F5, 0x26F5},
24+
{0x26FA, 0x26FA},
25+
{0x26FD, 0x26FD},
26+
{0x2705, 0x2705},
27+
{0x270A, 0x270B},
28+
{0x2728, 0x2728},
29+
{0x274C, 0x274C},
30+
{0x274E, 0x274E},
31+
{0x2753, 0x2755},
32+
{0x2757, 0x2757},
33+
{0x2795, 0x2797},
34+
{0x27B0, 0x27B0},
35+
{0x27BF, 0x27BF},
36+
{0x2B1B, 0x2B1C},
37+
{0x2B50, 0x2B50},
38+
{0x2B55, 0x2B55},
39+
{0x2E80, 0x2E99},
40+
{0x2E9B, 0x2EF3},
41+
{0x2F00, 0x2FD5},
42+
{0x2FF0, 0x2FFB},
43+
{0x3000, 0x303E},
44+
{0x3041, 0x3096},
45+
{0x3099, 0x30FF},
46+
{0x3105, 0x312F},
47+
{0x3131, 0x318E},
48+
{0x3190, 0x31E3},
49+
{0x31F0, 0x321E},
50+
{0x3220, 0x3247},
51+
{0x3250, 0x4DBF},
52+
{0x4E00, 0xA48C},
53+
{0xA490, 0xA4C6},
54+
{0xA960, 0xA97C},
55+
{0xAC00, 0xD7A3},
56+
{0xF900, 0xFAFF},
57+
{0xFE10, 0xFE19},
58+
{0xFE30, 0xFE52},
59+
{0xFE54, 0xFE66},
60+
{0xFE68, 0xFE6B},
61+
{0xFF01, 0xFF60},
62+
{0xFFE0, 0xFFE6},
63+
{0x16FE0, 0x16FE4},
64+
{0x16FF0, 0x16FF1},
65+
{0x17000, 0x187F7},
66+
{0x18800, 0x18CD5},
67+
{0x18D00, 0x18D08},
68+
{0x1B000, 0x1B11E},
69+
{0x1B150, 0x1B152},
70+
{0x1B164, 0x1B167},
71+
{0x1B170, 0x1B2FB},
72+
{0x1F004, 0x1F004},
73+
{0x1F0CF, 0x1F0CF},
74+
{0x1F18E, 0x1F18E},
75+
{0x1F191, 0x1F19A},
76+
{0x1F200, 0x1F202},
77+
{0x1F210, 0x1F23B},
78+
{0x1F240, 0x1F248},
79+
{0x1F250, 0x1F251},
80+
{0x1F260, 0x1F265},
81+
{0x1F300, 0x1F320},
82+
{0x1F32D, 0x1F335},
83+
{0x1F337, 0x1F37C},
84+
{0x1F37E, 0x1F393},
85+
{0x1F3A0, 0x1F3CA},
86+
{0x1F3CF, 0x1F3D3},
87+
{0x1F3E0, 0x1F3F0},
88+
{0x1F3F4, 0x1F3F4},
89+
{0x1F3F8, 0x1F43E},
90+
{0x1F440, 0x1F440},
91+
{0x1F442, 0x1F4FC},
92+
{0x1F4FF, 0x1F53D},
93+
{0x1F54B, 0x1F54E},
94+
{0x1F550, 0x1F567},
95+
{0x1F57A, 0x1F57A},
96+
{0x1F595, 0x1F596},
97+
{0x1F5A4, 0x1F5A4},
98+
{0x1F5FB, 0x1F64F},
99+
{0x1F680, 0x1F6C5},
100+
{0x1F6CC, 0x1F6CC},
101+
{0x1F6D0, 0x1F6D2},
102+
{0x1F6D5, 0x1F6D7},
103+
{0x1F6EB, 0x1F6EC},
104+
{0x1F6F4, 0x1F6FC},
105+
{0x1F7E0, 0x1F7EB},
106+
{0x1F90C, 0x1F93A},
107+
{0x1F93C, 0x1F945},
108+
{0x1F947, 0x1F978},
109+
{0x1F97A, 0x1F9CB},
110+
{0x1F9CD, 0x1F9FF},
111+
{0x1FA70, 0x1FA74},
112+
{0x1FA78, 0x1FA7A},
113+
{0x1FA80, 0x1FA86},
114+
{0x1FA90, 0x1FAA8},
115+
{0x1FAB0, 0x1FAB6},
116+
{0x1FAC0, 0x1FAC2},
117+
{0x1FAD0, 0x1FAD6},
118+
{0x20000, 0x2FFFD},
119+
{0x30000, 0x3FFFD},
120+
};

0 commit comments

Comments
 (0)