Skip to content

Commit 8b25e38

Browse files
committed
Fix conversion of EUC-CN text (and add test suite)
- Flag truncated multi-byte characters as erroneous. - Don't allow ASCII control characters to appear in the middle of a multi-byte character. - There was a bug whereby some unrecognized Unicode codepoints would be passed through unchanged to the output when converting Unicode to EUC-CN. - Stick to the original EUC-CN standard, rather than CP936 (an extended version invented by MS).
1 parent 69c979a commit 8b25e38

File tree

3 files changed

+7659
-54
lines changed

3 files changed

+7659
-54
lines changed

ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c

Lines changed: 70 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232

3333
#include "unicode_table_cp936.h"
3434

35+
static int mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter *filter);
36+
3537
static const unsigned char mblen_table_euccn[] = { /* 0xA1-0xFE */
3638
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3739
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -70,7 +72,7 @@ const struct mbfl_convert_vtbl vtbl_euccn_wchar = {
7072
mbfl_filt_conv_common_ctor,
7173
NULL,
7274
mbfl_filt_conv_euccn_wchar,
73-
mbfl_filt_conv_common_flush,
75+
mbfl_filt_conv_euccn_wchar_flush,
7476
NULL,
7577
};
7678

@@ -86,51 +88,46 @@ const struct mbfl_convert_vtbl vtbl_wchar_euccn = {
8688

8789
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
8890

89-
/*
90-
* EUC-CN => wchar
91-
*/
92-
int
93-
mbfl_filt_conv_euccn_wchar(int c, mbfl_convert_filter *filter)
91+
int mbfl_filt_conv_euccn_wchar(int c, mbfl_convert_filter *filter)
9492
{
9593
int c1, w;
9694

9795
switch (filter->status) {
9896
case 0:
99-
if (c >= 0 && c < 0x80) { /* latin */
97+
if (c >= 0 && c < 0x80) { /* latin */
10098
CK((*filter->output_function)(c, filter->data));
101-
} else if (c > 0xa0 && c < 0xff) { /* dbcs lead byte */
99+
} else if ((c >= 0xA1 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7)) { /* dbcs lead byte */
102100
filter->status = 1;
103101
filter->cache = c;
104102
} else {
105-
w = c & MBFL_WCSGROUP_MASK;
106-
w |= MBFL_WCSGROUP_THROUGH;
107-
CK((*filter->output_function)(w, filter->data));
103+
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
108104
}
109105
break;
110106

111-
case 1: /* dbcs second byte */
107+
case 1: /* dbcs second byte */
112108
filter->status = 0;
113109
c1 = filter->cache;
114-
if (c1 > 0xa0 && c1 < 0xff && c > 0xa0 && c < 0xff) {
110+
if (c > 0xA0 && c < 0xFF) {
115111
w = (c1 - 0x81)*192 + (c - 0x40);
116112
if (w >= 0 && w < cp936_ucs_table_size) {
117-
w = cp936_ucs_table[w];
113+
if (w == 0x1864) {
114+
w = 0x30FB;
115+
} else if (w == 0x186A) {
116+
w = 0x2015;
117+
} else if ((w >= 0x1921 && w <= 0x192A) || w == 0x1963 || (w >= 0x1C59 && w <= 0x1C7E) || (w >= 0x1DBB && w <= 0x1DC4)) {
118+
w = 0;
119+
} else {
120+
w = cp936_ucs_table[w];
121+
}
118122
} else {
119123
w = 0;
120124
}
121125
if (w <= 0) {
122-
w = (c1 << 8) | c;
123-
w &= MBFL_WCSPLANE_MASK;
124-
w |= MBFL_WCSPLANE_GB2312;
126+
w = (c1 << 8) | c | MBFL_WCSPLANE_GB2312;
125127
}
126128
CK((*filter->output_function)(w, filter->data));
127-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
128-
CK((*filter->output_function)(c, filter->data));
129129
} else {
130-
w = (c1 << 8) | c;
131-
w &= MBFL_WCSGROUP_MASK;
132-
w |= MBFL_WCSGROUP_THROUGH;
133-
CK((*filter->output_function)(w, filter->data));
130+
CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
134131
}
135132
break;
136133

@@ -142,62 +139,81 @@ mbfl_filt_conv_euccn_wchar(int c, mbfl_convert_filter *filter)
142139
return c;
143140
}
144141

145-
/*
146-
* wchar => EUC-CN
147-
*/
148-
int
149-
mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter)
142+
int mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter)
150143
{
151-
int c1, c2, s;
144+
int s = 0;
152145

153-
s = 0;
154146
if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
155-
s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
147+
if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261) {
148+
s = 0;
149+
} else {
150+
s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
151+
}
156152
} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
157-
s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
153+
if (c == 0x2015) {
154+
s = 0xA1AA;
155+
} else if (c == 0x2014 || (c >= 0x2170 && c <= 0x2179)) {
156+
s = 0;
157+
} else {
158+
s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
159+
}
158160
} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
159-
s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
161+
if (c == 0x30FB) {
162+
s = 0xA1A4;
163+
} else {
164+
s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
165+
}
160166
} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
161167
s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
162168
} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
163-
if (c == 0xff04) {
164-
s = 0xa1e7;
165-
} else if (c == 0xff5e) {
166-
s = 0xa1ab;
167-
} else if (c >= 0xff01 && c <= 0xff5d) {
168-
s = c - 0xff01 + 0xa3a1;
169-
} else if (c >= 0xffe0 && c <= 0xffe5) {
170-
s = ucs_hff_s_cp936_table[c-0xffe0];
169+
if (c == 0xFF04) {
170+
s = 0xA1E7;
171+
} else if (c == 0xFF5E) {
172+
s = 0xA1AB;
173+
} else if (c >= 0xFF01 && c <= 0xFF5D) {
174+
s = c - 0xFF01 + 0xA3A1;
175+
} else if (c >= 0xFFE0 && c <= 0xFFE5) {
176+
s = ucs_hff_s_cp936_table[c - 0xFFE0];
171177
}
172178
}
173-
c1 = (s >> 8) & 0xff;
174-
c2 = s & 0xff;
175179

176-
if (c1 < 0xa1 || c2 < 0xa1) { /* exclude CP936 extension */
177-
s = c;
180+
/* exclude CP936 extensions */
181+
if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
182+
s = 0;
178183
}
179184

180185
if (s <= 0) {
181-
c1 = c & ~MBFL_WCSPLANE_MASK;
182-
if (c1 == MBFL_WCSPLANE_GB2312) {
183-
s = c & MBFL_WCSPLANE_MASK;
184-
}
185-
if (c == 0) {
186-
s = 0;
186+
if (c < 0x80) {
187+
s = c;
187188
} else if (s <= 0) {
188189
s = -1;
189190
}
190191
}
192+
191193
if (s >= 0) {
192-
if (s < 0x80) { /* latin */
194+
if (s < 0x80) { /* latin */
193195
CK((*filter->output_function)(s, filter->data));
194196
} else {
195-
CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
196-
CK((*filter->output_function)(s & 0xff, filter->data));
197+
CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
198+
CK((*filter->output_function)(s & 0xFF, filter->data));
197199
}
198200
} else {
199201
CK(mbfl_filt_conv_illegal_output(c, filter));
200202
}
201203

202204
return c;
203205
}
206+
207+
static int mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter *filter)
208+
{
209+
if (filter->status == 1) {
210+
/* 2-byte character was truncated */
211+
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
212+
}
213+
214+
if (filter->flush_function) {
215+
(*filter->flush_function)(filter->data);
216+
}
217+
218+
return 0;
219+
}

0 commit comments

Comments
 (0)