Skip to content

Commit 69c979a

Browse files
committed
Fix conversion of EUC-KR text (and add test suite)
- Treat truncated multi-byte characters as an error. - Don't allow ASCII control characters to appear in the middle of a multi-byte character. - There was also a bug whereby some unrecognized Unicode codepoints would be passed through to the output unchanged when converting Unicode to EUC-KR.
1 parent ddea066 commit 69c979a

File tree

3 files changed

+8401
-31
lines changed

3 files changed

+8401
-31
lines changed

ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c

+31-31
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
#include "mbfilter_euc_kr.h"
3232
#include "unicode_table_uhc.h"
3333

34+
static int mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter *filter);
35+
3436
static const unsigned char mblen_table_euckr[] = { /* 0xA1-0xFE */
3537
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3638
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -69,7 +71,7 @@ const struct mbfl_convert_vtbl vtbl_euckr_wchar = {
6971
mbfl_filt_conv_common_ctor,
7072
NULL,
7173
mbfl_filt_conv_euckr_wchar,
72-
mbfl_filt_conv_common_flush,
74+
mbfl_filt_conv_euckr_wchar_flush,
7375
NULL,
7476
};
7577

@@ -85,22 +87,19 @@ const struct mbfl_convert_vtbl vtbl_wchar_euckr = {
8587

8688
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
8789

88-
int
89-
mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter)
90+
int mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter)
9091
{
9192
int c1, w, flag;
9293

9394
switch (filter->status) {
9495
case 0:
9596
if (c >= 0 && c < 0x80) { /* latin */
9697
CK((*filter->output_function)(c, filter->data));
97-
} else if (c > 0xa0 && c < 0xff && c != 0xc9) { /* dbcs lead byte */
98+
} else if (((c >= 0xA1 && c <= 0xAC) || (c >= 0xB0 && c <= 0xFD)) && c != 0xC9) { /* dbcs lead byte */
9899
filter->status = 1;
99100
filter->cache = c;
100101
} else {
101-
w = c & MBFL_WCSGROUP_MASK;
102-
w |= MBFL_WCSGROUP_THROUGH;
103-
CK((*filter->output_function)(w, filter->data));
102+
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
104103
}
105104
break;
106105

@@ -114,7 +113,7 @@ mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter)
114113
flag = 2;
115114
}
116115
if (flag > 0 && c >= 0xa1 && c <= 0xfe) {
117-
if (flag == 1){ /* 1st: 0xa1..0xc6, 2nd: 0x41..0x7a, 0x81..0xfe */
116+
if (flag == 1) { /* 1st: 0xa1..0xc6, 2nd: 0x41..0x7a, 0x81..0xfe */
118117
w = (c1 - 0xa1)*190 + (c - 0x41);
119118
if (w >= 0 && w < uhc2_ucs_table_size) {
120119
w = uhc2_ucs_table[w];
@@ -131,18 +130,11 @@ mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter)
131130
}
132131

133132
if (w <= 0) {
134-
w = (c1 << 8) | c;
135-
w &= MBFL_WCSPLANE_MASK;
136-
w |= MBFL_WCSPLANE_KSC5601;
133+
w = (c1 << 8) | c | MBFL_WCSPLANE_KSC5601;
137134
}
138135
CK((*filter->output_function)(w, filter->data));
139-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
140-
CK((*filter->output_function)(c, filter->data));
141136
} else {
142-
w = (c1 << 8) | c;
143-
w &= MBFL_WCSGROUP_MASK;
144-
w |= MBFL_WCSGROUP_THROUGH;
145-
CK((*filter->output_function)(w, filter->data));
137+
CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
146138
}
147139
break;
148140

@@ -154,10 +146,9 @@ mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter)
154146
return c;
155147
}
156148

157-
int
158-
mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter)
149+
int mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter)
159150
{
160-
int c1, c2, s = 0;
151+
int s = 0;
161152

162153
if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
163154
s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
@@ -175,24 +166,19 @@ mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter)
175166
s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
176167
}
177168

178-
c1 = (s >> 8) & 0xff;
179-
c2 = s & 0xff;
180169
/* exclude UHC extension area (although we are using the UHC conversion tables) */
181-
if (c1 < 0xa1 || c2 < 0xa1) {
182-
s = c;
170+
if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
171+
s = 0;
183172
}
184173

185174
if (s <= 0) {
186-
c1 = c & ~MBFL_WCSPLANE_MASK;
187-
if (c1 == MBFL_WCSPLANE_KSC5601) {
188-
s = c & MBFL_WCSPLANE_MASK;
189-
}
190-
if (c == 0) {
191-
s = 0;
192-
} else if (s <= 0) {
175+
if (c < 0x80) {
176+
s = c;
177+
} else {
193178
s = -1;
194179
}
195180
}
181+
196182
if (s >= 0) {
197183
if (s < 0x80) { /* latin */
198184
CK((*filter->output_function)(s, filter->data));
@@ -206,3 +192,17 @@ mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter)
206192

207193
return c;
208194
}
195+
196+
static int mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter *filter)
197+
{
198+
if (filter->status == 1) {
199+
/* 2-byte character was truncated */
200+
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
201+
}
202+
203+
if (filter->flush_function) {
204+
(*filter->flush_function)(filter->data);
205+
}
206+
207+
return 0;
208+
}

0 commit comments

Comments
 (0)