|
93 | 93 | #include "filters/mbfilter_singlebyte.h"
|
94 | 94 | #include "filters/mbfilter_utf8.h"
|
95 | 95 |
|
96 |
| -#include "rare_cp_bitvec.h" |
97 |
| - |
98 |
| -/* |
99 |
| - * encoding detector |
100 |
| - */ |
101 |
| -static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data) |
102 |
| -{ |
103 |
| - mbfl_encoding_detector_data *data = void_data; |
104 |
| - unsigned int c = input_cp; |
105 |
| - |
106 |
| - /* Receive wchars decoded from input string using candidate encoding. |
107 |
| - * If the string was invalid in the candidate encoding, we assume |
108 |
| - * it's the wrong one. Otherwise, give the candidate many 'demerits' |
109 |
| - * for each 'rare' codepoint found, a smaller number for each ASCII |
110 |
| - * punctuation character, and 1 for all other codepoints. |
111 |
| - * |
112 |
| - * The 'common' codepoints should cover the vast majority of |
113 |
| - * codepoints we are likely to see in practice, while only covering |
114 |
| - * a small minority of the entire Unicode encoding space. Why? |
115 |
| - * Well, if the test string happens to be valid in an incorrect |
116 |
| - * candidate encoding, the bogus codepoints which it decodes to will |
117 |
| - * be more or less random. By treating the majority of codepoints as |
118 |
| - * 'rare', we ensure that in almost all such cases, the bogus |
119 |
| - * codepoints will include plenty of 'rares', thus giving the |
120 |
| - * incorrect candidate encoding lots of demerits. See |
121 |
| - * common_codepoints.txt for the actual list used. |
122 |
| - * |
123 |
| - * So, why give extra demerits for ASCII punctuation characters? It's |
124 |
| - * because there are some text encodings, like UTF-7, HZ, and ISO-2022, |
125 |
| - * which deliberately only use bytes in the ASCII range. When |
126 |
| - * misinterpreted as ASCII/UTF-8, strings in these encodings will |
127 |
| - * have an unusually high number of ASCII punctuation characters. |
128 |
| - * So giving extra demerits for such characters will improve |
129 |
| - * detection accuracy for UTF-7 and similar encodings. |
130 |
| - * |
131 |
| - * Finally, why 1 demerit for all other characters? That penalizes |
132 |
| - * long strings, meaning we will tend to choose a candidate encoding |
133 |
| - * in which the test string decodes to a smaller number of |
134 |
| - * codepoints. That prevents single-byte encodings in which almost |
135 |
| - * every possible input byte decodes to a 'common' codepoint from |
136 |
| - * being favored too much. */ |
137 |
| - if (c == MBFL_BAD_INPUT) { |
138 |
| - data->num_illegalchars++; |
139 |
| - } else if (c > 0xFFFF) { |
140 |
| - data->score += 40; |
141 |
| - } else if (c >= 0x21 && c <= 0x2F) { |
142 |
| - data->score += 6; |
143 |
| - } else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) { |
144 |
| - data->score += 30; |
145 |
| - } else { |
146 |
| - data->score += 1; |
147 |
| - } |
148 |
| - return 0; |
149 |
| -} |
150 |
| - |
151 |
| -mbfl_encoding_detector *mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict) |
152 |
| -{ |
153 |
| - if (!elistsz) { |
154 |
| - return NULL; |
155 |
| - } |
156 |
| - |
157 |
| - mbfl_encoding_detector *identd = emalloc(sizeof(mbfl_encoding_detector)); |
158 |
| - identd->filter_list = ecalloc(elistsz, sizeof(mbfl_convert_filter*)); |
159 |
| - identd->filter_data = ecalloc(elistsz, sizeof(mbfl_encoding_detector_data)); |
160 |
| - |
161 |
| - int filter_list_size = 0; |
162 |
| - for (int i = 0; i < elistsz; i++) { |
163 |
| - mbfl_convert_filter *filter = mbfl_convert_filter_new(elist[i], &mbfl_encoding_wchar, |
164 |
| - mbfl_estimate_encoding_likelihood, NULL, &identd->filter_data[filter_list_size]); |
165 |
| - if (filter) { |
166 |
| - identd->filter_list[filter_list_size++] = filter; |
167 |
| - } |
168 |
| - } |
169 |
| - identd->filter_list_size = filter_list_size; |
170 |
| - identd->strict = strict; |
171 |
| - return identd; |
172 |
| -} |
173 |
| - |
174 |
| -void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd) |
175 |
| -{ |
176 |
| - for (int i = 0; i < identd->filter_list_size; i++) { |
177 |
| - mbfl_convert_filter_delete(identd->filter_list[i]); |
178 |
| - } |
179 |
| - efree(identd->filter_list); |
180 |
| - efree(identd->filter_data); |
181 |
| - efree(identd); |
182 |
| -} |
183 |
| - |
184 |
| -int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string) |
185 |
| -{ |
186 |
| - int num = identd->filter_list_size; |
187 |
| - size_t n = string->len; |
188 |
| - unsigned char *p = string->val; |
189 |
| - int bad = 0; |
190 |
| - |
191 |
| - if (identd->strict) { |
192 |
| - for (int i = 0; i < num; i++) { |
193 |
| - mbfl_convert_filter *filter = identd->filter_list[i]; |
194 |
| - mbfl_encoding_detector_data *data = &identd->filter_data[i]; |
195 |
| - if (filter->from->check != NULL && !(filter->from->check)(p, n)) { |
196 |
| - data->num_illegalchars++; |
197 |
| - } |
198 |
| - } |
199 |
| - } |
200 |
| - |
201 |
| - while (n--) { |
202 |
| - for (int i = 0; i < num; i++) { |
203 |
| - mbfl_convert_filter *filter = identd->filter_list[i]; |
204 |
| - mbfl_encoding_detector_data *data = &identd->filter_data[i]; |
205 |
| - if (!data->num_illegalchars) { |
206 |
| - (*filter->filter_function)(*p, filter); |
207 |
| - if (data->num_illegalchars) { |
208 |
| - bad++; |
209 |
| - } |
210 |
| - } |
211 |
| - } |
212 |
| - if ((num - 1) <= bad && !identd->strict) { |
213 |
| - return 1; |
214 |
| - } |
215 |
| - p++; |
216 |
| - } |
217 |
| - |
218 |
| - for (int i = 0; i < num; i++) { |
219 |
| - mbfl_convert_filter *filter = identd->filter_list[i]; |
220 |
| - (filter->filter_flush)(filter); |
221 |
| - } |
222 |
| - |
223 |
| - return 0; |
224 |
| -} |
225 |
| - |
226 |
| -const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd) |
227 |
| -{ |
228 |
| - size_t best_score = SIZE_MAX; /* Low score is 'better' */ |
229 |
| - const mbfl_encoding *enc = NULL; |
230 |
| - |
231 |
| - for (int i = 0; i < identd->filter_list_size; i++) { |
232 |
| - mbfl_convert_filter *filter = identd->filter_list[i]; |
233 |
| - mbfl_encoding_detector_data *data = &identd->filter_data[i]; |
234 |
| - if (!data->num_illegalchars && data->score < best_score) { |
235 |
| - enc = filter->from; |
236 |
| - best_score = data->score; |
237 |
| - } |
238 |
| - } |
239 |
| - |
240 |
| - return enc; |
241 |
| -} |
242 |
| - |
243 | 96 | /*
|
244 | 97 | * strcut
|
245 | 98 | */
|
|
0 commit comments