Skip to content

Commit 6df7557

Browse files
committed
mb_parse_str, mb_http_input, and mb_convert_variables use fast text conversion code for automatic encoding detection
For mb_parse_str, when mbstring.http_input (INI parameter) is a list of multiple possible text encodings (which is not the case by default), this new implementation is about 25% faster. When mbstring.http_input is a single value, then nothing is changed. (No automatic encoding detection is done in that case.)
1 parent 6ebd08b commit 6df7557

File tree

9 files changed

+260
-330
lines changed

9 files changed

+260
-330
lines changed

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 0 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -93,153 +93,6 @@
9393
#include "filters/mbfilter_singlebyte.h"
9494
#include "filters/mbfilter_utf8.h"
9595

96-
#include "rare_cp_bitvec.h"
97-
98-
/*
99-
* encoding detector
100-
*/
101-
static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data)
102-
{
103-
mbfl_encoding_detector_data *data = void_data;
104-
unsigned int c = input_cp;
105-
106-
/* Receive wchars decoded from input string using candidate encoding.
107-
* If the string was invalid in the candidate encoding, we assume
108-
* it's the wrong one. Otherwise, give the candidate many 'demerits'
109-
* for each 'rare' codepoint found, a smaller number for each ASCII
110-
* punctuation character, and 1 for all other codepoints.
111-
*
112-
* The 'common' codepoints should cover the vast majority of
113-
* codepoints we are likely to see in practice, while only covering
114-
* a small minority of the entire Unicode encoding space. Why?
115-
* Well, if the test string happens to be valid in an incorrect
116-
* candidate encoding, the bogus codepoints which it decodes to will
117-
* be more or less random. By treating the majority of codepoints as
118-
* 'rare', we ensure that in almost all such cases, the bogus
119-
* codepoints will include plenty of 'rares', thus giving the
120-
* incorrect candidate encoding lots of demerits. See
121-
* common_codepoints.txt for the actual list used.
122-
*
123-
* So, why give extra demerits for ASCII punctuation characters? It's
124-
* because there are some text encodings, like UTF-7, HZ, and ISO-2022,
125-
* which deliberately only use bytes in the ASCII range. When
126-
* misinterpreted as ASCII/UTF-8, strings in these encodings will
127-
* have an unusually high number of ASCII punctuation characters.
128-
* So giving extra demerits for such characters will improve
129-
* detection accuracy for UTF-7 and similar encodings.
130-
*
131-
* Finally, why 1 demerit for all other characters? That penalizes
132-
* long strings, meaning we will tend to choose a candidate encoding
133-
* in which the test string decodes to a smaller number of
134-
* codepoints. That prevents single-byte encodings in which almost
135-
* every possible input byte decodes to a 'common' codepoint from
136-
* being favored too much. */
137-
if (c == MBFL_BAD_INPUT) {
138-
data->num_illegalchars++;
139-
} else if (c > 0xFFFF) {
140-
data->score += 40;
141-
} else if (c >= 0x21 && c <= 0x2F) {
142-
data->score += 6;
143-
} else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) {
144-
data->score += 30;
145-
} else {
146-
data->score += 1;
147-
}
148-
return 0;
149-
}
150-
151-
mbfl_encoding_detector *mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict)
152-
{
153-
if (!elistsz) {
154-
return NULL;
155-
}
156-
157-
mbfl_encoding_detector *identd = emalloc(sizeof(mbfl_encoding_detector));
158-
identd->filter_list = ecalloc(elistsz, sizeof(mbfl_convert_filter*));
159-
identd->filter_data = ecalloc(elistsz, sizeof(mbfl_encoding_detector_data));
160-
161-
int filter_list_size = 0;
162-
for (int i = 0; i < elistsz; i++) {
163-
mbfl_convert_filter *filter = mbfl_convert_filter_new(elist[i], &mbfl_encoding_wchar,
164-
mbfl_estimate_encoding_likelihood, NULL, &identd->filter_data[filter_list_size]);
165-
if (filter) {
166-
identd->filter_list[filter_list_size++] = filter;
167-
}
168-
}
169-
identd->filter_list_size = filter_list_size;
170-
identd->strict = strict;
171-
return identd;
172-
}
173-
174-
void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd)
175-
{
176-
for (int i = 0; i < identd->filter_list_size; i++) {
177-
mbfl_convert_filter_delete(identd->filter_list[i]);
178-
}
179-
efree(identd->filter_list);
180-
efree(identd->filter_data);
181-
efree(identd);
182-
}
183-
184-
int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string)
185-
{
186-
int num = identd->filter_list_size;
187-
size_t n = string->len;
188-
unsigned char *p = string->val;
189-
int bad = 0;
190-
191-
if (identd->strict) {
192-
for (int i = 0; i < num; i++) {
193-
mbfl_convert_filter *filter = identd->filter_list[i];
194-
mbfl_encoding_detector_data *data = &identd->filter_data[i];
195-
if (filter->from->check != NULL && !(filter->from->check)(p, n)) {
196-
data->num_illegalchars++;
197-
}
198-
}
199-
}
200-
201-
while (n--) {
202-
for (int i = 0; i < num; i++) {
203-
mbfl_convert_filter *filter = identd->filter_list[i];
204-
mbfl_encoding_detector_data *data = &identd->filter_data[i];
205-
if (!data->num_illegalchars) {
206-
(*filter->filter_function)(*p, filter);
207-
if (data->num_illegalchars) {
208-
bad++;
209-
}
210-
}
211-
}
212-
if ((num - 1) <= bad && !identd->strict) {
213-
return 1;
214-
}
215-
p++;
216-
}
217-
218-
for (int i = 0; i < num; i++) {
219-
mbfl_convert_filter *filter = identd->filter_list[i];
220-
(filter->filter_flush)(filter);
221-
}
222-
223-
return 0;
224-
}
225-
226-
const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd)
227-
{
228-
size_t best_score = SIZE_MAX; /* Low score is 'better' */
229-
const mbfl_encoding *enc = NULL;
230-
231-
for (int i = 0; i < identd->filter_list_size; i++) {
232-
mbfl_convert_filter *filter = identd->filter_list[i];
233-
mbfl_encoding_detector_data *data = &identd->filter_data[i];
234-
if (!data->num_illegalchars && data->score < best_score) {
235-
enc = filter->from;
236-
best_score = data->score;
237-
}
238-
}
239-
240-
return enc;
241-
}
242-
24396
/*
24497
* strcut
24598
*/

ext/mbstring/libmbfl/mbfl/mbfilter.h

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -125,28 +125,6 @@
125125
#define MIN(a,b) ((a)<(b)?(a):(b))
126126
#endif
127127

128-
/*
129-
* encoding detector
130-
*/
131-
typedef struct _mbfl_encoding_detector mbfl_encoding_detector;
132-
133-
typedef struct {
134-
size_t num_illegalchars;
135-
size_t score;
136-
} mbfl_encoding_detector_data;
137-
138-
struct _mbfl_encoding_detector {
139-
mbfl_convert_filter **filter_list;
140-
mbfl_encoding_detector_data *filter_data;
141-
int filter_list_size;
142-
int strict;
143-
};
144-
145-
MBFLAPI extern mbfl_encoding_detector * mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict);
146-
MBFLAPI extern void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd);
147-
MBFLAPI extern int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string);
148-
MBFLAPI extern const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd);
149-
150128
/* Lengths -1 through -16 are reserved for error return values */
151129
static inline int mbfl_is_error(size_t len) {
152130
return len >= (size_t) -16;

ext/mbstring/mb_gpc.c

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,6 @@ const mbfl_encoding *_php_mb_encoding_handler_ex(const php_mb_encoding_handler_i
177177
size_t n, num = 1, *len_list = NULL;
178178
size_t new_val_len;
179179
const mbfl_encoding *from_encoding = NULL;
180-
mbfl_encoding_detector *identd = NULL;
181180

182181
if (!res || *res == '\0') {
183182
goto out;
@@ -235,23 +234,7 @@ const mbfl_encoding *_php_mb_encoding_handler_ex(const php_mb_encoding_handler_i
235234
} else if (info->num_from_encodings == 1) {
236235
from_encoding = info->from_encodings[0];
237236
} else {
238-
/* auto detect */
239-
from_encoding = NULL;
240-
identd = mbfl_encoding_detector_new(info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection));
241-
if (identd != NULL) {
242-
n = 0;
243-
while (n < num) {
244-
mbfl_string string;
245-
string.val = (unsigned char *)val_list[n];
246-
string.len = len_list[n];
247-
if (mbfl_encoding_detector_feed(identd, &string)) {
248-
break;
249-
}
250-
n++;
251-
}
252-
from_encoding = mbfl_encoding_detector_judge(identd);
253-
mbfl_encoding_detector_delete(identd);
254-
}
237+
from_encoding = mb_guess_encoding_for_strings((const unsigned char**)val_list, len_list, num, info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection));
255238
if (!from_encoding) {
256239
if (info->report_errors) {
257240
php_error_docref(NULL, E_WARNING, "Unable to detect encoding");

0 commit comments

Comments
 (0)