Skip to content

Commit e683c18

Browse files
committed
Merge branch 'PHP-7.4'
2 parents 7a3306a + d77ad27 commit e683c18

File tree

5 files changed

+403
-0
lines changed

5 files changed

+403
-0
lines changed

Diff for: ext/mbstring/mbstring.c

+170
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,12 @@ ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_output_handler, 0, 0, 2)
210210
ZEND_ARG_INFO(0, status)
211211
ZEND_END_ARG_INFO()
212212

213+
ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_str_split, 0, 0, 1)
214+
ZEND_ARG_INFO(0, str)
215+
ZEND_ARG_INFO(0, split_length)
216+
ZEND_ARG_INFO(0, encoding)
217+
ZEND_END_ARG_INFO()
218+
213219
ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_strlen, 0, 0, 1)
214220
ZEND_ARG_INFO(0, str)
215221
ZEND_ARG_INFO(0, encoding)
@@ -507,6 +513,7 @@ static const zend_function_entry mbstring_functions[] = {
507513
PHP_FE(mb_parse_str, arginfo_mb_parse_str)
508514
PHP_FE(mb_output_handler, arginfo_mb_output_handler)
509515
PHP_FE(mb_preferred_mime_name, arginfo_mb_preferred_mime_name)
516+
PHP_FE(mb_str_split, arginfo_mb_str_split)
510517
PHP_FE(mb_strlen, arginfo_mb_strlen)
511518
PHP_FE(mb_strpos, arginfo_mb_strpos)
512519
PHP_FE(mb_strrpos, arginfo_mb_strrpos)
@@ -2173,6 +2180,169 @@ PHP_FUNCTION(mb_output_handler)
21732180
}
21742181
/* }}} */
21752182

2183+
/* {{{ proto array mb_str_split(string str [, int split_length] [, string encoding])
2184+
Convert a multibyte string to an array. If split_length is specified,
2185+
break the string down into chunks each split_length characters long. */
2186+
2187+
/* structure to pass split params to the callback */
2188+
struct mbfl_split_params {
2189+
zval *return_value; /* php function return value structure pointer */
2190+
mbfl_string *result_string; /* string to store result chunk */
2191+
size_t mb_chunk_length; /* actual chunk length in chars */
2192+
size_t split_length; /* split length in chars */
2193+
mbfl_convert_filter *next_filter; /* widechar to encoding converter */
2194+
};
2195+
2196+
/* callback function to fill split array */
2197+
static int mbfl_split_output(int c, void *data)
2198+
{
2199+
struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
2200+
2201+
(*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
2202+
2203+
if(params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
2204+
mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
2205+
mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
2206+
mbfl_string *chunk = params->result_string;
2207+
mbfl_memory_device_result(device, chunk); /* make chunk */
2208+
add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
2209+
efree(chunk->val);
2210+
params->mb_chunk_length = 0; /* reset mb_chunk size */
2211+
}
2212+
return 0;
2213+
}
2214+
2215+
PHP_FUNCTION(mb_str_split)
2216+
{
2217+
zend_string *str, *encoding = NULL;
2218+
size_t mb_len, chunks, chunk_len;
2219+
const char *p, *last; /* pointer for the string cursor and last string char */
2220+
mbfl_string string, result_string;
2221+
const mbfl_encoding *mbfl_encoding;
2222+
zend_long split_length = 1;
2223+
2224+
ZEND_PARSE_PARAMETERS_START(1, 3)
2225+
Z_PARAM_STR(str)
2226+
Z_PARAM_OPTIONAL
2227+
Z_PARAM_LONG(split_length)
2228+
Z_PARAM_STR(encoding)
2229+
ZEND_PARSE_PARAMETERS_END();
2230+
2231+
if (split_length <= 0) {
2232+
php_error_docref(NULL, E_WARNING, "The length of each segment must be greater than zero");
2233+
RETURN_FALSE;
2234+
}
2235+
2236+
/* fill mbfl_string structure */
2237+
string.val = (unsigned char *) ZSTR_VAL(str);
2238+
string.len = ZSTR_LEN(str);
2239+
string.no_language = MBSTRG(language);
2240+
string.encoding = php_mb_get_encoding(encoding);
2241+
if (!string.encoding) {
2242+
RETURN_FALSE;
2243+
}
2244+
2245+
p = ZSTR_VAL(str); /* string cursor pointer */
2246+
last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */
2247+
2248+
mbfl_encoding = string.encoding;
2249+
2250+
/* first scenario: 1,2,4-bytes fixed width encodings (head part) */
2251+
if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
2252+
mb_len = string.len;
2253+
chunk_len = (size_t)split_length; /* chunk length in bytes */
2254+
} else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { /* 2 bytes */
2255+
mb_len = string.len / 2;
2256+
chunk_len = split_length * 2;
2257+
} else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { /* 4 bytes */
2258+
mb_len = string.len / 4;
2259+
chunk_len = split_length * 4;
2260+
} else if (mbfl_encoding->mblen_table != NULL) {
2261+
/* second scenario: variable width encodings with length table */
2262+
char unsigned const *mbtab = mbfl_encoding->mblen_table;
2263+
2264+
/* assume that we have 1-bytes characters */
2265+
array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
2266+
2267+
while (p < last) { /* split cycle work until the cursor has reached the last byte */
2268+
char const *chunk_p = p; /* chunk first byte pointer */
2269+
chunk_len = 0; /* chunk length in bytes */
2270+
for (zend_long char_count = 0; char_count < split_length && p < last; ++char_count) {
2271+
char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */
2272+
chunk_len += m;
2273+
p += m;
2274+
}
2275+
if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */
2276+
add_next_index_stringl(return_value, chunk_p, chunk_len);
2277+
}
2278+
return;
2279+
} else {
2280+
/* third scenario: other multibyte encodings */
2281+
mbfl_convert_filter *filter, *decoder;
2282+
2283+
/* assume that we have 1-bytes characters */
2284+
array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
2285+
2286+
/* decoder filter to decode wchar to encoding */
2287+
mbfl_memory_device device;
2288+
mbfl_memory_device_init(&device, split_length + 1, 0);
2289+
2290+
decoder = mbfl_convert_filter_new(
2291+
&mbfl_encoding_wchar,
2292+
string.encoding,
2293+
mbfl_memory_device_output,
2294+
NULL,
2295+
&device);
2296+
/* if something wrong with the decoded */
2297+
if (decoder == NULL) {
2298+
RETURN_FALSE;
2299+
}
2300+
2301+
/* wchar filter */
2302+
mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
2303+
struct mbfl_split_params params = { /* init callback function params structure */
2304+
.return_value = return_value,
2305+
.result_string = &result_string,
2306+
.mb_chunk_length = 0,
2307+
.split_length = (size_t)split_length,
2308+
.next_filter = decoder,
2309+
};
2310+
2311+
filter = mbfl_convert_filter_new(
2312+
string.encoding,
2313+
&mbfl_encoding_wchar,
2314+
mbfl_split_output,
2315+
NULL,
2316+
&params);
2317+
/* if something wrong with the filter */
2318+
if (filter == NULL){
2319+
mbfl_convert_filter_delete(decoder); /* this will free allocated memory for the decoded */
2320+
RETURN_FALSE;
2321+
}
2322+
2323+
while (p < last - 1) { /* cycle each byte except last with callback function */
2324+
(*filter->filter_function)(*p++, filter);
2325+
}
2326+
params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
2327+
(*filter->filter_function)(*p++, filter); /*process last char */
2328+
2329+
mbfl_convert_filter_delete(decoder);
2330+
mbfl_convert_filter_delete(filter);
2331+
return;
2332+
}
2333+
2334+
/* first scenario: 1,2,4-bytes fixed width encodings (tail part) */
2335+
chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */
2336+
array_init_size(return_value, chunks);
2337+
if (chunks != 0) {
2338+
for (zend_long i = 0; i < chunks - 1; p += chunk_len, ++i) {
2339+
add_next_index_stringl(return_value, p, chunk_len);
2340+
}
2341+
add_next_index_stringl(return_value, p, last - p);
2342+
}
2343+
}
2344+
/* }}} */
2345+
21762346
/* {{{ proto int mb_strlen(string str [, string encoding])
21772347
Get character numbers of a string */
21782348
PHP_FUNCTION(mb_strlen)

Diff for: ext/mbstring/mbstring.h

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ PHP_FUNCTION(mb_substitute_character);
7878
PHP_FUNCTION(mb_preferred_mime_name);
7979
PHP_FUNCTION(mb_parse_str);
8080
PHP_FUNCTION(mb_output_handler);
81+
PHP_FUNCTION(mb_str_split);
8182
PHP_FUNCTION(mb_strlen);
8283
PHP_FUNCTION(mb_strpos);
8384
PHP_FUNCTION(mb_strrpos);

Diff for: ext/mbstring/tests/mb_str_split_jp.phpt

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
--TEST--
2+
mb_str_split() tests for the japanese language
3+
--SKIPIF--
4+
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
5+
--INI--
6+
output_handler=
7+
mbstring.func_overload=0
8+
--FILE--
9+
<?php
10+
ini_set('include_path','.');
11+
include_once('common.inc');
12+
13+
$string = "日本"; /* 2 chars */
14+
$len = 2;
15+
$charset = [
16+
"BIG-5",
17+
"EUC-JP",
18+
"ISO-2022-JP",
19+
"SJIS",
20+
"UTF-16BE",
21+
"UTF-16LE",
22+
"UTF-32BE",
23+
"UTF-32LE",
24+
"UTF-8"
25+
];
26+
27+
28+
foreach($charset as $cs){
29+
$enc = mb_convert_encoding($string, $cs, "UTF-8");
30+
$split = mb_str_split($enc, 1, $cs);
31+
32+
/* check chunks number */
33+
for($i = 1; $i <= $len; ++$i){
34+
$ceil = ceil($len / $i);
35+
$cnt = count(mb_str_split($enc,$i,$cs));
36+
if($ceil != $cnt){
37+
echo "$cs WRONG CHUNKS NUMBER: expected/actual: $ceil/$cnt\n";
38+
}
39+
}
40+
41+
/* check content */
42+
echo "$cs:";
43+
for($i = 0; $i < $len; ++$i){
44+
echo " " . unpack("H*", $split[$i])[1];
45+
}
46+
echo "\n";
47+
}
48+
49+
/* long string test */
50+
$size = 50000;
51+
$long = str_repeat($string, $size); /* 50k x 2 chars = 1e5 chars */
52+
$enc = mb_convert_encoding($long, "ISO-2022-JP", "UTF-8");
53+
$array = mb_str_split($enc, $len, "ISO-2022-JP");
54+
$count = count($array);
55+
56+
/* check array size */
57+
if($size !== $count) printf("Long string splitting error: actual array size: %d expected: %d\n", $count, $size);
58+
59+
/* compare initial string and last array element after splitting */
60+
$enc = mb_convert_encoding($string, "ISO-2022-JP", "UTF-8");
61+
if(end($array) !== $enc){
62+
printf("Long string splitting error:
63+
last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
64+
}
65+
66+
?>
67+
--EXPECT--
68+
BIG-5: a4e9 a5bb
69+
EUC-JP: c6fc cbdc
70+
ISO-2022-JP: 1b2442467c1b2842 1b24424b5c1b2842
71+
SJIS: 93fa 967b
72+
UTF-16BE: 65e5 672c
73+
UTF-16LE: e565 2c67
74+
UTF-32BE: 000065e5 0000672c
75+
UTF-32LE: e5650000 2c670000
76+
UTF-8: e697a5 e69cac

Diff for: ext/mbstring/tests/mb_str_split_ru.phpt

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
--TEST--
2+
mb_str_split() tests for the russian language
3+
--SKIPIF--
4+
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
5+
--INI--
6+
output_handler=
7+
mbstring.func_overload=0
8+
--FILE--
9+
<?php
10+
ini_set('include_path','.');
11+
include_once('common.inc');
12+
13+
$string = "рай рай рай "; /* 12 chars */
14+
$len = 12;
15+
$charset = [
16+
"EUC-JP",
17+
"CP866",
18+
"KOI8-R",
19+
"UTF-16BE",
20+
"UTF-16LE",
21+
"UTF-32BE",
22+
"UTF-32LE",
23+
"UTF-8"
24+
];
25+
26+
27+
foreach($charset as $cs){
28+
$enc = mb_convert_encoding($string, $cs, "UTF-8");
29+
$split = mb_str_split($enc, 1, $cs);
30+
31+
32+
/* check chunks number */
33+
for($i = 1; $i <= $len; ++$i){
34+
$ceil = ceil($len / $i);
35+
$cnt = count(mb_str_split($enc,$i,$cs));
36+
if($ceil != $cnt){
37+
echo "$cs WRONG CHUNKS NUMBER: expected/actual: $ceil/$cnt\n";
38+
}
39+
}
40+
41+
/* check content */
42+
echo "$cs:";
43+
for($i = 0; $i < $len; ++$i){
44+
echo " " . unpack("H*", $split[$i])[1];
45+
}
46+
echo "\n";
47+
}
48+
49+
/* long string test */
50+
$size = 25000;
51+
$long = str_repeat($string, $size); /* 25k x 12 chars = 3e5 chars */
52+
$enc = mb_convert_encoding($long, "EUC-JP", "UTF-8");
53+
$array = mb_str_split($enc, $len, "EUC-JP");
54+
$count = count($array);
55+
56+
/* check array size */
57+
if($size !== $count) printf("Long string splitting error: actual array size: %d expected: %d\n", $count, $size);
58+
59+
/* compare initial string and last array element after splitting */
60+
$enc = mb_convert_encoding($string, "EUC-JP", "UTF-8");
61+
if(end($array) !== $enc){
62+
printf("Long string splitting error:
63+
last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
64+
}
65+
66+
?>
67+
--EXPECT--
68+
EUC-JP: a7e2 a7d1 a7db 20 a7e2 a7d1 a7db 20 a7e2 a7d1 a7db 20
69+
CP866: e0 a0 a9 20 e0 a0 a9 20 e0 a0 a9 20
70+
KOI8-R: d2 c1 ca 20 d2 c1 ca 20 d2 c1 ca 20
71+
UTF-16BE: 0440 0430 0439 0020 0440 0430 0439 0020 0440 0430 0439 0020
72+
UTF-16LE: 4004 3004 3904 2000 4004 3004 3904 2000 4004 3004 3904 2000
73+
UTF-32BE: 00000440 00000430 00000439 00000020 00000440 00000430 00000439 00000020 00000440 00000430 00000439 00000020
74+
UTF-32LE: 40040000 30040000 39040000 20000000 40040000 30040000 39040000 20000000 40040000 30040000 39040000 20000000
75+
UTF-8: d180 d0b0 d0b9 20 d180 d0b0 d0b9 20 d180 d0b0 d0b9 20

0 commit comments

Comments
 (0)