Skip to content

Commit a80b6d7

Browse files
youkidearitainielsdosGirgias
authored andcommittedNov 24, 2023
Add mb_trim function
Co-authored-by: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Co-authored-by: Gina Peter Banyard <girgias@php.net>
1 parent 3665e90 commit a80b6d7

File tree

5 files changed

+289
-3
lines changed

5 files changed

+289
-3
lines changed
 

‎ext/mbstring/mbstring.c

+139
Original file line numberDiff line numberDiff line change
@@ -2945,6 +2945,145 @@ PHP_FUNCTION(mb_strtolower)
29452945
RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
29462946
}
29472947

2948+
typedef enum {
2949+
MB_LTRIM = 1,
2950+
MB_RTRIM = 2,
2951+
MB_BOTH_TRIM = 3
2952+
} mb_trim_mode;
2953+
2954+
static zend_always_inline bool is_trim_wchar(uint32_t w, const HashTable *ht)
2955+
{
2956+
return zend_hash_index_exists(ht, w);
2957+
}
2958+
2959+
static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, mb_trim_mode mode, const mbfl_encoding *enc)
2960+
{
2961+
unsigned char *in = (unsigned char*)ZSTR_VAL(str);
2962+
uint32_t wchar_buf[128];
2963+
size_t in_len = ZSTR_LEN(str);
2964+
size_t out_len = 0;
2965+
unsigned int state = 0;
2966+
size_t left = 0;
2967+
size_t right = 0;
2968+
size_t total_len = 0;
2969+
2970+
while (in_len) {
2971+
out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2972+
ZEND_ASSERT(out_len <= 128);
2973+
total_len += out_len;
2974+
2975+
for (size_t i = 0; i < out_len; i++) {
2976+
uint32_t w = wchar_buf[i];
2977+
if (is_trim_wchar(w, what_ht)) {
2978+
if (mode & MB_LTRIM) {
2979+
left += 1;
2980+
}
2981+
if (mode & MB_RTRIM) {
2982+
right += 1;
2983+
}
2984+
} else {
2985+
mode &= ~MB_LTRIM;
2986+
if (mode & MB_RTRIM) {
2987+
right = 0;
2988+
}
2989+
}
2990+
}
2991+
}
2992+
2993+
return mb_get_substr(str, left, total_len - (right + left), enc);
2994+
}
2995+
2996+
static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
2997+
{
2998+
const uint32_t trim_default_chars[] = {
2999+
0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
3000+
0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
3001+
0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
3002+
0x85, 0x180E
3003+
};
3004+
size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
3005+
3006+
HashTable what_ht;
3007+
zval val;
3008+
ZVAL_TRUE(&val);
3009+
3010+
zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
3011+
3012+
for (size_t i = 0; i < trim_default_chars_length; i++) {
3013+
zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
3014+
}
3015+
zend_string* retval = trim_each_wchar(str, &what_ht, mode, enc);
3016+
zend_hash_destroy(&what_ht);
3017+
3018+
return retval;
3019+
}
3020+
3021+
static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
3022+
{
3023+
unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
3024+
uint32_t what_wchar_buf[128];
3025+
size_t what_out_len = 0;
3026+
unsigned int state = 0;
3027+
size_t what_len = ZSTR_LEN(what);
3028+
HashTable what_ht;
3029+
zval val;
3030+
ZVAL_TRUE(&val);
3031+
zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3032+
3033+
while (what_len) {
3034+
what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
3035+
ZEND_ASSERT(what_out_len <= 128);
3036+
for (size_t i = 0; i < what_out_len; i++) {
3037+
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3038+
}
3039+
}
3040+
3041+
zend_string *retval = trim_each_wchar(str, &what_ht, mode, enc);
3042+
zend_hash_destroy(&what_ht);
3043+
3044+
return retval;
3045+
}
3046+
3047+
static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
3048+
{
3049+
zend_string *str;
3050+
zend_string *what = NULL;
3051+
zend_string *encoding = NULL;
3052+
3053+
ZEND_PARSE_PARAMETERS_START(1, 3)
3054+
Z_PARAM_STR(str)
3055+
Z_PARAM_OPTIONAL
3056+
Z_PARAM_STR(what)
3057+
Z_PARAM_STR_OR_NULL(encoding)
3058+
ZEND_PARSE_PARAMETERS_END();
3059+
3060+
const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3061+
if (!enc) {
3062+
RETURN_THROWS();
3063+
}
3064+
3065+
if (what) {
3066+
RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
3067+
} else {
3068+
RETURN_STR(mb_trim_default_chars(str, mode, enc));
3069+
}
3070+
}
3071+
3072+
PHP_FUNCTION(mb_trim)
3073+
{
3074+
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
3075+
}
3076+
3077+
PHP_FUNCTION(mb_ltrim)
3078+
{
3079+
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
3080+
}
3081+
3082+
PHP_FUNCTION(mb_rtrim)
3083+
{
3084+
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
3085+
}
3086+
29483087
static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
29493088
{
29503089
const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);

‎ext/mbstring/mbstring.stub.php

+6
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,12 @@ function mb_strtoupper(string $string, ?string $encoding = null): string {}
135135
/** @refcount 1 */
136136
function mb_strtolower(string $string, ?string $encoding = null): string {}
137137

138+
function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}
139+
140+
function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}
141+
142+
function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}
143+
138144
/** @refcount 1 */
139145
function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {}
140146

‎ext/mbstring/mbstring_arginfo.h

+17-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎ext/mbstring/tests/mb_trim.phpt

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
--TEST--
2+
mb_trim() function tests
3+
--EXTENSIONS--
4+
mbstring
5+
--FILE--
6+
<?php
7+
mb_internal_encoding("UTF-8");
8+
9+
echo "== Copy from trim ==\n";
10+
var_dump('ABC' === mb_trim('ABC'));
11+
var_dump('ABC' === mb_ltrim('ABC'));
12+
var_dump('ABC' === mb_rtrim('ABC'));
13+
var_dump('ABC' === mb_trim(" \0\t\nABC \0\t\n"));
14+
var_dump("ABC \0\t\n" === mb_ltrim(" \0\t\nABC \0\t\n"));
15+
var_dump(" \0\t\nABC" === mb_rtrim(" \0\t\nABC \0\t\n"));
16+
var_dump(" \0\t\nABC \0\t\n" === mb_trim(" \0\t\nABC \0\t\n",''));
17+
var_dump(" \0\t\nABC \0\t\n" === mb_ltrim(" \0\t\nABC \0\t\n",''));
18+
var_dump(" \0\t\nABC \0\t\n" === mb_rtrim(" \0\t\nABC \0\t\n",''));
19+
echo "== Empty string ==\n";
20+
var_dump(mb_trim(""));
21+
var_dump(mb_ltrim(""));
22+
var_dump(mb_rtrim(""));
23+
24+
echo "== Single string ==\n";
25+
var_dump(mb_ltrim(' test ', ''));
26+
var_dump(mb_trim(" あいうえおあお ", " ", "UTF-8"));
27+
var_dump(mb_trim('foo BAR Spaß', 'ß', "UTF-8"));
28+
var_dump(mb_trim('foo BAR Spaß', 'f', "UTF-8"));
29+
30+
echo "== Multi strings ==\n";
31+
var_dump(mb_trim('foo BAR Spaß', 'ßf', "UTF-8"));
32+
var_dump(mb_trim('foo BAR Spaß', '', "UTF-8"));
33+
var_dump(mb_trim(" あいうおえお  あ", " あ", "UTF-8"));
34+
var_dump(mb_trim(" あいうおえお  あ", "あ ", "UTF-8"));
35+
var_dump(mb_trim(" あいうおえお  a", "あa", "UTF-8"));
36+
var_dump(mb_trim(" あいうおえお  a", "\xe3", "UTF-8"));
37+
38+
echo "== Many strings ==\n";
39+
var_dump(mb_trim(str_repeat(" ", 129)));
40+
var_dump(mb_trim(str_repeat(" ", 129) . "a"));
41+
var_dump(mb_rtrim(str_repeat(" ", 129) . "a"));
42+
43+
echo "== mb_ltrim ==\n";
44+
var_dump(mb_ltrim("あああああああああああああああああああああああああああああああああいああああ", ""));
45+
echo "== mb_rtrim ==\n";
46+
var_dump(mb_rtrim("あああああああああああああああああああああああああああああああああいああああ", ""));
47+
48+
echo "== default params ==\n";
49+
var_dump(mb_trim(" \f\n\r\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}"));
50+
51+
echo "== Byte Order Mark ==\n";
52+
var_dump(mb_ltrim("\u{FFFE}漢字", "\u{FFFE}\u{FEFF}"));
53+
var_dump(bin2hex(mb_ltrim(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE")));
54+
var_dump(bin2hex(mb_ltrim(mb_convert_encoding("\u{FEFF}漢字", "UTF-16BE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16BE", "UTF-8"), "UTF-16BE")));
55+
56+
echo "== Empty string ==\n";
57+
var_dump(mb_trim(" abcd ", ""));
58+
var_dump(mb_ltrim(" abcd ", ""));
59+
var_dump(mb_rtrim(" abcd ", ""));
60+
61+
echo "== SJIS ==\n";
62+
var_dump(mb_convert_encoding(mb_trim("\x81\x40\x82\xa0\x81\x40", "\x81\x40", "SJIS"), "UTF-8", "SJIS"));
63+
64+
echo "== Same strings ==\n";
65+
var_dump(mb_trim("foo", "oo"));
66+
67+
echo "== \$encoding throws ValueError ==\n";
68+
try {
69+
var_dump(mb_trim( "\u{180F}", "", "NULL"));
70+
} catch (ValueError $e) {
71+
var_dump($e->getMessage());
72+
}
73+
74+
?>
75+
--EXPECT--
76+
== Copy from trim ==
77+
bool(true)
78+
bool(true)
79+
bool(true)
80+
bool(true)
81+
bool(true)
82+
bool(true)
83+
bool(true)
84+
bool(true)
85+
bool(true)
86+
== Empty string ==
87+
string(0) ""
88+
string(0) ""
89+
string(0) ""
90+
== Single string ==
91+
string(6) " test "
92+
string(21) "あいうえおあお"
93+
string(11) "foo BAR Spa"
94+
string(12) "oo BAR Spaß"
95+
== Multi strings ==
96+
string(10) "oo BAR Spa"
97+
string(10) "oo BAR Spa"
98+
string(16) "いうおえお "
99+
string(16) "いうおえお "
100+
string(25) " あいうおえお  "
101+
string(26) " あいうおえお  a"
102+
== Many strings ==
103+
string(0) ""
104+
string(1) "a"
105+
string(388) "                                                                                                                                 a"
106+
== mb_ltrim ==
107+
string(15) "いああああ"
108+
== mb_rtrim ==
109+
string(102) "あああああああああああああああああああああああああああああああああい"
110+
== default params ==
111+
string(0) ""
112+
== Byte Order Mark ==
113+
string(6) "漢字"
114+
string(8) "226f575b"
115+
string(8) "6f225b57"
116+
== Empty string ==
117+
string(6) " abcd "
118+
string(6) " abcd "
119+
string(6) " abcd "
120+
== SJIS ==
121+
string(3) "あ"
122+
== Same strings ==
123+
string(1) "f"
124+
== $encoding throws ValueError ==
125+
string(73) "mb_trim(): Argument #3 ($encoding) must be a valid encoding, "NULL" given"

‎ext/mbstring/tests/mbregex_stack_limit2.phpt

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ if (version_compare(MB_ONIGURUMA_VERSION, '6.9.3') < 0) {
1212
?>
1313
--FILE--
1414
<?php
15-
function mb_trim( $string, $chars = "", $chars_array = array() )
15+
function mb_trim_regex( $string, $chars = "", $chars_array = array() )
1616
{
1717
for( $x=0; $x<iconv_strlen( $chars ); $x++ ) $chars_array[] = preg_quote( iconv_substr( $chars, $x, 1 ) );
1818
$encoded_char_list = implode( "|", array_merge( array( "\s","\t","\n","\r", "\0", "\x0B" ), $chars_array ) );
@@ -23,7 +23,7 @@ function mb_trim( $string, $chars = "", $chars_array = array() )
2323
}
2424

2525
ini_set('mbstring.regex_stack_limit', 10000);
26-
var_dump(mb_trim(str_repeat(' ', 10000)));
26+
var_dump(mb_trim_regex(str_repeat(' ', 10000)));
2727

2828
echo 'OK';
2929
?>

0 commit comments

Comments
 (0)
Please sign in to comment.