Add mb_trim function

Co-authored-by: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Co-authored-by: Gina Peter Banyard <girgias@php.net>
This commit is contained in:
Yuya Hamada 2023-10-01 02:11:59 +09:00 committed by Alex Dowad
parent 3665e90061
commit a80b6d7b99
5 changed files with 289 additions and 3 deletions

View file

@ -2945,6 +2945,145 @@ PHP_FUNCTION(mb_strtolower)
RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc)); RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
} }
typedef enum {
MB_LTRIM = 1,
MB_RTRIM = 2,
MB_BOTH_TRIM = 3
} mb_trim_mode;
static zend_always_inline bool is_trim_wchar(uint32_t w, const HashTable *ht)
{
return zend_hash_index_exists(ht, w);
}
static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, mb_trim_mode mode, const mbfl_encoding *enc)
{
unsigned char *in = (unsigned char*)ZSTR_VAL(str);
uint32_t wchar_buf[128];
size_t in_len = ZSTR_LEN(str);
size_t out_len = 0;
unsigned int state = 0;
size_t left = 0;
size_t right = 0;
size_t total_len = 0;
while (in_len) {
out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
ZEND_ASSERT(out_len <= 128);
total_len += out_len;
for (size_t i = 0; i < out_len; i++) {
uint32_t w = wchar_buf[i];
if (is_trim_wchar(w, what_ht)) {
if (mode & MB_LTRIM) {
left += 1;
}
if (mode & MB_RTRIM) {
right += 1;
}
} else {
mode &= ~MB_LTRIM;
if (mode & MB_RTRIM) {
right = 0;
}
}
}
}
return mb_get_substr(str, left, total_len - (right + left), enc);
}
static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
{
const uint32_t trim_default_chars[] = {
0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
0x85, 0x180E
};
size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
HashTable what_ht;
zval val;
ZVAL_TRUE(&val);
zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
for (size_t i = 0; i < trim_default_chars_length; i++) {
zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
}
zend_string* retval = trim_each_wchar(str, &what_ht, mode, enc);
zend_hash_destroy(&what_ht);
return retval;
}
static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
{
unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
uint32_t what_wchar_buf[128];
size_t what_out_len = 0;
unsigned int state = 0;
size_t what_len = ZSTR_LEN(what);
HashTable what_ht;
zval val;
ZVAL_TRUE(&val);
zend_hash_init(&what_ht, what_len, NULL, NULL, false);
while (what_len) {
what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
ZEND_ASSERT(what_out_len <= 128);
for (size_t i = 0; i < what_out_len; i++) {
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
}
}
zend_string *retval = trim_each_wchar(str, &what_ht, mode, enc);
zend_hash_destroy(&what_ht);
return retval;
}
static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
{
zend_string *str;
zend_string *what = NULL;
zend_string *encoding = NULL;
ZEND_PARSE_PARAMETERS_START(1, 3)
Z_PARAM_STR(str)
Z_PARAM_OPTIONAL
Z_PARAM_STR(what)
Z_PARAM_STR_OR_NULL(encoding)
ZEND_PARSE_PARAMETERS_END();
const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
if (!enc) {
RETURN_THROWS();
}
if (what) {
RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
} else {
RETURN_STR(mb_trim_default_chars(str, mode, enc));
}
}
PHP_FUNCTION(mb_trim)
{
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
}
PHP_FUNCTION(mb_ltrim)
{
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
}
PHP_FUNCTION(mb_rtrim)
{
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
}
static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size) static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
{ {
const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0); const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);

View file

@ -135,6 +135,12 @@ function mb_strtoupper(string $string, ?string $encoding = null): string {}
/** @refcount 1 */ /** @refcount 1 */
function mb_strtolower(string $string, ?string $encoding = null): string {} function mb_strtolower(string $string, ?string $encoding = null): string {}
function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}
function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}
function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}
/** @refcount 1 */ /** @refcount 1 */
function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {} function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {}

View file

@ -1,5 +1,5 @@
/* This is a generated file, edit the .stub.php file instead. /* This is a generated file, edit the .stub.php file instead.
* Stub hash: 141073d610f862b525406fb7f48ac58b6691080e */ * Stub hash: 4071d9df39c4ec0d544edd9ff74e5d85f8863b0d */
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_language, 0, 0, MAY_BE_STRING|MAY_BE_BOOL) ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_language, 0, 0, MAY_BE_STRING|MAY_BE_BOOL)
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, language, IS_STRING, 1, "null") ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, language, IS_STRING, 1, "null")
@ -118,6 +118,16 @@ ZEND_END_ARG_INFO()
#define arginfo_mb_strtolower arginfo_mb_strtoupper #define arginfo_mb_strtolower arginfo_mb_strtoupper
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_mb_trim, 0, 1, IS_STRING, 0)
ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0)
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, characters, IS_STRING, 0, "\" \\f\\n\\r\\t\\v\\x00   …\"")
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, encoding, IS_STRING, 1, "null")
ZEND_END_ARG_INFO()
#define arginfo_mb_ltrim arginfo_mb_trim
#define arginfo_mb_rtrim arginfo_mb_trim
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_detect_encoding, 0, 1, MAY_BE_STRING|MAY_BE_FALSE) ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_detect_encoding, 0, 1, MAY_BE_STRING|MAY_BE_FALSE)
ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0)
ZEND_ARG_TYPE_MASK(0, encodings, MAY_BE_ARRAY|MAY_BE_STRING|MAY_BE_NULL, "null") ZEND_ARG_TYPE_MASK(0, encodings, MAY_BE_ARRAY|MAY_BE_STRING|MAY_BE_NULL, "null")
@ -339,6 +349,9 @@ ZEND_FUNCTION(mb_convert_encoding);
ZEND_FUNCTION(mb_convert_case); ZEND_FUNCTION(mb_convert_case);
ZEND_FUNCTION(mb_strtoupper); ZEND_FUNCTION(mb_strtoupper);
ZEND_FUNCTION(mb_strtolower); ZEND_FUNCTION(mb_strtolower);
ZEND_FUNCTION(mb_trim);
ZEND_FUNCTION(mb_ltrim);
ZEND_FUNCTION(mb_rtrim);
ZEND_FUNCTION(mb_detect_encoding); ZEND_FUNCTION(mb_detect_encoding);
ZEND_FUNCTION(mb_list_encodings); ZEND_FUNCTION(mb_list_encodings);
ZEND_FUNCTION(mb_encoding_aliases); ZEND_FUNCTION(mb_encoding_aliases);
@ -434,6 +447,9 @@ static const zend_function_entry ext_functions[] = {
ZEND_FE(mb_convert_case, arginfo_mb_convert_case) ZEND_FE(mb_convert_case, arginfo_mb_convert_case)
ZEND_FE(mb_strtoupper, arginfo_mb_strtoupper) ZEND_FE(mb_strtoupper, arginfo_mb_strtoupper)
ZEND_FE(mb_strtolower, arginfo_mb_strtolower) ZEND_FE(mb_strtolower, arginfo_mb_strtolower)
ZEND_FE(mb_trim, arginfo_mb_trim)
ZEND_FE(mb_ltrim, arginfo_mb_ltrim)
ZEND_FE(mb_rtrim, arginfo_mb_rtrim)
ZEND_FE(mb_detect_encoding, arginfo_mb_detect_encoding) ZEND_FE(mb_detect_encoding, arginfo_mb_detect_encoding)
ZEND_FE(mb_list_encodings, arginfo_mb_list_encodings) ZEND_FE(mb_list_encodings, arginfo_mb_list_encodings)
ZEND_FE(mb_encoding_aliases, arginfo_mb_encoding_aliases) ZEND_FE(mb_encoding_aliases, arginfo_mb_encoding_aliases)

View file

@ -0,0 +1,125 @@
--TEST--
mb_trim() function tests
--EXTENSIONS--
mbstring
--FILE--
<?php
mb_internal_encoding("UTF-8");
echo "== Copy from trim ==\n";
var_dump('ABC' === mb_trim('ABC'));
var_dump('ABC' === mb_ltrim('ABC'));
var_dump('ABC' === mb_rtrim('ABC'));
var_dump('ABC' === mb_trim(" \0\t\nABC \0\t\n"));
var_dump("ABC \0\t\n" === mb_ltrim(" \0\t\nABC \0\t\n"));
var_dump(" \0\t\nABC" === mb_rtrim(" \0\t\nABC \0\t\n"));
var_dump(" \0\t\nABC \0\t\n" === mb_trim(" \0\t\nABC \0\t\n",''));
var_dump(" \0\t\nABC \0\t\n" === mb_ltrim(" \0\t\nABC \0\t\n",''));
var_dump(" \0\t\nABC \0\t\n" === mb_rtrim(" \0\t\nABC \0\t\n",''));
echo "== Empty string ==\n";
var_dump(mb_trim(""));
var_dump(mb_ltrim(""));
var_dump(mb_rtrim(""));
echo "== Single string ==\n";
var_dump(mb_ltrim(' test ', ''));
var_dump(mb_trim(" あいうえおあお ", " ", "UTF-8"));
var_dump(mb_trim('foo BAR Spaß', 'ß', "UTF-8"));
var_dump(mb_trim('foo BAR Spaß', 'f', "UTF-8"));
echo "== Multi strings ==\n";
var_dump(mb_trim('foo BAR Spaß', 'ßf', "UTF-8"));
var_dump(mb_trim('foo BAR Spaß', 'fß', "UTF-8"));
var_dump(mb_trim(" あいうおえお  あ", " あ", "UTF-8"));
var_dump(mb_trim(" あいうおえお  あ", "あ ", "UTF-8"));
var_dump(mb_trim(" あいうおえお  a", "あa", "UTF-8"));
var_dump(mb_trim(" あいうおえお  a", "\xe3", "UTF-8"));
echo "== Many strings ==\n";
var_dump(mb_trim(str_repeat(" ", 129)));
var_dump(mb_trim(str_repeat(" ", 129) . "a"));
var_dump(mb_rtrim(str_repeat(" ", 129) . "a"));
echo "== mb_ltrim ==\n";
var_dump(mb_ltrim("あああああああああああああああああああああああああああああああああいああああ", "あ"));
echo "== mb_rtrim ==\n";
var_dump(mb_rtrim("あああああああああああああああああああああああああああああああああいああああ", "あ"));
echo "== default params ==\n";
var_dump(mb_trim(" \f\n\r\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}"));
echo "== Byte Order Mark ==\n";
var_dump(mb_ltrim("\u{FFFE}漢字", "\u{FFFE}\u{FEFF}"));
var_dump(bin2hex(mb_ltrim(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE")));
var_dump(bin2hex(mb_ltrim(mb_convert_encoding("\u{FEFF}漢字", "UTF-16BE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16BE", "UTF-8"), "UTF-16BE")));
echo "== Empty string ==\n";
var_dump(mb_trim(" abcd ", ""));
var_dump(mb_ltrim(" abcd ", ""));
var_dump(mb_rtrim(" abcd ", ""));
echo "== SJIS ==\n";
var_dump(mb_convert_encoding(mb_trim("\x81\x40\x82\xa0\x81\x40", "\x81\x40", "SJIS"), "UTF-8", "SJIS"));
echo "== Same strings ==\n";
var_dump(mb_trim("foo", "oo"));
echo "== \$encoding throws ValueError ==\n";
try {
var_dump(mb_trim( "\u{180F}", "", "NULL"));
} catch (ValueError $e) {
var_dump($e->getMessage());
}
?>
--EXPECT--
== Copy from trim ==
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
== Empty string ==
string(0) ""
string(0) ""
string(0) ""
== Single string ==
string(6) " test "
string(21) "あいうえおあお"
string(11) "foo BAR Spa"
string(12) "oo BAR Spaß"
== Multi strings ==
string(10) "oo BAR Spa"
string(10) "oo BAR Spa"
string(16) "いうおえお "
string(16) "いうおえお "
string(25) " あいうおえお  "
string(26) " あいうおえお  a"
== Many strings ==
string(0) ""
string(1) "a"
string(388) "                                                                                                                                 a"
== mb_ltrim ==
string(15) "いああああ"
== mb_rtrim ==
string(102) "あああああああああああああああああああああああああああああああああい"
== default params ==
string(0) ""
== Byte Order Mark ==
string(6) "漢字"
string(8) "226f575b"
string(8) "6f225b57"
== Empty string ==
string(6) " abcd "
string(6) " abcd "
string(6) " abcd "
== SJIS ==
string(3) "あ"
== Same strings ==
string(1) "f"
== $encoding throws ValueError ==
string(73) "mb_trim(): Argument #3 ($encoding) must be a valid encoding, "NULL" given"

View file

@ -12,7 +12,7 @@ if (version_compare(MB_ONIGURUMA_VERSION, '6.9.3') < 0) {
?> ?>
--FILE-- --FILE--
<?php <?php
function mb_trim( $string, $chars = "", $chars_array = array() ) function mb_trim_regex( $string, $chars = "", $chars_array = array() )
{ {
for( $x=0; $x<iconv_strlen( $chars ); $x++ ) $chars_array[] = preg_quote( iconv_substr( $chars, $x, 1 ) ); for( $x=0; $x<iconv_strlen( $chars ); $x++ ) $chars_array[] = preg_quote( iconv_substr( $chars, $x, 1 ) );
$encoded_char_list = implode( "|", array_merge( array( "\s","\t","\n","\r", "\0", "\x0B" ), $chars_array ) ); $encoded_char_list = implode( "|", array_merge( array( "\s","\t","\n","\r", "\0", "\x0B" ), $chars_array ) );
@ -23,7 +23,7 @@ function mb_trim( $string, $chars = "", $chars_array = array() )
} }
ini_set('mbstring.regex_stack_limit', 10000); ini_set('mbstring.regex_stack_limit', 10000);
var_dump(mb_trim(str_repeat(' ', 10000))); var_dump(mb_trim_regex(str_repeat(' ', 10000)));
echo 'OK'; echo 'OK';
?> ?>