mirror of
https://github.com/php/php-src.git
synced 2025-08-16 05:58:45 +02:00
Add mb_trim function
Co-authored-by: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Co-authored-by: Gina Peter Banyard <girgias@php.net>
This commit is contained in:
parent
3665e90061
commit
a80b6d7b99
5 changed files with 289 additions and 3 deletions
|
@ -2945,6 +2945,145 @@ PHP_FUNCTION(mb_strtolower)
|
|||
RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
|
||||
}
|
||||
|
||||
typedef enum {
|
||||
MB_LTRIM = 1,
|
||||
MB_RTRIM = 2,
|
||||
MB_BOTH_TRIM = 3
|
||||
} mb_trim_mode;
|
||||
|
||||
static zend_always_inline bool is_trim_wchar(uint32_t w, const HashTable *ht)
|
||||
{
|
||||
return zend_hash_index_exists(ht, w);
|
||||
}
|
||||
|
||||
static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, mb_trim_mode mode, const mbfl_encoding *enc)
|
||||
{
|
||||
unsigned char *in = (unsigned char*)ZSTR_VAL(str);
|
||||
uint32_t wchar_buf[128];
|
||||
size_t in_len = ZSTR_LEN(str);
|
||||
size_t out_len = 0;
|
||||
unsigned int state = 0;
|
||||
size_t left = 0;
|
||||
size_t right = 0;
|
||||
size_t total_len = 0;
|
||||
|
||||
while (in_len) {
|
||||
out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
|
||||
ZEND_ASSERT(out_len <= 128);
|
||||
total_len += out_len;
|
||||
|
||||
for (size_t i = 0; i < out_len; i++) {
|
||||
uint32_t w = wchar_buf[i];
|
||||
if (is_trim_wchar(w, what_ht)) {
|
||||
if (mode & MB_LTRIM) {
|
||||
left += 1;
|
||||
}
|
||||
if (mode & MB_RTRIM) {
|
||||
right += 1;
|
||||
}
|
||||
} else {
|
||||
mode &= ~MB_LTRIM;
|
||||
if (mode & MB_RTRIM) {
|
||||
right = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mb_get_substr(str, left, total_len - (right + left), enc);
|
||||
}
|
||||
|
||||
static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
|
||||
{
|
||||
const uint32_t trim_default_chars[] = {
|
||||
0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
|
||||
0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
|
||||
0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
|
||||
0x85, 0x180E
|
||||
};
|
||||
size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
|
||||
|
||||
HashTable what_ht;
|
||||
zval val;
|
||||
ZVAL_TRUE(&val);
|
||||
|
||||
zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
|
||||
|
||||
for (size_t i = 0; i < trim_default_chars_length; i++) {
|
||||
zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
|
||||
}
|
||||
zend_string* retval = trim_each_wchar(str, &what_ht, mode, enc);
|
||||
zend_hash_destroy(&what_ht);
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
|
||||
{
|
||||
unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
|
||||
uint32_t what_wchar_buf[128];
|
||||
size_t what_out_len = 0;
|
||||
unsigned int state = 0;
|
||||
size_t what_len = ZSTR_LEN(what);
|
||||
HashTable what_ht;
|
||||
zval val;
|
||||
ZVAL_TRUE(&val);
|
||||
zend_hash_init(&what_ht, what_len, NULL, NULL, false);
|
||||
|
||||
while (what_len) {
|
||||
what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
|
||||
ZEND_ASSERT(what_out_len <= 128);
|
||||
for (size_t i = 0; i < what_out_len; i++) {
|
||||
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
|
||||
}
|
||||
}
|
||||
|
||||
zend_string *retval = trim_each_wchar(str, &what_ht, mode, enc);
|
||||
zend_hash_destroy(&what_ht);
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
|
||||
{
|
||||
zend_string *str;
|
||||
zend_string *what = NULL;
|
||||
zend_string *encoding = NULL;
|
||||
|
||||
ZEND_PARSE_PARAMETERS_START(1, 3)
|
||||
Z_PARAM_STR(str)
|
||||
Z_PARAM_OPTIONAL
|
||||
Z_PARAM_STR(what)
|
||||
Z_PARAM_STR_OR_NULL(encoding)
|
||||
ZEND_PARSE_PARAMETERS_END();
|
||||
|
||||
const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
|
||||
if (!enc) {
|
||||
RETURN_THROWS();
|
||||
}
|
||||
|
||||
if (what) {
|
||||
RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
|
||||
} else {
|
||||
RETURN_STR(mb_trim_default_chars(str, mode, enc));
|
||||
}
|
||||
}
|
||||
|
||||
PHP_FUNCTION(mb_trim)
|
||||
{
|
||||
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_BOTH_TRIM);
|
||||
}
|
||||
|
||||
PHP_FUNCTION(mb_ltrim)
|
||||
{
|
||||
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_LTRIM);
|
||||
}
|
||||
|
||||
PHP_FUNCTION(mb_rtrim)
|
||||
{
|
||||
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
|
||||
}
|
||||
|
||||
static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
|
||||
{
|
||||
const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
|
||||
|
|
|
@ -135,6 +135,12 @@ function mb_strtoupper(string $string, ?string $encoding = null): string {}
|
|||
/** @refcount 1 */
|
||||
function mb_strtolower(string $string, ?string $encoding = null): string {}
|
||||
|
||||
function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}
|
||||
|
||||
function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}
|
||||
|
||||
function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string {}
|
||||
|
||||
/** @refcount 1 */
|
||||
function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {}
|
||||
|
||||
|
|
18
ext/mbstring/mbstring_arginfo.h
generated
18
ext/mbstring/mbstring_arginfo.h
generated
|
@ -1,5 +1,5 @@
|
|||
/* This is a generated file, edit the .stub.php file instead.
|
||||
* Stub hash: 141073d610f862b525406fb7f48ac58b6691080e */
|
||||
* Stub hash: 4071d9df39c4ec0d544edd9ff74e5d85f8863b0d */
|
||||
|
||||
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_language, 0, 0, MAY_BE_STRING|MAY_BE_BOOL)
|
||||
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, language, IS_STRING, 1, "null")
|
||||
|
@ -118,6 +118,16 @@ ZEND_END_ARG_INFO()
|
|||
|
||||
#define arginfo_mb_strtolower arginfo_mb_strtoupper
|
||||
|
||||
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_mb_trim, 0, 1, IS_STRING, 0)
|
||||
ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0)
|
||||
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, characters, IS_STRING, 0, "\" \\f\\n\\r\\t\\v\\x00
\"")
|
||||
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, encoding, IS_STRING, 1, "null")
|
||||
ZEND_END_ARG_INFO()
|
||||
|
||||
#define arginfo_mb_ltrim arginfo_mb_trim
|
||||
|
||||
#define arginfo_mb_rtrim arginfo_mb_trim
|
||||
|
||||
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_detect_encoding, 0, 1, MAY_BE_STRING|MAY_BE_FALSE)
|
||||
ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0)
|
||||
ZEND_ARG_TYPE_MASK(0, encodings, MAY_BE_ARRAY|MAY_BE_STRING|MAY_BE_NULL, "null")
|
||||
|
@ -339,6 +349,9 @@ ZEND_FUNCTION(mb_convert_encoding);
|
|||
ZEND_FUNCTION(mb_convert_case);
|
||||
ZEND_FUNCTION(mb_strtoupper);
|
||||
ZEND_FUNCTION(mb_strtolower);
|
||||
ZEND_FUNCTION(mb_trim);
|
||||
ZEND_FUNCTION(mb_ltrim);
|
||||
ZEND_FUNCTION(mb_rtrim);
|
||||
ZEND_FUNCTION(mb_detect_encoding);
|
||||
ZEND_FUNCTION(mb_list_encodings);
|
||||
ZEND_FUNCTION(mb_encoding_aliases);
|
||||
|
@ -434,6 +447,9 @@ static const zend_function_entry ext_functions[] = {
|
|||
ZEND_FE(mb_convert_case, arginfo_mb_convert_case)
|
||||
ZEND_FE(mb_strtoupper, arginfo_mb_strtoupper)
|
||||
ZEND_FE(mb_strtolower, arginfo_mb_strtolower)
|
||||
ZEND_FE(mb_trim, arginfo_mb_trim)
|
||||
ZEND_FE(mb_ltrim, arginfo_mb_ltrim)
|
||||
ZEND_FE(mb_rtrim, arginfo_mb_rtrim)
|
||||
ZEND_FE(mb_detect_encoding, arginfo_mb_detect_encoding)
|
||||
ZEND_FE(mb_list_encodings, arginfo_mb_list_encodings)
|
||||
ZEND_FE(mb_encoding_aliases, arginfo_mb_encoding_aliases)
|
||||
|
|
125
ext/mbstring/tests/mb_trim.phpt
Normal file
125
ext/mbstring/tests/mb_trim.phpt
Normal file
|
@ -0,0 +1,125 @@
|
|||
--TEST--
|
||||
mb_trim() function tests
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
mb_internal_encoding("UTF-8");
|
||||
|
||||
echo "== Copy from trim ==\n";
|
||||
var_dump('ABC' === mb_trim('ABC'));
|
||||
var_dump('ABC' === mb_ltrim('ABC'));
|
||||
var_dump('ABC' === mb_rtrim('ABC'));
|
||||
var_dump('ABC' === mb_trim(" \0\t\nABC \0\t\n"));
|
||||
var_dump("ABC \0\t\n" === mb_ltrim(" \0\t\nABC \0\t\n"));
|
||||
var_dump(" \0\t\nABC" === mb_rtrim(" \0\t\nABC \0\t\n"));
|
||||
var_dump(" \0\t\nABC \0\t\n" === mb_trim(" \0\t\nABC \0\t\n",''));
|
||||
var_dump(" \0\t\nABC \0\t\n" === mb_ltrim(" \0\t\nABC \0\t\n",''));
|
||||
var_dump(" \0\t\nABC \0\t\n" === mb_rtrim(" \0\t\nABC \0\t\n",''));
|
||||
echo "== Empty string ==\n";
|
||||
var_dump(mb_trim(""));
|
||||
var_dump(mb_ltrim(""));
|
||||
var_dump(mb_rtrim(""));
|
||||
|
||||
echo "== Single string ==\n";
|
||||
var_dump(mb_ltrim(' test ', ''));
|
||||
var_dump(mb_trim(" あいうえおあお ", " ", "UTF-8"));
|
||||
var_dump(mb_trim('foo BAR Spaß', 'ß', "UTF-8"));
|
||||
var_dump(mb_trim('foo BAR Spaß', 'f', "UTF-8"));
|
||||
|
||||
echo "== Multi strings ==\n";
|
||||
var_dump(mb_trim('foo BAR Spaß', 'ßf', "UTF-8"));
|
||||
var_dump(mb_trim('foo BAR Spaß', 'fß', "UTF-8"));
|
||||
var_dump(mb_trim(" あいうおえお あ", " あ", "UTF-8"));
|
||||
var_dump(mb_trim(" あいうおえお あ", "あ ", "UTF-8"));
|
||||
var_dump(mb_trim(" あいうおえお a", "あa", "UTF-8"));
|
||||
var_dump(mb_trim(" あいうおえお a", "\xe3", "UTF-8"));
|
||||
|
||||
echo "== Many strings ==\n";
|
||||
var_dump(mb_trim(str_repeat(" ", 129)));
|
||||
var_dump(mb_trim(str_repeat(" ", 129) . "a"));
|
||||
var_dump(mb_rtrim(str_repeat(" ", 129) . "a"));
|
||||
|
||||
echo "== mb_ltrim ==\n";
|
||||
var_dump(mb_ltrim("あああああああああああああああああああああああああああああああああいああああ", "あ"));
|
||||
echo "== mb_rtrim ==\n";
|
||||
var_dump(mb_rtrim("あああああああああああああああああああああああああああああああああいああああ", "あ"));
|
||||
|
||||
echo "== default params ==\n";
|
||||
var_dump(mb_trim(" \f\n\r\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}"));
|
||||
|
||||
echo "== Byte Order Mark ==\n";
|
||||
var_dump(mb_ltrim("\u{FFFE}漢字", "\u{FFFE}\u{FEFF}"));
|
||||
var_dump(bin2hex(mb_ltrim(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE")));
|
||||
var_dump(bin2hex(mb_ltrim(mb_convert_encoding("\u{FEFF}漢字", "UTF-16BE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16BE", "UTF-8"), "UTF-16BE")));
|
||||
|
||||
echo "== Empty string ==\n";
|
||||
var_dump(mb_trim(" abcd ", ""));
|
||||
var_dump(mb_ltrim(" abcd ", ""));
|
||||
var_dump(mb_rtrim(" abcd ", ""));
|
||||
|
||||
echo "== SJIS ==\n";
|
||||
var_dump(mb_convert_encoding(mb_trim("\x81\x40\x82\xa0\x81\x40", "\x81\x40", "SJIS"), "UTF-8", "SJIS"));
|
||||
|
||||
echo "== Same strings ==\n";
|
||||
var_dump(mb_trim("foo", "oo"));
|
||||
|
||||
echo "== \$encoding throws ValueError ==\n";
|
||||
try {
|
||||
var_dump(mb_trim( "\u{180F}", "", "NULL"));
|
||||
} catch (ValueError $e) {
|
||||
var_dump($e->getMessage());
|
||||
}
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
== Copy from trim ==
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
bool(true)
|
||||
== Empty string ==
|
||||
string(0) ""
|
||||
string(0) ""
|
||||
string(0) ""
|
||||
== Single string ==
|
||||
string(6) " test "
|
||||
string(21) "あいうえおあお"
|
||||
string(11) "foo BAR Spa"
|
||||
string(12) "oo BAR Spaß"
|
||||
== Multi strings ==
|
||||
string(10) "oo BAR Spa"
|
||||
string(10) "oo BAR Spa"
|
||||
string(16) "いうおえお "
|
||||
string(16) "いうおえお "
|
||||
string(25) " あいうおえお "
|
||||
string(26) " あいうおえお a"
|
||||
== Many strings ==
|
||||
string(0) ""
|
||||
string(1) "a"
|
||||
string(388) " a"
|
||||
== mb_ltrim ==
|
||||
string(15) "いああああ"
|
||||
== mb_rtrim ==
|
||||
string(102) "あああああああああああああああああああああああああああああああああい"
|
||||
== default params ==
|
||||
string(0) ""
|
||||
== Byte Order Mark ==
|
||||
string(6) "漢字"
|
||||
string(8) "226f575b"
|
||||
string(8) "6f225b57"
|
||||
== Empty string ==
|
||||
string(6) " abcd "
|
||||
string(6) " abcd "
|
||||
string(6) " abcd "
|
||||
== SJIS ==
|
||||
string(3) "あ"
|
||||
== Same strings ==
|
||||
string(1) "f"
|
||||
== $encoding throws ValueError ==
|
||||
string(73) "mb_trim(): Argument #3 ($encoding) must be a valid encoding, "NULL" given"
|
|
@ -12,7 +12,7 @@ if (version_compare(MB_ONIGURUMA_VERSION, '6.9.3') < 0) {
|
|||
?>
|
||||
--FILE--
|
||||
<?php
|
||||
function mb_trim( $string, $chars = "", $chars_array = array() )
|
||||
function mb_trim_regex( $string, $chars = "", $chars_array = array() )
|
||||
{
|
||||
for( $x=0; $x<iconv_strlen( $chars ); $x++ ) $chars_array[] = preg_quote( iconv_substr( $chars, $x, 1 ) );
|
||||
$encoded_char_list = implode( "|", array_merge( array( "\s","\t","\n","\r", "\0", "\x0B" ), $chars_array ) );
|
||||
|
@ -23,7 +23,7 @@ function mb_trim( $string, $chars = "", $chars_array = array() )
|
|||
}
|
||||
|
||||
ini_set('mbstring.regex_stack_limit', 10000);
|
||||
var_dump(mb_trim(str_repeat(' ', 10000)));
|
||||
var_dump(mb_trim_regex(str_repeat(' ', 10000)));
|
||||
|
||||
echo 'OK';
|
||||
?>
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue