mb_scrub does not attempt to scrub known-valid UTF-8 strings

This commit is contained in:
Alex Dowad 2023-01-20 10:28:26 +02:00
parent f4dd35ea53
commit 6f53dbb83e
2 changed files with 16 additions and 5 deletions

View file

@ -5066,12 +5066,10 @@ PHP_FUNCTION(mb_chr)
/* {{{ */
PHP_FUNCTION(mb_scrub)
{
char* str;
size_t str_len;
zend_string *enc_name = NULL;
zend_string *str, *enc_name = NULL;
ZEND_PARSE_PARAMETERS_START(1, 2)
Z_PARAM_STRING(str, str_len)
Z_PARAM_STR(str)
Z_PARAM_OPTIONAL
Z_PARAM_STR_OR_NULL(enc_name)
ZEND_PARSE_PARAMETERS_END();
@ -5081,7 +5079,12 @@ PHP_FUNCTION(mb_scrub)
RETURN_THROWS();
}
RETURN_STR(php_mb_convert_encoding_ex(str, str_len, enc, enc));
if (enc == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
/* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
RETURN_STR_COPY(str);
}
RETURN_STR(php_mb_convert_encoding_ex(ZSTR_VAL(str), ZSTR_LEN(str), enc, enc));
}
/* }}} */

View file

@ -8,7 +8,15 @@ var_dump(
"?" === mb_scrub("\x80"),
"?" === mb_scrub("\x80", 'UTF-8')
);
$utf8str = "abc 日本語 Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞";
// Check $utf8str so it is marked as 'valid UTF-8'
// This will enable optimized implementation of mb_scrub
if (!mb_check_encoding($utf8str, 'UTF-8'))
die("Test string should be valid UTF-8");
var_dump(mb_scrub($utf8str));
?>
--EXPECT--
bool(true)
bool(true)
string(122) "abc 日本語 Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞"