Return false on invalid codepoint in mb_chr()

Instead of returning the encoding of the current substitution
character. This allows a robust check for the failure case. The
substitution character (especially the default of "?") is also
a valid output of mb_chr() for a valid input (for "?" that would be
0x3f), so it's a bad choice for an error value.
This commit is contained in:
Nikita Popov 2017-08-03 22:32:31 +02:00
parent 41e9ba6333
commit e53162a32b
2 changed files with 27 additions and 46 deletions

View file

@ -106,8 +106,6 @@ static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
static inline zend_bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
static inline zend_bool php_mb_is_no_encoding_unicode(enum mbfl_no_encoding no_enc);
static inline zend_bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
/* }}} */
@ -3172,13 +3170,6 @@ static inline zend_bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding
}
/* See mbfl_no_encoding definition for list of unicode encodings */
static inline zend_bool php_mb_is_no_encoding_unicode(enum mbfl_no_encoding no_enc)
{
return (no_enc >= mbfl_no_encoding_ucs4 && no_enc <= mbfl_no_encoding_utf8_sb);
}
/* See mbfl_no_encoding definition for list of UTF-8 encodings */
static inline zend_bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
{
@ -5143,10 +5134,18 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
}
}
if (php_mb_is_no_encoding_utf8(no_enc)) {
if (php_mb_is_unsupported_no_encoding(no_enc)) {
php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc);
return NULL;
}
if (0 > cp || cp > 0x10ffff || (cp > 0xd7ff && 0xe000 > cp)) {
cp = MBSTRG(current_filter_illegal_substchar);
if (cp < 0 || cp > 0x10ffff) {
return NULL;
}
if (php_mb_is_no_encoding_utf8(no_enc)) {
if (cp > 0xd7ff && 0xe000 > cp) {
return NULL;
}
if (cp < 0x80) {
@ -5182,20 +5181,6 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
}
return ret;
} else if (php_mb_is_unsupported_no_encoding(no_enc)) {
php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc);
return NULL;
}
if (0 > cp || 0x10ffff < cp) {
if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
cp = MBSTRG(current_filter_illegal_substchar);
} else {
cp = 0x3f;
}
}
buf_len = 4;
@ -5206,9 +5191,21 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
buf[3] = cp & 0xff;
buf[4] = 0;
ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len);
efree(buf);
{
long orig_illegalchars = MBSTRG(illegalchars);
MBSTRG(illegalchars) = 0;
ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len);
if (MBSTRG(illegalchars) != 0) {
efree(buf);
efree(ret);
MBSTRG(illegalchars) = orig_illegalchars;
return NULL;
}
MBSTRG(illegalchars) = orig_illegalchars;
}
efree(buf);
if (output_len) {
*output_len = ret_len;
}

View file

@ -7,22 +7,8 @@ mb_chr()
var_dump(
"\u{20bb7}" === mb_chr(0x20bb7),
"\x8f\xa1\xef" === mb_chr(0x50aa, "EUC-JP-2004"),
"?" === mb_chr(0xd800)
);
mb_internal_encoding("UCS-4BE");
mb_substitute_character(0xfffd);
var_dump(
"\u{fffd}" === mb_chr(0xd800, "UTF-8")
);
var_dump(
"\u{fffd}" === mb_chr(0xd800, "UTF-8")
);
mb_internal_encoding("EUC-JP");
mb_substitute_character(0xa4a2);
var_dump(
"\u{a4a2}" === mb_chr(0xd800, "UTF-8")
false === mb_chr(0xd800),
false === mb_chr(0x1f600, "EUC-JP-2004")
);
// Invalid
@ -39,8 +25,6 @@ bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
bool(true)
Warning: mb_chr(): Unknown encoding "typo" in %s on line %d