mirror of
https://github.com/php/php-src.git
synced 2025-08-15 21:48:51 +02:00
Merge branch 'PHP-8.1'
* PHP-8.1: Error handling for UTF-8 complies with WHATWG specification
This commit is contained in:
commit
3f12d26e3a
4 changed files with 93 additions and 83 deletions
|
@ -127,9 +127,7 @@ retry:
|
|||
CK((*filter->output_function)(s, filter->data));
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
break;
|
||||
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
|
||||
|
@ -144,9 +142,7 @@ retry:
|
|||
filter->status++;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
break;
|
||||
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
|
||||
|
@ -161,9 +157,7 @@ retry:
|
|||
filter->status++;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
break;
|
||||
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
|
||||
|
@ -172,9 +166,7 @@ retry:
|
|||
filter->status++;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -237,9 +229,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
|
|||
unsigned char c2 = *p++;
|
||||
if ((c2 & 0xC0) != 0x80) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
|
||||
p--;
|
||||
}
|
||||
p--;
|
||||
} else {
|
||||
*out++ = ((c & 0x1F) << 6) | (c2 & 0x3F);
|
||||
}
|
||||
|
@ -252,34 +242,21 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
|
|||
unsigned char c3 = *p++;
|
||||
if ((c2 & 0xC0) != 0x80 || !((c2 >= 0x80 && c2 <= 0xBF) && ((c == 0xE0 && c2 >= 0xA0) || (c == 0xED && c2 < 0xA0) || (c > 0xE0 && c != 0xED)))) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
|
||||
p -= 2;
|
||||
} else {
|
||||
p--;
|
||||
}
|
||||
p -= 2;
|
||||
} else if ((c3 & 0xC0) != 0x80) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4)) {
|
||||
p--;
|
||||
}
|
||||
p--;
|
||||
} else {
|
||||
uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
|
||||
if (decoded >= 0xD800 && decoded <= 0xDFFF) {
|
||||
if (decoded < 0x800 || (decoded >= 0xD800 && decoded <= 0xDFFF)) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
} else {
|
||||
*out++ = (decoded < 0x800) ? MBFL_BAD_INPUT : decoded;
|
||||
*out++ = decoded;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
/* Skip over some number of bytes to duplicate error-handling behavior of old implementation */
|
||||
while (p < e) {
|
||||
c = *p;
|
||||
if ((c & 0xC0) != 0x80) {
|
||||
if (c >= 0x80 && (c < 0xC2 || c > 0xF4))
|
||||
p++;
|
||||
break;
|
||||
}
|
||||
while (p < e && (*p & 0xC0) == 0x80) {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
|
@ -288,51 +265,28 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
|
|||
unsigned char c2 = *p++;
|
||||
unsigned char c3 = *p++;
|
||||
unsigned char c4 = *p++;
|
||||
if ((c2 & 0xC0) != 0x80) {
|
||||
/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
|
||||
* fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
|
||||
* greater than U+10FFFF, which is the highest legal codepoint */
|
||||
if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
|
||||
p -= 3;
|
||||
} else {
|
||||
p -= 2;
|
||||
}
|
||||
} else if ((c3 & 0xC0) != 0x80 || !((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
|
||||
p -= 3;
|
||||
} else if ((c3 & 0xC0) != 0x80) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
if (!((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
|
||||
if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4)) {
|
||||
p -= 2;
|
||||
} else {
|
||||
p -= 3;
|
||||
}
|
||||
} else if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4)) {
|
||||
p -= 2;
|
||||
} else {
|
||||
p--;
|
||||
}
|
||||
p -= 2;
|
||||
} else if ((c4 & 0xC0) != 0x80) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
if (c4 < 0x80 || (c4 >= 0xC2 && c4 <= 0xF4)) {
|
||||
p--;
|
||||
}
|
||||
p--;
|
||||
} else {
|
||||
uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
|
||||
*out++ = (decoded < 0x10000) ? MBFL_BAD_INPUT : decoded;
|
||||
}
|
||||
} else {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
/* Skip over some number of bytes to duplicate error-handling behavior of old implementation */
|
||||
if (p < e) {
|
||||
unsigned char c2 = *p;
|
||||
if (!((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
|
||||
if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4))
|
||||
p++;
|
||||
} else {
|
||||
while (p < e) {
|
||||
c = *p;
|
||||
if ((c & 0xC0) != 0x80) {
|
||||
if (c >= 0x80 && (c < 0xC2 || c > 0xF4))
|
||||
p++;
|
||||
break;
|
||||
}
|
||||
if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || c == 0xF2 || c == 0xF3) {
|
||||
while (p < e && (*p & 0xC0) == 0x80) {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,28 +21,28 @@ var_dump(chk_enc("\x31\x32\x33", 0));
|
|||
var_dump(chk_enc("\x41\x42\x43", 0));
|
||||
var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
|
||||
var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
|
||||
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
|
||||
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
|
||||
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
|
||||
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
|
||||
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
|
||||
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
|
||||
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
|
||||
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
|
||||
var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
|
||||
var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
|
||||
var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
|
||||
var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
|
||||
|
||||
var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
|
||||
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
|
||||
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
|
||||
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
|
||||
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
|
||||
var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
|
||||
var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
|
||||
|
||||
var_dump(chk_enc("\xc1\xbf", 2));
|
||||
var_dump(chk_enc("\xc2\x80", 0));
|
||||
var_dump(chk_enc("\xdf\xbf", 0));
|
||||
var_dump(chk_enc("\xe0\x9f\xff", 2));
|
||||
var_dump(chk_enc("\xe0\x9f\xff", 3));
|
||||
var_dump(chk_enc("\xe0\xa0\x80", 2));
|
||||
var_dump(chk_enc("\xef\xbf\xbf", 0));
|
||||
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
|
||||
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
|
||||
var_dump(chk_enc("\xf0\x90\x80\x80", 0));
|
||||
var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
|
||||
var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
|
||||
|
@ -57,7 +57,7 @@ echo "UTF-8 and surrogates area\n";
|
|||
$out = '';
|
||||
$cnt = 0;
|
||||
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
|
||||
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
|
||||
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
|
||||
if ($s === false) {
|
||||
$cnt++;
|
||||
} else {
|
||||
|
|
56
ext/mbstring/tests/utf8_error_handling.phpt
Normal file
56
ext/mbstring/tests/utf8_error_handling.phpt
Normal file
|
@ -0,0 +1,56 @@
|
|||
--TEST--
|
||||
Confirm error handling for UTF-8 complies with WHATWG spec
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
/* The WHATWG specifies not just how web browsers should handle _valid_
|
||||
* UTF-8 text, but how they should handle _invalid_ UTF-8 text (such
|
||||
* as how many error markers each invalid byte sequence should decode
|
||||
* to).
|
||||
* That specification is followed by the JavaScript Encoding API.
|
||||
*
|
||||
* The API documentation for mb_convert_encoding does not specify how
|
||||
* many error markers we will emit for each possible invalid byte
|
||||
* sequence, so we might as well comply with the WHATWG specification.
|
||||
*
|
||||
* Thanks to Martin Auswöger for pointing this out... and another big
|
||||
* thanks for providing test cases!
|
||||
*
|
||||
* Ref: https://encoding.spec.whatwg.org/#utf-8-decoder
|
||||
*/
|
||||
mb_substitute_character(0x25);
|
||||
|
||||
$testCases = [
|
||||
["\x80", "%"],
|
||||
["\xFF", "%"],
|
||||
["\xC2\x7F", "%\x7F"],
|
||||
["\xC2\x80", "\xC2\x80"],
|
||||
["\xDF\xBF", "\xDF\xBF"],
|
||||
["\xDF\xC0", "%%"],
|
||||
["\xE0\xA0\x7F", "%\x7F"],
|
||||
["\xE0\xA0\x80", "\xE0\xA0\x80"],
|
||||
["\xEF\xBF\xBF", "\xEF\xBF\xBF"],
|
||||
["\xEF\xBF\xC0", "%%"],
|
||||
["\xF0\x90\x80\x7F", "%\x7F"],
|
||||
["\xF0\x90\x80\x80", "\xF0\x90\x80\x80"],
|
||||
["\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF"],
|
||||
["\xF4\x8F\xBF\xC0", "%%"],
|
||||
["\xFA\x80\x80\x80\x80", "%%%%%"],
|
||||
["\xFB\xBF\xBF\xBF\xBF", "%%%%%"],
|
||||
["\xFD\x80\x80\x80\x80\x80", "%%%%%%"],
|
||||
["\xFD\xBF\xBF\xBF\xBF\xBF", "%%%%%%"]
|
||||
];
|
||||
|
||||
foreach ($testCases as $testCase) {
|
||||
$result = mb_convert_encoding($testCase[0], 'UTF-8', 'UTF-8');
|
||||
if ($result !== $testCase[1]) {
|
||||
die("Expected UTF-8 string " . bin2hex($testCase[0]) . " to convert to UTF-8 string " . bin2hex($testCase[1]) . "; got " . bin2hex($result));
|
||||
}
|
||||
}
|
||||
|
||||
echo "All done!\n";
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
All done!
|
|
@ -761,14 +761,14 @@ testValidString('', '', 'UTF-8', 'UTF-32BE');
|
|||
|
||||
$invalid = array(
|
||||
// Codepoints outside of valid 0-0x10FFFF range for Unicode
|
||||
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0x110000
|
||||
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
|
||||
"\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
|
||||
"\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
|
||||
|
||||
// Reserved range for UTF-16 surrogate pairs
|
||||
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 2), // CP 0xD800
|
||||
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDBFF
|
||||
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDFFF
|
||||
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800
|
||||
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF
|
||||
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF
|
||||
|
||||
// Truncated characters
|
||||
"\xDF" => "\x00\x00\x00%", // should have been 2-byte
|
||||
|
@ -788,8 +788,8 @@ $invalid = array(
|
|||
|
||||
// Multi-byte characters which end too soon and go to a junk byte
|
||||
// (Which isn't even valid to start a new character)
|
||||
"\xF0\xBF\xBF\xFF" => "\x00\x00\x00%",
|
||||
"\xF0\xBF\xFF" => "\x00\x00\x00%",
|
||||
"\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
|
||||
"\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
|
||||
|
||||
// Continuation bytes which appear outside of a MB char
|
||||
"\x80" => "\x00\x00\x00%",
|
||||
|
@ -799,8 +799,8 @@ $invalid = array(
|
|||
// Overlong code units
|
||||
// (Using more bytes than needed to encode a character)
|
||||
"\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes
|
||||
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 3 bytes
|
||||
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 3) // didn't need 4 bytes
|
||||
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes
|
||||
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
|
||||
);
|
||||
|
||||
testInvalidCodepoints($invalid, 'UTF-8');
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue