Merge branch 'PHP-8.1'

* PHP-8.1:
  Error handling for UTF-8 complies with WHATWG specification
This commit is contained in:
Alex Dowad 2022-04-16 20:32:12 +02:00
commit 3f12d26e3a
4 changed files with 93 additions and 83 deletions

View file

@ -127,9 +127,7 @@ retry:
CK((*filter->output_function)(s, filter->data));
} else {
CK(mbfl_filt_put_invalid_char(filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
goto retry;
}
goto retry;
}
break;
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
@ -144,9 +142,7 @@ retry:
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
goto retry;
}
goto retry;
}
break;
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
@ -161,9 +157,7 @@ retry:
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
goto retry;
}
goto retry;
}
break;
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
@ -172,9 +166,7 @@ retry:
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
goto retry;
}
goto retry;
}
break;
@ -237,9 +229,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
unsigned char c2 = *p++;
if ((c2 & 0xC0) != 0x80) {
*out++ = MBFL_BAD_INPUT;
if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
p--;
}
p--;
} else {
*out++ = ((c & 0x1F) << 6) | (c2 & 0x3F);
}
@ -252,34 +242,21 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
unsigned char c3 = *p++;
if ((c2 & 0xC0) != 0x80 || !((c2 >= 0x80 && c2 <= 0xBF) && ((c == 0xE0 && c2 >= 0xA0) || (c == 0xED && c2 < 0xA0) || (c > 0xE0 && c != 0xED)))) {
*out++ = MBFL_BAD_INPUT;
if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
p -= 2;
} else {
p--;
}
p -= 2;
} else if ((c3 & 0xC0) != 0x80) {
*out++ = MBFL_BAD_INPUT;
if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4)) {
p--;
}
p--;
} else {
uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
if (decoded >= 0xD800 && decoded <= 0xDFFF) {
if (decoded < 0x800 || (decoded >= 0xD800 && decoded <= 0xDFFF)) {
*out++ = MBFL_BAD_INPUT;
} else {
*out++ = (decoded < 0x800) ? MBFL_BAD_INPUT : decoded;
*out++ = decoded;
}
}
} else {
*out++ = MBFL_BAD_INPUT;
/* Skip over some number of bytes to duplicate error-handling behavior of old implementation */
while (p < e) {
c = *p;
if ((c & 0xC0) != 0x80) {
if (c >= 0x80 && (c < 0xC2 || c > 0xF4))
p++;
break;
}
while (p < e && (*p & 0xC0) == 0x80) {
p++;
}
}
@ -288,51 +265,28 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
unsigned char c2 = *p++;
unsigned char c3 = *p++;
unsigned char c4 = *p++;
if ((c2 & 0xC0) != 0x80) {
/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
* fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
* greater than U+10FFFF, which is the highest legal codepoint */
if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
*out++ = MBFL_BAD_INPUT;
if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
p -= 3;
} else {
p -= 2;
}
} else if ((c3 & 0xC0) != 0x80 || !((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
p -= 3;
} else if ((c3 & 0xC0) != 0x80) {
*out++ = MBFL_BAD_INPUT;
if (!((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4)) {
p -= 2;
} else {
p -= 3;
}
} else if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4)) {
p -= 2;
} else {
p--;
}
p -= 2;
} else if ((c4 & 0xC0) != 0x80) {
*out++ = MBFL_BAD_INPUT;
if (c4 < 0x80 || (c4 >= 0xC2 && c4 <= 0xF4)) {
p--;
}
p--;
} else {
uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
*out++ = (decoded < 0x10000) ? MBFL_BAD_INPUT : decoded;
}
} else {
*out++ = MBFL_BAD_INPUT;
/* Skip over some number of bytes to duplicate error-handling behavior of old implementation */
if (p < e) {
unsigned char c2 = *p;
if (!((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4))
p++;
} else {
while (p < e) {
c = *p;
if ((c & 0xC0) != 0x80) {
if (c >= 0x80 && (c < 0xC2 || c > 0xF4))
p++;
break;
}
if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || c == 0xF2 || c == 0xF3) {
while (p < e && (*p & 0xC0) == 0x80) {
p++;
}
}

View file

@ -21,28 +21,28 @@ var_dump(chk_enc("\x31\x32\x33", 0));
var_dump(chk_enc("\x41\x42\x43", 0));
var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
var_dump(chk_enc("\xc1\xbf", 2));
var_dump(chk_enc("\xc2\x80", 0));
var_dump(chk_enc("\xdf\xbf", 0));
var_dump(chk_enc("\xe0\x9f\xff", 2));
var_dump(chk_enc("\xe0\x9f\xff", 3));
var_dump(chk_enc("\xe0\xa0\x80", 2));
var_dump(chk_enc("\xef\xbf\xbf", 0));
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
var_dump(chk_enc("\xf0\x90\x80\x80", 0));
var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
@ -57,7 +57,7 @@ echo "UTF-8 and surrogates area\n";
$out = '';
$cnt = 0;
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
if ($s === false) {
$cnt++;
} else {

View file

@ -0,0 +1,56 @@
--TEST--
Confirm error handling for UTF-8 complies with WHATWG spec
--EXTENSIONS--
mbstring
--FILE--
<?php
/* The WHATWG specifies not just how web browsers should handle _valid_
* UTF-8 text, but how they should handle _invalid_ UTF-8 text (such
* as how many error markers each invalid byte sequence should decode
* to).
* That specification is followed by the JavaScript Encoding API.
*
* The API documentation for mb_convert_encoding does not specify how
* many error markers we will emit for each possible invalid byte
* sequence, so we might as well comply with the WHATWG specification.
*
* Thanks to Martin Auswöger for pointing this out... and another big
* thanks for providing test cases!
*
* Ref: https://encoding.spec.whatwg.org/#utf-8-decoder
*/
mb_substitute_character(0x25);
$testCases = [
["\x80", "%"],
["\xFF", "%"],
["\xC2\x7F", "%\x7F"],
["\xC2\x80", "\xC2\x80"],
["\xDF\xBF", "\xDF\xBF"],
["\xDF\xC0", "%%"],
["\xE0\xA0\x7F", "%\x7F"],
["\xE0\xA0\x80", "\xE0\xA0\x80"],
["\xEF\xBF\xBF", "\xEF\xBF\xBF"],
["\xEF\xBF\xC0", "%%"],
["\xF0\x90\x80\x7F", "%\x7F"],
["\xF0\x90\x80\x80", "\xF0\x90\x80\x80"],
["\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF"],
["\xF4\x8F\xBF\xC0", "%%"],
["\xFA\x80\x80\x80\x80", "%%%%%"],
["\xFB\xBF\xBF\xBF\xBF", "%%%%%"],
["\xFD\x80\x80\x80\x80\x80", "%%%%%%"],
["\xFD\xBF\xBF\xBF\xBF\xBF", "%%%%%%"]
];
foreach ($testCases as $testCase) {
$result = mb_convert_encoding($testCase[0], 'UTF-8', 'UTF-8');
if ($result !== $testCase[1]) {
die("Expected UTF-8 string " . bin2hex($testCase[0]) . " to convert to UTF-8 string " . bin2hex($testCase[1]) . "; got " . bin2hex($result));
}
}
echo "All done!\n";
?>
--EXPECT--
All done!

View file

@ -761,14 +761,14 @@ testValidString('', '', 'UTF-8', 'UTF-32BE');
$invalid = array(
// Codepoints outside of valid 0-0x10FFFF range for Unicode
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0x110000
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
"\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
"\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
// Reserved range for UTF-16 surrogate pairs
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 2), // CP 0xD800
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDBFF
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDFFF
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF
// Truncated characters
"\xDF" => "\x00\x00\x00%", // should have been 2-byte
@ -788,8 +788,8 @@ $invalid = array(
// Multi-byte characters which end too soon and go to a junk byte
// (Which isn't even valid to start a new character)
"\xF0\xBF\xBF\xFF" => "\x00\x00\x00%",
"\xF0\xBF\xFF" => "\x00\x00\x00%",
"\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
"\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
// Continuation bytes which appear outside of a MB char
"\x80" => "\x00\x00\x00%",
@ -799,8 +799,8 @@ $invalid = array(
// Overlong code units
// (Using more bytes than needed to encode a character)
"\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 3 bytes
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 3) // didn't need 4 bytes
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
);
testInvalidCodepoints($invalid, 'UTF-8');