mirror of
https://github.com/php/php-src.git
synced 2025-08-15 21:48:51 +02:00
Error handling for UTF-8 complies with WHATWG specification
In 7502c86342
, I adjusted the number of error markers emitted on
invalid UTF-8 text to be more consistent with mbstring's behavior on
other text encodings (generally, it emits one error marker for one
unexpected byte). I didn't expect that anybody would actually care one
way or the other, but felt that it was better to be consistent than
not.
Later, Martin Auswöger kindly pointed out that the WHATWG encoding
specification, which governs how various text encodings are handled
by web browsers, does actually specify how many error markers should
be generated for any given piece of invalid UTF-8 text.
Until now, we have never really paid much attention to the WHATWG
specification, but we do want to comply with as many relevant
specifications as possible. And since PHP is commonly used for web
applications, compatibility with the behavior of web browsers is
obviously a good thing.
This commit is contained in:
parent
b0ab5d0fb0
commit
04e59c916f
5 changed files with 81 additions and 29 deletions
4
NEWS
4
NEWS
|
@ -22,6 +22,10 @@ PHP NEWS
|
|||
- Hash:
|
||||
. Fixed bug #81714 (segfault when serializing finalized HashContext). (cmb)
|
||||
|
||||
- MBString:
|
||||
. Number of error markers emitted for invalid UTF-8 text matches WHATWG specification.
|
||||
This is a return to the behavior of PHP 8.0 and earlier. (alexdowad)
|
||||
|
||||
- MySQLi:
|
||||
. Fixed bug GH-8267 (MySQLi uses unsupported format specifier on Windows).
|
||||
(cmb)
|
||||
|
|
|
@ -122,9 +122,7 @@ retry:
|
|||
CK((*filter->output_function)(s, filter->data));
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
break;
|
||||
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
|
||||
|
@ -139,9 +137,7 @@ retry:
|
|||
filter->status++;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
break;
|
||||
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
|
||||
|
@ -156,9 +152,7 @@ retry:
|
|||
filter->status++;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
break;
|
||||
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
|
||||
|
@ -167,9 +161,7 @@ retry:
|
|||
filter->status++;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
|
|
@ -22,28 +22,28 @@ var_dump(chk_enc("\x31\x32\x33", 0));
|
|||
var_dump(chk_enc("\x41\x42\x43", 0));
|
||||
var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
|
||||
var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
|
||||
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
|
||||
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
|
||||
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
|
||||
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
|
||||
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
|
||||
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
|
||||
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
|
||||
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
|
||||
var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
|
||||
var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
|
||||
var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
|
||||
var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
|
||||
|
||||
var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
|
||||
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
|
||||
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
|
||||
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
|
||||
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
|
||||
var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
|
||||
var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
|
||||
|
||||
var_dump(chk_enc("\xc1\xbf", 2));
|
||||
var_dump(chk_enc("\xc2\x80", 0));
|
||||
var_dump(chk_enc("\xdf\xbf", 0));
|
||||
var_dump(chk_enc("\xe0\x9f\xff", 2));
|
||||
var_dump(chk_enc("\xe0\x9f\xff", 3));
|
||||
var_dump(chk_enc("\xe0\xa0\x80", 2));
|
||||
var_dump(chk_enc("\xef\xbf\xbf", 0));
|
||||
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
|
||||
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
|
||||
var_dump(chk_enc("\xf0\x90\x80\x80", 0));
|
||||
var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
|
||||
var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
|
||||
|
@ -58,7 +58,7 @@ echo "UTF-8 and surrogates area\n";
|
|||
$out = '';
|
||||
$cnt = 0;
|
||||
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
|
||||
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
|
||||
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
|
||||
if ($s === false) {
|
||||
$cnt++;
|
||||
} else {
|
||||
|
|
56
ext/mbstring/tests/utf8_error_handling.phpt
Normal file
56
ext/mbstring/tests/utf8_error_handling.phpt
Normal file
|
@ -0,0 +1,56 @@
|
|||
--TEST--
|
||||
Confirm error handling for UTF-8 complies with WHATWG spec
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
/* The WHATWG specifies not just how web browsers should handle _valid_
|
||||
* UTF-8 text, but how they should handle _invalid_ UTF-8 text (such
|
||||
* as how many error markers each invalid byte sequence should decode
|
||||
* to).
|
||||
* That specification is followed by the JavaScript Encoding API.
|
||||
*
|
||||
* The API documentation for mb_convert_encoding does not specify how
|
||||
* many error markers we will emit for each possible invalid byte
|
||||
* sequence, so we might as well comply with the WHATWG specification.
|
||||
*
|
||||
* Thanks to Martin Auswöger for pointing this out... and another big
|
||||
* thanks for providing test cases!
|
||||
*
|
||||
* Ref: https://encoding.spec.whatwg.org/#utf-8-decoder
|
||||
*/
|
||||
mb_substitute_character(0x25);
|
||||
|
||||
$testCases = [
|
||||
["\x80", "%"],
|
||||
["\xFF", "%"],
|
||||
["\xC2\x7F", "%\x7F"],
|
||||
["\xC2\x80", "\xC2\x80"],
|
||||
["\xDF\xBF", "\xDF\xBF"],
|
||||
["\xDF\xC0", "%%"],
|
||||
["\xE0\xA0\x7F", "%\x7F"],
|
||||
["\xE0\xA0\x80", "\xE0\xA0\x80"],
|
||||
["\xEF\xBF\xBF", "\xEF\xBF\xBF"],
|
||||
["\xEF\xBF\xC0", "%%"],
|
||||
["\xF0\x90\x80\x7F", "%\x7F"],
|
||||
["\xF0\x90\x80\x80", "\xF0\x90\x80\x80"],
|
||||
["\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF"],
|
||||
["\xF4\x8F\xBF\xC0", "%%"],
|
||||
["\xFA\x80\x80\x80\x80", "%%%%%"],
|
||||
["\xFB\xBF\xBF\xBF\xBF", "%%%%%"],
|
||||
["\xFD\x80\x80\x80\x80\x80", "%%%%%%"],
|
||||
["\xFD\xBF\xBF\xBF\xBF\xBF", "%%%%%%"]
|
||||
];
|
||||
|
||||
foreach ($testCases as $testCase) {
|
||||
$result = mb_convert_encoding($testCase[0], 'UTF-8', 'UTF-8');
|
||||
if ($result !== $testCase[1]) {
|
||||
die("Expected UTF-8 string " . bin2hex($testCase[0]) . " to convert to UTF-8 string " . bin2hex($testCase[1]) . "; got " . bin2hex($result));
|
||||
}
|
||||
}
|
||||
|
||||
echo "All done!\n";
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
All done!
|
|
@ -761,14 +761,14 @@ testValidString('', '', 'UTF-8', 'UTF-32BE');
|
|||
|
||||
$invalid = array(
|
||||
// Codepoints outside of valid 0-0x10FFFF range for Unicode
|
||||
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0x110000
|
||||
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
|
||||
"\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
|
||||
"\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
|
||||
|
||||
// Reserved range for UTF-16 surrogate pairs
|
||||
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 2), // CP 0xD800
|
||||
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDBFF
|
||||
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDFFF
|
||||
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800
|
||||
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF
|
||||
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF
|
||||
|
||||
// Truncated characters
|
||||
"\xDF" => "\x00\x00\x00%", // should have been 2-byte
|
||||
|
@ -788,8 +788,8 @@ $invalid = array(
|
|||
|
||||
// Multi-byte characters which end too soon and go to a junk byte
|
||||
// (Which isn't even valid to start a new character)
|
||||
"\xF0\xBF\xBF\xFF" => "\x00\x00\x00%",
|
||||
"\xF0\xBF\xFF" => "\x00\x00\x00%",
|
||||
"\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
|
||||
"\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
|
||||
|
||||
// Continuation bytes which appear outside of a MB char
|
||||
"\x80" => "\x00\x00\x00%",
|
||||
|
@ -799,8 +799,8 @@ $invalid = array(
|
|||
// Overlong code units
|
||||
// (Using more bytes than needed to encode a character)
|
||||
"\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes
|
||||
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 3 bytes
|
||||
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 3) // didn't need 4 bytes
|
||||
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes
|
||||
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
|
||||
);
|
||||
|
||||
testInvalidCodepoints($invalid, 'UTF-8');
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue