diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 83d2158c37d..3705259b452 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -127,9 +127,7 @@ retry: CK((*filter->output_function)(s, filter->data)); } else { CK(mbfl_filt_put_invalid_char(filter)); - if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) { - goto retry; - } + goto retry; } break; case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */ @@ -144,9 +142,7 @@ retry: filter->status++; } else { CK(mbfl_filt_put_invalid_char(filter)); - if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) { - goto retry; - } + goto retry; } break; case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */ @@ -161,9 +157,7 @@ retry: filter->status++; } else { CK(mbfl_filt_put_invalid_char(filter)); - if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) { - goto retry; - } + goto retry; } break; case 0x31: /* 4byte code 3rd char: 0x80-0xbf */ @@ -172,9 +166,7 @@ retry: filter->status++; } else { CK(mbfl_filt_put_invalid_char(filter)); - if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) { - goto retry; - } + goto retry; } break; @@ -237,9 +229,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf unsigned char c2 = *p++; if ((c2 & 0xC0) != 0x80) { *out++ = MBFL_BAD_INPUT; - if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) { - p--; - } + p--; } else { *out++ = ((c & 0x1F) << 6) | (c2 & 0x3F); } @@ -252,34 +242,21 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf unsigned char c3 = *p++; if ((c2 & 0xC0) != 0x80 || !((c2 >= 0x80 && c2 <= 0xBF) && ((c == 0xE0 && c2 >= 0xA0) || (c == 0xED && c2 < 0xA0) || (c > 0xE0 && c != 0xED)))) { *out++ = MBFL_BAD_INPUT; - if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) { - p -= 2; - } else { - p--; - } + p -= 2; } else if ((c3 & 0xC0) != 0x80) { *out++ = MBFL_BAD_INPUT; - if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4)) { - p--; - } + p--; } else { uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); - if (decoded >= 0xD800 && decoded <= 0xDFFF) { + if (decoded < 0x800 || (decoded >= 0xD800 && decoded <= 0xDFFF)) { *out++ = MBFL_BAD_INPUT; } else { - *out++ = (decoded < 0x800) ? MBFL_BAD_INPUT : decoded; + *out++ = decoded; } } } else { *out++ = MBFL_BAD_INPUT; - /* Skip over some number of bytes to duplicate error-handling behavior of old implementation */ - while (p < e) { - c = *p; - if ((c & 0xC0) != 0x80) { - if (c >= 0x80 && (c < 0xC2 || c > 0xF4)) - p++; - break; - } + while (p < e && (*p & 0xC0) == 0x80) { p++; } } @@ -288,51 +265,28 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf unsigned char c2 = *p++; unsigned char c3 = *p++; unsigned char c4 = *p++; - if ((c2 & 0xC0) != 0x80) { + /* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have + * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is + * greater than U+10FFFF, which is the highest legal codepoint */ + if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { *out++ = MBFL_BAD_INPUT; - if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) { - p -= 3; - } else { - p -= 2; - } - } else if ((c3 & 0xC0) != 0x80 || !((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) { + p -= 3; + } else if ((c3 & 0xC0) != 0x80) { *out++ = MBFL_BAD_INPUT; - if (!((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) { - if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4)) { - p -= 2; - } else { - p -= 3; - } - } else if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4)) { - p -= 2; - } else { - p--; - } + p -= 2; } else if ((c4 & 0xC0) != 0x80) { *out++ = MBFL_BAD_INPUT; - if (c4 < 0x80 || (c4 >= 0xC2 && c4 <= 0xF4)) { - p--; - } + p--; } else { uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F); *out++ = (decoded < 0x10000) ? MBFL_BAD_INPUT : decoded; } } else { *out++ = MBFL_BAD_INPUT; - /* Skip over some number of bytes to duplicate error-handling behavior of old implementation */ if (p < e) { unsigned char c2 = *p; - if (!((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) { - if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4)) - p++; - } else { - while (p < e) { - c = *p; - if ((c & 0xC0) != 0x80) { - if (c >= 0x80 && (c < 0xC2 || c > 0xF4)) - p++; - break; - } + if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || c == 0xF2 || c == 0xF3) { + while (p < e && (*p & 0xC0) == 0x80) { p++; } } diff --git a/ext/mbstring/tests/illformed_utf_sequences.phpt b/ext/mbstring/tests/illformed_utf_sequences.phpt index b29827be44d..6eddaf341e0 100644 --- a/ext/mbstring/tests/illformed_utf_sequences.phpt +++ b/ext/mbstring/tests/illformed_utf_sequences.phpt @@ -21,28 +21,28 @@ var_dump(chk_enc("\x31\x32\x33", 0)); var_dump(chk_enc("\x41\x42\x43", 0)); var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6)); var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6)); -var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6)); -var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6)); -var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9)); -var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8)); +var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9)); +var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9)); +var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12)); +var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11)); var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15)); var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15)); var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18)); var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18)); var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0)); -var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6)); -var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9)); +var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9)); +var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12)); var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15)); var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18)); var_dump(chk_enc("\xc1\xbf", 2)); var_dump(chk_enc("\xc2\x80", 0)); var_dump(chk_enc("\xdf\xbf", 0)); -var_dump(chk_enc("\xe0\x9f\xff", 2)); +var_dump(chk_enc("\xe0\x9f\xff", 3)); var_dump(chk_enc("\xe0\xa0\x80", 2)); var_dump(chk_enc("\xef\xbf\xbf", 0)); -var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3)); +var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4)); var_dump(chk_enc("\xf0\x90\x80\x80", 0)); var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4)); var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5)); @@ -57,7 +57,7 @@ echo "UTF-8 and surrogates area\n"; $out = ''; $cnt = 0; for ($i = 0xd7ff; $i <= 0xe000; ++$i) { - $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2); + $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3); if ($s === false) { $cnt++; } else { diff --git a/ext/mbstring/tests/utf8_error_handling.phpt b/ext/mbstring/tests/utf8_error_handling.phpt new file mode 100644 index 00000000000..0592778cd1c --- /dev/null +++ b/ext/mbstring/tests/utf8_error_handling.phpt @@ -0,0 +1,56 @@ +--TEST-- +Confirm error handling for UTF-8 complies with WHATWG spec +--EXTENSIONS-- +mbstring +--FILE-- + +--EXPECT-- +All done! diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt index 038a7703300..37fd7f32426 100644 --- a/ext/mbstring/tests/utf_encodings.phpt +++ b/ext/mbstring/tests/utf_encodings.phpt @@ -761,14 +761,14 @@ testValidString('', '', 'UTF-8', 'UTF-32BE'); $invalid = array( // Codepoints outside of valid 0-0x10FFFF range for Unicode - "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0x110000 + "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000 "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000 "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF // Reserved range for UTF-16 surrogate pairs - "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 2), // CP 0xD800 - "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDBFF - "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDFFF + "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800 + "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF + "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF // Truncated characters "\xDF" => "\x00\x00\x00%", // should have been 2-byte @@ -788,8 +788,8 @@ $invalid = array( // Multi-byte characters which end too soon and go to a junk byte // (Which isn't even valid to start a new character) - "\xF0\xBF\xBF\xFF" => "\x00\x00\x00%", - "\xF0\xBF\xFF" => "\x00\x00\x00%", + "\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2), + "\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2), // Continuation bytes which appear outside of a MB char "\x80" => "\x00\x00\x00%", @@ -799,8 +799,8 @@ $invalid = array( // Overlong code units // (Using more bytes than needed to encode a character) "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes - "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 3 bytes - "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 3) // didn't need 4 bytes + "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes + "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes ); testInvalidCodepoints($invalid, 'UTF-8');