Simplify decoding filter for UTF-8

When decoding a 3-byte UTF-8 code unit, redundant checks for overlong
code unit and for illegal codepoints from U+D800-DFFF were included.
Both of these conditions are caught by the line which reads:

    if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {

As such, there is no reason to check for the same error conditions again.

Likewise, when decoding a 4-byte UTF-8 code unit, there was a
redundant check for overlong code unit. That was already caught by the
line which reads:

    if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
This commit is contained in:
Alex Dowad 2022-11-28 08:51:21 +02:00
parent 50e32015ae
commit 0109aa62ec

View file

@ -249,11 +249,9 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
p--; p--;
} else { } else {
uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
if (decoded < 0x800 || (decoded >= 0xD800 && decoded <= 0xDFFF)) { ZEND_ASSERT(decoded >= 0x800); /* Not an overlong code unit */
*out++ = MBFL_BAD_INPUT; ZEND_ASSERT(decoded < 0xD800 || decoded > 0xDFFF); /* U+D800-DFFF are reserved, illegal code points */
} else { *out++ = decoded;
*out++ = decoded;
}
} }
} else { } else {
*out++ = MBFL_BAD_INPUT; *out++ = MBFL_BAD_INPUT;
@ -283,7 +281,8 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
p--; p--;
} else { } else {
uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F); uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
*out++ = (decoded < 0x10000) ? MBFL_BAD_INPUT : decoded; ZEND_ASSERT(decoded >= 0x10000); /* Not an overlong code unit */
*out++ = decoded;
} }
} else { } else {
*out++ = MBFL_BAD_INPUT; *out++ = MBFL_BAD_INPUT;