From 0109aa62ec2b5f33b307c956555c31f36bd7b6a9 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 28 Nov 2022 08:51:21 +0200 Subject: [PATCH] Simplify decoding filter for UTF-8 When decoding a 3-byte UTF-8 code unit, redundant checks for overlong code unit and for illegal codepoints from U+D800-DFFF were included. Both of these conditions are caught by the line which reads: if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { As such, there is no reason to check for the same error conditions again. Likewise, when decoding a 4-byte UTF-8 code unit, there was a redundant check for overlong code unit. That was already caught by the line which reads: if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { --- ext/mbstring/libmbfl/filters/mbfilter_utf8.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 6c7bad0e805..46bddd17a73 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -249,11 +249,9 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf p--; } else { uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); - if (decoded < 0x800 || (decoded >= 0xD800 && decoded <= 0xDFFF)) { - *out++ = MBFL_BAD_INPUT; - } else { - *out++ = decoded; - } + ZEND_ASSERT(decoded >= 0x800); /* Not an overlong code unit */ + ZEND_ASSERT(decoded < 0xD800 || decoded > 0xDFFF); /* U+D800-DFFF are reserved, illegal code points */ + *out++ = decoded; } } else { *out++ = MBFL_BAD_INPUT; @@ -283,7 +281,8 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf p--; } else { uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F); - *out++ = (decoded < 0x10000) ? MBFL_BAD_INPUT : decoded; + ZEND_ASSERT(decoded >= 0x10000); /* Not an overlong code unit */ + *out++ = decoded; } } else { *out++ = MBFL_BAD_INPUT;