Simplify decoding filter for UTF-8

When decoding a 3-byte UTF-8 code unit, redundant checks for overlong code unit and for illegal codepoints from U+D800-DFFF were included. Both of these conditions are caught by the line which reads: if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { As such, there is no reason to check for the same error conditions again. Likewise, when decoding a 4-byte UTF-8 code unit, there was a redundant check for overlong code unit. That was already caught by the line which reads: if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
2025-08-16 05:58:45 +02:00 · 2022-11-28 08:51:21 +02:00 · 2022-11-28 08:51:21 +02:00 · 0109aa62ec
commit 0109aa62ec
parent 50e32015ae
1 changed files with 5 additions and 6 deletions
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c
@ -249,11 +249,9 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
 					p--;
 				} else {
 					uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
-					if (decoded < 0x800 || (decoded >= 0xD800 && decoded <= 0xDFFF)) {
+					ZEND_ASSERT(decoded >= 0x800); /* Not an overlong code unit */
-						*out++ = MBFL_BAD_INPUT;
+					ZEND_ASSERT(decoded < 0xD800 || decoded > 0xDFFF); /* U+D800-DFFF are reserved, illegal code points */
-					} else {
+					*out++ = decoded;
 						*out++ = decoded;
 					}
 				}
 			} else {
 				*out++ = MBFL_BAD_INPUT;
@ -283,7 +281,8 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
 					p--;
 				} else {
 					uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
-					*out++ = (decoded < 0x10000) ? MBFL_BAD_INPUT : decoded;
+					ZEND_ASSERT(decoded >= 0x10000); /* Not an overlong code unit */
 					*out++ = decoded;
 				}
 			} else {
 				*out++ = MBFL_BAD_INPUT;