Add AVX2-accelerated UTF-16 decoding/encoding routines

As with other SIMD-accelerated functions in php-src, the new UTF-16 encoding and decoding routines can be compiled either with AVX2 acceleration "always on", "always off", or else with runtime detection of AVX2 support. With the new UTF-16 decoder/encoder, conversion of extremely short strings (as in several bytes) has the same performance as before, and conversion of medium-length (~100 character) strings is about 65% faster, but conversion of long (~10,000 character) strings is around 6 times faster. Many other mbstring functions will also be faster now when handling UTF-16; for example, mb_strlen is almost 3 times faster on medium strings, and almost 9 times faster on long strings. (Why does mb_strlen benefit more from AVX2 acceleration than mb_convert_encoding? It's because mb_strlen only needs to decode, but not re-encode, the input string, and the UTF-16 decoder benefits much more from SIMD acceleration than the UTF-16 encoder.)
2025-08-15 21:48:51 +02:00 · 2023-01-25 22:28:10 +02:00 · 2023-01-25 22:28:10 +02:00 · c8ec2ed730
commit c8ec2ed730
parent d5d9900661
7 changed files with 572 additions and 10 deletions
--- a/Zend/zend_bitset.h
+++ b/Zend/zend_bitset.h
@ -19,6 +19,13 @@
 #ifndef _ZEND_BITSET_H_
 #define _ZEND_BITSET_H_
 #include <stdint.h>
 #include <stdbool.h>
 #include <string.h>
 #include "zend_portability.h"
 #include "zend_long.h"
 typedef zend_ulong *zend_bitset;
 #define ZEND_BITSET_ELM_SIZE sizeof(zend_ulong)
--- a/build/php.m4
+++ b/build/php.m4
@ -2643,6 +2643,27 @@ AC_DEFUN([PHP_CHECK_BUILTIN_SADDLL_OVERFLOW], [
   [$have_builtin_saddll_overflow], [Whether the compiler supports __builtin_saddll_overflow])
 ])
 dnl
 dnl PHP_CHECK_BUILTIN_USUB_OVERFLOW
 dnl
 AC_DEFUN([PHP_CHECK_BUILTIN_USUB_OVERFLOW], [
  AC_MSG_CHECKING([for __builtin_usub_overflow])
  AC_LINK_IFELSE([AC_LANG_PROGRAM([], [[
    unsigned int tmpvar;
    return __builtin_usub_overflow(3, 7, &tmpvar);
  ]])], [
    have_builtin_usub_overflow=1
    AC_MSG_RESULT([yes])
  ], [
    have_builtin_usub_overflow=0
    AC_MSG_RESULT([no])
  ])
  AC_DEFINE_UNQUOTED([PHP_HAVE_BUILTIN_USUB_OVERFLOW],
   [$have_builtin_usub_overflow], [Whether the compiler supports __builtin_usub_overflow])
 ])
 dnl
 dnl PHP_CHECK_BUILTIN_SSUBL_OVERFLOW
 dnl
--- a/configure.ac
+++ b/configure.ac
@ -504,6 +504,8 @@ dnl Check __builtin_saddl_overflow
 PHP_CHECK_BUILTIN_SADDL_OVERFLOW
 dnl Check __builtin_saddll_overflow
 PHP_CHECK_BUILTIN_SADDLL_OVERFLOW
 dnl Check __builtin_usub_overflow
 PHP_CHECK_BUILTIN_USUB_OVERFLOW
 dnl Check __builtin_ssubl_overflow
 PHP_CHECK_BUILTIN_SSUBL_OVERFLOW
 dnl Check __builtin_ssubll_overflow
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
@ -27,15 +27,154 @@
 *
 */
 #include "zend_bitset.h"
 #include "mbfilter.h"
 #include "mbfilter_utf16.h"
 #ifdef ZEND_INTRIN_AVX2_NATIVE
 /* We are building AVX2-only binary */
 # include <immintrin.h>
 # define mb_utf16be_to_wchar mb_utf16be_to_wchar_avx2
 # define mb_utf16le_to_wchar mb_utf16le_to_wchar_avx2
 # define mb_wchar_to_utf16be mb_wchar_to_utf16be_avx2
 # define mb_wchar_to_utf16le mb_wchar_to_utf16le_avx2
 static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 #elif defined(ZEND_INTRIN_AVX2_RESOLVER)
 /* We are building binary which works with or without AVX2; whether or not to use
 * AVX2-accelerated functions will be determined at runtime */
 # include <immintrin.h>
 # include "Zend/zend_cpuinfo.h"
 static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
 /* Dynamic linker will decide whether or not to use AVX2-based functions and
 * resolve symbols accordingly */
 ZEND_INTRIN_AVX2_FUNC_DECL(size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state));
 ZEND_INTRIN_AVX2_FUNC_DECL(void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end));
 ZEND_INTRIN_AVX2_FUNC_DECL(size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state));
 ZEND_INTRIN_AVX2_FUNC_DECL(void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end));
 size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((ifunc("resolve_utf16be_wchar")));
 void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((ifunc("resolve_wchar_utf16be")));
 size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((ifunc("resolve_utf16le_wchar")));
 void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((ifunc("resolve_wchar_utf16le")));
 ZEND_NO_SANITIZE_ADDRESS
 ZEND_ATTRIBUTE_UNUSED
 static mb_to_wchar_fn resolve_utf16be_wchar(void)
 {
 	return zend_cpu_supports_avx2() ? mb_utf16be_to_wchar_avx2 : mb_utf16be_to_wchar_default;
 }
 ZEND_NO_SANITIZE_ADDRESS
 ZEND_ATTRIBUTE_UNUSED
 static mb_from_wchar_fn resolve_wchar_utf16be(void)
 {
 	return zend_cpu_supports_avx2() ? mb_wchar_to_utf16be_avx2 : mb_wchar_to_utf16be_default;
 }
 ZEND_NO_SANITIZE_ADDRESS
 ZEND_ATTRIBUTE_UNUSED
 static mb_to_wchar_fn resolve_utf16le_wchar(void)
 {
 	return zend_cpu_supports_avx2() ? mb_utf16le_to_wchar_avx2 : mb_utf16le_to_wchar_default;
 }
 ZEND_NO_SANITIZE_ADDRESS
 ZEND_ATTRIBUTE_UNUSED
 static mb_from_wchar_fn resolve_wchar_utf16le(void)
 {
 	return zend_cpu_supports_avx2() ? mb_wchar_to_utf16le_avx2 : mb_wchar_to_utf16le_default;
 }
 # else /* ZEND_INTRIN_AVX2_FUNC_PTR */
 /* We are compiling for a target where the dynamic linker will not be able to
 * resolve symbols according to whether the host supports AVX2 or not; so instead,
 * we can make calls go through a function pointer and set the function pointer
 * on module load */
 #ifdef HAVE_FUNC_ATTRIBUTE_TARGET
 static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((target("avx2")));
 static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((target("avx2")));
 static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((target("avx2")));
 static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((target("avx2")));
 #else
 static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 #endif
 static mb_to_wchar_fn utf16be_to_wchar_ptr = NULL;
 static mb_from_wchar_fn wchar_to_utf16be_ptr = NULL;
 static mb_to_wchar_fn utf16le_to_wchar_ptr = NULL;
 static mb_from_wchar_fn wchar_to_utf16le_ptr = NULL;
 static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 {
 	return utf16be_to_wchar_ptr(in, in_len, buf, bufsize, NULL);
 }
 static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
 {
 	wchar_to_utf16be_ptr(in, len, buf, end);
 }
 static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 {
 	return utf16le_to_wchar_ptr(in, in_len, buf, bufsize, NULL);
 }
 static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
 {
 	wchar_to_utf16le_ptr(in, len, buf, end);
 }
 void init_convert_utf16(void)
 {
 	if (zend_cpu_supports_avx2()) {
 		utf16be_to_wchar_ptr = mb_utf16be_to_wchar_avx2;
 		wchar_to_utf16be_ptr = mb_wchar_to_utf16be_avx2;
 		utf16le_to_wchar_ptr = mb_utf16le_to_wchar_avx2;
 		wchar_to_utf16le_ptr = mb_wchar_to_utf16le_avx2;
 	} else {
 		utf16be_to_wchar_ptr = mb_utf16be_to_wchar_default;
 		wchar_to_utf16be_ptr = mb_wchar_to_utf16be_default;
 		utf16le_to_wchar_ptr = mb_utf16le_to_wchar_default;
 		wchar_to_utf16le_ptr = mb_wchar_to_utf16le_default;
 	}
 }
 # endif
 #else
 /* No AVX2 support */
 # define mb_utf16be_to_wchar mb_utf16be_to_wchar_default
 # define mb_utf16le_to_wchar mb_utf16le_to_wchar_default
 # define mb_wchar_to_utf16be mb_wchar_to_utf16be_default
 # define mb_wchar_to_utf16le mb_wchar_to_utf16le_default
 static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 #endif
 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
 static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
 static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
@ -366,7 +505,7 @@ static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *bu
 	return mb_utf16be_to_wchar(in, in_len, buf, bufsize, NULL);
 }
-static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
+static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 {
 	/* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
 	unsigned char *p = *in, *e = p + (*in_len & ~1);
@ -419,7 +558,7 @@ static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
 	return out - buf;
 }
-static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
+static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
 {
 	unsigned char *out, *limit;
 	MB_CONVERT_BUF_LOAD(buf, out, limit);
@ -436,7 +575,7 @@ static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, b
 			MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
 			out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF);
 		} else {
-			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be);
+			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be_default);
 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
 		}
 	}
@ -444,7 +583,7 @@ static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, b
 	MB_CONVERT_BUF_STORE(buf, out, limit);
 }
-static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
+static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 {
 	/* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
 	unsigned char *p = *in, *e = p + (*in_len & ~1);
@ -497,7 +636,7 @@ static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
 	return out - buf;
 }
-static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
+static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
 {
 	unsigned char *out, *limit;
 	MB_CONVERT_BUF_LOAD(buf, out, limit);
@ -514,10 +653,387 @@ static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, b
 			MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
 			out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF);
 		} else {
-			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le);
+			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le_default);
 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
 		}
 	}
 	MB_CONVERT_BUF_STORE(buf, out, limit);
 }
 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
 size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 #else
 static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 #endif
 {
 	size_t len = *in_len;
 	if (len >= 32 && bufsize >= 16) {
 		unsigned char *p = *in;
 		uint32_t *out = buf;
 		/* Used to determine if a block of input bytes contains any surrogates */
 		const __m256i _f8 = _mm256_set1_epi16(0xF8);
 		const __m256i _d8 = _mm256_set1_epi16(0xD8);
 		/* wchars must be in host byte order, which is little-endian on x86;
 		 * Since we are reading in (big-endian) UTF-16BE, use this vector to swap byte order for output */
 		const __m256i swap_bytes = _mm256_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
 		do {
 			__m256i operand = _mm256_loadu_si256((__m256i*)p); /* Load 32 bytes */
 			uint32_t surrogate_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi16(_mm256_and_si256(operand, _f8), _d8));
 			if (surrogate_bitvec == 0) {
 				/* There are no surrogates among these 16 characters
 				 * So converting the UTF-16 input to wchars is very simple; just extend each 16-bit value
 				 * to a 32-bit value, filling in zero bits in the high end */
 				operand = _mm256_shuffle_epi8(operand, swap_bytes);
 				_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
 				_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
 				out += 16;
 				bufsize -= 16;
 				p += sizeof(__m256i);
 				len -= sizeof(__m256i);
 			} else if ((surrogate_bitvec & 1) == 0) {
 				/* Some prefix of the current block is non-surrogates; output those */
 				uint8_t n_chars = zend_ulong_ntz(surrogate_bitvec) >> 1;
 				operand = _mm256_shuffle_epi8(operand, swap_bytes);
 				/* We know that the output buffer has at least 64 bytes of space available
 				 * So don't bother trimming the output down to only include the non-surrogate prefix;
 				 * rather, write out an entire block of 64 (or 32) bytes, then bump our output pointer
 				 * forward just past the 'good part', so the 'bad part' will be overwritten on the next
 				 * iteration of this loop */
 				_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
 				if (n_chars > 8) {
 					_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
 				}
 				out += n_chars;
 				bufsize -= n_chars;
 				p += n_chars * 2;
 				len -= n_chars * 2;
 			} else {
 				/* Some prefix of the current block is (valid or invalid) surrogates
 				 * Handle those using non-vectorized code */
 				surrogate_bitvec = ~surrogate_bitvec;
 				unsigned int n_chars = surrogate_bitvec ? zend_ulong_ntz(surrogate_bitvec) >> 1 : 16;
 				do {
 					unsigned char c1 = *p++;
 					unsigned char c2 = *p++;
 					if (c1 & 0x4 || len < 4) {
 						/* 2nd part of surrogate pair has come first OR string ended abruptly
 						 * after 1st part of surrogate pair */
 						*out++ = MBFL_BAD_INPUT;
 						bufsize--;
 						n_chars--;
 						len -= 2;
 						continue;
 					}
 					uint16_t n = (c1 << 8) | c2;
 					unsigned char c3 = *p++;
 					unsigned char c4 = *p++;
 					if ((c3 & 0xFC) == 0xDC) {
 						/* Valid surrogate pair */
 						uint16_t n2 = (c3 << 8) | c4;
 						*out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
 						bufsize--;
 						len -= 4;
 #ifdef PHP_HAVE_BUILTIN_USUB_OVERFLOW
 						/* Subtracting 2 from `n_chars` will automatically set the CPU's flags;
 						 * branch directly off the appropriate flag (CF on x86) rather than using
 						 * another instruction (CMP on x86) to check for underflow */
 						if (__builtin_usub_overflow(n_chars, 2, &n_chars)) {
 							/* The last 2 bytes of this block and the first 2 bytes of the following
 							 * block form a valid surrogate pair; now just make sure we don't get
 							 * stuck in this loop due to underflow of the loop index */
 							break;
 						}
 #else
 						n_chars -= 2;
 						if (n_chars == UINT_MAX) {
 							break;
 						}
 #endif
 					} else {
 						/* First half of surrogate pair was followed by another first half
 						 * OR by a non-surrogate character */
 						*out++ = MBFL_BAD_INPUT;
 						bufsize--;
 						n_chars--;
 						len -= 2;
 						p -= 2; /* Back up so the last 2 bytes will be processed again */
 					}
 				} while (n_chars);
 			}
 		} while (len >= 32 && bufsize >= 16);
 		if (len && bufsize >= 4) {
 			/* Finish up trailing bytes which don't fill a 32-byte block */
 			out += mb_utf16be_to_wchar_default(&p, &len, out, bufsize, NULL);
 		}
 		*in = p;
 		*in_len = len;
 		return out - buf;
 	} else if (len) {
 		return mb_utf16be_to_wchar_default(in, in_len, buf, bufsize, NULL);
 	} else {
 		return 0;
 	}
 }
 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
 void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
 #else
 static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
 #endif
 {
 	if (len >= 8) {
 		unsigned char *out, *limit;
 		MB_CONVERT_BUF_LOAD(buf, out, limit);
 		MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
 		/* Used to find wchars which are outside the Unicode BMP (Basic Multilingual Plane) */
 		const __m256i bmp_mask = _mm256_set1_epi32(0xFFFF);
 		/* Used to extract 16 bits which we want from each of eight 32-bit values */
 		const __m256i pack_8x16 = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 12, 13, 8, 9, 4, 5, 0, 1, 12, 13, 8, 9, 4, 5, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1);
 		do {
 			__m256i operand = _mm256_loadu_si256((__m256i*)in); /* Load 32 bytes */
 			uint32_t bmp_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_and_si256(operand, bmp_mask), operand));
 			if (bmp_bitvec == 0xFFFFFFFF) {
 				/* All eight wchars are in the BMP
 				 * Shuffle bytes around to get the 16 bytes we want into the low 16 bytes of YMM register
 				 * (which is equivalent to an XMM register) */
 				operand = _mm256_shuffle_epi8(operand, pack_8x16);
 				__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
 				operand = _mm256_alignr_epi8(operand2, operand, 8);
 				_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand)); /* Store 16 bytes */
 				out += 16;
 				len -= 8;
 				in += 8;
 			} else if (bmp_bitvec & 1) {
 				/* Some prefix of this block are codepoints in the BMP */
 				unsigned int n_bytes = zend_ulong_ntz(~bmp_bitvec);
 				operand = _mm256_shuffle_epi8(operand, pack_8x16);
 				__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
 				operand = _mm256_alignr_epi8(operand2, operand, 8);
 				/* Store 16 bytes, but bump output pointer forward just past the 'good part',
 				 * so the 'bad part' will be overwritten on the next iteration of this loop */
 				_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
 				out += n_bytes >> 1;
 				len -= n_bytes >> 2;
 				in += n_bytes >> 2;
 			} else {
 				/* Some prefix of this block is codepoints outside the BMP OR error markers
 				 * Handle them using non-vectorized code */
 				unsigned int n_words = bmp_bitvec ? zend_ulong_ntz(bmp_bitvec) >> 2 : 8;
 				do {
 					uint32_t w = *in++;
 					n_words--;
 					len--;
 					if (w < MBFL_WCSPLANE_UTF32MAX) {
 						uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
 						uint16_t n2 = (w & 0x3FF) | 0xDC00;
 						MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
 						out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF);
 					} else {
 						MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be_default);
 						MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
 					}
 				} while (n_words);
 			}
 		} while (len >= 8);
 		MB_CONVERT_BUF_STORE(buf, out, limit);
 	}
 	if (len) {
 		mb_wchar_to_utf16be_default(in, len, buf, end);
 	}
 }
 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
 size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 #else
 static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 #endif
 {
 	/* Most of this function is the same as `mb_utf16be_to_wchar_avx2`, above;
 	 * See it for more detailed code comments */
 	size_t len = *in_len;
 	if (len >= 32 && bufsize >= 16) {
 		unsigned char *p = *in;
 		uint32_t *out = buf;
 		const __m256i _f8 = _mm256_set1_epi16(0xF800);
 		const __m256i _d8 = _mm256_set1_epi16(0xD800);
 		do {
 			__m256i operand = _mm256_loadu_si256((__m256i*)p);
 			uint32_t surrogate_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi16(_mm256_and_si256(operand, _f8), _d8));
 			if (surrogate_bitvec == 0) {
 				/* There are no surrogates among these 16 characters */
 				_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
 				_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
 				out += 16;
 				bufsize -= 16;
 				p += sizeof(__m256i);
 				len -= sizeof(__m256i);
 			} else if ((surrogate_bitvec & 1) == 0) {
 				/* Some prefix of the current block is non-surrogates */
 				uint8_t n_chars = zend_ulong_ntz(surrogate_bitvec) >> 1;
 				_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
 				if (n_chars > 8) {
 					_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
 				}
 				out += n_chars;
 				bufsize -= n_chars;
 				p += n_chars * 2;
 				len -= n_chars * 2;
 			} else {
 				/* Some prefix of the current block is (valid or invalid) surrogates */
 				surrogate_bitvec = ~surrogate_bitvec;
 				unsigned int n_chars = surrogate_bitvec ? zend_ulong_ntz(surrogate_bitvec) >> 1 : 16;
 				do {
 					unsigned char c1 = *p++;
 					unsigned char c2 = *p++;
 					if (c2 & 0x4 || len < 4) {
 						/* 2nd part of surrogate pair has come first OR string ended abruptly
 						 * after 1st part of surrogate pair */
 						*out++ = MBFL_BAD_INPUT;
 						bufsize--;
 						n_chars--;
 						len -= 2;
 						continue;
 					}
 					uint16_t n = (c2 << 8) | c1;
 					unsigned char c3 = *p++;
 					unsigned char c4 = *p++;
 					if ((c4 & 0xFC) == 0xDC) {
 						/* Valid surrogate pair */
 						uint16_t n2 = (c4 << 8) | c3;
 						*out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
 						bufsize--;
 						len -= 4;
 #ifdef PHP_HAVE_BUILTIN_USUB_OVERFLOW
 						if (__builtin_usub_overflow(n_chars, 2, &n_chars)) {
 							break;
 						}
 #else
 						n_chars -= 2;
 						if (n_chars == UINT_MAX) {
 							break;
 						}
 #endif
 					} else {
 						/* First half of surrogate pair was followed by another first half
 						 * OR by a non-surrogate character */
 						*out++ = MBFL_BAD_INPUT;
 						bufsize--;
 						n_chars--;
 						len -= 2;
 						p -= 2; /* Back up so the last 2 bytes will be processed again */
 					}
 				} while (n_chars);
 			}
 		} while (len >= 32 && bufsize >= 16);
 		if (len && bufsize >= 4) {
 			out += mb_utf16le_to_wchar_default(&p, &len, out, bufsize, NULL);
 		}
 		*in = p;
 		*in_len = len;
 		return out - buf;
 	} else if (len) {
 		return mb_utf16le_to_wchar_default(in, in_len, buf, bufsize, NULL);
 	} else {
 		return 0;
 	}
 }
 #ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
 void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
 #else
 static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
 #endif
 {
 	if (len >= 8) {
 		unsigned char *out, *limit;
 		MB_CONVERT_BUF_LOAD(buf, out, limit);
 		MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
 		/* Used to find wchars which are outside the Unicode BMP (Basic Multilingual Plane) */
 		const __m256i bmp_mask = _mm256_set1_epi32(0xFFFF);
 		/* Used to extract 16 bits which we want from each of eight 32-bit values */
 		const __m256i pack_8x16 = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 13, 12, 9, 8, 5, 4, 1, 0, 13, 12, 9, 8, 5, 4, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1);
 		do {
 			__m256i operand = _mm256_loadu_si256((__m256i*)in);
 			uint32_t bmp_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_and_si256(operand, bmp_mask), operand));
 			if (bmp_bitvec == 0xFFFFFFFF) {
 				/* All eight wchars are in the BMP
 				 * Shuffle bytes around to get the 16 bytes we want into the low 16 bytes of YMM register
 				 * (which is equivalent to an XMM register) */
 				operand = _mm256_shuffle_epi8(operand, pack_8x16);
 				__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
 				operand = _mm256_alignr_epi8(operand2, operand, 8);
 				_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
 				out += 16;
 				len -= 8;
 				in += 8;
 			} else if (bmp_bitvec & 1) {
 				/* Some prefix of this block are codepoints in the BMP */
 				unsigned int n_bytes = zend_ulong_ntz(~bmp_bitvec);
 				operand = _mm256_shuffle_epi8(operand, pack_8x16);
 				__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
 				operand = _mm256_alignr_epi8(operand2, operand, 8);
 				_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
 				out += n_bytes >> 1;
 				len -= n_bytes >> 2;
 				in += n_bytes >> 2;
 			} else {
 				/* Some prefix of this block is codepoints outside the BMP OR error markers */
 				unsigned int n_words = bmp_bitvec ? zend_ulong_ntz(bmp_bitvec) >> 2 : 8;
 				do {
 					uint32_t w = *in++;
 					n_words--;
 					len--;
 					if (w < MBFL_WCSPLANE_UTF32MAX) {
 						uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
 						uint16_t n2 = (w & 0x3FF) | 0xDC00;
 						MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
 						out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF);
 					} else {
 						MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le_default);
 						MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
 					}
 				} while (n_words);
 			}
 		} while (len >= 8);
 		MB_CONVERT_BUF_STORE(buf, out, limit);
 	}
 	if (len) {
 		mb_wchar_to_utf16le_default(in, len, buf, end);
 	}
 }
 #endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.h
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.h
@ -47,4 +47,8 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter);
 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter);
 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter);
 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
 void init_convert_utf16(void);
 #endif
 #endif /* MBFL_MBFILTER_UTF16_H */
--- a/ext/mbstring/mbstring.c
+++ b/ext/mbstring/mbstring.c
@ -1080,6 +1080,7 @@ ZEND_TSRMLS_CACHE_UPDATE();
 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
 	init_check_utf8();
 	init_convert_utf16();
 #endif
 	return SUCCESS;
--- a/ext/mbstring/tests/utf_encodings.phpt
+++ b/ext/mbstring/tests/utf_encodings.phpt
@ -895,6 +895,17 @@ testValidString("\xDC\x00", "\x00\xDC", 'UCS-2BE', 'UTF-16LE', false);
 convertInvalidString("\x00\x11\x56\x78", "\x00%", 'UCS-4BE', 'UTF-16BE');
 convertInvalidString("\x00\x11\x56\x78", "%\x00", 'UCS-4BE', 'UTF-16LE');
 // Regression tests for bugs with initial AVX2-accelerated implementation
 convertInvalidString(str_repeat("a\x00", 15) . "\x00\xD8\x00\xFC", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16LE', 'UCS-2BE');
 convertInvalidString(str_repeat("\x00a", 15) . "\xD8\x00\xFC\x00", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16BE', 'UCS-2BE');
 // This string caused an out-of-bounds read; it was found by a fuzzer
 $str = "\xdb\xdb\xdb#\xdb\xdb\xdf\xdb\xdf\xdb\xdb\x0b\xdb\x00\xdc\xdb\xdf\xdb\xdf\xdb\xda\x0b\xdb\x00\xdcY\xdf\x03\xdb\x03\xd9\xd9\xd8";
 convertInvalidString($str, "\x00\x25\x00\x25\xdb\xdb\xdf\xdb\x00\x25\x00\x25\xdb\x00\xdc\xdb\x00\x25\x00\x25\x00\x25\xdb\x00\xdc\x59\x00\x25\x00\x25\x00\x25\x00\x25", 'UTF-16BE', 'UTF-16BE');
 $str = "\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xdd\xd9\x0a\xda\xda\xda\xda\xdd\xda\xdd\xd9\xda\xda\xda\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb";
 convertInvalidString($str, "\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\xda\xda\xda\xdd\x25\x00\xd9\x0a\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00", 'UTF-16LE', 'UTF-16LE');
 echo "== UTF-32 ==\n";
 testValidCodepoints("UTF-32LE");