Add AVX2-accelerated UTF-16 decoding/encoding routines

As with other SIMD-accelerated functions in php-src, the new UTF-16 encoding and decoding routines can be compiled either with AVX2 acceleration "always on", "always off", or else with runtime detection of AVX2 support. With the new UTF-16 decoder/encoder, conversion of extremely short strings (as in several bytes) has the same performance as before, and conversion of medium-length (~100 character) strings is about 65% faster, but conversion of long (~10,000 character) strings is around 6 times faster. Many other mbstring functions will also be faster now when handling UTF-16; for example, mb_strlen is almost 3 times faster on medium strings, and almost 9 times faster on long strings. (Why does mb_strlen benefit more from AVX2 acceleration than mb_convert_encoding? It's because mb_strlen only needs to decode, but not re-encode, the input string, and the UTF-16 decoder benefits much more from SIMD acceleration than the UTF-16 encoder.)
2025-08-15 13:38:49 +02:00 · 2023-01-25 22:28:10 +02:00 · 2023-01-25 22:28:10 +02:00 · c8ec2ed730
commit c8ec2ed730
parent d5d9900661
7 changed files with 572 additions and 10 deletions
--- a/Zend/zend_bitset.h
+++ b/Zend/zend_bitset.h
@ -19,6 +19,13 @@
 #ifndef _ZEND_BITSET_H_
 #define _ZEND_BITSET_H_

+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "zend_portability.h"
+#include "zend_long.h"
+
 typedef zend_ulong *zend_bitset;

 #define ZEND_BITSET_ELM_SIZE sizeof(zend_ulong)
--- a/build/php.m4
+++ b/build/php.m4
@ -2643,6 +2643,27 @@ AC_DEFUN([PHP_CHECK_BUILTIN_SADDLL_OVERFLOW], [
   [$have_builtin_saddll_overflow], [Whether the compiler supports __builtin_saddll_overflow])
 ])

+dnl
+dnl PHP_CHECK_BUILTIN_USUB_OVERFLOW
+dnl
+AC_DEFUN([PHP_CHECK_BUILTIN_USUB_OVERFLOW], [
+  AC_MSG_CHECKING([for __builtin_usub_overflow])
+
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([], [[
+    unsigned int tmpvar;
+    return __builtin_usub_overflow(3, 7, &tmpvar);
+  ]])], [
+    have_builtin_usub_overflow=1
+    AC_MSG_RESULT([yes])
+  ], [
+    have_builtin_usub_overflow=0
+    AC_MSG_RESULT([no])
+  ])
+
+  AC_DEFINE_UNQUOTED([PHP_HAVE_BUILTIN_USUB_OVERFLOW],
+   [$have_builtin_usub_overflow], [Whether the compiler supports __builtin_usub_overflow])
+])
+
 dnl
 dnl PHP_CHECK_BUILTIN_SSUBL_OVERFLOW
 dnl
--- a/configure.ac
+++ b/configure.ac
@ -504,6 +504,8 @@ dnl Check __builtin_saddl_overflow
 PHP_CHECK_BUILTIN_SADDL_OVERFLOW
 dnl Check __builtin_saddll_overflow
 PHP_CHECK_BUILTIN_SADDLL_OVERFLOW
+dnl Check __builtin_usub_overflow
+PHP_CHECK_BUILTIN_USUB_OVERFLOW
 dnl Check __builtin_ssubl_overflow
 PHP_CHECK_BUILTIN_SSUBL_OVERFLOW
 dnl Check __builtin_ssubll_overflow
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
@ -27,15 +27,154 @@
 *
 */

+#include "zend_bitset.h"
 #include "mbfilter.h"
 #include "mbfilter_utf16.h"

+#ifdef ZEND_INTRIN_AVX2_NATIVE
+
+/* We are building AVX2-only binary */
+# include <immintrin.h>
+# define mb_utf16be_to_wchar mb_utf16be_to_wchar_avx2
+# define mb_utf16le_to_wchar mb_utf16le_to_wchar_avx2
+# define mb_wchar_to_utf16be mb_wchar_to_utf16be_avx2
+# define mb_wchar_to_utf16le mb_wchar_to_utf16le_avx2
+
+static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
+static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
+static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
+static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
+
+#elif defined(ZEND_INTRIN_AVX2_RESOLVER)
+
+/* We are building binary which works with or without AVX2; whether or not to use
+ * AVX2-accelerated functions will be determined at runtime */
+# include <immintrin.h>
+# include "Zend/zend_cpuinfo.h"
+
+static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
+static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
+static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
+static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
+
+# ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
+/* Dynamic linker will decide whether or not to use AVX2-based functions and
+ * resolve symbols accordingly */
+
+ZEND_INTRIN_AVX2_FUNC_DECL(size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state));
+ZEND_INTRIN_AVX2_FUNC_DECL(void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end));
+ZEND_INTRIN_AVX2_FUNC_DECL(size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state));
+ZEND_INTRIN_AVX2_FUNC_DECL(void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end));
+
+size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((ifunc("resolve_utf16be_wchar")));
+void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((ifunc("resolve_wchar_utf16be")));
+size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((ifunc("resolve_utf16le_wchar")));
+void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((ifunc("resolve_wchar_utf16le")));
+
+ZEND_NO_SANITIZE_ADDRESS
+ZEND_ATTRIBUTE_UNUSED
+static mb_to_wchar_fn resolve_utf16be_wchar(void)
+{
+	return zend_cpu_supports_avx2() ? mb_utf16be_to_wchar_avx2 : mb_utf16be_to_wchar_default;
+}
+
+ZEND_NO_SANITIZE_ADDRESS
+ZEND_ATTRIBUTE_UNUSED
+static mb_from_wchar_fn resolve_wchar_utf16be(void)
+{
+	return zend_cpu_supports_avx2() ? mb_wchar_to_utf16be_avx2 : mb_wchar_to_utf16be_default;
+}
+
+ZEND_NO_SANITIZE_ADDRESS
+ZEND_ATTRIBUTE_UNUSED
+static mb_to_wchar_fn resolve_utf16le_wchar(void)
+{
+	return zend_cpu_supports_avx2() ? mb_utf16le_to_wchar_avx2 : mb_utf16le_to_wchar_default;
+}
+
+ZEND_NO_SANITIZE_ADDRESS
+ZEND_ATTRIBUTE_UNUSED
+static mb_from_wchar_fn resolve_wchar_utf16le(void)
+{
+	return zend_cpu_supports_avx2() ? mb_wchar_to_utf16le_avx2 : mb_wchar_to_utf16le_default;
+}
+
+# else /* ZEND_INTRIN_AVX2_FUNC_PTR */
+/* We are compiling for a target where the dynamic linker will not be able to
+ * resolve symbols according to whether the host supports AVX2 or not; so instead,
+ * we can make calls go through a function pointer and set the function pointer
+ * on module load */
+
+#ifdef HAVE_FUNC_ATTRIBUTE_TARGET
+static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((target("avx2")));
+static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((target("avx2")));
+static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((target("avx2")));
+static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((target("avx2")));
+#else
+static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
+static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
+static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
+static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
+#endif
+
+static mb_to_wchar_fn utf16be_to_wchar_ptr = NULL;
+static mb_from_wchar_fn wchar_to_utf16be_ptr = NULL;
+static mb_to_wchar_fn utf16le_to_wchar_ptr = NULL;
+static mb_from_wchar_fn wchar_to_utf16le_ptr = NULL;
+
+static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
+{
+	return utf16be_to_wchar_ptr(in, in_len, buf, bufsize, NULL);
+}
+
+static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
+{
+	wchar_to_utf16be_ptr(in, len, buf, end);
+}
+
+static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
+{
+	return utf16le_to_wchar_ptr(in, in_len, buf, bufsize, NULL);
+}
+
+static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
+{
+	wchar_to_utf16le_ptr(in, len, buf, end);
+}
+
+void init_convert_utf16(void)
+{
+	if (zend_cpu_supports_avx2()) {
+		utf16be_to_wchar_ptr = mb_utf16be_to_wchar_avx2;
+		wchar_to_utf16be_ptr = mb_wchar_to_utf16be_avx2;
+		utf16le_to_wchar_ptr = mb_utf16le_to_wchar_avx2;
+		wchar_to_utf16le_ptr = mb_wchar_to_utf16le_avx2;
+	} else {
+		utf16be_to_wchar_ptr = mb_utf16be_to_wchar_default;
+		wchar_to_utf16be_ptr = mb_wchar_to_utf16be_default;
+		utf16le_to_wchar_ptr = mb_utf16le_to_wchar_default;
+		wchar_to_utf16le_ptr = mb_wchar_to_utf16le_default;
+	}
+}
+# endif
+
+#else
+
+/* No AVX2 support */
+# define mb_utf16be_to_wchar mb_utf16be_to_wchar_default
+# define mb_utf16le_to_wchar mb_utf16le_to_wchar_default
+# define mb_wchar_to_utf16be mb_wchar_to_utf16be_default
+# define mb_wchar_to_utf16le mb_wchar_to_utf16le_default
+
+static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
+static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
+static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
+static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
+
+#endif
+
 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
 static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
-static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
-static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
-static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
-static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);

 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};

@ -366,7 +505,7 @@ static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *bu
 	return mb_utf16be_to_wchar(in, in_len, buf, bufsize, NULL);
 }

-static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
+static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 {
 	/* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
 	unsigned char *p = *in, *e = p + (*in_len & ~1);
@ -419,7 +558,7 @@ static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
 	return out - buf;
 }

-static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
+static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
 {
 	unsigned char *out, *limit;
 	MB_CONVERT_BUF_LOAD(buf, out, limit);
@ -436,7 +575,7 @@ static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, b
 			MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
 			out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF);
 		} else {
-			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be);
+			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be_default);
 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
 		}
 	}
@ -444,7 +583,7 @@ static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, b
 	MB_CONVERT_BUF_STORE(buf, out, limit);
 }

-static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
+static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 {
 	/* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
 	unsigned char *p = *in, *e = p + (*in_len & ~1);
@ -497,7 +636,7 @@ static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
 	return out - buf;
 }

-static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
+static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
 {
 	unsigned char *out, *limit;
 	MB_CONVERT_BUF_LOAD(buf, out, limit);
@ -514,10 +653,387 @@ static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, b
 			MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
 			out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF);
 		} else {
-			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le);
+			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le_default);
 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
 		}
 	}

 	MB_CONVERT_BUF_STORE(buf, out, limit);
 }
+
+#if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
+
+#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
+size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
+#else
+static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
+#endif
+{
+	size_t len = *in_len;
+
+	if (len >= 32 && bufsize >= 16) {
+		unsigned char *p = *in;
+		uint32_t *out = buf;
+
+		/* Used to determine if a block of input bytes contains any surrogates */
+		const __m256i _f8 = _mm256_set1_epi16(0xF8);
+		const __m256i _d8 = _mm256_set1_epi16(0xD8);
+		/* wchars must be in host byte order, which is little-endian on x86;
+		 * Since we are reading in (big-endian) UTF-16BE, use this vector to swap byte order for output */
+		const __m256i swap_bytes = _mm256_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+
+		do {
+			__m256i operand = _mm256_loadu_si256((__m256i*)p); /* Load 32 bytes */
+
+			uint32_t surrogate_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi16(_mm256_and_si256(operand, _f8), _d8));
+			if (surrogate_bitvec == 0) {
+				/* There are no surrogates among these 16 characters
+				 * So converting the UTF-16 input to wchars is very simple; just extend each 16-bit value
+				 * to a 32-bit value, filling in zero bits in the high end */
+				operand = _mm256_shuffle_epi8(operand, swap_bytes);
+				_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
+				_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
+				out += 16;
+				bufsize -= 16;
+				p += sizeof(__m256i);
+				len -= sizeof(__m256i);
+			} else if ((surrogate_bitvec & 1) == 0) {
+				/* Some prefix of the current block is non-surrogates; output those */
+				uint8_t n_chars = zend_ulong_ntz(surrogate_bitvec) >> 1;
+				operand = _mm256_shuffle_epi8(operand, swap_bytes);
+				/* We know that the output buffer has at least 64 bytes of space available
+				 * So don't bother trimming the output down to only include the non-surrogate prefix;
+				 * rather, write out an entire block of 64 (or 32) bytes, then bump our output pointer
+				 * forward just past the 'good part', so the 'bad part' will be overwritten on the next
+				 * iteration of this loop */
+				_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
+				if (n_chars > 8) {
+					_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
+				}
+				out += n_chars;
+				bufsize -= n_chars;
+				p += n_chars * 2;
+				len -= n_chars * 2;
+			} else {
+				/* Some prefix of the current block is (valid or invalid) surrogates
+				 * Handle those using non-vectorized code */
+				surrogate_bitvec = ~surrogate_bitvec;
+				unsigned int n_chars = surrogate_bitvec ? zend_ulong_ntz(surrogate_bitvec) >> 1 : 16;
+				do {
+					unsigned char c1 = *p++;
+					unsigned char c2 = *p++;
+
+					if (c1 & 0x4 || len < 4) {
+						/* 2nd part of surrogate pair has come first OR string ended abruptly
+						 * after 1st part of surrogate pair */
+						*out++ = MBFL_BAD_INPUT;
+						bufsize--;
+						n_chars--;
+						len -= 2;
+						continue;
+					}
+
+					uint16_t n = (c1 << 8) | c2;
+					unsigned char c3 = *p++;
+					unsigned char c4 = *p++;
+
+					if ((c3 & 0xFC) == 0xDC) {
+						/* Valid surrogate pair */
+						uint16_t n2 = (c3 << 8) | c4;
+						*out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
+						bufsize--;
+						len -= 4;
+#ifdef PHP_HAVE_BUILTIN_USUB_OVERFLOW
+						/* Subtracting 2 from `n_chars` will automatically set the CPU's flags;
+						 * branch directly off the appropriate flag (CF on x86) rather than using
+						 * another instruction (CMP on x86) to check for underflow */
+						if (__builtin_usub_overflow(n_chars, 2, &n_chars)) {
+							/* The last 2 bytes of this block and the first 2 bytes of the following
+							 * block form a valid surrogate pair; now just make sure we don't get
+							 * stuck in this loop due to underflow of the loop index */
+							break;
+						}
+#else
+						n_chars -= 2;
+						if (n_chars == UINT_MAX) {
+							break;
+						}
+#endif
+					} else {
+						/* First half of surrogate pair was followed by another first half
+						 * OR by a non-surrogate character */
+						*out++ = MBFL_BAD_INPUT;
+						bufsize--;
+						n_chars--;
+						len -= 2;
+						p -= 2; /* Back up so the last 2 bytes will be processed again */
+					}
+				} while (n_chars);
+			}
+		} while (len >= 32 && bufsize >= 16);
+
+		if (len && bufsize >= 4) {
+			/* Finish up trailing bytes which don't fill a 32-byte block */
+			out += mb_utf16be_to_wchar_default(&p, &len, out, bufsize, NULL);
+		}
+
+		*in = p;
+		*in_len = len;
+		return out - buf;
+	} else if (len) {
+		return mb_utf16be_to_wchar_default(in, in_len, buf, bufsize, NULL);
+	} else {
+		return 0;
+	}
+}
+
+#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
+void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
+#else
+static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
+#endif
+{
+	if (len >= 8) {
+		unsigned char *out, *limit;
+		MB_CONVERT_BUF_LOAD(buf, out, limit);
+		MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
+
+		/* Used to find wchars which are outside the Unicode BMP (Basic Multilingual Plane) */
+		const __m256i bmp_mask = _mm256_set1_epi32(0xFFFF);
+		/* Used to extract 16 bits which we want from each of eight 32-bit values */
+		const __m256i pack_8x16 = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 12, 13, 8, 9, 4, 5, 0, 1, 12, 13, 8, 9, 4, 5, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1);
+
+		do {
+			__m256i operand = _mm256_loadu_si256((__m256i*)in); /* Load 32 bytes */
+
+			uint32_t bmp_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_and_si256(operand, bmp_mask), operand));
+			if (bmp_bitvec == 0xFFFFFFFF) {
+				/* All eight wchars are in the BMP
+				 * Shuffle bytes around to get the 16 bytes we want into the low 16 bytes of YMM register
+				 * (which is equivalent to an XMM register) */
+				operand = _mm256_shuffle_epi8(operand, pack_8x16);
+				__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
+				operand = _mm256_alignr_epi8(operand2, operand, 8);
+				_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand)); /* Store 16 bytes */
+				out += 16;
+				len -= 8;
+				in += 8;
+			} else if (bmp_bitvec & 1) {
+				/* Some prefix of this block are codepoints in the BMP */
+				unsigned int n_bytes = zend_ulong_ntz(~bmp_bitvec);
+				operand = _mm256_shuffle_epi8(operand, pack_8x16);
+				__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
+				operand = _mm256_alignr_epi8(operand2, operand, 8);
+				/* Store 16 bytes, but bump output pointer forward just past the 'good part',
+				 * so the 'bad part' will be overwritten on the next iteration of this loop */
+				_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
+				out += n_bytes >> 1;
+				len -= n_bytes >> 2;
+				in += n_bytes >> 2;
+			} else {
+				/* Some prefix of this block is codepoints outside the BMP OR error markers
+				 * Handle them using non-vectorized code */
+				unsigned int n_words = bmp_bitvec ? zend_ulong_ntz(bmp_bitvec) >> 2 : 8;
+				do {
+					uint32_t w = *in++;
+					n_words--;
+					len--;
+
+					if (w < MBFL_WCSPLANE_UTF32MAX) {
+						uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
+						uint16_t n2 = (w & 0x3FF) | 0xDC00;
+						MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
+						out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF);
+					} else {
+						MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be_default);
+						MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
+					}
+				} while (n_words);
+			}
+		} while (len >= 8);
+
+		MB_CONVERT_BUF_STORE(buf, out, limit);
+	}
+
+	if (len) {
+		mb_wchar_to_utf16be_default(in, len, buf, end);
+	}
+}
+
+#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
+size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
+#else
+static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
+#endif
+{
+	/* Most of this function is the same as `mb_utf16be_to_wchar_avx2`, above;
+	 * See it for more detailed code comments */
+
+	size_t len = *in_len;
+
+	if (len >= 32 && bufsize >= 16) {
+		unsigned char *p = *in;
+		uint32_t *out = buf;
+
+		const __m256i _f8 = _mm256_set1_epi16(0xF800);
+		const __m256i _d8 = _mm256_set1_epi16(0xD800);
+
+		do {
+			__m256i operand = _mm256_loadu_si256((__m256i*)p);
+
+			uint32_t surrogate_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi16(_mm256_and_si256(operand, _f8), _d8));
+			if (surrogate_bitvec == 0) {
+				/* There are no surrogates among these 16 characters */
+				_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
+				_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
+				out += 16;
+				bufsize -= 16;
+				p += sizeof(__m256i);
+				len -= sizeof(__m256i);
+			} else if ((surrogate_bitvec & 1) == 0) {
+				/* Some prefix of the current block is non-surrogates */
+				uint8_t n_chars = zend_ulong_ntz(surrogate_bitvec) >> 1;
+				_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
+				if (n_chars > 8) {
+					_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
+				}
+				out += n_chars;
+				bufsize -= n_chars;
+				p += n_chars * 2;
+				len -= n_chars * 2;
+			} else {
+				/* Some prefix of the current block is (valid or invalid) surrogates */
+				surrogate_bitvec = ~surrogate_bitvec;
+				unsigned int n_chars = surrogate_bitvec ? zend_ulong_ntz(surrogate_bitvec) >> 1 : 16;
+				do {
+					unsigned char c1 = *p++;
+					unsigned char c2 = *p++;
+
+					if (c2 & 0x4 || len < 4) {
+						/* 2nd part of surrogate pair has come first OR string ended abruptly
+						 * after 1st part of surrogate pair */
+						*out++ = MBFL_BAD_INPUT;
+						bufsize--;
+						n_chars--;
+						len -= 2;
+						continue;
+					}
+
+					uint16_t n = (c2 << 8) | c1;
+					unsigned char c3 = *p++;
+					unsigned char c4 = *p++;
+
+					if ((c4 & 0xFC) == 0xDC) {
+						/* Valid surrogate pair */
+						uint16_t n2 = (c4 << 8) | c3;
+						*out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
+						bufsize--;
+						len -= 4;
+#ifdef PHP_HAVE_BUILTIN_USUB_OVERFLOW
+						if (__builtin_usub_overflow(n_chars, 2, &n_chars)) {
+							break;
+						}
+#else
+						n_chars -= 2;
+						if (n_chars == UINT_MAX) {
+							break;
+						}
+#endif
+					} else {
+						/* First half of surrogate pair was followed by another first half
+						 * OR by a non-surrogate character */
+						*out++ = MBFL_BAD_INPUT;
+						bufsize--;
+						n_chars--;
+						len -= 2;
+						p -= 2; /* Back up so the last 2 bytes will be processed again */
+					}
+				} while (n_chars);
+			}
+		} while (len >= 32 && bufsize >= 16);
+
+		if (len && bufsize >= 4) {
+			out += mb_utf16le_to_wchar_default(&p, &len, out, bufsize, NULL);
+		}
+
+		*in = p;
+		*in_len = len;
+		return out - buf;
+	} else if (len) {
+		return mb_utf16le_to_wchar_default(in, in_len, buf, bufsize, NULL);
+	} else {
+		return 0;
+	}
+}
+
+#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
+void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
+#else
+static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
+#endif
+{
+	if (len >= 8) {
+		unsigned char *out, *limit;
+		MB_CONVERT_BUF_LOAD(buf, out, limit);
+		MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
+
+		/* Used to find wchars which are outside the Unicode BMP (Basic Multilingual Plane) */
+		const __m256i bmp_mask = _mm256_set1_epi32(0xFFFF);
+		/* Used to extract 16 bits which we want from each of eight 32-bit values */
+		const __m256i pack_8x16 = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 13, 12, 9, 8, 5, 4, 1, 0, 13, 12, 9, 8, 5, 4, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+
+		do {
+			__m256i operand = _mm256_loadu_si256((__m256i*)in);
+
+			uint32_t bmp_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_and_si256(operand, bmp_mask), operand));
+			if (bmp_bitvec == 0xFFFFFFFF) {
+				/* All eight wchars are in the BMP
+				 * Shuffle bytes around to get the 16 bytes we want into the low 16 bytes of YMM register
+				 * (which is equivalent to an XMM register) */
+				operand = _mm256_shuffle_epi8(operand, pack_8x16);
+				__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
+				operand = _mm256_alignr_epi8(operand2, operand, 8);
+				_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
+				out += 16;
+				len -= 8;
+				in += 8;
+			} else if (bmp_bitvec & 1) {
+				/* Some prefix of this block are codepoints in the BMP */
+				unsigned int n_bytes = zend_ulong_ntz(~bmp_bitvec);
+				operand = _mm256_shuffle_epi8(operand, pack_8x16);
+				__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
+				operand = _mm256_alignr_epi8(operand2, operand, 8);
+				_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
+				out += n_bytes >> 1;
+				len -= n_bytes >> 2;
+				in += n_bytes >> 2;
+			} else {
+				/* Some prefix of this block is codepoints outside the BMP OR error markers */
+				unsigned int n_words = bmp_bitvec ? zend_ulong_ntz(bmp_bitvec) >> 2 : 8;
+				do {
+					uint32_t w = *in++;
+					n_words--;
+					len--;
+
+					if (w < MBFL_WCSPLANE_UTF32MAX) {
+						uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
+						uint16_t n2 = (w & 0x3FF) | 0xDC00;
+						MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
+						out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF);
+					} else {
+						MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le_default);
+						MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
+					}
+				} while (n_words);
+			}
+		} while (len >= 8);
+
+		MB_CONVERT_BUF_STORE(buf, out, limit);
+	}
+
+	if (len) {
+		mb_wchar_to_utf16le_default(in, len, buf, end);
+	}
+}
+
+#endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.h
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.h
@ -47,4 +47,8 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter);
 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter);
 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter);

+#ifdef ZEND_INTRIN_AVX2_FUNC_PTR
+void init_convert_utf16(void);
+#endif
+
 #endif /* MBFL_MBFILTER_UTF16_H */
--- a/ext/mbstring/mbstring.c
+++ b/ext/mbstring/mbstring.c
@ -1080,6 +1080,7 @@ ZEND_TSRMLS_CACHE_UPDATE();

 #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
 	init_check_utf8();
+	init_convert_utf16();
 #endif

 	return SUCCESS;
--- a/ext/mbstring/tests/utf_encodings.phpt
+++ b/ext/mbstring/tests/utf_encodings.phpt
@ -895,6 +895,17 @@ testValidString("\xDC\x00", "\x00\xDC", 'UCS-2BE', 'UTF-16LE', false);
 convertInvalidString("\x00\x11\x56\x78", "\x00%", 'UCS-4BE', 'UTF-16BE');
 convertInvalidString("\x00\x11\x56\x78", "%\x00", 'UCS-4BE', 'UTF-16LE');

+// Regression tests for bugs with initial AVX2-accelerated implementation
+convertInvalidString(str_repeat("a\x00", 15) . "\x00\xD8\x00\xFC", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16LE', 'UCS-2BE');
+convertInvalidString(str_repeat("\x00a", 15) . "\xD8\x00\xFC\x00", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16BE', 'UCS-2BE');
+
+// This string caused an out-of-bounds read; it was found by a fuzzer
+$str = "\xdb\xdb\xdb#\xdb\xdb\xdf\xdb\xdf\xdb\xdb\x0b\xdb\x00\xdc\xdb\xdf\xdb\xdf\xdb\xda\x0b\xdb\x00\xdcY\xdf\x03\xdb\x03\xd9\xd9\xd8";
+convertInvalidString($str, "\x00\x25\x00\x25\xdb\xdb\xdf\xdb\x00\x25\x00\x25\xdb\x00\xdc\xdb\x00\x25\x00\x25\x00\x25\xdb\x00\xdc\x59\x00\x25\x00\x25\x00\x25\x00\x25", 'UTF-16BE', 'UTF-16BE');
+
+$str = "\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xdd\xd9\x0a\xda\xda\xda\xda\xdd\xda\xdd\xd9\xda\xda\xda\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb";
+convertInvalidString($str, "\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\xda\xda\xda\xdd\x25\x00\xd9\x0a\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00", 'UTF-16LE', 'UTF-16LE');
+
 echo "== UTF-32 ==\n";

 testValidCodepoints("UTF-32LE");