From c8ec2ed730334c266ddf01a54d501f3777704219 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Wed, 25 Jan 2023 22:28:10 +0200 Subject: [PATCH] Add AVX2-accelerated UTF-16 decoding/encoding routines As with other SIMD-accelerated functions in php-src, the new UTF-16 encoding and decoding routines can be compiled either with AVX2 acceleration "always on", "always off", or else with runtime detection of AVX2 support. With the new UTF-16 decoder/encoder, conversion of extremely short strings (as in several bytes) has the same performance as before, and conversion of medium-length (~100 character) strings is about 65% faster, but conversion of long (~10,000 character) strings is around 6 times faster. Many other mbstring functions will also be faster now when handling UTF-16; for example, mb_strlen is almost 3 times faster on medium strings, and almost 9 times faster on long strings. (Why does mb_strlen benefit more from AVX2 acceleration than mb_convert_encoding? It's because mb_strlen only needs to decode, but not re-encode, the input string, and the UTF-16 decoder benefits much more from SIMD acceleration than the UTF-16 encoder.) --- Zend/zend_bitset.h | 7 + build/php.m4 | 21 + configure.ac | 2 + ext/mbstring/libmbfl/filters/mbfilter_utf16.c | 536 +++++++++++++++++- ext/mbstring/libmbfl/filters/mbfilter_utf16.h | 4 + ext/mbstring/mbstring.c | 1 + ext/mbstring/tests/utf_encodings.phpt | 11 + 7 files changed, 572 insertions(+), 10 deletions(-) diff --git a/Zend/zend_bitset.h b/Zend/zend_bitset.h index fdb6ab79a1e..262fab24a5e 100644 --- a/Zend/zend_bitset.h +++ b/Zend/zend_bitset.h @@ -19,6 +19,13 @@ #ifndef _ZEND_BITSET_H_ #define _ZEND_BITSET_H_ +#include +#include +#include + +#include "zend_portability.h" +#include "zend_long.h" + typedef zend_ulong *zend_bitset; #define ZEND_BITSET_ELM_SIZE sizeof(zend_ulong) diff --git a/build/php.m4 b/build/php.m4 index 698a9195e57..9287d2fc60c 100644 --- a/build/php.m4 +++ b/build/php.m4 @@ -2643,6 +2643,27 @@ AC_DEFUN([PHP_CHECK_BUILTIN_SADDLL_OVERFLOW], [ [$have_builtin_saddll_overflow], [Whether the compiler supports __builtin_saddll_overflow]) ]) +dnl +dnl PHP_CHECK_BUILTIN_USUB_OVERFLOW +dnl +AC_DEFUN([PHP_CHECK_BUILTIN_USUB_OVERFLOW], [ + AC_MSG_CHECKING([for __builtin_usub_overflow]) + + AC_LINK_IFELSE([AC_LANG_PROGRAM([], [[ + unsigned int tmpvar; + return __builtin_usub_overflow(3, 7, &tmpvar); + ]])], [ + have_builtin_usub_overflow=1 + AC_MSG_RESULT([yes]) + ], [ + have_builtin_usub_overflow=0 + AC_MSG_RESULT([no]) + ]) + + AC_DEFINE_UNQUOTED([PHP_HAVE_BUILTIN_USUB_OVERFLOW], + [$have_builtin_usub_overflow], [Whether the compiler supports __builtin_usub_overflow]) +]) + dnl dnl PHP_CHECK_BUILTIN_SSUBL_OVERFLOW dnl diff --git a/configure.ac b/configure.ac index b0697e3208d..cd77a89ff20 100644 --- a/configure.ac +++ b/configure.ac @@ -504,6 +504,8 @@ dnl Check __builtin_saddl_overflow PHP_CHECK_BUILTIN_SADDL_OVERFLOW dnl Check __builtin_saddll_overflow PHP_CHECK_BUILTIN_SADDLL_OVERFLOW +dnl Check __builtin_usub_overflow +PHP_CHECK_BUILTIN_USUB_OVERFLOW dnl Check __builtin_ssubl_overflow PHP_CHECK_BUILTIN_SSUBL_OVERFLOW dnl Check __builtin_ssubll_overflow diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index eddd56f3627..9e0c98370b9 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -27,15 +27,154 @@ * */ +#include "zend_bitset.h" #include "mbfilter.h" #include "mbfilter_utf16.h" +#ifdef ZEND_INTRIN_AVX2_NATIVE + +/* We are building AVX2-only binary */ +# include +# define mb_utf16be_to_wchar mb_utf16be_to_wchar_avx2 +# define mb_utf16le_to_wchar mb_utf16le_to_wchar_avx2 +# define mb_wchar_to_utf16be mb_wchar_to_utf16be_avx2 +# define mb_wchar_to_utf16le mb_wchar_to_utf16le_avx2 + +static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); + +#elif defined(ZEND_INTRIN_AVX2_RESOLVER) + +/* We are building binary which works with or without AVX2; whether or not to use + * AVX2-accelerated functions will be determined at runtime */ +# include +# include "Zend/zend_cpuinfo.h" + +static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); + +# ifdef ZEND_INTRIN_AVX2_FUNC_PROTO +/* Dynamic linker will decide whether or not to use AVX2-based functions and + * resolve symbols accordingly */ + +ZEND_INTRIN_AVX2_FUNC_DECL(size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)); +ZEND_INTRIN_AVX2_FUNC_DECL(void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)); +ZEND_INTRIN_AVX2_FUNC_DECL(size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)); +ZEND_INTRIN_AVX2_FUNC_DECL(void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)); + +size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((ifunc("resolve_utf16be_wchar"))); +void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((ifunc("resolve_wchar_utf16be"))); +size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((ifunc("resolve_utf16le_wchar"))); +void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((ifunc("resolve_wchar_utf16le"))); + +ZEND_NO_SANITIZE_ADDRESS +ZEND_ATTRIBUTE_UNUSED +static mb_to_wchar_fn resolve_utf16be_wchar(void) +{ + return zend_cpu_supports_avx2() ? mb_utf16be_to_wchar_avx2 : mb_utf16be_to_wchar_default; +} + +ZEND_NO_SANITIZE_ADDRESS +ZEND_ATTRIBUTE_UNUSED +static mb_from_wchar_fn resolve_wchar_utf16be(void) +{ + return zend_cpu_supports_avx2() ? mb_wchar_to_utf16be_avx2 : mb_wchar_to_utf16be_default; +} + +ZEND_NO_SANITIZE_ADDRESS +ZEND_ATTRIBUTE_UNUSED +static mb_to_wchar_fn resolve_utf16le_wchar(void) +{ + return zend_cpu_supports_avx2() ? mb_utf16le_to_wchar_avx2 : mb_utf16le_to_wchar_default; +} + +ZEND_NO_SANITIZE_ADDRESS +ZEND_ATTRIBUTE_UNUSED +static mb_from_wchar_fn resolve_wchar_utf16le(void) +{ + return zend_cpu_supports_avx2() ? mb_wchar_to_utf16le_avx2 : mb_wchar_to_utf16le_default; +} + +# else /* ZEND_INTRIN_AVX2_FUNC_PTR */ +/* We are compiling for a target where the dynamic linker will not be able to + * resolve symbols according to whether the host supports AVX2 or not; so instead, + * we can make calls go through a function pointer and set the function pointer + * on module load */ + +#ifdef HAVE_FUNC_ATTRIBUTE_TARGET +static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((target("avx2"))); +static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((target("avx2"))); +static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((target("avx2"))); +static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((target("avx2"))); +#else +static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +#endif + +static mb_to_wchar_fn utf16be_to_wchar_ptr = NULL; +static mb_from_wchar_fn wchar_to_utf16be_ptr = NULL; +static mb_to_wchar_fn utf16le_to_wchar_ptr = NULL; +static mb_from_wchar_fn wchar_to_utf16le_ptr = NULL; + +static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + return utf16be_to_wchar_ptr(in, in_len, buf, bufsize, NULL); +} + +static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + wchar_to_utf16be_ptr(in, len, buf, end); +} + +static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + return utf16le_to_wchar_ptr(in, in_len, buf, bufsize, NULL); +} + +static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + wchar_to_utf16le_ptr(in, len, buf, end); +} + +void init_convert_utf16(void) +{ + if (zend_cpu_supports_avx2()) { + utf16be_to_wchar_ptr = mb_utf16be_to_wchar_avx2; + wchar_to_utf16be_ptr = mb_wchar_to_utf16be_avx2; + utf16le_to_wchar_ptr = mb_utf16le_to_wchar_avx2; + wchar_to_utf16le_ptr = mb_wchar_to_utf16le_avx2; + } else { + utf16be_to_wchar_ptr = mb_utf16be_to_wchar_default; + wchar_to_utf16be_ptr = mb_wchar_to_utf16be_default; + utf16le_to_wchar_ptr = mb_utf16le_to_wchar_default; + wchar_to_utf16le_ptr = mb_wchar_to_utf16le_default; + } +} +# endif + +#else + +/* No AVX2 support */ +# define mb_utf16be_to_wchar mb_utf16be_to_wchar_default +# define mb_utf16le_to_wchar mb_utf16le_to_wchar_default +# define mb_wchar_to_utf16be mb_wchar_to_utf16be_default +# define mb_wchar_to_utf16le mb_wchar_to_utf16le_default + +static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); + +#endif + static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter); static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); -static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); -static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); -static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); -static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL}; @@ -366,7 +505,7 @@ static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *bu return mb_utf16be_to_wchar(in, in_len, buf, bufsize, NULL); } -static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { /* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */ unsigned char *p = *in, *e = p + (*in_len & ~1); @@ -419,7 +558,7 @@ static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t * return out - buf; } -static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) { unsigned char *out, *limit; MB_CONVERT_BUF_LOAD(buf, out, limit); @@ -436,7 +575,7 @@ static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, b MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4); out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF); } else { - MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be); + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be_default); MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); } } @@ -444,7 +583,7 @@ static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, b MB_CONVERT_BUF_STORE(buf, out, limit); } -static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { /* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */ unsigned char *p = *in, *e = p + (*in_len & ~1); @@ -497,7 +636,7 @@ static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t * return out - buf; } -static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) { unsigned char *out, *limit; MB_CONVERT_BUF_LOAD(buf, out, limit); @@ -514,10 +653,387 @@ static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, b MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4); out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF); } else { - MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le); + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le_default); MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); } } MB_CONVERT_BUF_STORE(buf, out, limit); } + +#if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) + +#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO +size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +#else +static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +#endif +{ + size_t len = *in_len; + + if (len >= 32 && bufsize >= 16) { + unsigned char *p = *in; + uint32_t *out = buf; + + /* Used to determine if a block of input bytes contains any surrogates */ + const __m256i _f8 = _mm256_set1_epi16(0xF8); + const __m256i _d8 = _mm256_set1_epi16(0xD8); + /* wchars must be in host byte order, which is little-endian on x86; + * Since we are reading in (big-endian) UTF-16BE, use this vector to swap byte order for output */ + const __m256i swap_bytes = _mm256_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + + do { + __m256i operand = _mm256_loadu_si256((__m256i*)p); /* Load 32 bytes */ + + uint32_t surrogate_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi16(_mm256_and_si256(operand, _f8), _d8)); + if (surrogate_bitvec == 0) { + /* There are no surrogates among these 16 characters + * So converting the UTF-16 input to wchars is very simple; just extend each 16-bit value + * to a 32-bit value, filling in zero bits in the high end */ + operand = _mm256_shuffle_epi8(operand, swap_bytes); + _mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand))); + _mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1))); + out += 16; + bufsize -= 16; + p += sizeof(__m256i); + len -= sizeof(__m256i); + } else if ((surrogate_bitvec & 1) == 0) { + /* Some prefix of the current block is non-surrogates; output those */ + uint8_t n_chars = zend_ulong_ntz(surrogate_bitvec) >> 1; + operand = _mm256_shuffle_epi8(operand, swap_bytes); + /* We know that the output buffer has at least 64 bytes of space available + * So don't bother trimming the output down to only include the non-surrogate prefix; + * rather, write out an entire block of 64 (or 32) bytes, then bump our output pointer + * forward just past the 'good part', so the 'bad part' will be overwritten on the next + * iteration of this loop */ + _mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand))); + if (n_chars > 8) { + _mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1))); + } + out += n_chars; + bufsize -= n_chars; + p += n_chars * 2; + len -= n_chars * 2; + } else { + /* Some prefix of the current block is (valid or invalid) surrogates + * Handle those using non-vectorized code */ + surrogate_bitvec = ~surrogate_bitvec; + unsigned int n_chars = surrogate_bitvec ? zend_ulong_ntz(surrogate_bitvec) >> 1 : 16; + do { + unsigned char c1 = *p++; + unsigned char c2 = *p++; + + if (c1 & 0x4 || len < 4) { + /* 2nd part of surrogate pair has come first OR string ended abruptly + * after 1st part of surrogate pair */ + *out++ = MBFL_BAD_INPUT; + bufsize--; + n_chars--; + len -= 2; + continue; + } + + uint16_t n = (c1 << 8) | c2; + unsigned char c3 = *p++; + unsigned char c4 = *p++; + + if ((c3 & 0xFC) == 0xDC) { + /* Valid surrogate pair */ + uint16_t n2 = (c3 << 8) | c4; + *out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000; + bufsize--; + len -= 4; +#ifdef PHP_HAVE_BUILTIN_USUB_OVERFLOW + /* Subtracting 2 from `n_chars` will automatically set the CPU's flags; + * branch directly off the appropriate flag (CF on x86) rather than using + * another instruction (CMP on x86) to check for underflow */ + if (__builtin_usub_overflow(n_chars, 2, &n_chars)) { + /* The last 2 bytes of this block and the first 2 bytes of the following + * block form a valid surrogate pair; now just make sure we don't get + * stuck in this loop due to underflow of the loop index */ + break; + } +#else + n_chars -= 2; + if (n_chars == UINT_MAX) { + break; + } +#endif + } else { + /* First half of surrogate pair was followed by another first half + * OR by a non-surrogate character */ + *out++ = MBFL_BAD_INPUT; + bufsize--; + n_chars--; + len -= 2; + p -= 2; /* Back up so the last 2 bytes will be processed again */ + } + } while (n_chars); + } + } while (len >= 32 && bufsize >= 16); + + if (len && bufsize >= 4) { + /* Finish up trailing bytes which don't fill a 32-byte block */ + out += mb_utf16be_to_wchar_default(&p, &len, out, bufsize, NULL); + } + + *in = p; + *in_len = len; + return out - buf; + } else if (len) { + return mb_utf16be_to_wchar_default(in, in_len, buf, bufsize, NULL); + } else { + return 0; + } +} + +#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO +void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +#else +static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +#endif +{ + if (len >= 8) { + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); + + /* Used to find wchars which are outside the Unicode BMP (Basic Multilingual Plane) */ + const __m256i bmp_mask = _mm256_set1_epi32(0xFFFF); + /* Used to extract 16 bits which we want from each of eight 32-bit values */ + const __m256i pack_8x16 = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 12, 13, 8, 9, 4, 5, 0, 1, 12, 13, 8, 9, 4, 5, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1); + + do { + __m256i operand = _mm256_loadu_si256((__m256i*)in); /* Load 32 bytes */ + + uint32_t bmp_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_and_si256(operand, bmp_mask), operand)); + if (bmp_bitvec == 0xFFFFFFFF) { + /* All eight wchars are in the BMP + * Shuffle bytes around to get the 16 bytes we want into the low 16 bytes of YMM register + * (which is equivalent to an XMM register) */ + operand = _mm256_shuffle_epi8(operand, pack_8x16); + __m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1); + operand = _mm256_alignr_epi8(operand2, operand, 8); + _mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand)); /* Store 16 bytes */ + out += 16; + len -= 8; + in += 8; + } else if (bmp_bitvec & 1) { + /* Some prefix of this block are codepoints in the BMP */ + unsigned int n_bytes = zend_ulong_ntz(~bmp_bitvec); + operand = _mm256_shuffle_epi8(operand, pack_8x16); + __m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1); + operand = _mm256_alignr_epi8(operand2, operand, 8); + /* Store 16 bytes, but bump output pointer forward just past the 'good part', + * so the 'bad part' will be overwritten on the next iteration of this loop */ + _mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand)); + out += n_bytes >> 1; + len -= n_bytes >> 2; + in += n_bytes >> 2; + } else { + /* Some prefix of this block is codepoints outside the BMP OR error markers + * Handle them using non-vectorized code */ + unsigned int n_words = bmp_bitvec ? zend_ulong_ntz(bmp_bitvec) >> 2 : 8; + do { + uint32_t w = *in++; + n_words--; + len--; + + if (w < MBFL_WCSPLANE_UTF32MAX) { + uint16_t n1 = ((w >> 10) - 0x40) | 0xD800; + uint16_t n2 = (w & 0x3FF) | 0xDC00; + MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4); + out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF); + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be_default); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); + } + } while (n_words); + } + } while (len >= 8); + + MB_CONVERT_BUF_STORE(buf, out, limit); + } + + if (len) { + mb_wchar_to_utf16be_default(in, len, buf, end); + } +} + +#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO +size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +#else +static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +#endif +{ + /* Most of this function is the same as `mb_utf16be_to_wchar_avx2`, above; + * See it for more detailed code comments */ + + size_t len = *in_len; + + if (len >= 32 && bufsize >= 16) { + unsigned char *p = *in; + uint32_t *out = buf; + + const __m256i _f8 = _mm256_set1_epi16(0xF800); + const __m256i _d8 = _mm256_set1_epi16(0xD800); + + do { + __m256i operand = _mm256_loadu_si256((__m256i*)p); + + uint32_t surrogate_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi16(_mm256_and_si256(operand, _f8), _d8)); + if (surrogate_bitvec == 0) { + /* There are no surrogates among these 16 characters */ + _mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand))); + _mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1))); + out += 16; + bufsize -= 16; + p += sizeof(__m256i); + len -= sizeof(__m256i); + } else if ((surrogate_bitvec & 1) == 0) { + /* Some prefix of the current block is non-surrogates */ + uint8_t n_chars = zend_ulong_ntz(surrogate_bitvec) >> 1; + _mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand))); + if (n_chars > 8) { + _mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1))); + } + out += n_chars; + bufsize -= n_chars; + p += n_chars * 2; + len -= n_chars * 2; + } else { + /* Some prefix of the current block is (valid or invalid) surrogates */ + surrogate_bitvec = ~surrogate_bitvec; + unsigned int n_chars = surrogate_bitvec ? zend_ulong_ntz(surrogate_bitvec) >> 1 : 16; + do { + unsigned char c1 = *p++; + unsigned char c2 = *p++; + + if (c2 & 0x4 || len < 4) { + /* 2nd part of surrogate pair has come first OR string ended abruptly + * after 1st part of surrogate pair */ + *out++ = MBFL_BAD_INPUT; + bufsize--; + n_chars--; + len -= 2; + continue; + } + + uint16_t n = (c2 << 8) | c1; + unsigned char c3 = *p++; + unsigned char c4 = *p++; + + if ((c4 & 0xFC) == 0xDC) { + /* Valid surrogate pair */ + uint16_t n2 = (c4 << 8) | c3; + *out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000; + bufsize--; + len -= 4; +#ifdef PHP_HAVE_BUILTIN_USUB_OVERFLOW + if (__builtin_usub_overflow(n_chars, 2, &n_chars)) { + break; + } +#else + n_chars -= 2; + if (n_chars == UINT_MAX) { + break; + } +#endif + } else { + /* First half of surrogate pair was followed by another first half + * OR by a non-surrogate character */ + *out++ = MBFL_BAD_INPUT; + bufsize--; + n_chars--; + len -= 2; + p -= 2; /* Back up so the last 2 bytes will be processed again */ + } + } while (n_chars); + } + } while (len >= 32 && bufsize >= 16); + + if (len && bufsize >= 4) { + out += mb_utf16le_to_wchar_default(&p, &len, out, bufsize, NULL); + } + + *in = p; + *in_len = len; + return out - buf; + } else if (len) { + return mb_utf16le_to_wchar_default(in, in_len, buf, bufsize, NULL); + } else { + return 0; + } +} + +#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO +void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +#else +static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +#endif +{ + if (len >= 8) { + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); + + /* Used to find wchars which are outside the Unicode BMP (Basic Multilingual Plane) */ + const __m256i bmp_mask = _mm256_set1_epi32(0xFFFF); + /* Used to extract 16 bits which we want from each of eight 32-bit values */ + const __m256i pack_8x16 = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 13, 12, 9, 8, 5, 4, 1, 0, 13, 12, 9, 8, 5, 4, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1); + + do { + __m256i operand = _mm256_loadu_si256((__m256i*)in); + + uint32_t bmp_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_and_si256(operand, bmp_mask), operand)); + if (bmp_bitvec == 0xFFFFFFFF) { + /* All eight wchars are in the BMP + * Shuffle bytes around to get the 16 bytes we want into the low 16 bytes of YMM register + * (which is equivalent to an XMM register) */ + operand = _mm256_shuffle_epi8(operand, pack_8x16); + __m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1); + operand = _mm256_alignr_epi8(operand2, operand, 8); + _mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand)); + out += 16; + len -= 8; + in += 8; + } else if (bmp_bitvec & 1) { + /* Some prefix of this block are codepoints in the BMP */ + unsigned int n_bytes = zend_ulong_ntz(~bmp_bitvec); + operand = _mm256_shuffle_epi8(operand, pack_8x16); + __m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1); + operand = _mm256_alignr_epi8(operand2, operand, 8); + _mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand)); + out += n_bytes >> 1; + len -= n_bytes >> 2; + in += n_bytes >> 2; + } else { + /* Some prefix of this block is codepoints outside the BMP OR error markers */ + unsigned int n_words = bmp_bitvec ? zend_ulong_ntz(bmp_bitvec) >> 2 : 8; + do { + uint32_t w = *in++; + n_words--; + len--; + + if (w < MBFL_WCSPLANE_UTF32MAX) { + uint16_t n1 = ((w >> 10) - 0x40) | 0xD800; + uint16_t n2 = (w & 0x3FF) | 0xDC00; + MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4); + out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF); + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le_default); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); + } + } while (n_words); + } + } while (len >= 8); + + MB_CONVERT_BUF_STORE(buf, out, limit); + } + + if (len) { + mb_wchar_to_utf16le_default(in, len, buf, end); + } +} + +#endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.h b/ext/mbstring/libmbfl/filters/mbfilter_utf16.h index 727c231b347..291628549de 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.h @@ -47,4 +47,8 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter); int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter); int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter); +#ifdef ZEND_INTRIN_AVX2_FUNC_PTR +void init_convert_utf16(void); +#endif + #endif /* MBFL_MBFILTER_UTF16_H */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index fa466842936..270dc2d36d7 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1080,6 +1080,7 @@ ZEND_TSRMLS_CACHE_UPDATE(); #ifdef ZEND_INTRIN_AVX2_FUNC_PTR init_check_utf8(); + init_convert_utf16(); #endif return SUCCESS; diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt index 55a79274e3f..634d070ea27 100644 --- a/ext/mbstring/tests/utf_encodings.phpt +++ b/ext/mbstring/tests/utf_encodings.phpt @@ -895,6 +895,17 @@ testValidString("\xDC\x00", "\x00\xDC", 'UCS-2BE', 'UTF-16LE', false); convertInvalidString("\x00\x11\x56\x78", "\x00%", 'UCS-4BE', 'UTF-16BE'); convertInvalidString("\x00\x11\x56\x78", "%\x00", 'UCS-4BE', 'UTF-16LE'); +// Regression tests for bugs with initial AVX2-accelerated implementation +convertInvalidString(str_repeat("a\x00", 15) . "\x00\xD8\x00\xFC", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16LE', 'UCS-2BE'); +convertInvalidString(str_repeat("\x00a", 15) . "\xD8\x00\xFC\x00", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16BE', 'UCS-2BE'); + +// This string caused an out-of-bounds read; it was found by a fuzzer +$str = "\xdb\xdb\xdb#\xdb\xdb\xdf\xdb\xdf\xdb\xdb\x0b\xdb\x00\xdc\xdb\xdf\xdb\xdf\xdb\xda\x0b\xdb\x00\xdcY\xdf\x03\xdb\x03\xd9\xd9\xd8"; +convertInvalidString($str, "\x00\x25\x00\x25\xdb\xdb\xdf\xdb\x00\x25\x00\x25\xdb\x00\xdc\xdb\x00\x25\x00\x25\x00\x25\xdb\x00\xdc\x59\x00\x25\x00\x25\x00\x25\x00\x25", 'UTF-16BE', 'UTF-16BE'); + +$str = "\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xdd\xd9\x0a\xda\xda\xda\xda\xdd\xda\xdd\xd9\xda\xda\xda\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb"; +convertInvalidString($str, "\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\xda\xda\xda\xdd\x25\x00\xd9\x0a\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00", 'UTF-16LE', 'UTF-16LE'); + echo "== UTF-32 ==\n"; testValidCodepoints("UTF-32LE");