Add AVX2-accelerated UTF-16 decoding/encoding routines

As with other SIMD-accelerated functions in php-src, the new UTF-16
encoding and decoding routines can be compiled either with AVX2
acceleration "always on", "always off", or else with runtime detection
of AVX2 support.

With the new UTF-16 decoder/encoder, conversion of extremely short
strings (as in several bytes) has the same performance as before,
and conversion of medium-length (~100 character) strings is about 65%
faster, but conversion of long (~10,000 character) strings is around
6 times faster.

Many other mbstring functions will also be faster now when handling
UTF-16; for example, mb_strlen is almost 3 times faster on medium
strings, and almost 9 times faster on long strings. (Why does mb_strlen
benefit more from AVX2 acceleration than mb_convert_encoding? It's
because mb_strlen only needs to decode, but not re-encode, the input
string, and the UTF-16 decoder benefits much more from SIMD
acceleration than the UTF-16 encoder.)
This commit is contained in:
Alex Dowad 2023-01-25 22:28:10 +02:00
parent d5d9900661
commit c8ec2ed730
7 changed files with 572 additions and 10 deletions

View file

@ -19,6 +19,13 @@
#ifndef _ZEND_BITSET_H_ #ifndef _ZEND_BITSET_H_
#define _ZEND_BITSET_H_ #define _ZEND_BITSET_H_
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include "zend_portability.h"
#include "zend_long.h"
typedef zend_ulong *zend_bitset; typedef zend_ulong *zend_bitset;
#define ZEND_BITSET_ELM_SIZE sizeof(zend_ulong) #define ZEND_BITSET_ELM_SIZE sizeof(zend_ulong)

View file

@ -2643,6 +2643,27 @@ AC_DEFUN([PHP_CHECK_BUILTIN_SADDLL_OVERFLOW], [
[$have_builtin_saddll_overflow], [Whether the compiler supports __builtin_saddll_overflow]) [$have_builtin_saddll_overflow], [Whether the compiler supports __builtin_saddll_overflow])
]) ])
dnl
dnl PHP_CHECK_BUILTIN_USUB_OVERFLOW
dnl
AC_DEFUN([PHP_CHECK_BUILTIN_USUB_OVERFLOW], [
AC_MSG_CHECKING([for __builtin_usub_overflow])
AC_LINK_IFELSE([AC_LANG_PROGRAM([], [[
unsigned int tmpvar;
return __builtin_usub_overflow(3, 7, &tmpvar);
]])], [
have_builtin_usub_overflow=1
AC_MSG_RESULT([yes])
], [
have_builtin_usub_overflow=0
AC_MSG_RESULT([no])
])
AC_DEFINE_UNQUOTED([PHP_HAVE_BUILTIN_USUB_OVERFLOW],
[$have_builtin_usub_overflow], [Whether the compiler supports __builtin_usub_overflow])
])
dnl dnl
dnl PHP_CHECK_BUILTIN_SSUBL_OVERFLOW dnl PHP_CHECK_BUILTIN_SSUBL_OVERFLOW
dnl dnl

View file

@ -504,6 +504,8 @@ dnl Check __builtin_saddl_overflow
PHP_CHECK_BUILTIN_SADDL_OVERFLOW PHP_CHECK_BUILTIN_SADDL_OVERFLOW
dnl Check __builtin_saddll_overflow dnl Check __builtin_saddll_overflow
PHP_CHECK_BUILTIN_SADDLL_OVERFLOW PHP_CHECK_BUILTIN_SADDLL_OVERFLOW
dnl Check __builtin_usub_overflow
PHP_CHECK_BUILTIN_USUB_OVERFLOW
dnl Check __builtin_ssubl_overflow dnl Check __builtin_ssubl_overflow
PHP_CHECK_BUILTIN_SSUBL_OVERFLOW PHP_CHECK_BUILTIN_SSUBL_OVERFLOW
dnl Check __builtin_ssubll_overflow dnl Check __builtin_ssubll_overflow

View file

@ -27,15 +27,154 @@
* *
*/ */
#include "zend_bitset.h"
#include "mbfilter.h" #include "mbfilter.h"
#include "mbfilter_utf16.h" #include "mbfilter_utf16.h"
#ifdef ZEND_INTRIN_AVX2_NATIVE
/* We are building AVX2-only binary */
# include <immintrin.h>
# define mb_utf16be_to_wchar mb_utf16be_to_wchar_avx2
# define mb_utf16le_to_wchar mb_utf16le_to_wchar_avx2
# define mb_wchar_to_utf16be mb_wchar_to_utf16be_avx2
# define mb_wchar_to_utf16le mb_wchar_to_utf16le_avx2
static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
#elif defined(ZEND_INTRIN_AVX2_RESOLVER)
/* We are building binary which works with or without AVX2; whether or not to use
* AVX2-accelerated functions will be determined at runtime */
# include <immintrin.h>
# include "Zend/zend_cpuinfo.h"
static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
# ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
/* Dynamic linker will decide whether or not to use AVX2-based functions and
* resolve symbols accordingly */
ZEND_INTRIN_AVX2_FUNC_DECL(size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state));
ZEND_INTRIN_AVX2_FUNC_DECL(void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end));
ZEND_INTRIN_AVX2_FUNC_DECL(size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state));
ZEND_INTRIN_AVX2_FUNC_DECL(void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end));
size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((ifunc("resolve_utf16be_wchar")));
void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((ifunc("resolve_wchar_utf16be")));
size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((ifunc("resolve_utf16le_wchar")));
void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((ifunc("resolve_wchar_utf16le")));
ZEND_NO_SANITIZE_ADDRESS
ZEND_ATTRIBUTE_UNUSED
static mb_to_wchar_fn resolve_utf16be_wchar(void)
{
return zend_cpu_supports_avx2() ? mb_utf16be_to_wchar_avx2 : mb_utf16be_to_wchar_default;
}
ZEND_NO_SANITIZE_ADDRESS
ZEND_ATTRIBUTE_UNUSED
static mb_from_wchar_fn resolve_wchar_utf16be(void)
{
return zend_cpu_supports_avx2() ? mb_wchar_to_utf16be_avx2 : mb_wchar_to_utf16be_default;
}
ZEND_NO_SANITIZE_ADDRESS
ZEND_ATTRIBUTE_UNUSED
static mb_to_wchar_fn resolve_utf16le_wchar(void)
{
return zend_cpu_supports_avx2() ? mb_utf16le_to_wchar_avx2 : mb_utf16le_to_wchar_default;
}
ZEND_NO_SANITIZE_ADDRESS
ZEND_ATTRIBUTE_UNUSED
static mb_from_wchar_fn resolve_wchar_utf16le(void)
{
return zend_cpu_supports_avx2() ? mb_wchar_to_utf16le_avx2 : mb_wchar_to_utf16le_default;
}
# else /* ZEND_INTRIN_AVX2_FUNC_PTR */
/* We are compiling for a target where the dynamic linker will not be able to
* resolve symbols according to whether the host supports AVX2 or not; so instead,
* we can make calls go through a function pointer and set the function pointer
* on module load */
#ifdef HAVE_FUNC_ATTRIBUTE_TARGET
static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((target("avx2")));
static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((target("avx2")));
static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((target("avx2")));
static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((target("avx2")));
#else
static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
#endif
static mb_to_wchar_fn utf16be_to_wchar_ptr = NULL;
static mb_from_wchar_fn wchar_to_utf16be_ptr = NULL;
static mb_to_wchar_fn utf16le_to_wchar_ptr = NULL;
static mb_from_wchar_fn wchar_to_utf16le_ptr = NULL;
static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
return utf16be_to_wchar_ptr(in, in_len, buf, bufsize, NULL);
}
static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
{
wchar_to_utf16be_ptr(in, len, buf, end);
}
static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
return utf16le_to_wchar_ptr(in, in_len, buf, bufsize, NULL);
}
static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
{
wchar_to_utf16le_ptr(in, len, buf, end);
}
void init_convert_utf16(void)
{
if (zend_cpu_supports_avx2()) {
utf16be_to_wchar_ptr = mb_utf16be_to_wchar_avx2;
wchar_to_utf16be_ptr = mb_wchar_to_utf16be_avx2;
utf16le_to_wchar_ptr = mb_utf16le_to_wchar_avx2;
wchar_to_utf16le_ptr = mb_wchar_to_utf16le_avx2;
} else {
utf16be_to_wchar_ptr = mb_utf16be_to_wchar_default;
wchar_to_utf16be_ptr = mb_wchar_to_utf16be_default;
utf16le_to_wchar_ptr = mb_utf16le_to_wchar_default;
wchar_to_utf16le_ptr = mb_wchar_to_utf16le_default;
}
}
# endif
#else
/* No AVX2 support */
# define mb_utf16be_to_wchar mb_utf16be_to_wchar_default
# define mb_utf16le_to_wchar mb_utf16le_to_wchar_default
# define mb_wchar_to_utf16be mb_wchar_to_utf16be_default
# define mb_wchar_to_utf16le mb_wchar_to_utf16le_default
static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
#endif
static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL}; static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
@ -366,7 +505,7 @@ static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *bu
return mb_utf16be_to_wchar(in, in_len, buf, bufsize, NULL); return mb_utf16be_to_wchar(in, in_len, buf, bufsize, NULL);
} }
static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{ {
/* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */ /* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
unsigned char *p = *in, *e = p + (*in_len & ~1); unsigned char *p = *in, *e = p + (*in_len & ~1);
@ -419,7 +558,7 @@ static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
return out - buf; return out - buf;
} }
static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
{ {
unsigned char *out, *limit; unsigned char *out, *limit;
MB_CONVERT_BUF_LOAD(buf, out, limit); MB_CONVERT_BUF_LOAD(buf, out, limit);
@ -436,7 +575,7 @@ static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, b
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4); MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF); out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF);
} else { } else {
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be); MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be_default);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
} }
} }
@ -444,7 +583,7 @@ static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, b
MB_CONVERT_BUF_STORE(buf, out, limit); MB_CONVERT_BUF_STORE(buf, out, limit);
} }
static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{ {
/* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */ /* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
unsigned char *p = *in, *e = p + (*in_len & ~1); unsigned char *p = *in, *e = p + (*in_len & ~1);
@ -497,7 +636,7 @@ static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
return out - buf; return out - buf;
} }
static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
{ {
unsigned char *out, *limit; unsigned char *out, *limit;
MB_CONVERT_BUF_LOAD(buf, out, limit); MB_CONVERT_BUF_LOAD(buf, out, limit);
@ -514,10 +653,387 @@ static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, b
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4); MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF); out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF);
} else { } else {
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le); MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le_default);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
} }
} }
MB_CONVERT_BUF_STORE(buf, out, limit); MB_CONVERT_BUF_STORE(buf, out, limit);
} }
#if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
#else
static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
#endif
{
size_t len = *in_len;
if (len >= 32 && bufsize >= 16) {
unsigned char *p = *in;
uint32_t *out = buf;
/* Used to determine if a block of input bytes contains any surrogates */
const __m256i _f8 = _mm256_set1_epi16(0xF8);
const __m256i _d8 = _mm256_set1_epi16(0xD8);
/* wchars must be in host byte order, which is little-endian on x86;
* Since we are reading in (big-endian) UTF-16BE, use this vector to swap byte order for output */
const __m256i swap_bytes = _mm256_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
do {
__m256i operand = _mm256_loadu_si256((__m256i*)p); /* Load 32 bytes */
uint32_t surrogate_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi16(_mm256_and_si256(operand, _f8), _d8));
if (surrogate_bitvec == 0) {
/* There are no surrogates among these 16 characters
* So converting the UTF-16 input to wchars is very simple; just extend each 16-bit value
* to a 32-bit value, filling in zero bits in the high end */
operand = _mm256_shuffle_epi8(operand, swap_bytes);
_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
out += 16;
bufsize -= 16;
p += sizeof(__m256i);
len -= sizeof(__m256i);
} else if ((surrogate_bitvec & 1) == 0) {
/* Some prefix of the current block is non-surrogates; output those */
uint8_t n_chars = zend_ulong_ntz(surrogate_bitvec) >> 1;
operand = _mm256_shuffle_epi8(operand, swap_bytes);
/* We know that the output buffer has at least 64 bytes of space available
* So don't bother trimming the output down to only include the non-surrogate prefix;
* rather, write out an entire block of 64 (or 32) bytes, then bump our output pointer
* forward just past the 'good part', so the 'bad part' will be overwritten on the next
* iteration of this loop */
_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
if (n_chars > 8) {
_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
}
out += n_chars;
bufsize -= n_chars;
p += n_chars * 2;
len -= n_chars * 2;
} else {
/* Some prefix of the current block is (valid or invalid) surrogates
* Handle those using non-vectorized code */
surrogate_bitvec = ~surrogate_bitvec;
unsigned int n_chars = surrogate_bitvec ? zend_ulong_ntz(surrogate_bitvec) >> 1 : 16;
do {
unsigned char c1 = *p++;
unsigned char c2 = *p++;
if (c1 & 0x4 || len < 4) {
/* 2nd part of surrogate pair has come first OR string ended abruptly
* after 1st part of surrogate pair */
*out++ = MBFL_BAD_INPUT;
bufsize--;
n_chars--;
len -= 2;
continue;
}
uint16_t n = (c1 << 8) | c2;
unsigned char c3 = *p++;
unsigned char c4 = *p++;
if ((c3 & 0xFC) == 0xDC) {
/* Valid surrogate pair */
uint16_t n2 = (c3 << 8) | c4;
*out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
bufsize--;
len -= 4;
#ifdef PHP_HAVE_BUILTIN_USUB_OVERFLOW
/* Subtracting 2 from `n_chars` will automatically set the CPU's flags;
* branch directly off the appropriate flag (CF on x86) rather than using
* another instruction (CMP on x86) to check for underflow */
if (__builtin_usub_overflow(n_chars, 2, &n_chars)) {
/* The last 2 bytes of this block and the first 2 bytes of the following
* block form a valid surrogate pair; now just make sure we don't get
* stuck in this loop due to underflow of the loop index */
break;
}
#else
n_chars -= 2;
if (n_chars == UINT_MAX) {
break;
}
#endif
} else {
/* First half of surrogate pair was followed by another first half
* OR by a non-surrogate character */
*out++ = MBFL_BAD_INPUT;
bufsize--;
n_chars--;
len -= 2;
p -= 2; /* Back up so the last 2 bytes will be processed again */
}
} while (n_chars);
}
} while (len >= 32 && bufsize >= 16);
if (len && bufsize >= 4) {
/* Finish up trailing bytes which don't fill a 32-byte block */
out += mb_utf16be_to_wchar_default(&p, &len, out, bufsize, NULL);
}
*in = p;
*in_len = len;
return out - buf;
} else if (len) {
return mb_utf16be_to_wchar_default(in, in_len, buf, bufsize, NULL);
} else {
return 0;
}
}
#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
#else
static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
#endif
{
if (len >= 8) {
unsigned char *out, *limit;
MB_CONVERT_BUF_LOAD(buf, out, limit);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
/* Used to find wchars which are outside the Unicode BMP (Basic Multilingual Plane) */
const __m256i bmp_mask = _mm256_set1_epi32(0xFFFF);
/* Used to extract 16 bits which we want from each of eight 32-bit values */
const __m256i pack_8x16 = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 12, 13, 8, 9, 4, 5, 0, 1, 12, 13, 8, 9, 4, 5, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1);
do {
__m256i operand = _mm256_loadu_si256((__m256i*)in); /* Load 32 bytes */
uint32_t bmp_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_and_si256(operand, bmp_mask), operand));
if (bmp_bitvec == 0xFFFFFFFF) {
/* All eight wchars are in the BMP
* Shuffle bytes around to get the 16 bytes we want into the low 16 bytes of YMM register
* (which is equivalent to an XMM register) */
operand = _mm256_shuffle_epi8(operand, pack_8x16);
__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
operand = _mm256_alignr_epi8(operand2, operand, 8);
_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand)); /* Store 16 bytes */
out += 16;
len -= 8;
in += 8;
} else if (bmp_bitvec & 1) {
/* Some prefix of this block are codepoints in the BMP */
unsigned int n_bytes = zend_ulong_ntz(~bmp_bitvec);
operand = _mm256_shuffle_epi8(operand, pack_8x16);
__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
operand = _mm256_alignr_epi8(operand2, operand, 8);
/* Store 16 bytes, but bump output pointer forward just past the 'good part',
* so the 'bad part' will be overwritten on the next iteration of this loop */
_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
out += n_bytes >> 1;
len -= n_bytes >> 2;
in += n_bytes >> 2;
} else {
/* Some prefix of this block is codepoints outside the BMP OR error markers
* Handle them using non-vectorized code */
unsigned int n_words = bmp_bitvec ? zend_ulong_ntz(bmp_bitvec) >> 2 : 8;
do {
uint32_t w = *in++;
n_words--;
len--;
if (w < MBFL_WCSPLANE_UTF32MAX) {
uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
uint16_t n2 = (w & 0x3FF) | 0xDC00;
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF);
} else {
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be_default);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
}
} while (n_words);
}
} while (len >= 8);
MB_CONVERT_BUF_STORE(buf, out, limit);
}
if (len) {
mb_wchar_to_utf16be_default(in, len, buf, end);
}
}
#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
#else
static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
#endif
{
/* Most of this function is the same as `mb_utf16be_to_wchar_avx2`, above;
* See it for more detailed code comments */
size_t len = *in_len;
if (len >= 32 && bufsize >= 16) {
unsigned char *p = *in;
uint32_t *out = buf;
const __m256i _f8 = _mm256_set1_epi16(0xF800);
const __m256i _d8 = _mm256_set1_epi16(0xD800);
do {
__m256i operand = _mm256_loadu_si256((__m256i*)p);
uint32_t surrogate_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi16(_mm256_and_si256(operand, _f8), _d8));
if (surrogate_bitvec == 0) {
/* There are no surrogates among these 16 characters */
_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
out += 16;
bufsize -= 16;
p += sizeof(__m256i);
len -= sizeof(__m256i);
} else if ((surrogate_bitvec & 1) == 0) {
/* Some prefix of the current block is non-surrogates */
uint8_t n_chars = zend_ulong_ntz(surrogate_bitvec) >> 1;
_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
if (n_chars > 8) {
_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
}
out += n_chars;
bufsize -= n_chars;
p += n_chars * 2;
len -= n_chars * 2;
} else {
/* Some prefix of the current block is (valid or invalid) surrogates */
surrogate_bitvec = ~surrogate_bitvec;
unsigned int n_chars = surrogate_bitvec ? zend_ulong_ntz(surrogate_bitvec) >> 1 : 16;
do {
unsigned char c1 = *p++;
unsigned char c2 = *p++;
if (c2 & 0x4 || len < 4) {
/* 2nd part of surrogate pair has come first OR string ended abruptly
* after 1st part of surrogate pair */
*out++ = MBFL_BAD_INPUT;
bufsize--;
n_chars--;
len -= 2;
continue;
}
uint16_t n = (c2 << 8) | c1;
unsigned char c3 = *p++;
unsigned char c4 = *p++;
if ((c4 & 0xFC) == 0xDC) {
/* Valid surrogate pair */
uint16_t n2 = (c4 << 8) | c3;
*out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
bufsize--;
len -= 4;
#ifdef PHP_HAVE_BUILTIN_USUB_OVERFLOW
if (__builtin_usub_overflow(n_chars, 2, &n_chars)) {
break;
}
#else
n_chars -= 2;
if (n_chars == UINT_MAX) {
break;
}
#endif
} else {
/* First half of surrogate pair was followed by another first half
* OR by a non-surrogate character */
*out++ = MBFL_BAD_INPUT;
bufsize--;
n_chars--;
len -= 2;
p -= 2; /* Back up so the last 2 bytes will be processed again */
}
} while (n_chars);
}
} while (len >= 32 && bufsize >= 16);
if (len && bufsize >= 4) {
out += mb_utf16le_to_wchar_default(&p, &len, out, bufsize, NULL);
}
*in = p;
*in_len = len;
return out - buf;
} else if (len) {
return mb_utf16le_to_wchar_default(in, in_len, buf, bufsize, NULL);
} else {
return 0;
}
}
#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
#else
static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
#endif
{
if (len >= 8) {
unsigned char *out, *limit;
MB_CONVERT_BUF_LOAD(buf, out, limit);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
/* Used to find wchars which are outside the Unicode BMP (Basic Multilingual Plane) */
const __m256i bmp_mask = _mm256_set1_epi32(0xFFFF);
/* Used to extract 16 bits which we want from each of eight 32-bit values */
const __m256i pack_8x16 = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 13, 12, 9, 8, 5, 4, 1, 0, 13, 12, 9, 8, 5, 4, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1);
do {
__m256i operand = _mm256_loadu_si256((__m256i*)in);
uint32_t bmp_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_and_si256(operand, bmp_mask), operand));
if (bmp_bitvec == 0xFFFFFFFF) {
/* All eight wchars are in the BMP
* Shuffle bytes around to get the 16 bytes we want into the low 16 bytes of YMM register
* (which is equivalent to an XMM register) */
operand = _mm256_shuffle_epi8(operand, pack_8x16);
__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
operand = _mm256_alignr_epi8(operand2, operand, 8);
_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
out += 16;
len -= 8;
in += 8;
} else if (bmp_bitvec & 1) {
/* Some prefix of this block are codepoints in the BMP */
unsigned int n_bytes = zend_ulong_ntz(~bmp_bitvec);
operand = _mm256_shuffle_epi8(operand, pack_8x16);
__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
operand = _mm256_alignr_epi8(operand2, operand, 8);
_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
out += n_bytes >> 1;
len -= n_bytes >> 2;
in += n_bytes >> 2;
} else {
/* Some prefix of this block is codepoints outside the BMP OR error markers */
unsigned int n_words = bmp_bitvec ? zend_ulong_ntz(bmp_bitvec) >> 2 : 8;
do {
uint32_t w = *in++;
n_words--;
len--;
if (w < MBFL_WCSPLANE_UTF32MAX) {
uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
uint16_t n2 = (w & 0x3FF) | 0xDC00;
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF);
} else {
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le_default);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
}
} while (n_words);
}
} while (len >= 8);
MB_CONVERT_BUF_STORE(buf, out, limit);
}
if (len) {
mb_wchar_to_utf16le_default(in, len, buf, end);
}
}
#endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */

View file

@ -47,4 +47,8 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter); int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter); int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter);
#ifdef ZEND_INTRIN_AVX2_FUNC_PTR
void init_convert_utf16(void);
#endif
#endif /* MBFL_MBFILTER_UTF16_H */ #endif /* MBFL_MBFILTER_UTF16_H */

View file

@ -1080,6 +1080,7 @@ ZEND_TSRMLS_CACHE_UPDATE();
#ifdef ZEND_INTRIN_AVX2_FUNC_PTR #ifdef ZEND_INTRIN_AVX2_FUNC_PTR
init_check_utf8(); init_check_utf8();
init_convert_utf16();
#endif #endif
return SUCCESS; return SUCCESS;

View file

@ -895,6 +895,17 @@ testValidString("\xDC\x00", "\x00\xDC", 'UCS-2BE', 'UTF-16LE', false);
convertInvalidString("\x00\x11\x56\x78", "\x00%", 'UCS-4BE', 'UTF-16BE'); convertInvalidString("\x00\x11\x56\x78", "\x00%", 'UCS-4BE', 'UTF-16BE');
convertInvalidString("\x00\x11\x56\x78", "%\x00", 'UCS-4BE', 'UTF-16LE'); convertInvalidString("\x00\x11\x56\x78", "%\x00", 'UCS-4BE', 'UTF-16LE');
// Regression tests for bugs with initial AVX2-accelerated implementation
convertInvalidString(str_repeat("a\x00", 15) . "\x00\xD8\x00\xFC", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16LE', 'UCS-2BE');
convertInvalidString(str_repeat("\x00a", 15) . "\xD8\x00\xFC\x00", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16BE', 'UCS-2BE');
// This string caused an out-of-bounds read; it was found by a fuzzer
$str = "\xdb\xdb\xdb#\xdb\xdb\xdf\xdb\xdf\xdb\xdb\x0b\xdb\x00\xdc\xdb\xdf\xdb\xdf\xdb\xda\x0b\xdb\x00\xdcY\xdf\x03\xdb\x03\xd9\xd9\xd8";
convertInvalidString($str, "\x00\x25\x00\x25\xdb\xdb\xdf\xdb\x00\x25\x00\x25\xdb\x00\xdc\xdb\x00\x25\x00\x25\x00\x25\xdb\x00\xdc\x59\x00\x25\x00\x25\x00\x25\x00\x25", 'UTF-16BE', 'UTF-16BE');
$str = "\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xdd\xd9\x0a\xda\xda\xda\xda\xdd\xda\xdd\xd9\xda\xda\xda\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb";
convertInvalidString($str, "\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\xda\xda\xda\xdd\x25\x00\xd9\x0a\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00", 'UTF-16LE', 'UTF-16LE');
echo "== UTF-32 ==\n"; echo "== UTF-32 ==\n";
testValidCodepoints("UTF-32LE"); testValidCodepoints("UTF-32LE");