mirror of
https://github.com/php/php-src.git
synced 2025-08-15 13:38:49 +02:00
Add AVX2-accelerated UTF-16 decoding/encoding routines
As with other SIMD-accelerated functions in php-src, the new UTF-16 encoding and decoding routines can be compiled either with AVX2 acceleration "always on", "always off", or else with runtime detection of AVX2 support. With the new UTF-16 decoder/encoder, conversion of extremely short strings (as in several bytes) has the same performance as before, and conversion of medium-length (~100 character) strings is about 65% faster, but conversion of long (~10,000 character) strings is around 6 times faster. Many other mbstring functions will also be faster now when handling UTF-16; for example, mb_strlen is almost 3 times faster on medium strings, and almost 9 times faster on long strings. (Why does mb_strlen benefit more from AVX2 acceleration than mb_convert_encoding? It's because mb_strlen only needs to decode, but not re-encode, the input string, and the UTF-16 decoder benefits much more from SIMD acceleration than the UTF-16 encoder.)
This commit is contained in:
parent
d5d9900661
commit
c8ec2ed730
7 changed files with 572 additions and 10 deletions
|
@ -19,6 +19,13 @@
|
|||
#ifndef _ZEND_BITSET_H_
|
||||
#define _ZEND_BITSET_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "zend_portability.h"
|
||||
#include "zend_long.h"
|
||||
|
||||
typedef zend_ulong *zend_bitset;
|
||||
|
||||
#define ZEND_BITSET_ELM_SIZE sizeof(zend_ulong)
|
||||
|
|
21
build/php.m4
21
build/php.m4
|
@ -2643,6 +2643,27 @@ AC_DEFUN([PHP_CHECK_BUILTIN_SADDLL_OVERFLOW], [
|
|||
[$have_builtin_saddll_overflow], [Whether the compiler supports __builtin_saddll_overflow])
|
||||
])
|
||||
|
||||
dnl
|
||||
dnl PHP_CHECK_BUILTIN_USUB_OVERFLOW
|
||||
dnl
|
||||
AC_DEFUN([PHP_CHECK_BUILTIN_USUB_OVERFLOW], [
|
||||
AC_MSG_CHECKING([for __builtin_usub_overflow])
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_PROGRAM([], [[
|
||||
unsigned int tmpvar;
|
||||
return __builtin_usub_overflow(3, 7, &tmpvar);
|
||||
]])], [
|
||||
have_builtin_usub_overflow=1
|
||||
AC_MSG_RESULT([yes])
|
||||
], [
|
||||
have_builtin_usub_overflow=0
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
|
||||
AC_DEFINE_UNQUOTED([PHP_HAVE_BUILTIN_USUB_OVERFLOW],
|
||||
[$have_builtin_usub_overflow], [Whether the compiler supports __builtin_usub_overflow])
|
||||
])
|
||||
|
||||
dnl
|
||||
dnl PHP_CHECK_BUILTIN_SSUBL_OVERFLOW
|
||||
dnl
|
||||
|
|
|
@ -504,6 +504,8 @@ dnl Check __builtin_saddl_overflow
|
|||
PHP_CHECK_BUILTIN_SADDL_OVERFLOW
|
||||
dnl Check __builtin_saddll_overflow
|
||||
PHP_CHECK_BUILTIN_SADDLL_OVERFLOW
|
||||
dnl Check __builtin_usub_overflow
|
||||
PHP_CHECK_BUILTIN_USUB_OVERFLOW
|
||||
dnl Check __builtin_ssubl_overflow
|
||||
PHP_CHECK_BUILTIN_SSUBL_OVERFLOW
|
||||
dnl Check __builtin_ssubll_overflow
|
||||
|
|
|
@ -27,15 +27,154 @@
|
|||
*
|
||||
*/
|
||||
|
||||
#include "zend_bitset.h"
|
||||
#include "mbfilter.h"
|
||||
#include "mbfilter_utf16.h"
|
||||
|
||||
#ifdef ZEND_INTRIN_AVX2_NATIVE
|
||||
|
||||
/* We are building AVX2-only binary */
|
||||
# include <immintrin.h>
|
||||
# define mb_utf16be_to_wchar mb_utf16be_to_wchar_avx2
|
||||
# define mb_utf16le_to_wchar mb_utf16le_to_wchar_avx2
|
||||
# define mb_wchar_to_utf16be mb_wchar_to_utf16be_avx2
|
||||
# define mb_wchar_to_utf16le mb_wchar_to_utf16le_avx2
|
||||
|
||||
static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
|
||||
#elif defined(ZEND_INTRIN_AVX2_RESOLVER)
|
||||
|
||||
/* We are building binary which works with or without AVX2; whether or not to use
|
||||
* AVX2-accelerated functions will be determined at runtime */
|
||||
# include <immintrin.h>
|
||||
# include "Zend/zend_cpuinfo.h"
|
||||
|
||||
static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
|
||||
# ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
|
||||
/* Dynamic linker will decide whether or not to use AVX2-based functions and
|
||||
* resolve symbols accordingly */
|
||||
|
||||
ZEND_INTRIN_AVX2_FUNC_DECL(size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state));
|
||||
ZEND_INTRIN_AVX2_FUNC_DECL(void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end));
|
||||
ZEND_INTRIN_AVX2_FUNC_DECL(size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state));
|
||||
ZEND_INTRIN_AVX2_FUNC_DECL(void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end));
|
||||
|
||||
size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((ifunc("resolve_utf16be_wchar")));
|
||||
void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((ifunc("resolve_wchar_utf16be")));
|
||||
size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((ifunc("resolve_utf16le_wchar")));
|
||||
void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((ifunc("resolve_wchar_utf16le")));
|
||||
|
||||
ZEND_NO_SANITIZE_ADDRESS
|
||||
ZEND_ATTRIBUTE_UNUSED
|
||||
static mb_to_wchar_fn resolve_utf16be_wchar(void)
|
||||
{
|
||||
return zend_cpu_supports_avx2() ? mb_utf16be_to_wchar_avx2 : mb_utf16be_to_wchar_default;
|
||||
}
|
||||
|
||||
ZEND_NO_SANITIZE_ADDRESS
|
||||
ZEND_ATTRIBUTE_UNUSED
|
||||
static mb_from_wchar_fn resolve_wchar_utf16be(void)
|
||||
{
|
||||
return zend_cpu_supports_avx2() ? mb_wchar_to_utf16be_avx2 : mb_wchar_to_utf16be_default;
|
||||
}
|
||||
|
||||
ZEND_NO_SANITIZE_ADDRESS
|
||||
ZEND_ATTRIBUTE_UNUSED
|
||||
static mb_to_wchar_fn resolve_utf16le_wchar(void)
|
||||
{
|
||||
return zend_cpu_supports_avx2() ? mb_utf16le_to_wchar_avx2 : mb_utf16le_to_wchar_default;
|
||||
}
|
||||
|
||||
ZEND_NO_SANITIZE_ADDRESS
|
||||
ZEND_ATTRIBUTE_UNUSED
|
||||
static mb_from_wchar_fn resolve_wchar_utf16le(void)
|
||||
{
|
||||
return zend_cpu_supports_avx2() ? mb_wchar_to_utf16le_avx2 : mb_wchar_to_utf16le_default;
|
||||
}
|
||||
|
||||
# else /* ZEND_INTRIN_AVX2_FUNC_PTR */
|
||||
/* We are compiling for a target where the dynamic linker will not be able to
|
||||
* resolve symbols according to whether the host supports AVX2 or not; so instead,
|
||||
* we can make calls go through a function pointer and set the function pointer
|
||||
* on module load */
|
||||
|
||||
#ifdef HAVE_FUNC_ATTRIBUTE_TARGET
|
||||
static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((target("avx2")));
|
||||
static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((target("avx2")));
|
||||
static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) __attribute__((target("avx2")));
|
||||
static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) __attribute__((target("avx2")));
|
||||
#else
|
||||
static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
#endif
|
||||
|
||||
static mb_to_wchar_fn utf16be_to_wchar_ptr = NULL;
|
||||
static mb_from_wchar_fn wchar_to_utf16be_ptr = NULL;
|
||||
static mb_to_wchar_fn utf16le_to_wchar_ptr = NULL;
|
||||
static mb_from_wchar_fn wchar_to_utf16le_ptr = NULL;
|
||||
|
||||
static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
{
|
||||
return utf16be_to_wchar_ptr(in, in_len, buf, bufsize, NULL);
|
||||
}
|
||||
|
||||
static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
{
|
||||
wchar_to_utf16be_ptr(in, len, buf, end);
|
||||
}
|
||||
|
||||
static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
{
|
||||
return utf16le_to_wchar_ptr(in, in_len, buf, bufsize, NULL);
|
||||
}
|
||||
|
||||
static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
{
|
||||
wchar_to_utf16le_ptr(in, len, buf, end);
|
||||
}
|
||||
|
||||
void init_convert_utf16(void)
|
||||
{
|
||||
if (zend_cpu_supports_avx2()) {
|
||||
utf16be_to_wchar_ptr = mb_utf16be_to_wchar_avx2;
|
||||
wchar_to_utf16be_ptr = mb_wchar_to_utf16be_avx2;
|
||||
utf16le_to_wchar_ptr = mb_utf16le_to_wchar_avx2;
|
||||
wchar_to_utf16le_ptr = mb_wchar_to_utf16le_avx2;
|
||||
} else {
|
||||
utf16be_to_wchar_ptr = mb_utf16be_to_wchar_default;
|
||||
wchar_to_utf16be_ptr = mb_wchar_to_utf16be_default;
|
||||
utf16le_to_wchar_ptr = mb_utf16le_to_wchar_default;
|
||||
wchar_to_utf16le_ptr = mb_wchar_to_utf16le_default;
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
#else
|
||||
|
||||
/* No AVX2 support */
|
||||
# define mb_utf16be_to_wchar mb_utf16be_to_wchar_default
|
||||
# define mb_utf16le_to_wchar mb_utf16le_to_wchar_default
|
||||
# define mb_wchar_to_utf16be mb_wchar_to_utf16be_default
|
||||
# define mb_wchar_to_utf16le mb_wchar_to_utf16le_default
|
||||
|
||||
static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
|
||||
#endif
|
||||
|
||||
static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
|
||||
static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
|
||||
static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
|
||||
|
||||
|
@ -366,7 +505,7 @@ static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *bu
|
|||
return mb_utf16be_to_wchar(in, in_len, buf, bufsize, NULL);
|
||||
}
|
||||
|
||||
static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
static size_t mb_utf16be_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
{
|
||||
/* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
|
||||
unsigned char *p = *in, *e = p + (*in_len & ~1);
|
||||
|
@ -419,7 +558,7 @@ static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
|
|||
return out - buf;
|
||||
}
|
||||
|
||||
static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
static void mb_wchar_to_utf16be_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
{
|
||||
unsigned char *out, *limit;
|
||||
MB_CONVERT_BUF_LOAD(buf, out, limit);
|
||||
|
@ -436,7 +575,7 @@ static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, b
|
|||
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
|
||||
out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF);
|
||||
} else {
|
||||
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be);
|
||||
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be_default);
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
|
||||
}
|
||||
}
|
||||
|
@ -444,7 +583,7 @@ static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, b
|
|||
MB_CONVERT_BUF_STORE(buf, out, limit);
|
||||
}
|
||||
|
||||
static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
static size_t mb_utf16le_to_wchar_default(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
{
|
||||
/* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
|
||||
unsigned char *p = *in, *e = p + (*in_len & ~1);
|
||||
|
@ -497,7 +636,7 @@ static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
|
|||
return out - buf;
|
||||
}
|
||||
|
||||
static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
{
|
||||
unsigned char *out, *limit;
|
||||
MB_CONVERT_BUF_LOAD(buf, out, limit);
|
||||
|
@ -514,10 +653,387 @@ static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, b
|
|||
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
|
||||
out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF);
|
||||
} else {
|
||||
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le);
|
||||
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le_default);
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
|
||||
}
|
||||
}
|
||||
|
||||
MB_CONVERT_BUF_STORE(buf, out, limit);
|
||||
}
|
||||
|
||||
#if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
|
||||
|
||||
#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
|
||||
size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
#else
|
||||
static size_t mb_utf16be_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
#endif
|
||||
{
|
||||
size_t len = *in_len;
|
||||
|
||||
if (len >= 32 && bufsize >= 16) {
|
||||
unsigned char *p = *in;
|
||||
uint32_t *out = buf;
|
||||
|
||||
/* Used to determine if a block of input bytes contains any surrogates */
|
||||
const __m256i _f8 = _mm256_set1_epi16(0xF8);
|
||||
const __m256i _d8 = _mm256_set1_epi16(0xD8);
|
||||
/* wchars must be in host byte order, which is little-endian on x86;
|
||||
* Since we are reading in (big-endian) UTF-16BE, use this vector to swap byte order for output */
|
||||
const __m256i swap_bytes = _mm256_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
|
||||
|
||||
do {
|
||||
__m256i operand = _mm256_loadu_si256((__m256i*)p); /* Load 32 bytes */
|
||||
|
||||
uint32_t surrogate_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi16(_mm256_and_si256(operand, _f8), _d8));
|
||||
if (surrogate_bitvec == 0) {
|
||||
/* There are no surrogates among these 16 characters
|
||||
* So converting the UTF-16 input to wchars is very simple; just extend each 16-bit value
|
||||
* to a 32-bit value, filling in zero bits in the high end */
|
||||
operand = _mm256_shuffle_epi8(operand, swap_bytes);
|
||||
_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
|
||||
_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
|
||||
out += 16;
|
||||
bufsize -= 16;
|
||||
p += sizeof(__m256i);
|
||||
len -= sizeof(__m256i);
|
||||
} else if ((surrogate_bitvec & 1) == 0) {
|
||||
/* Some prefix of the current block is non-surrogates; output those */
|
||||
uint8_t n_chars = zend_ulong_ntz(surrogate_bitvec) >> 1;
|
||||
operand = _mm256_shuffle_epi8(operand, swap_bytes);
|
||||
/* We know that the output buffer has at least 64 bytes of space available
|
||||
* So don't bother trimming the output down to only include the non-surrogate prefix;
|
||||
* rather, write out an entire block of 64 (or 32) bytes, then bump our output pointer
|
||||
* forward just past the 'good part', so the 'bad part' will be overwritten on the next
|
||||
* iteration of this loop */
|
||||
_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
|
||||
if (n_chars > 8) {
|
||||
_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
|
||||
}
|
||||
out += n_chars;
|
||||
bufsize -= n_chars;
|
||||
p += n_chars * 2;
|
||||
len -= n_chars * 2;
|
||||
} else {
|
||||
/* Some prefix of the current block is (valid or invalid) surrogates
|
||||
* Handle those using non-vectorized code */
|
||||
surrogate_bitvec = ~surrogate_bitvec;
|
||||
unsigned int n_chars = surrogate_bitvec ? zend_ulong_ntz(surrogate_bitvec) >> 1 : 16;
|
||||
do {
|
||||
unsigned char c1 = *p++;
|
||||
unsigned char c2 = *p++;
|
||||
|
||||
if (c1 & 0x4 || len < 4) {
|
||||
/* 2nd part of surrogate pair has come first OR string ended abruptly
|
||||
* after 1st part of surrogate pair */
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
bufsize--;
|
||||
n_chars--;
|
||||
len -= 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
uint16_t n = (c1 << 8) | c2;
|
||||
unsigned char c3 = *p++;
|
||||
unsigned char c4 = *p++;
|
||||
|
||||
if ((c3 & 0xFC) == 0xDC) {
|
||||
/* Valid surrogate pair */
|
||||
uint16_t n2 = (c3 << 8) | c4;
|
||||
*out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
|
||||
bufsize--;
|
||||
len -= 4;
|
||||
#ifdef PHP_HAVE_BUILTIN_USUB_OVERFLOW
|
||||
/* Subtracting 2 from `n_chars` will automatically set the CPU's flags;
|
||||
* branch directly off the appropriate flag (CF on x86) rather than using
|
||||
* another instruction (CMP on x86) to check for underflow */
|
||||
if (__builtin_usub_overflow(n_chars, 2, &n_chars)) {
|
||||
/* The last 2 bytes of this block and the first 2 bytes of the following
|
||||
* block form a valid surrogate pair; now just make sure we don't get
|
||||
* stuck in this loop due to underflow of the loop index */
|
||||
break;
|
||||
}
|
||||
#else
|
||||
n_chars -= 2;
|
||||
if (n_chars == UINT_MAX) {
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
/* First half of surrogate pair was followed by another first half
|
||||
* OR by a non-surrogate character */
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
bufsize--;
|
||||
n_chars--;
|
||||
len -= 2;
|
||||
p -= 2; /* Back up so the last 2 bytes will be processed again */
|
||||
}
|
||||
} while (n_chars);
|
||||
}
|
||||
} while (len >= 32 && bufsize >= 16);
|
||||
|
||||
if (len && bufsize >= 4) {
|
||||
/* Finish up trailing bytes which don't fill a 32-byte block */
|
||||
out += mb_utf16be_to_wchar_default(&p, &len, out, bufsize, NULL);
|
||||
}
|
||||
|
||||
*in = p;
|
||||
*in_len = len;
|
||||
return out - buf;
|
||||
} else if (len) {
|
||||
return mb_utf16be_to_wchar_default(in, in_len, buf, bufsize, NULL);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
|
||||
void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
#else
|
||||
static void mb_wchar_to_utf16be_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
#endif
|
||||
{
|
||||
if (len >= 8) {
|
||||
unsigned char *out, *limit;
|
||||
MB_CONVERT_BUF_LOAD(buf, out, limit);
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
|
||||
|
||||
/* Used to find wchars which are outside the Unicode BMP (Basic Multilingual Plane) */
|
||||
const __m256i bmp_mask = _mm256_set1_epi32(0xFFFF);
|
||||
/* Used to extract 16 bits which we want from each of eight 32-bit values */
|
||||
const __m256i pack_8x16 = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 12, 13, 8, 9, 4, 5, 0, 1, 12, 13, 8, 9, 4, 5, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1);
|
||||
|
||||
do {
|
||||
__m256i operand = _mm256_loadu_si256((__m256i*)in); /* Load 32 bytes */
|
||||
|
||||
uint32_t bmp_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_and_si256(operand, bmp_mask), operand));
|
||||
if (bmp_bitvec == 0xFFFFFFFF) {
|
||||
/* All eight wchars are in the BMP
|
||||
* Shuffle bytes around to get the 16 bytes we want into the low 16 bytes of YMM register
|
||||
* (which is equivalent to an XMM register) */
|
||||
operand = _mm256_shuffle_epi8(operand, pack_8x16);
|
||||
__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
|
||||
operand = _mm256_alignr_epi8(operand2, operand, 8);
|
||||
_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand)); /* Store 16 bytes */
|
||||
out += 16;
|
||||
len -= 8;
|
||||
in += 8;
|
||||
} else if (bmp_bitvec & 1) {
|
||||
/* Some prefix of this block are codepoints in the BMP */
|
||||
unsigned int n_bytes = zend_ulong_ntz(~bmp_bitvec);
|
||||
operand = _mm256_shuffle_epi8(operand, pack_8x16);
|
||||
__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
|
||||
operand = _mm256_alignr_epi8(operand2, operand, 8);
|
||||
/* Store 16 bytes, but bump output pointer forward just past the 'good part',
|
||||
* so the 'bad part' will be overwritten on the next iteration of this loop */
|
||||
_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
|
||||
out += n_bytes >> 1;
|
||||
len -= n_bytes >> 2;
|
||||
in += n_bytes >> 2;
|
||||
} else {
|
||||
/* Some prefix of this block is codepoints outside the BMP OR error markers
|
||||
* Handle them using non-vectorized code */
|
||||
unsigned int n_words = bmp_bitvec ? zend_ulong_ntz(bmp_bitvec) >> 2 : 8;
|
||||
do {
|
||||
uint32_t w = *in++;
|
||||
n_words--;
|
||||
len--;
|
||||
|
||||
if (w < MBFL_WCSPLANE_UTF32MAX) {
|
||||
uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
|
||||
uint16_t n2 = (w & 0x3FF) | 0xDC00;
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
|
||||
out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF);
|
||||
} else {
|
||||
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be_default);
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
|
||||
}
|
||||
} while (n_words);
|
||||
}
|
||||
} while (len >= 8);
|
||||
|
||||
MB_CONVERT_BUF_STORE(buf, out, limit);
|
||||
}
|
||||
|
||||
if (len) {
|
||||
mb_wchar_to_utf16be_default(in, len, buf, end);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
|
||||
size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
#else
|
||||
static size_t mb_utf16le_to_wchar_avx2(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
|
||||
#endif
|
||||
{
|
||||
/* Most of this function is the same as `mb_utf16be_to_wchar_avx2`, above;
|
||||
* See it for more detailed code comments */
|
||||
|
||||
size_t len = *in_len;
|
||||
|
||||
if (len >= 32 && bufsize >= 16) {
|
||||
unsigned char *p = *in;
|
||||
uint32_t *out = buf;
|
||||
|
||||
const __m256i _f8 = _mm256_set1_epi16(0xF800);
|
||||
const __m256i _d8 = _mm256_set1_epi16(0xD800);
|
||||
|
||||
do {
|
||||
__m256i operand = _mm256_loadu_si256((__m256i*)p);
|
||||
|
||||
uint32_t surrogate_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi16(_mm256_and_si256(operand, _f8), _d8));
|
||||
if (surrogate_bitvec == 0) {
|
||||
/* There are no surrogates among these 16 characters */
|
||||
_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
|
||||
_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
|
||||
out += 16;
|
||||
bufsize -= 16;
|
||||
p += sizeof(__m256i);
|
||||
len -= sizeof(__m256i);
|
||||
} else if ((surrogate_bitvec & 1) == 0) {
|
||||
/* Some prefix of the current block is non-surrogates */
|
||||
uint8_t n_chars = zend_ulong_ntz(surrogate_bitvec) >> 1;
|
||||
_mm256_storeu_si256((__m256i*)out, _mm256_cvtepu16_epi32(_mm256_castsi256_si128(operand)));
|
||||
if (n_chars > 8) {
|
||||
_mm256_storeu_si256((__m256i*)(out + 8), _mm256_cvtepu16_epi32(_mm256_extracti128_si256(operand, 1)));
|
||||
}
|
||||
out += n_chars;
|
||||
bufsize -= n_chars;
|
||||
p += n_chars * 2;
|
||||
len -= n_chars * 2;
|
||||
} else {
|
||||
/* Some prefix of the current block is (valid or invalid) surrogates */
|
||||
surrogate_bitvec = ~surrogate_bitvec;
|
||||
unsigned int n_chars = surrogate_bitvec ? zend_ulong_ntz(surrogate_bitvec) >> 1 : 16;
|
||||
do {
|
||||
unsigned char c1 = *p++;
|
||||
unsigned char c2 = *p++;
|
||||
|
||||
if (c2 & 0x4 || len < 4) {
|
||||
/* 2nd part of surrogate pair has come first OR string ended abruptly
|
||||
* after 1st part of surrogate pair */
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
bufsize--;
|
||||
n_chars--;
|
||||
len -= 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
uint16_t n = (c2 << 8) | c1;
|
||||
unsigned char c3 = *p++;
|
||||
unsigned char c4 = *p++;
|
||||
|
||||
if ((c4 & 0xFC) == 0xDC) {
|
||||
/* Valid surrogate pair */
|
||||
uint16_t n2 = (c4 << 8) | c3;
|
||||
*out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
|
||||
bufsize--;
|
||||
len -= 4;
|
||||
#ifdef PHP_HAVE_BUILTIN_USUB_OVERFLOW
|
||||
if (__builtin_usub_overflow(n_chars, 2, &n_chars)) {
|
||||
break;
|
||||
}
|
||||
#else
|
||||
n_chars -= 2;
|
||||
if (n_chars == UINT_MAX) {
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
/* First half of surrogate pair was followed by another first half
|
||||
* OR by a non-surrogate character */
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
bufsize--;
|
||||
n_chars--;
|
||||
len -= 2;
|
||||
p -= 2; /* Back up so the last 2 bytes will be processed again */
|
||||
}
|
||||
} while (n_chars);
|
||||
}
|
||||
} while (len >= 32 && bufsize >= 16);
|
||||
|
||||
if (len && bufsize >= 4) {
|
||||
out += mb_utf16le_to_wchar_default(&p, &len, out, bufsize, NULL);
|
||||
}
|
||||
|
||||
*in = p;
|
||||
*in_len = len;
|
||||
return out - buf;
|
||||
} else if (len) {
|
||||
return mb_utf16le_to_wchar_default(in, in_len, buf, bufsize, NULL);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
|
||||
void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
#else
|
||||
static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
|
||||
#endif
|
||||
{
|
||||
if (len >= 8) {
|
||||
unsigned char *out, *limit;
|
||||
MB_CONVERT_BUF_LOAD(buf, out, limit);
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
|
||||
|
||||
/* Used to find wchars which are outside the Unicode BMP (Basic Multilingual Plane) */
|
||||
const __m256i bmp_mask = _mm256_set1_epi32(0xFFFF);
|
||||
/* Used to extract 16 bits which we want from each of eight 32-bit values */
|
||||
const __m256i pack_8x16 = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 13, 12, 9, 8, 5, 4, 1, 0, 13, 12, 9, 8, 5, 4, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1);
|
||||
|
||||
do {
|
||||
__m256i operand = _mm256_loadu_si256((__m256i*)in);
|
||||
|
||||
uint32_t bmp_bitvec = _mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_and_si256(operand, bmp_mask), operand));
|
||||
if (bmp_bitvec == 0xFFFFFFFF) {
|
||||
/* All eight wchars are in the BMP
|
||||
* Shuffle bytes around to get the 16 bytes we want into the low 16 bytes of YMM register
|
||||
* (which is equivalent to an XMM register) */
|
||||
operand = _mm256_shuffle_epi8(operand, pack_8x16);
|
||||
__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
|
||||
operand = _mm256_alignr_epi8(operand2, operand, 8);
|
||||
_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
|
||||
out += 16;
|
||||
len -= 8;
|
||||
in += 8;
|
||||
} else if (bmp_bitvec & 1) {
|
||||
/* Some prefix of this block are codepoints in the BMP */
|
||||
unsigned int n_bytes = zend_ulong_ntz(~bmp_bitvec);
|
||||
operand = _mm256_shuffle_epi8(operand, pack_8x16);
|
||||
__m256i operand2 = _mm256_permute2x128_si256(operand, operand, 1);
|
||||
operand = _mm256_alignr_epi8(operand2, operand, 8);
|
||||
_mm_storeu_si128((__m128i*)out, _mm256_castsi256_si128(operand));
|
||||
out += n_bytes >> 1;
|
||||
len -= n_bytes >> 2;
|
||||
in += n_bytes >> 2;
|
||||
} else {
|
||||
/* Some prefix of this block is codepoints outside the BMP OR error markers */
|
||||
unsigned int n_words = bmp_bitvec ? zend_ulong_ntz(bmp_bitvec) >> 2 : 8;
|
||||
do {
|
||||
uint32_t w = *in++;
|
||||
n_words--;
|
||||
len--;
|
||||
|
||||
if (w < MBFL_WCSPLANE_UTF32MAX) {
|
||||
uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
|
||||
uint16_t n2 = (w & 0x3FF) | 0xDC00;
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
|
||||
out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF);
|
||||
} else {
|
||||
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le_default);
|
||||
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
|
||||
}
|
||||
} while (n_words);
|
||||
}
|
||||
} while (len >= 8);
|
||||
|
||||
MB_CONVERT_BUF_STORE(buf, out, limit);
|
||||
}
|
||||
|
||||
if (len) {
|
||||
mb_wchar_to_utf16le_default(in, len, buf, end);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
|
||||
|
|
|
@ -47,4 +47,8 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter);
|
|||
int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter);
|
||||
int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter);
|
||||
|
||||
#ifdef ZEND_INTRIN_AVX2_FUNC_PTR
|
||||
void init_convert_utf16(void);
|
||||
#endif
|
||||
|
||||
#endif /* MBFL_MBFILTER_UTF16_H */
|
||||
|
|
|
@ -1080,6 +1080,7 @@ ZEND_TSRMLS_CACHE_UPDATE();
|
|||
|
||||
#ifdef ZEND_INTRIN_AVX2_FUNC_PTR
|
||||
init_check_utf8();
|
||||
init_convert_utf16();
|
||||
#endif
|
||||
|
||||
return SUCCESS;
|
||||
|
|
|
@ -895,6 +895,17 @@ testValidString("\xDC\x00", "\x00\xDC", 'UCS-2BE', 'UTF-16LE', false);
|
|||
convertInvalidString("\x00\x11\x56\x78", "\x00%", 'UCS-4BE', 'UTF-16BE');
|
||||
convertInvalidString("\x00\x11\x56\x78", "%\x00", 'UCS-4BE', 'UTF-16LE');
|
||||
|
||||
// Regression tests for bugs with initial AVX2-accelerated implementation
|
||||
convertInvalidString(str_repeat("a\x00", 15) . "\x00\xD8\x00\xFC", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16LE', 'UCS-2BE');
|
||||
convertInvalidString(str_repeat("\x00a", 15) . "\xD8\x00\xFC\x00", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16BE', 'UCS-2BE');
|
||||
|
||||
// This string caused an out-of-bounds read; it was found by a fuzzer
|
||||
$str = "\xdb\xdb\xdb#\xdb\xdb\xdf\xdb\xdf\xdb\xdb\x0b\xdb\x00\xdc\xdb\xdf\xdb\xdf\xdb\xda\x0b\xdb\x00\xdcY\xdf\x03\xdb\x03\xd9\xd9\xd8";
|
||||
convertInvalidString($str, "\x00\x25\x00\x25\xdb\xdb\xdf\xdb\x00\x25\x00\x25\xdb\x00\xdc\xdb\x00\x25\x00\x25\x00\x25\xdb\x00\xdc\x59\x00\x25\x00\x25\x00\x25\x00\x25", 'UTF-16BE', 'UTF-16BE');
|
||||
|
||||
$str = "\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xdd\xd9\x0a\xda\xda\xda\xda\xdd\xda\xdd\xd9\xda\xda\xda\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb";
|
||||
convertInvalidString($str, "\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\xda\xda\xda\xdd\x25\x00\xd9\x0a\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00", 'UTF-16LE', 'UTF-16LE');
|
||||
|
||||
echo "== UTF-32 ==\n";
|
||||
|
||||
testValidCodepoints("UTF-32LE");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue