diff --git a/ext/opcache/zend_accelerator_util_funcs.c b/ext/opcache/zend_accelerator_util_funcs.c index c5efdc3f33a..b99a50b2128 100644 --- a/ext/opcache/zend_accelerator_util_funcs.c +++ b/ext/opcache/zend_accelerator_util_funcs.c @@ -27,6 +27,11 @@ #include "zend_shared_alloc.h" #include "zend_observer.h" +#ifdef __SSE2__ +/* For SSE2 adler32 */ +#include +#endif + typedef int (*id_function_t)(void *, void *); typedef void (*unique_copy_ctor_func_t)(void *pElement); @@ -451,11 +456,62 @@ zend_op_array* zend_accel_load_script(zend_persistent_script *persistent_script, #define ADLER32_NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ -#define ADLER32_DO1(buf) {s1 += *(buf); s2 += s1;} -#define ADLER32_DO2(buf, i) ADLER32_DO1(buf + i); ADLER32_DO1(buf + i + 1); -#define ADLER32_DO4(buf, i) ADLER32_DO2(buf, i); ADLER32_DO2(buf, i + 2); -#define ADLER32_DO8(buf, i) ADLER32_DO4(buf, i); ADLER32_DO4(buf, i + 4); -#define ADLER32_DO16(buf) ADLER32_DO8(buf, 0); ADLER32_DO8(buf, 8); +#define ADLER32_SCALAR_DO1(buf) {s1 += *(buf); s2 += s1;} +#define ADLER32_SCALAR_DO2(buf, i) ADLER32_SCALAR_DO1(buf + i); ADLER32_SCALAR_DO1(buf + i + 1); +#define ADLER32_SCALAR_DO4(buf, i) ADLER32_SCALAR_DO2(buf, i); ADLER32_SCALAR_DO2(buf, i + 2); +#define ADLER32_SCALAR_DO8(buf, i) ADLER32_SCALAR_DO4(buf, i); ADLER32_SCALAR_DO4(buf, i + 4); +#define ADLER32_SCALAR_DO16(buf) ADLER32_SCALAR_DO8(buf, 0); ADLER32_SCALAR_DO8(buf, 8); + +static zend_always_inline void adler32_do16_loop(unsigned char *buf, unsigned char *end, unsigned int *s1_out, unsigned int *s2_out) +{ + unsigned int s1 = *s1_out; + unsigned int s2 = *s2_out; + +#ifdef __SSE2__ + const __m128i zero = _mm_setzero_si128(); + + __m128i accumulate_s2 = zero; + unsigned int accumulate_s1 = 0; + + do { + __m128i read = _mm_loadu_si128((__m128i *) buf); /* [A:P] */ + + /* Split the 8-bit-element vector into two 16-bit-element vectors where each element gets zero-extended from 8-bits to 16-bits */ + __m128i lower = _mm_unpacklo_epi8(read, zero); /* [A:H] zero-extended to 16-bits */ + __m128i higher = _mm_unpackhi_epi8(read, zero); /* [I:P] zero-extended to 16-bits */ + lower = _mm_madd_epi16(lower, _mm_set_epi16(9, 10, 11, 12, 13, 14, 15, 16)); /* [A * 16:H * 9] */ + higher = _mm_madd_epi16(higher, _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8)); /* [I * 8:P * 1] */ + + /* We'll cheat here: it's difficult to add 16-bit elementwise, but we can do 32-bit additions. + * The highest value the sum of two elements of the vectors can take is 0xff * 16 + 0xff * 8 < 0xffff. + * That means there is no carry possible from 16->17 bits so the 32-bit addition is safe. */ + __m128i sum = _mm_add_epi32(lower, higher); /* [A * 16 + I * 8:H * 9 + P * 1] */ + accumulate_s2 = _mm_add_epi32(accumulate_s2, sum); + accumulate_s1 += s1; + + /* Computes 8-bit element-wise abs(buf - zero) and then sums the elements into two 16 bit parts */ + sum = _mm_sad_epu8(read, zero); + s1 += _mm_cvtsi128_si32(sum) + _mm_extract_epi16(sum, 4); + + buf += 16; + } while (buf != end); + + /* For convenience, let's do a rename of variables and let accumulate_s2 = [X, Y, Z, W] */ + __m128i shuffled = _mm_shuffle_epi32(accumulate_s2, _MM_SHUFFLE(1, 0, 0, 2)); /* [Y, X, X, Z] */ + accumulate_s2 = _mm_add_epi32(accumulate_s2, shuffled); /* [X + Y, Y + X, Z + X, W + Z] */ + shuffled = _mm_shuffle_epi32(accumulate_s2, _MM_SHUFFLE(3, 3, 3, 3)); /* [X + Y, X + Y, X + Y, X + Y] */ + accumulate_s2 = _mm_add_epi32(accumulate_s2, shuffled); /* [/, /, /, W + Z + X + Y] */ + s2 += accumulate_s1 * 16 + _mm_cvtsi128_si32(accumulate_s2); +#else + do { + ADLER32_SCALAR_DO16(buf); + buf += 16; + } while (buf != end); +#endif + + *s1_out = s1; + *s2_out = s2; +} unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t len) { @@ -466,10 +522,8 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le while (len >= ADLER32_NMAX) { len -= ADLER32_NMAX; end = buf + ADLER32_NMAX; - do { - ADLER32_DO16(buf); - buf += 16; - } while (buf != end); + adler32_do16_loop(buf, end, &s1, &s2); + buf = end; s1 %= ADLER32_BASE; s2 %= ADLER32_BASE; } @@ -478,15 +532,13 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le if (len >= 16) { end = buf + (len & 0xfff0); len &= 0xf; - do { - ADLER32_DO16(buf); - buf += 16; - } while (buf != end); + adler32_do16_loop(buf, end, &s1, &s2); + buf = end; } if (len) { end = buf + len; do { - ADLER32_DO1(buf); + ADLER32_SCALAR_DO1(buf); buf++; } while (buf != end); }