Implement an SSE2 accelerated version of zend_adler32 (#10507)

When benchmarking the file cache of opcache on index.php from a dummy WordPress install, I noticed that 36.42% of the time was spent in zend_adler32 to verify the checksums of the files. Callgrind reported that 332,731,216 instructions were executed during that run and average time to execute the index file was around 91ms. This patch implements an SSE2 accelerated version of zend_adler32, which reduces the number of instructions executed on that bench to 248,600,983, which is a reduction of ~25%. There is also a decrease in wallclock time measurable: around 10ms. Now only 16.05% of the time is spent computing checksums. The benchmark tests were performed using Callgrind, and time for the wallclock time. These tests were executed multiple times and their results were averaged. The WordPress install only contains two almost-blank posts.
2025-08-15 21:48:51 +02:00 · 2023-02-05 16:58:39 +01:00 · 2023-02-05 16:58:39 +01:00 · 722fbd01a3
commit 722fbd01a3
parent d3abcae4a2
1 changed files with 66 additions and 14 deletions
--- a/ext/opcache/zend_accelerator_util_funcs.c
+++ b/ext/opcache/zend_accelerator_util_funcs.c
@ -27,6 +27,11 @@
 #include "zend_shared_alloc.h"
 #include "zend_observer.h"

+#ifdef __SSE2__
+/* For SSE2 adler32 */
+#include <immintrin.h>
+#endif
+
 typedef int (*id_function_t)(void *, void *);
 typedef void (*unique_copy_ctor_func_t)(void *pElement);

@ -451,11 +456,62 @@ zend_op_array* zend_accel_load_script(zend_persistent_script *persistent_script,
 #define ADLER32_NMAX 5552
 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */

-#define ADLER32_DO1(buf)        {s1 += *(buf); s2 += s1;}
-#define ADLER32_DO2(buf, i)     ADLER32_DO1(buf + i); ADLER32_DO1(buf + i + 1);
-#define ADLER32_DO4(buf, i)     ADLER32_DO2(buf, i); ADLER32_DO2(buf, i + 2);
-#define ADLER32_DO8(buf, i)     ADLER32_DO4(buf, i); ADLER32_DO4(buf, i + 4);
-#define ADLER32_DO16(buf)       ADLER32_DO8(buf, 0); ADLER32_DO8(buf, 8);
+#define ADLER32_SCALAR_DO1(buf)        {s1 += *(buf); s2 += s1;}
+#define ADLER32_SCALAR_DO2(buf, i)     ADLER32_SCALAR_DO1(buf + i); ADLER32_SCALAR_DO1(buf + i + 1);
+#define ADLER32_SCALAR_DO4(buf, i)     ADLER32_SCALAR_DO2(buf, i); ADLER32_SCALAR_DO2(buf, i + 2);
+#define ADLER32_SCALAR_DO8(buf, i)     ADLER32_SCALAR_DO4(buf, i); ADLER32_SCALAR_DO4(buf, i + 4);
+#define ADLER32_SCALAR_DO16(buf)       ADLER32_SCALAR_DO8(buf, 0); ADLER32_SCALAR_DO8(buf, 8);
+
+static zend_always_inline void adler32_do16_loop(unsigned char *buf, unsigned char *end, unsigned int *s1_out, unsigned int *s2_out)
+{
+	unsigned int s1 = *s1_out;
+	unsigned int s2 = *s2_out;
+
+#ifdef __SSE2__
+	const __m128i zero = _mm_setzero_si128();
+
+	__m128i accumulate_s2 = zero;
+	unsigned int accumulate_s1 = 0;
+
+	do {
+		__m128i read = _mm_loadu_si128((__m128i *) buf); /* [A:P] */
+
+		/* Split the 8-bit-element vector into two 16-bit-element vectors where each element gets zero-extended from 8-bits to 16-bits */
+		__m128i lower = _mm_unpacklo_epi8(read, zero);									/* [A:H] zero-extended to 16-bits */
+		__m128i higher = _mm_unpackhi_epi8(read, zero);									/* [I:P] zero-extended to 16-bits */
+		lower = _mm_madd_epi16(lower, _mm_set_epi16(9, 10, 11, 12, 13, 14, 15, 16));	/* [A * 16:H * 9] */
+		higher = _mm_madd_epi16(higher, _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8)); 		/* [I * 8:P * 1] */
+
+		/* We'll cheat here: it's difficult to add 16-bit elementwise, but we can do 32-bit additions.
+			* The highest value the sum of two elements of the vectors can take is 0xff * 16 + 0xff * 8 < 0xffff.
+			* That means there is no carry possible from 16->17 bits so the 32-bit addition is safe. */
+		__m128i sum = _mm_add_epi32(lower, higher); /* [A * 16 + I * 8:H * 9 + P * 1] */
+		accumulate_s2 = _mm_add_epi32(accumulate_s2, sum);
+		accumulate_s1 += s1;
+
+		/* Computes 8-bit element-wise abs(buf - zero) and then sums the elements into two 16 bit parts */
+		sum = _mm_sad_epu8(read, zero);
+		s1 += _mm_cvtsi128_si32(sum) + _mm_extract_epi16(sum, 4);
+
+		buf += 16;
+	} while (buf != end);
+
+	/* For convenience, let's do a rename of variables and let accumulate_s2 = [X, Y, Z, W] */
+	__m128i shuffled = _mm_shuffle_epi32(accumulate_s2, _MM_SHUFFLE(1, 0, 0, 2));	/* [Y, X, X, Z] */
+	accumulate_s2 = _mm_add_epi32(accumulate_s2, shuffled);							/* [X + Y, Y + X, Z + X, W + Z] */
+	shuffled = _mm_shuffle_epi32(accumulate_s2, _MM_SHUFFLE(3, 3, 3, 3));			/* [X + Y, X + Y, X + Y, X + Y] */
+	accumulate_s2 = _mm_add_epi32(accumulate_s2, shuffled);							/* [/, /, /, W + Z + X + Y] */
+	s2 += accumulate_s1 * 16 + _mm_cvtsi128_si32(accumulate_s2);
+#else
+	do {
+		ADLER32_SCALAR_DO16(buf);
+		buf += 16;
+	} while (buf != end);
+#endif
+
+	*s1_out = s1;
+	*s2_out = s2;
+}

 unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t len)
 {
@ -466,10 +522,8 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
 	while (len >= ADLER32_NMAX) {
 		len -= ADLER32_NMAX;
 		end = buf + ADLER32_NMAX;
-		do {
-			ADLER32_DO16(buf);
-			buf += 16;
-		} while (buf != end);
+		adler32_do16_loop(buf, end, &s1, &s2);
+		buf = end;
 		s1 %= ADLER32_BASE;
 		s2 %= ADLER32_BASE;
 	}
@ -478,15 +532,13 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
 		if (len >= 16) {
 			end = buf + (len & 0xfff0);
 			len &= 0xf;
-			do {
-				ADLER32_DO16(buf);
-				buf += 16;
-			} while (buf != end);
+			adler32_do16_loop(buf, end, &s1, &s2);
+			buf = end;
 		}
 		if (len) {
 			end = buf + len;
 			do {
-				ADLER32_DO1(buf);
+				ADLER32_SCALAR_DO1(buf);
 				buf++;
 			} while (buf != end);
 		}