Use SSE2 in bc_copy_and_shift_numbers() if possible

This commit is contained in:
Niels Dossche 2024-05-01 18:27:03 +02:00
parent 4964c5cb23
commit a604d1d342

View file

@ -16,6 +16,9 @@
#include "bcmath.h"
#include "convert.h"
#ifdef __SSE2__
# include <emmintrin.h>
#endif
/* This will be 0x01010101 for 32-bit and 0x0101010101010101 */
#define SWAR_ONES (~((size_t) 0) / 0xFF)
@ -31,6 +34,19 @@ static char *bc_copy_and_shift_numbers(char *restrict dest, const char *source,
shift = -shift;
}
#ifdef __SSE2__
/* SIMD SSE2 bulk shift + copy */
__m128i shift_vector = _mm_set1_epi8(shift);
while (source + sizeof(__m128i) <= source_end) {
__m128i bytes = _mm_loadu_si128((const __m128i *) source);
bytes = _mm_add_epi8(bytes, shift_vector);
_mm_storeu_si128((__m128i *) dest, bytes);
source += sizeof(__m128i);
dest += sizeof(__m128i);
}
#endif
/* Handle sizeof(size_t) (i.e. 4/8) bytes at once.
* We know that adding/subtracting an individual byte cannot overflow,
* so it is possible to add/subtract an entire word of bytes at once