Refactor BCMath _bc_do_sub (#14132)

_bc_do_sub now uses SIMD to perform calculations at high speed.

Moved the macros used for SIMD to `private.h`, and added some constants
and macros.
This commit is contained in:
Saki Takamachi 2024-05-07 11:39:31 +09:00 committed by GitHub
parent bb21d195c1
commit 02732007f7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 126 additions and 17 deletions

View file

@ -16,16 +16,11 @@
#include "bcmath.h"
#include "convert.h"
#include "private.h"
#ifdef __SSE2__
# include <emmintrin.h>
#endif
/* This will be 0x01010101 for 32-bit and 0x0101010101010101 */
#define SWAR_ONES (~((size_t) 0) / 0xFF)
/* This repeats a byte `x` into an entire 32/64-bit word.
* Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
#define SWAR_REPEAT(x) (SWAR_ONES * (x))
static char *bc_copy_and_shift_numbers(char *restrict dest, const char *source, const char *source_end, unsigned char shift, bool add)
{
size_t bulk_shift = SWAR_REPEAT(shift);

View file

@ -124,17 +124,19 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
{
bc_num diff;
size_t diff_scale, diff_len;
size_t min_scale, min_len;
size_t borrow, count;
/* The caller is guaranteed that n1 is always large. */
size_t diff_len = EXPECTED(n1->n_len >= n2->n_len) ? n1->n_len : n2->n_len;
size_t diff_scale = MAX(n1->n_scale, n2->n_scale);
/* Same condition as EXPECTED before, but using EXPECTED again will make it slower. */
size_t min_len = n1->n_len >= n2->n_len ? n2->n_len : n1->n_len;
size_t min_scale = MIN(n1->n_scale, n2->n_scale);
size_t min_bytes = min_len + min_scale;
size_t borrow = 0;
size_t count;
int val;
char *n1ptr, *n2ptr, *diffptr;
/* Allocate temporary storage. */
diff_len = MAX(n1->n_len, n2->n_len);
diff_scale = MAX(n1->n_scale, n2->n_scale);
min_len = MIN(n1->n_len, n2->n_len);
min_scale = MIN(n1->n_scale, n2->n_scale);
diff = bc_new_num (diff_len, MAX(diff_scale, scale_min));
/* Initialize the subtract. */
@ -142,9 +144,6 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
n2ptr = (char *) (n2->n_value + n2->n_len + n2->n_scale - 1);
diffptr = (char *) (diff->n_value + diff_len + diff_scale - 1);
/* Subtract the numbers. */
borrow = 0;
/* Take care of the longer scaled number. */
if (n1->n_scale != min_scale) {
/* n1 has the longer scale */
@ -166,7 +165,59 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
}
/* Now do the equal length scale and integer parts. */
for (count = 0; count < min_len + min_scale; count++) {
count = 0;
/* Uses SIMD to perform calculations at high speed. */
if (min_bytes >= sizeof(BC_UINT_T)) {
diffptr++;
n1ptr++;
n2ptr++;
while (count + sizeof(BC_UINT_T) <= min_bytes) {
diffptr -= sizeof(BC_UINT_T);
n1ptr -= sizeof(BC_UINT_T);
n2ptr -= sizeof(BC_UINT_T);
BC_UINT_T n1bytes;
BC_UINT_T n2bytes;
memcpy(&n1bytes, n1ptr, sizeof(n1bytes));
memcpy(&n2bytes, n2ptr, sizeof(n2bytes));
#if BC_LITTLE_ENDIAN
/* Little endian requires changing the order of bytes. */
n1bytes = BC_BSWAP(n1bytes);
n2bytes = BC_BSWAP(n2bytes);
#endif
n1bytes -= n2bytes + borrow;
/* If the most significant bit is 1, a carry down has occurred. */
bool tmp_borrow = n1bytes & ((BC_UINT_T) 1 << (8 * sizeof(BC_UINT_T) - 1));
/*
* Check the most significant bit of each of the bytes, and if it is 1, a carry down has
* occurred. When carrying down occurs, due to the difference between decimal and hexadecimal
* numbers, an extra 6 is added to the lower 4 bits.
* Therefore, for a byte that has been carried down, set all the upper 4 bits to 0 and subtract
* 6 from the lower 4 bits to adjust it to the correct value as a decimal number.
*/
BC_UINT_T borrow_mask = ((n1bytes & SWAR_REPEAT(0x80)) >> 7) * 0x06;
n1bytes = (n1bytes & SWAR_REPEAT(0x0F)) - borrow_mask;
#if BC_LITTLE_ENDIAN
/* Little endian requires changing the order of bytes back. */
n1bytes = BC_BSWAP(n1bytes);
#endif
memcpy(diffptr, &n1bytes, sizeof(n1bytes));
borrow = tmp_borrow;
count += sizeof(BC_UINT_T);
}
diffptr--;
n1ptr--;
n2ptr--;
}
/* Calculate the remaining bytes that are less than the size of BC_UINT_T using a normal loop. */
for (; count < min_bytes; count++) {
val = *n1ptr-- - *n2ptr-- - borrow;
if (val < 0) {
val += BASE;

View file

@ -34,6 +34,68 @@
#include <stdbool.h>
#include <stddef.h>
/* This will be 0x01010101 for 32-bit and 0x0101010101010101 for 64-bit */
#define SWAR_ONES (~((size_t) 0) / 0xFF)
/* This repeats a byte `x` into an entire 32/64-bit word.
* Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
#define SWAR_REPEAT(x) (SWAR_ONES * (x))
/* Bytes swap */
#if defined(_MSC_VER)
# include <stdlib.h>
# define BSWAP32(u) _byteswap_ulong(u)
# define BSWAP64(u) _byteswap_uint64(u)
#else
# ifdef __has_builtin
# if __has_builtin(__builtin_bswap32)
# define BSWAP32(u) __builtin_bswap32(u)
# endif // __has_builtin(__builtin_bswap32)
# if __has_builtin(__builtin_bswap64)
# define BSWAP64(u) __builtin_bswap64(u)
# endif // __has_builtin(__builtin_bswap64)
# elif defined(__GNUC__)
# define BSWAP32(u) __builtin_bswap32(u)
# define BSWAP64(u) __builtin_bswap64(u)
# endif // __has_builtin
#endif // defined(_MSC_VER)
#ifndef BSWAP32
inline uint32_t BSWAP32(uint32_t u)
{
return (((u & 0xff000000) >> 24)
| ((u & 0x00ff0000) >> 8)
| ((u & 0x0000ff00) << 8)
| ((u & 0x000000ff) << 24));
}
#endif
#ifndef BSWAP64
inline uint64_t BSWAP64(uint64_t u)
{
return (((u & 0xff00000000000000ULL) >> 56)
| ((u & 0x00ff000000000000ULL) >> 40)
| ((u & 0x0000ff0000000000ULL) >> 24)
| ((u & 0x000000ff00000000ULL) >> 8)
| ((u & 0x00000000ff000000ULL) << 8)
| ((u & 0x0000000000ff0000ULL) << 24)
| ((u & 0x000000000000ff00ULL) << 40)
| ((u & 0x00000000000000ffULL) << 56));
}
#endif
#if SIZEOF_SIZE_T >= 8
#define BC_BSWAP(u) BSWAP64(u)
#define BC_UINT_T uint64_t
#else
#define BC_BSWAP(u) BSWAP32(u)
#define BC_UINT_T uint32_t
#endif
#ifdef WORDS_BIGENDIAN
#define BC_LITTLE_ENDIAN 0
#else
#define BC_LITTLE_ENDIAN 1
#endif
/* routines */
int _bc_do_compare (bc_num n1, bc_num n2, bool use_sign);
bc_num _bc_do_add (bc_num n1, bc_num n2, size_t scale_min);

View file

@ -31,6 +31,7 @@
#include "bcmath.h"
#include "convert.h"
#include "private.h"
#include <stdbool.h>
#include <stddef.h>
#ifdef __SSE2__