8319577: x86_64 AVX2 intrinsics for Arrays.sort methods (int, float arrays)

Reviewed-by: sviswanathan, ihse, jbhateja, kvn
2025-08-25 22:04:51 +02:00 · 2023-12-08 22:52:48 +00:00 · 2023-12-08 22:52:48 +00:00 · ce108446ca
commit ce108446ca
parent 5c12a182e3
24 changed files with 2471 additions and 1720 deletions
--- a/src/java.base/linux/native/libsimdsort/avx2-emu-funcs.hpp
+++ b/src/java.base/linux/native/libsimdsort/avx2-emu-funcs.hpp
@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
+#ifndef AVX2_EMU_FUNCS
+#define AVX2_EMU_FUNCS
+
+#include <array>
+#include <utility>
+
+#include "xss-common-qsort.h"
+
+constexpr auto avx2_mask_helper_lut32 = [] {
+    std::array<std::array<int32_t, 8>, 256> lut{};
+    for (int64_t i = 0; i <= 0xFF; i++) {
+        std::array<int32_t, 8> entry{};
+        for (int j = 0; j < 8; j++) {
+            if (((i >> j) & 1) == 1)
+                entry[j] = 0xFFFFFFFF;
+            else
+                entry[j] = 0;
+        }
+        lut[i] = entry;
+    }
+    return lut;
+}();
+
+constexpr auto avx2_compressstore_lut32_gen = [] {
+    std::array<std::array<std::array<int32_t, 8>, 256>, 2> lutPair{};
+    auto &permLut = lutPair[0];
+    auto &leftLut = lutPair[1];
+    for (int64_t i = 0; i <= 0xFF; i++) {
+        std::array<int32_t, 8> indices{};
+        std::array<int32_t, 8> leftEntry = {0, 0, 0, 0, 0, 0, 0, 0};
+        int right = 7;
+        int left = 0;
+        for (int j = 0; j < 8; j++) {
+            bool ge = (i >> j) & 1;
+            if (ge) {
+                indices[right] = j;
+                right--;
+            } else {
+                indices[left] = j;
+                leftEntry[left] = 0xFFFFFFFF;
+                left++;
+            }
+        }
+        permLut[i] = indices;
+        leftLut[i] = leftEntry;
+    }
+    return lutPair;
+}();
+
+constexpr auto avx2_compressstore_lut32_perm = avx2_compressstore_lut32_gen[0];
+constexpr auto avx2_compressstore_lut32_left = avx2_compressstore_lut32_gen[1];
+
+
+X86_SIMD_SORT_INLINE
+__m256i convert_int_to_avx2_mask(int32_t m) {
+    return _mm256_loadu_si256(
+        (const __m256i *)avx2_mask_helper_lut32[m].data());
+}
+
+X86_SIMD_SORT_INLINE
+int32_t convert_avx2_mask_to_int(__m256i m) {
+    return _mm256_movemask_ps(_mm256_castsi256_ps(m));
+}
+
+// Emulators for intrinsics missing from AVX2 compared to AVX512
+template <typename T>
+T avx2_emu_reduce_max32(typename avx2_vector<T>::reg_t x) {
+    using vtype = avx2_vector<T>;
+    using reg_t = typename vtype::reg_t;
+
+    reg_t inter1 =
+        vtype::max(x, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(x));
+    reg_t inter2 = vtype::max(
+        inter1, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(inter1));
+    T arr[vtype::numlanes];
+    vtype::storeu(arr, inter2);
+    return std::max(arr[0], arr[7]);
+}
+
+template <typename T>
+T avx2_emu_reduce_min32(typename avx2_vector<T>::reg_t x) {
+    using vtype = avx2_vector<T>;
+    using reg_t = typename vtype::reg_t;
+
+    reg_t inter1 =
+        vtype::min(x, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(x));
+    reg_t inter2 = vtype::min(
+        inter1, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(inter1));
+    T arr[vtype::numlanes];
+    vtype::storeu(arr, inter2);
+    return std::min(arr[0], arr[7]);
+}
+
+template <typename T>
+void avx2_emu_mask_compressstoreu32(void *base_addr,
+                                    typename avx2_vector<T>::opmask_t k,
+                                    typename avx2_vector<T>::reg_t reg) {
+    using vtype = avx2_vector<T>;
+
+    T *leftStore = (T *)base_addr;
+
+    int32_t shortMask = convert_avx2_mask_to_int(k);
+    const __m256i &perm = _mm256_loadu_si256(
+        (const __m256i *)avx2_compressstore_lut32_perm[shortMask].data());
+    const __m256i &left = _mm256_loadu_si256(
+        (const __m256i *)avx2_compressstore_lut32_left[shortMask].data());
+
+    typename vtype::reg_t temp = vtype::permutevar(reg, perm);
+
+    vtype::mask_storeu(leftStore, left, temp);
+}
+
+
+template <typename T>
+int avx2_double_compressstore32(void *left_addr, void *right_addr,
+                                typename avx2_vector<T>::opmask_t k,
+                                typename avx2_vector<T>::reg_t reg) {
+    using vtype = avx2_vector<T>;
+
+    T *leftStore = (T *)left_addr;
+    T *rightStore = (T *)right_addr;
+
+    int32_t shortMask = convert_avx2_mask_to_int(k);
+    const __m256i &perm = _mm256_loadu_si256(
+        (const __m256i *)avx2_compressstore_lut32_perm[shortMask].data());
+
+    typename vtype::reg_t temp = vtype::permutevar(reg, perm);
+
+    vtype::storeu(leftStore, temp);
+    vtype::storeu(rightStore, temp);
+
+    return _mm_popcnt_u32(shortMask);
+}
+
+
+template <typename T>
+typename avx2_vector<T>::reg_t avx2_emu_max(typename avx2_vector<T>::reg_t x,
+                                            typename avx2_vector<T>::reg_t y) {
+    using vtype = avx2_vector<T>;
+    typename vtype::opmask_t nlt = vtype::gt(x, y);
+    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(y),
+                                                _mm256_castsi256_pd(x),
+                                                _mm256_castsi256_pd(nlt)));
+}
+
+template <typename T>
+typename avx2_vector<T>::reg_t avx2_emu_min(typename avx2_vector<T>::reg_t x,
+                                            typename avx2_vector<T>::reg_t y) {
+    using vtype = avx2_vector<T>;
+    typename vtype::opmask_t nlt = vtype::gt(x, y);
+    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(x),
+                                                _mm256_castsi256_pd(y),
+                                                _mm256_castsi256_pd(nlt)));
+}
+
+#endif