Try to fix ubuntu fail

xia-mc · xia-mc · commit 2fb0e56ce977 · 2024-11-16T20:05:40.000+08:00
diff --git a/pyfastutil/src/utils/simd/BitonicSort.cpp b/pyfastutil/src/utils/simd/BitonicSort.cpp
@@ -371,10 +371,39 @@ namespace simd {
      * Merge sorted blocks with SIMD optimization, or fallback
      * make sure aligned
      */
-    template <typename T, typename = std::enable_if_t<std::is_same_v<T, int> || std::is_same_v<T, long long>>>
-    __forceinline void mergeSortedBlocks(std::vector<T, AlignedAllocator<T, 64>> &data, const size_t &blockSize) {
+    __forceinline void mergeSortedBlocks(std::vector<int, AlignedAllocator<int, 64>> &data, const size_t &blockSize) {
         const size_t total = data.size();
-        auto temp = std::vector<T, AlignedAllocator<T, 64>>(total);
+        auto temp = std::vector<int, AlignedAllocator<int, 64>>(total);
+
+        bool cycle = true;
+        size_t mid;
+        size_t right;
+        for (size_t size = blockSize; size < total; size *= 2) {
+            size_t left = 0;
+            prefetchL1(&data + left);
+            prefetchL1(&temp + left);
+            for (; left < total; left += 2 * size) {
+                mid = std::min(left + size - 1, total - 1);
+                right = std::min(left + 2 * size - 1, total - 1);
+                if (cycle) {
+                    doSingleMerge(left, mid, right, data, temp);
+                } else {
+                    doSingleMerge(left, mid, right, temp, data);
+                }
+            }
+
+            cycle = !cycle;
+        }
+
+        // copy the final result
+        if (!cycle) {
+            simdMemCpyAligned(temp.data(), data.data(), temp.size());
+        }
+    }
+
+    __forceinline void mergeSortedBlocks(std::vector<long long, AlignedAllocator<long long, 64>> &data, const size_t &blockSize) {
+        const size_t total = data.size();
+        auto temp = std::vector<long long, AlignedAllocator<long long, 64>>(total);
 
         bool cycle = true;
         size_t mid;
@@ -405,11 +434,41 @@ namespace simd {
     /**
      * Merge sorted blocks with SIMD optimization reversed, or fallback
      */
-    template <typename T, typename = std::enable_if_t<std::is_same_v<T, int> || std::is_same_v<T, long long>>>
     __forceinline void
-    mergeSortedBlocksReversed(std::vector<T, AlignedAllocator<T, 64>> &data, const size_t &blockSize) {
+    mergeSortedBlocksReversed(std::vector<int, AlignedAllocator<int, 64>> &data, const size_t &blockSize) {
+        const size_t total = data.size();
+        auto temp = std::vector<int, AlignedAllocator<int, 64>>(total);
+
+        bool cycle = true;
+        size_t mid;
+        size_t right;
+        for (size_t size = blockSize; size < total; size *= 2) {
+            size_t left = 0;
+            prefetchL1(&data + left);
+            prefetchL1(&temp + left);
+            for (; left < total; left += 2 * size) {
+                mid = std::min(left + size - 1, total - 1);
+                right = std::min(left + 2 * size - 1, total - 1);
+                if (cycle) {
+                    doSingleMergeReversed(left, mid, right, data, temp);
+                } else {
+                    doSingleMergeReversed(left, mid, right, temp, data);
+                }
+            }
+
+            cycle = !cycle;
+        }
+
+        // copy the final result
+        if (!cycle) {
+            simdMemCpyAligned(temp.data(), data.data(), temp.size());
+        }
+    }
+
+    __forceinline void
+    mergeSortedBlocksReversed(std::vector<long long, AlignedAllocator<long long, 64>> &data, const size_t &blockSize) {
         const size_t total = data.size();
-        auto temp = std::vector<T, AlignedAllocator<T, 64>>(total);
+        auto temp = std::vector<long long, AlignedAllocator<long long, 64>>(total);
 
         bool cycle = true;
         size_t mid;