@@ -371,10 +371,39 @@ namespace simd {
371371 * Merge sorted blocks with SIMD optimization, or fallback
372372 * make sure aligned
373373 */
374- template <typename T, typename = std::enable_if_t <std::is_same_v<T, int > || std::is_same_v<T, long long >>>
375- __forceinline void mergeSortedBlocks (std::vector<T, AlignedAllocator<T, 64 >> &data, const size_t &blockSize) {
374+ __forceinline void mergeSortedBlocks (std::vector<int , AlignedAllocator<int , 64 >> &data, const size_t &blockSize) {
376375 const size_t total = data.size ();
377- auto temp = std::vector<T, AlignedAllocator<T, 64 >>(total);
376+ auto temp = std::vector<int , AlignedAllocator<int , 64 >>(total);
377+
378+ bool cycle = true ;
379+ size_t mid;
380+ size_t right;
381+ for (size_t size = blockSize; size < total; size *= 2 ) {
382+ size_t left = 0 ;
383+ prefetchL1 (&data + left);
384+ prefetchL1 (&temp + left);
385+ for (; left < total; left += 2 * size) {
386+ mid = std::min (left + size - 1 , total - 1 );
387+ right = std::min (left + 2 * size - 1 , total - 1 );
388+ if (cycle) {
389+ doSingleMerge (left, mid, right, data, temp);
390+ } else {
391+ doSingleMerge (left, mid, right, temp, data);
392+ }
393+ }
394+
395+ cycle = !cycle;
396+ }
397+
398+ // copy the final result
399+ if (!cycle) {
400+ simdMemCpyAligned (temp.data (), data.data (), temp.size ());
401+ }
402+ }
403+
404+ __forceinline void mergeSortedBlocks (std::vector<long long , AlignedAllocator<long long , 64 >> &data, const size_t &blockSize) {
405+ const size_t total = data.size ();
406+ auto temp = std::vector<long long , AlignedAllocator<long long , 64 >>(total);
378407
379408 bool cycle = true ;
380409 size_t mid;
@@ -405,11 +434,41 @@ namespace simd {
405434 /* *
406435 * Merge sorted blocks with SIMD optimization reversed, or fallback
407436 */
408- template <typename T, typename = std::enable_if_t <std::is_same_v<T, int > || std::is_same_v<T, long long >>>
409437 __forceinline void
410- mergeSortedBlocksReversed (std::vector<T, AlignedAllocator<T, 64 >> &data, const size_t &blockSize) {
438+ mergeSortedBlocksReversed (std::vector<int , AlignedAllocator<int , 64 >> &data, const size_t &blockSize) {
439+ const size_t total = data.size ();
440+ auto temp = std::vector<int , AlignedAllocator<int , 64 >>(total);
441+
442+ bool cycle = true ;
443+ size_t mid;
444+ size_t right;
445+ for (size_t size = blockSize; size < total; size *= 2 ) {
446+ size_t left = 0 ;
447+ prefetchL1 (&data + left);
448+ prefetchL1 (&temp + left);
449+ for (; left < total; left += 2 * size) {
450+ mid = std::min (left + size - 1 , total - 1 );
451+ right = std::min (left + 2 * size - 1 , total - 1 );
452+ if (cycle) {
453+ doSingleMergeReversed (left, mid, right, data, temp);
454+ } else {
455+ doSingleMergeReversed (left, mid, right, temp, data);
456+ }
457+ }
458+
459+ cycle = !cycle;
460+ }
461+
462+ // copy the final result
463+ if (!cycle) {
464+ simdMemCpyAligned (temp.data (), data.data (), temp.size ());
465+ }
466+ }
467+
468+ __forceinline void
469+ mergeSortedBlocksReversed (std::vector<long long , AlignedAllocator<long long , 64 >> &data, const size_t &blockSize) {
411470 const size_t total = data.size ();
412- auto temp = std::vector<T , AlignedAllocator<T , 64 >>(total);
471+ auto temp = std::vector<long long , AlignedAllocator<long long , 64 >>(total);
413472
414473 bool cycle = true ;
415474 size_t mid;
0 commit comments