11#ifndef  XSS_NETWORK_QSORT
22#define  XSS_NETWORK_QSORT 
33
4+ #include  " avx512-common-qsort.h" 
5+ 
46template  <typename  vtype,
57          int64_t  numVecs,
68          typename  reg_t  = typename  vtype::reg_t >
79X86_SIMD_SORT_INLINE void  bitonic_clean_n_vec (reg_t  *regs)
810{
9- # pragma  GCC unroll 64 
11+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
1012    for  (int  num = numVecs / 2 ; num >= 2 ; num /= 2 ) {
11- # pragma  GCC unroll 64 
13+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
1214        for  (int  j = 0 ; j < numVecs; j += num) {
13- # pragma  GCC unroll 64 
15+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
1416            for  (int  i = 0 ; i < num / 2 ; i++) {
1517                COEX<vtype>(regs[i + j], regs[i + j + num / 2 ]);
1618            }
@@ -30,7 +32,7 @@ X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs)
3032    }
3133    else  if  constexpr  (numVecs > 2 ) {
3234//  Reverse upper half
33- # pragma  GCC unroll 64 
35+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
3436        for  (int  i = 0 ; i < numVecs / 2 ; i++) {
3537            reg_t  rev = vtype::reverse (regs[numVecs - i - 1 ]);
3638            reg_t  maxV = vtype::max (regs[i], rev);
@@ -44,7 +46,7 @@ X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs)
4446    bitonic_clean_n_vec<vtype, numVecs>(regs);
4547
4648//  Now do bitonic_merge
47- # pragma  GCC unroll 64 
49+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
4850    for  (int  i = 0 ; i < numVecs; i++) {
4951        regs[i] = vtype::bitonic_merge (regs[i]);
5052    }
@@ -59,7 +61,7 @@ X86_SIMD_SORT_INLINE void bitonic_fullmerge_n_vec(reg_t *regs)
5961    if  constexpr  (numPer > numVecs)
6062        return ;
6163    else  {
62- # pragma  GCC unroll 64 
64+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
6365        for  (int  i = 0 ; i < numVecs / numPer; i++) {
6466            bitonic_merge_n_vec<vtype, numPer>(regs + i * numPer);
6567        }
@@ -79,7 +81,7 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
7981
8082    //  Generate masks for loading and storing
8183    typename  vtype::opmask_t  ioMasks[numVecs - numVecs / 2 ];
82-     # pragma  GCC unroll 64 
84+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
8385    for  (int  i = numVecs / 2 , j = 0 ; i < numVecs; i++, j++) {
8486        int64_t  num_to_read
8587                = std::min ((int64_t )std::max (0 , N - i * vtype::numlanes),
@@ -88,19 +90,19 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
8890    }
8991
9092//  Unmasked part of the load
91- # pragma  GCC unroll 64 
93+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
9294    for  (int  i = 0 ; i < numVecs / 2 ; i++) {
9395        vecs[i] = vtype::loadu (arr + i * vtype::numlanes);
9496    }
9597//  Masked part of the load
96- # pragma  GCC unroll 64 
98+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
9799    for  (int  i = numVecs / 2 , j = 0 ; i < numVecs; i++, j++) {
98100        vecs[i] = vtype::mask_loadu (
99101                vtype::zmm_max (), ioMasks[j], arr + i * vtype::numlanes);
100102    }
101103
102104//  Sort each loaded vector
103- # pragma  GCC unroll 64 
105+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
104106    for  (int  i = 0 ; i < numVecs; i++) {
105107        vecs[i] = vtype::sort_vec (vecs[i]);
106108    }
@@ -109,12 +111,12 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
109111    bitonic_fullmerge_n_vec<vtype, numVecs>(&vecs[0 ]);
110112
111113//  Unmasked part of the store
112- # pragma  GCC unroll 64 
114+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
113115    for  (int  i = 0 ; i < numVecs / 2 ; i++) {
114116        vtype::storeu (arr + i * vtype::numlanes, vecs[i]);
115117    }
116118//  Masked part of the store
117- # pragma  GCC unroll 64 
119+ X86_SIMD_SORT_UNROLL_LOOP ( 64 ) 
118120    for  (int  i = numVecs / 2 , j = 0 ; i < numVecs; i++, j++) {
119121        vtype::mask_storeu (arr + i * vtype::numlanes, ioMasks[j], vecs[i]);
120122    }
0 commit comments