@@ -60,7 +60,7 @@ struct avx2_vector<int64_t> {
6060#else 
6161    static  constexpr  int  network_sort_threshold = 64 ;
6262#endif 
63-     static  constexpr  int  partition_unroll_factor = 4 ;
63+     static  constexpr  int  partition_unroll_factor = 8 ;
6464
6565    using  swizzle_ops = avx2_64bit_swizzle_ops;
6666
@@ -89,12 +89,15 @@ struct avx2_vector<int64_t> {
8989    {
9090        return  _mm256_xor_si256 (x, y);
9191    }
92+     static  opmask_t  gt (reg_t  x, reg_t  y)
93+     {
94+         return  _mm256_cmpgt_epi64 (x, y);
95+     }
9296    static  opmask_t  ge (reg_t  x, reg_t  y)
9397    {
9498        opmask_t  equal = eq (x, y);
9599        opmask_t  greater = _mm256_cmpgt_epi64 (x, y);
96-         return  _mm256_castpd_si256 (_mm256_or_pd (_mm256_castsi256_pd (equal),
97-                                                 _mm256_castsi256_pd (greater)));
100+         return  _mm256_or_si256 (equal, greater);
98101    }
99102    static  opmask_t  eq (reg_t  x, reg_t  y)
100103    {
@@ -221,7 +224,7 @@ struct avx2_vector<uint64_t> {
221224#else 
222225    static  constexpr  int  network_sort_threshold = 64 ;
223226#endif 
224-     static  constexpr  int  partition_unroll_factor = 4 ;
227+     static  constexpr  int  partition_unroll_factor = 8 ;
225228
226229    using  swizzle_ops = avx2_64bit_swizzle_ops;
227230
@@ -258,17 +261,21 @@ struct avx2_vector<uint64_t> {
258261        return  _mm256_i64gather_epi64 (
259262                (long  long  int  const  *)base, index, scale);
260263    }
264+     static  opmask_t  gt (reg_t  x, reg_t  y)
265+     {
266+         const  __m256i offset = _mm256_set1_epi64x (0x8000000000000000 );
267+         x = _mm256_xor_si256 (x, offset);
268+         y = _mm256_xor_si256 (y, offset);
269+         return  _mm256_cmpgt_epi64 (x, y);
270+     }
261271    static  opmask_t  ge (reg_t  x, reg_t  y)
262272    {
263273        opmask_t  equal = eq (x, y);
264- 
265274        const  __m256i offset = _mm256_set1_epi64x (0x8000000000000000 );
266-         x = _mm256_add_epi64 (x, offset);
267-         y = _mm256_add_epi64 (y, offset);
268- 
275+         x = _mm256_xor_si256 (x, offset);
276+         y = _mm256_xor_si256 (y, offset);
269277        opmask_t  greater = _mm256_cmpgt_epi64 (x, y);
270-         return  _mm256_castpd_si256 (_mm256_or_pd (_mm256_castsi256_pd (equal),
271-                                                 _mm256_castsi256_pd (greater)));
278+         return  _mm256_or_si256 (equal, greater);
272279    }
273280    static  opmask_t  eq (reg_t  x, reg_t  y)
274281    {
@@ -380,7 +387,7 @@ struct avx2_vector<double> {
380387#else 
381388    static  constexpr  int  network_sort_threshold = 64 ;
382389#endif 
383-     static  constexpr  int  partition_unroll_factor = 4 ;
390+     static  constexpr  int  partition_unroll_factor = 8 ;
384391
385392    using  swizzle_ops = avx2_64bit_swizzle_ops;
386393
0 commit comments