@@ -241,23 +241,27 @@ X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask)
241241 */
242242template <typename vtype, typename type_t , typename reg_t >
243243X86_SIMD_SORT_INLINE int32_t partition_vec (type_t *arr,
244- arrsize_t left,
245- arrsize_t right ,
244+ arrsize_t & left,
245+ arrsize_t & unpartitioned ,
246246 const reg_t curr_vec,
247247 const reg_t pivot_vec,
248- reg_t * smallest_vec,
249- reg_t * biggest_vec)
248+ reg_t & smallest_vec,
249+ reg_t & biggest_vec)
250250{
251- /* which elements are larger than or equal to the pivot */
252251 typename vtype::opmask_t ge_mask = vtype::ge (curr_vec, pivot_vec);
253- int32_t amount_ge_pivot = _mm_popcnt_u32 (( int32_t ) ge_mask);
252+ uint64_t amount_ge_pivot = _mm_popcnt_u64 ( ge_mask);
254253 vtype::mask_compressstoreu (
255254 arr + left, vtype::knot_opmask (ge_mask), curr_vec);
255+
256+ left += (vtype::numlanes - amount_ge_pivot);
257+
256258 vtype::mask_compressstoreu (
257- arr + right - amount_ge_pivot, ge_mask, curr_vec);
258- *smallest_vec = vtype::min (curr_vec, *smallest_vec);
259- *biggest_vec = vtype::max (curr_vec, *biggest_vec);
260- return amount_ge_pivot;
259+ arr + left + unpartitioned, ge_mask, curr_vec);
260+
261+ unpartitioned -= vtype::numlanes;
262+
263+ smallest_vec = vtype::min (curr_vec, smallest_vec);
264+ biggest_vec = vtype::max (curr_vec, biggest_vec);
261265}
262266/*
263267 * Parition an array based on the pivot and returns the index of the
@@ -293,23 +297,19 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr,
293297
294298 if (right - left == vtype::numlanes) {
295299 reg_t vec = vtype::loadu (arr + left);
296- int32_t amount_ge_pivot = partition_vec<vtype>(arr,
297- left,
298- left + vtype::numlanes,
299- vec,
300- pivot_vec,
301- &min_vec,
302- &max_vec);
303- *smallest = vtype::reducemin (min_vec);
304- *biggest = vtype::reducemax (max_vec);
305- return left + (vtype::numlanes - amount_ge_pivot);
300+ uint64_t unpartitioned = right - left - vtype::numlanes;
301+ uint64_t l_store = left;
302+
303+ partition_vec<vtype>(arr, l_store, unpartitioned, vec, pivot_vec, min_vec, max_vec);
304+
305+ return l_store;
306306 }
307307
308308 // first and last vtype::numlanes values are partitioned at the end
309309 reg_t vec_left = vtype::loadu (arr + left);
310310 reg_t vec_right = vtype::loadu (arr + (right - vtype::numlanes));
311311 // store points of the vectors
312- arrsize_t r_store = right - vtype::numlanes;
312+ arrsize_t unpartitioned = right - left - vtype::numlanes;
313313 arrsize_t l_store = left;
314314 // indices for loading the elements
315315 left += vtype::numlanes;
@@ -321,7 +321,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr,
321321 * then next elements are loaded from the right side,
322322 * otherwise from the left side
323323 */
324- if ((r_store + vtype::numlanes) - right < left - l_store) {
324+ if ((l_store + unpartitioned + vtype::numlanes) - right < left - l_store) {
325325 right -= vtype::numlanes;
326326 curr_vec = vtype::loadu (arr + right);
327327 }
@@ -330,36 +330,13 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr,
330330 left += vtype::numlanes;
331331 }
332332 // partition the current vector and save it on both sides of the array
333- int32_t amount_ge_pivot
334- = partition_vec<vtype>(arr,
335- l_store,
336- r_store + vtype::numlanes,
337- curr_vec,
338- pivot_vec,
339- &min_vec,
340- &max_vec);
341- ;
342- r_store -= amount_ge_pivot;
343- l_store += (vtype::numlanes - amount_ge_pivot);
333+ partition_vec<vtype>(arr, l_store, unpartitioned, curr_vec, pivot_vec, min_vec, max_vec);
344334 }
345335
346336 /* partition and save vec_left and vec_right */
347- int32_t amount_ge_pivot = partition_vec<vtype>(arr,
348- l_store,
349- r_store + vtype::numlanes,
350- vec_left,
351- pivot_vec,
352- &min_vec,
353- &max_vec);
354- l_store += (vtype::numlanes - amount_ge_pivot);
355- amount_ge_pivot = partition_vec<vtype>(arr,
356- l_store,
357- l_store + vtype::numlanes,
358- vec_right,
359- pivot_vec,
360- &min_vec,
361- &max_vec);
362- l_store += (vtype::numlanes - amount_ge_pivot);
337+ partition_vec<vtype>(arr, l_store, unpartitioned, vec_left, pivot_vec, min_vec, max_vec);
338+ partition_vec<vtype>(arr, l_store, unpartitioned, vec_right, pivot_vec, min_vec, max_vec);
339+
363340 *smallest = vtype::reducemin (min_vec);
364341 *biggest = vtype::reducemax (max_vec);
365342 return l_store;
@@ -446,7 +423,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
446423 arr + (right - vtype::numlanes * (num_unroll - ii)));
447424 }
448425 // store points of the vectors
449- arrsize_t r_store = right - vtype::numlanes;
426+ arrsize_t unpartitioned = right - left - vtype::numlanes;
450427 arrsize_t l_store = left;
451428 // indices for loading the elements
452429 left += num_unroll * vtype::numlanes;
@@ -458,62 +435,37 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
458435 * then next elements are loaded from the right side,
459436 * otherwise from the left side
460437 */
461- if ((r_store + vtype::numlanes) - right < left - l_store) {
438+ if ((l_store + unpartitioned + vtype::numlanes) - right < left - l_store) {
462439 right -= num_unroll * vtype::numlanes;
463440 X86_SIMD_SORT_UNROLL_LOOP (8 )
464441 for (int ii = 0 ; ii < num_unroll; ++ii) {
465442 curr_vec[ii] = vtype::loadu (arr + right + ii * vtype::numlanes);
466443 }
444+ _mm_prefetch (arr + right - num_unroll * vtype::numlanes, _MM_HINT_T0);
467445 }
468446 else {
469447 X86_SIMD_SORT_UNROLL_LOOP (8 )
470448 for (int ii = 0 ; ii < num_unroll; ++ii) {
471449 curr_vec[ii] = vtype::loadu (arr + left + ii * vtype::numlanes);
472450 }
473451 left += num_unroll * vtype::numlanes;
452+ _mm_prefetch (arr + left, _MM_HINT_T0);
474453 }
475454 // partition the current vector and save it on both sides of the array
476455 X86_SIMD_SORT_UNROLL_LOOP (8 )
477456 for (int ii = 0 ; ii < num_unroll; ++ii) {
478- int32_t amount_ge_pivot
479- = partition_vec<vtype>(arr,
480- l_store,
481- r_store + vtype::numlanes,
482- curr_vec[ii],
483- pivot_vec,
484- &min_vec,
485- &max_vec);
486- l_store += (vtype::numlanes - amount_ge_pivot);
487- r_store -= amount_ge_pivot;
457+ partition_vec<vtype>(arr, l_store, unpartitioned, curr_vec[ii], pivot_vec, min_vec, max_vec);
488458 }
489459 }
490460
491461 /* partition and save vec_left[8] and vec_right[8] */
492462 X86_SIMD_SORT_UNROLL_LOOP (8 )
493463 for (int ii = 0 ; ii < num_unroll; ++ii) {
494- int32_t amount_ge_pivot
495- = partition_vec<vtype>(arr,
496- l_store,
497- r_store + vtype::numlanes,
498- vec_left[ii],
499- pivot_vec,
500- &min_vec,
501- &max_vec);
502- l_store += (vtype::numlanes - amount_ge_pivot);
503- r_store -= amount_ge_pivot;
464+ partition_vec<vtype>(arr, l_store, unpartitioned, vec_left[ii], pivot_vec, min_vec, max_vec);
504465 }
505466 X86_SIMD_SORT_UNROLL_LOOP (8 )
506467 for (int ii = 0 ; ii < num_unroll; ++ii) {
507- int32_t amount_ge_pivot
508- = partition_vec<vtype>(arr,
509- l_store,
510- r_store + vtype::numlanes,
511- vec_right[ii],
512- pivot_vec,
513- &min_vec,
514- &max_vec);
515- l_store += (vtype::numlanes - amount_ge_pivot);
516- r_store -= amount_ge_pivot;
468+ partition_vec<vtype>(arr, l_store, unpartitioned, vec_right[ii], pivot_vec, min_vec, max_vec);
517469 }
518470 *smallest = vtype::reducemin (min_vec);
519471 *biggest = vtype::reducemax (max_vec);
0 commit comments