Skip to content

Commit 16e35b0

Browse files
committed
Changed core partition logic
1 parent d617059 commit 16e35b0

File tree

1 file changed

+33
-81
lines changed

1 file changed

+33
-81
lines changed

src/avx512-common-qsort.h

Lines changed: 33 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -241,23 +241,27 @@ X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask)
241241
*/
242242
template <typename vtype, typename type_t, typename reg_t>
243243
X86_SIMD_SORT_INLINE int32_t partition_vec(type_t *arr,
244-
arrsize_t left,
245-
arrsize_t right,
244+
arrsize_t& left,
245+
arrsize_t& unpartitioned,
246246
const reg_t curr_vec,
247247
const reg_t pivot_vec,
248-
reg_t *smallest_vec,
249-
reg_t *biggest_vec)
248+
reg_t &smallest_vec,
249+
reg_t &biggest_vec)
250250
{
251-
/* which elements are larger than or equal to the pivot */
252251
typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
253-
int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)ge_mask);
252+
uint64_t amount_ge_pivot = _mm_popcnt_u64(ge_mask);
254253
vtype::mask_compressstoreu(
255254
arr + left, vtype::knot_opmask(ge_mask), curr_vec);
255+
256+
left += (vtype::numlanes - amount_ge_pivot);
257+
256258
vtype::mask_compressstoreu(
257-
arr + right - amount_ge_pivot, ge_mask, curr_vec);
258-
*smallest_vec = vtype::min(curr_vec, *smallest_vec);
259-
*biggest_vec = vtype::max(curr_vec, *biggest_vec);
260-
return amount_ge_pivot;
259+
arr + left + unpartitioned, ge_mask, curr_vec);
260+
261+
unpartitioned -= vtype::numlanes;
262+
263+
smallest_vec = vtype::min(curr_vec, smallest_vec);
264+
biggest_vec = vtype::max(curr_vec, biggest_vec);
261265
}
262266
/*
263267
* Parition an array based on the pivot and returns the index of the
@@ -293,23 +297,19 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr,
293297

294298
if (right - left == vtype::numlanes) {
295299
reg_t vec = vtype::loadu(arr + left);
296-
int32_t amount_ge_pivot = partition_vec<vtype>(arr,
297-
left,
298-
left + vtype::numlanes,
299-
vec,
300-
pivot_vec,
301-
&min_vec,
302-
&max_vec);
303-
*smallest = vtype::reducemin(min_vec);
304-
*biggest = vtype::reducemax(max_vec);
305-
return left + (vtype::numlanes - amount_ge_pivot);
300+
uint64_t unpartitioned = right - left - vtype::numlanes;
301+
uint64_t l_store = left;
302+
303+
partition_vec<vtype>(arr, l_store, unpartitioned, vec, pivot_vec, min_vec, max_vec);
304+
305+
return l_store;
306306
}
307307

308308
// first and last vtype::numlanes values are partitioned at the end
309309
reg_t vec_left = vtype::loadu(arr + left);
310310
reg_t vec_right = vtype::loadu(arr + (right - vtype::numlanes));
311311
// store points of the vectors
312-
arrsize_t r_store = right - vtype::numlanes;
312+
arrsize_t unpartitioned = right - left - vtype::numlanes;
313313
arrsize_t l_store = left;
314314
// indices for loading the elements
315315
left += vtype::numlanes;
@@ -321,7 +321,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr,
321321
* then next elements are loaded from the right side,
322322
* otherwise from the left side
323323
*/
324-
if ((r_store + vtype::numlanes) - right < left - l_store) {
324+
if ((l_store + unpartitioned + vtype::numlanes) - right < left - l_store) {
325325
right -= vtype::numlanes;
326326
curr_vec = vtype::loadu(arr + right);
327327
}
@@ -330,36 +330,13 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr,
330330
left += vtype::numlanes;
331331
}
332332
// partition the current vector and save it on both sides of the array
333-
int32_t amount_ge_pivot
334-
= partition_vec<vtype>(arr,
335-
l_store,
336-
r_store + vtype::numlanes,
337-
curr_vec,
338-
pivot_vec,
339-
&min_vec,
340-
&max_vec);
341-
;
342-
r_store -= amount_ge_pivot;
343-
l_store += (vtype::numlanes - amount_ge_pivot);
333+
partition_vec<vtype>(arr, l_store, unpartitioned, curr_vec, pivot_vec, min_vec, max_vec);
344334
}
345335

346336
/* partition and save vec_left and vec_right */
347-
int32_t amount_ge_pivot = partition_vec<vtype>(arr,
348-
l_store,
349-
r_store + vtype::numlanes,
350-
vec_left,
351-
pivot_vec,
352-
&min_vec,
353-
&max_vec);
354-
l_store += (vtype::numlanes - amount_ge_pivot);
355-
amount_ge_pivot = partition_vec<vtype>(arr,
356-
l_store,
357-
l_store + vtype::numlanes,
358-
vec_right,
359-
pivot_vec,
360-
&min_vec,
361-
&max_vec);
362-
l_store += (vtype::numlanes - amount_ge_pivot);
337+
partition_vec<vtype>(arr, l_store, unpartitioned, vec_left, pivot_vec, min_vec, max_vec);
338+
partition_vec<vtype>(arr, l_store, unpartitioned, vec_right, pivot_vec, min_vec, max_vec);
339+
363340
*smallest = vtype::reducemin(min_vec);
364341
*biggest = vtype::reducemax(max_vec);
365342
return l_store;
@@ -446,7 +423,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
446423
arr + (right - vtype::numlanes * (num_unroll - ii)));
447424
}
448425
// store points of the vectors
449-
arrsize_t r_store = right - vtype::numlanes;
426+
arrsize_t unpartitioned = right - left - vtype::numlanes;
450427
arrsize_t l_store = left;
451428
// indices for loading the elements
452429
left += num_unroll * vtype::numlanes;
@@ -458,62 +435,37 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
458435
* then next elements are loaded from the right side,
459436
* otherwise from the left side
460437
*/
461-
if ((r_store + vtype::numlanes) - right < left - l_store) {
438+
if ((l_store + unpartitioned + vtype::numlanes) - right < left - l_store) {
462439
right -= num_unroll * vtype::numlanes;
463440
X86_SIMD_SORT_UNROLL_LOOP(8)
464441
for (int ii = 0; ii < num_unroll; ++ii) {
465442
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
466443
}
444+
_mm_prefetch(arr + right - num_unroll * vtype::numlanes, _MM_HINT_T0);
467445
}
468446
else {
469447
X86_SIMD_SORT_UNROLL_LOOP(8)
470448
for (int ii = 0; ii < num_unroll; ++ii) {
471449
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
472450
}
473451
left += num_unroll * vtype::numlanes;
452+
_mm_prefetch(arr + left, _MM_HINT_T0);
474453
}
475454
// partition the current vector and save it on both sides of the array
476455
X86_SIMD_SORT_UNROLL_LOOP(8)
477456
for (int ii = 0; ii < num_unroll; ++ii) {
478-
int32_t amount_ge_pivot
479-
= partition_vec<vtype>(arr,
480-
l_store,
481-
r_store + vtype::numlanes,
482-
curr_vec[ii],
483-
pivot_vec,
484-
&min_vec,
485-
&max_vec);
486-
l_store += (vtype::numlanes - amount_ge_pivot);
487-
r_store -= amount_ge_pivot;
457+
partition_vec<vtype>(arr, l_store, unpartitioned, curr_vec[ii], pivot_vec, min_vec, max_vec);
488458
}
489459
}
490460

491461
/* partition and save vec_left[8] and vec_right[8] */
492462
X86_SIMD_SORT_UNROLL_LOOP(8)
493463
for (int ii = 0; ii < num_unroll; ++ii) {
494-
int32_t amount_ge_pivot
495-
= partition_vec<vtype>(arr,
496-
l_store,
497-
r_store + vtype::numlanes,
498-
vec_left[ii],
499-
pivot_vec,
500-
&min_vec,
501-
&max_vec);
502-
l_store += (vtype::numlanes - amount_ge_pivot);
503-
r_store -= amount_ge_pivot;
464+
partition_vec<vtype>(arr, l_store, unpartitioned, vec_left[ii], pivot_vec, min_vec, max_vec);
504465
}
505466
X86_SIMD_SORT_UNROLL_LOOP(8)
506467
for (int ii = 0; ii < num_unroll; ++ii) {
507-
int32_t amount_ge_pivot
508-
= partition_vec<vtype>(arr,
509-
l_store,
510-
r_store + vtype::numlanes,
511-
vec_right[ii],
512-
pivot_vec,
513-
&min_vec,
514-
&max_vec);
515-
l_store += (vtype::numlanes - amount_ge_pivot);
516-
r_store -= amount_ge_pivot;
468+
partition_vec<vtype>(arr, l_store, unpartitioned, vec_right[ii], pivot_vec, min_vec, max_vec);
517469
}
518470
*smallest = vtype::reducemin(min_vec);
519471
*biggest = vtype::reducemax(max_vec);

0 commit comments

Comments
 (0)