@@ -380,12 +380,8 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
380380 arr, left, right, pivot, smallest, biggest);
381381 }
382382
383- if (right - left <= 2 * num_unroll * vtype::numlanes) {
384- return partition_avx512<vtype>(
385- arr, left, right, pivot, smallest, biggest);
386- }
387- /* make array length divisible by 8*vtype::numlanes , shortening the array */
388- for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0 ;
383+ /* make array length divisible by vtype::numlanes , shortening the array */
384+ for (int32_t i = ((right - left) % (vtype::numlanes)); i > 0 ;
389385 --i) {
390386 *smallest = std::min (*smallest, arr[left], comparison_func<vtype>);
391387 *biggest = std::max (*biggest, arr[left], comparison_func<vtype>);
@@ -396,14 +392,49 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
396392 ++left;
397393 }
398394 }
399-
395+
400396 if (left == right)
401397 return left; /* less than vtype::numlanes elements in the array */
402-
398+
403399 using reg_t = typename vtype::reg_t ;
404400 reg_t pivot_vec = vtype::set1 (pivot);
405401 reg_t min_vec = vtype::set1 (*smallest);
406402 reg_t max_vec = vtype::set1 (*biggest);
403+
404+ int64_t vecsToPartition = ((right - left) / vtype::numlanes) % num_unroll;
405+ type_t buffer[num_unroll * vtype::numlanes];
406+ int32_t bufferStored = 0 ;
407+ int64_t leftStore = left;
408+
409+ for (int32_t i = 0 ; i < vecsToPartition; i++){
410+ reg_t curr_vec = vtype::loadu (arr + left + i * vtype::numlanes);
411+ typename vtype::opmask_t ge_mask = vtype::ge (curr_vec, pivot_vec);
412+ int32_t amount_ge_pivot = _mm_popcnt_u64 ((int64_t )ge_mask);
413+ vtype::mask_compressstoreu (
414+ arr + leftStore, vtype::knot_opmask (ge_mask), curr_vec);
415+
416+ vtype::mask_compressstoreu (
417+ buffer + bufferStored, ge_mask, curr_vec);
418+
419+ min_vec = vtype::min (curr_vec, min_vec);
420+ max_vec = vtype::max (curr_vec, max_vec);
421+
422+ bufferStored += amount_ge_pivot;
423+ leftStore += vtype::numlanes - amount_ge_pivot;
424+ }
425+
426+ // We can't just store the buffer on the right, since this would override data that has no copies elsewhere
427+ // Instead, copy the data that is currently on the right, and store it on the left side in the space between leftStore and left
428+ // Then we copy the buffer onto the right side
429+ std::memcpy (arr + leftStore, arr + right - bufferStored, bufferStored * sizeof (type_t ));
430+ std::memcpy (arr + right - bufferStored, buffer, bufferStored * sizeof (type_t ));
431+
432+ // The change to left depends only on numVecs, since we store the data replaced by the buffer on the left side
433+ left += vecsToPartition * vtype::numlanes - bufferStored;
434+ right -= bufferStored;
435+
436+ if (left == right)
437+ return left; /* less than vtype::numlanes elements in the array */
407438
408439 // We will now have atleast 16 registers worth of data to process:
409440 // left and right vtype::numlanes values are partitioned at the end
0 commit comments