@@ -255,11 +255,13 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
255255 type_t *biggest)
256256{
257257 const int num_unroll = 8 ;
258- if (right - left <= 2 *num_unroll*vtype::numlanes) {
259- return partition_avx512<vtype>(arr, left, right, pivot, smallest, biggest);
258+ if (right - left <= 2 * num_unroll * vtype::numlanes) {
259+ return partition_avx512<vtype>(
260+ arr, left, right, pivot, smallest, biggest);
260261 }
261262 /* make array length divisible by 8*vtype::numlanes , shortening the array */
262- for (int32_t i = ((right - left) % (num_unroll*vtype::numlanes)); i > 0 ; --i) {
263+ for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0 ;
264+ --i) {
263265 *smallest = std::min (*smallest, arr[left], comparison_func<vtype>);
264266 *biggest = std::max (*biggest, arr[left], comparison_func<vtype>);
265267 if (!comparison_func<vtype>(arr[left], pivot)) {
@@ -281,17 +283,18 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
281283 // We will now have atleast 16 registers worth of data to process:
282284 // left and right vtype::numlanes values are partitioned at the end
283285 zmm_t vec_left[num_unroll], vec_right[num_unroll];
284- #pragma GCC unroll 8
286+ #pragma GCC unroll 8
285287 for (int ii = 0 ; ii < num_unroll; ++ii) {
286- vec_left[ii] = vtype::loadu (arr + left + vtype::numlanes*ii);
287- vec_right[ii] = vtype::loadu (arr + (right - vtype::numlanes*(num_unroll-ii)));
288+ vec_left[ii] = vtype::loadu (arr + left + vtype::numlanes * ii);
289+ vec_right[ii] = vtype::loadu (
290+ arr + (right - vtype::numlanes * (num_unroll - ii)));
288291 }
289292 // store points of the vectors
290293 int64_t r_store = right - vtype::numlanes;
291294 int64_t l_store = left;
292295 // indices for loading the elements
293- left += num_unroll* vtype::numlanes;
294- right -= num_unroll* vtype::numlanes;
296+ left += num_unroll * vtype::numlanes;
297+ right -= num_unroll * vtype::numlanes;
295298 while (right - left != 0 ) {
296299 zmm_t curr_vec[num_unroll];
297300 /*
@@ -300,57 +303,59 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
300303 * otherwise from the left side
301304 */
302305 if ((r_store + vtype::numlanes) - right < left - l_store) {
303- right -= num_unroll* vtype::numlanes;
304- #pragma GCC unroll 8
306+ right -= num_unroll * vtype::numlanes;
307+ #pragma GCC unroll 8
305308 for (int ii = 0 ; ii < num_unroll; ++ii) {
306- curr_vec[ii] = vtype::loadu (arr + right + ii* vtype::numlanes);
309+ curr_vec[ii] = vtype::loadu (arr + right + ii * vtype::numlanes);
307310 }
308311 }
309312 else {
310- #pragma GCC unroll 8
313+ #pragma GCC unroll 8
311314 for (int ii = 0 ; ii < num_unroll; ++ii) {
312- curr_vec[ii] = vtype::loadu (arr + left + ii* vtype::numlanes);
315+ curr_vec[ii] = vtype::loadu (arr + left + ii * vtype::numlanes);
313316 }
314- left += num_unroll* vtype::numlanes;
317+ left += num_unroll * vtype::numlanes;
315318 }
316- // partition the current vector and save it on both sides of the array
317- #pragma GCC unroll 8
319+ // partition the current vector and save it on both sides of the array
320+ #pragma GCC unroll 8
318321 for (int ii = 0 ; ii < num_unroll; ++ii) {
319322 int32_t amount_ge_pivot
320323 = partition_vec<vtype>(arr,
321324 l_store,
322325 r_store + vtype::numlanes,
323326 curr_vec[ii],
324327 pivot_vec,
325- &min_vec,pick
328+ &min_vec,
326329 &max_vec);
327330 l_store += (vtype::numlanes - amount_ge_pivot);
328331 r_store -= amount_ge_pivot;
329332 }
330333 }
331334
332- /* partition and save vec_left[8] and vec_right[8] */
333- #pragma GCC unroll 8
335+ /* partition and save vec_left[8] and vec_right[8] */
336+ #pragma GCC unroll 8
334337 for (int ii = 0 ; ii < num_unroll; ++ii) {
335- int32_t amount_ge_pivot = partition_vec<vtype>(arr,
336- l_store,
337- r_store + vtype::numlanes,
338- vec_left[ii],
339- pivot_vec,
340- &min_vec,
341- &max_vec);
338+ int32_t amount_ge_pivot
339+ = partition_vec<vtype>(arr,
340+ l_store,
341+ r_store + vtype::numlanes,
342+ vec_left[ii],
343+ pivot_vec,
344+ &min_vec,
345+ &max_vec);
342346 l_store += (vtype::numlanes - amount_ge_pivot);
343347 r_store -= amount_ge_pivot;
344348 }
345- #pragma GCC unroll 8
349+ #pragma GCC unroll 8
346350 for (int ii = 0 ; ii < num_unroll; ++ii) {
347- int32_t amount_ge_pivot = partition_vec<vtype>(arr,
348- l_store,
349- r_store + vtype::numlanes,
350- vec_right[ii],
351- pivot_vec,
352- &min_vec,
353- &max_vec);
351+ int32_t amount_ge_pivot
352+ = partition_vec<vtype>(arr,
353+ l_store,
354+ r_store + vtype::numlanes,
355+ vec_right[ii],
356+ pivot_vec,
357+ &min_vec,
358+ &max_vec);
354359 l_store += (vtype::numlanes - amount_ge_pivot);
355360 r_store -= amount_ge_pivot;
356361 }
0 commit comments