Skip to content

Commit 9461b11

Browse files
author
Raghuveer Devulapalli
authored
Merge pull request #99 from r-devulap/minor-fixes
Improve emulation of AVX2 min/max 64-bit
2 parents f96d286 + cffafff commit 9461b11

File tree

4 files changed

+22
-15
lines changed

4 files changed

+22
-15
lines changed

scripts/bench-compare.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then
1111
fi
1212
compare=$(realpath .bench/google-benchmark/tools/compare.py)
1313

14-
meson setup --warnlevel 0 --buildtype release builddir-${branch}
14+
meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir-${branch}
1515
cd builddir-${branch}
1616
ninja
1717
$compare filters ./benchexe $1 $2

scripts/branch-compare.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ build_branch() {
2727
fi
2828
fi
2929
cd $dir_name
30-
meson setup --warnlevel 0 --buildtype release builddir
30+
meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir
3131
cd builddir
3232
ninja
3333
cd ../../

src/avx2-64bit-qsort.hpp

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ struct avx2_vector<int64_t> {
6060
#else
6161
static constexpr int network_sort_threshold = 64;
6262
#endif
63-
static constexpr int partition_unroll_factor = 4;
63+
static constexpr int partition_unroll_factor = 8;
6464

6565
using swizzle_ops = avx2_64bit_swizzle_ops;
6666

@@ -89,12 +89,15 @@ struct avx2_vector<int64_t> {
8989
{
9090
return _mm256_xor_si256(x, y);
9191
}
92+
static opmask_t gt(reg_t x, reg_t y)
93+
{
94+
return _mm256_cmpgt_epi64(x, y);
95+
}
9296
static opmask_t ge(reg_t x, reg_t y)
9397
{
9498
opmask_t equal = eq(x, y);
9599
opmask_t greater = _mm256_cmpgt_epi64(x, y);
96-
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal),
97-
_mm256_castsi256_pd(greater)));
100+
return _mm256_or_si256(equal, greater);
98101
}
99102
static opmask_t eq(reg_t x, reg_t y)
100103
{
@@ -221,7 +224,7 @@ struct avx2_vector<uint64_t> {
221224
#else
222225
static constexpr int network_sort_threshold = 64;
223226
#endif
224-
static constexpr int partition_unroll_factor = 4;
227+
static constexpr int partition_unroll_factor = 8;
225228

226229
using swizzle_ops = avx2_64bit_swizzle_ops;
227230

@@ -258,17 +261,21 @@ struct avx2_vector<uint64_t> {
258261
return _mm256_i64gather_epi64(
259262
(long long int const *)base, index, scale);
260263
}
264+
static opmask_t gt(reg_t x, reg_t y)
265+
{
266+
const __m256i offset = _mm256_set1_epi64x(0x8000000000000000);
267+
x = _mm256_xor_si256(x, offset);
268+
y = _mm256_xor_si256(y, offset);
269+
return _mm256_cmpgt_epi64(x, y);
270+
}
261271
static opmask_t ge(reg_t x, reg_t y)
262272
{
263273
opmask_t equal = eq(x, y);
264-
265274
const __m256i offset = _mm256_set1_epi64x(0x8000000000000000);
266-
x = _mm256_add_epi64(x, offset);
267-
y = _mm256_add_epi64(y, offset);
268-
275+
x = _mm256_xor_si256(x, offset);
276+
y = _mm256_xor_si256(y, offset);
269277
opmask_t greater = _mm256_cmpgt_epi64(x, y);
270-
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal),
271-
_mm256_castsi256_pd(greater)));
278+
return _mm256_or_si256(equal, greater);
272279
}
273280
static opmask_t eq(reg_t x, reg_t y)
274281
{
@@ -380,7 +387,7 @@ struct avx2_vector<double> {
380387
#else
381388
static constexpr int network_sort_threshold = 64;
382389
#endif
383-
static constexpr int partition_unroll_factor = 4;
390+
static constexpr int partition_unroll_factor = 8;
384391

385392
using swizzle_ops = avx2_64bit_swizzle_ops;
386393

src/avx2-emu-funcs.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ typename avx2_vector<T>::reg_t avx2_emu_max(typename avx2_vector<T>::reg_t x,
273273
typename avx2_vector<T>::reg_t y)
274274
{
275275
using vtype = avx2_vector<T>;
276-
typename vtype::opmask_t nlt = vtype::ge(x, y);
276+
typename vtype::opmask_t nlt = vtype::gt(x, y);
277277
return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(y),
278278
_mm256_castsi256_pd(x),
279279
_mm256_castsi256_pd(nlt)));
@@ -284,7 +284,7 @@ typename avx2_vector<T>::reg_t avx2_emu_min(typename avx2_vector<T>::reg_t x,
284284
typename avx2_vector<T>::reg_t y)
285285
{
286286
using vtype = avx2_vector<T>;
287-
typename vtype::opmask_t nlt = vtype::ge(x, y);
287+
typename vtype::opmask_t nlt = vtype::gt(x, y);
288288
return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(x),
289289
_mm256_castsi256_pd(y),
290290
_mm256_castsi256_pd(nlt)));

0 commit comments

Comments
 (0)