diff --git a/.github/workflows/build-numpy.yml b/.github/workflows/build-numpy.yml index 4cff8ed8..f992586a 100644 --- a/.github/workflows/build-numpy.yml +++ b/.github/workflows/build-numpy.yml @@ -89,6 +89,12 @@ jobs: sudo apt update sudo apt -y install g++-12 gcc-12 git + - name: Install Intel SDE + run: | + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/788820/sde-external-9.27.0-2023-09-13-lin.tar.xz + mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ + sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde + - name: Checkout NumPy main uses: actions/checkout@v3 with: @@ -123,3 +129,21 @@ jobs: CC: gcc-12 run: | spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_spr + + - name: Run tests on TGL + working-directory: ${{ github.workspace }}/numpy + run: | + export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/) + export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE" + cd build-install && + sde -tgl -- python -c "import numpy; numpy.show_config()" && + sde -tgl -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py + + - name: Run tests on SPR + working-directory: ${{ github.workspace }}/numpy + run: | + export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/) + export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE" + cd build-install && + sde -spr -- python -c "import numpy; numpy.show_config()" && + sde -spr -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index a71281f4..be806f5f 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -499,8 +499,10 @@ replace_nan_with_inf>(uint16_t *arr, arrsize_t arrsize) { arrsize_t nan_count = 0; __mmask16 loadmask = 0xFFFF; - while (arrsize > 0) { - if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; } + for (arrsize_t ii = 0; ii < arrsize; ii = ii + zmm_vector::numlanes / 2) { + if (arrsize - ii < 16) { + loadmask = (0x0001 << (arrsize-ii)) - 0x0001; + } __m256i in_zmm = _mm256_maskz_loadu_epi16(loadmask, arr); __m512 in_zmm_asfloat = _mm512_cvtph_ps(in_zmm); __mmask16 nanmask = _mm512_cmp_ps_mask( @@ -508,7 +510,6 @@ replace_nan_with_inf>(uint16_t *arr, arrsize_t arrsize) nan_count += _mm_popcnt_u32((int32_t)nanmask); _mm256_mask_storeu_epi16(arr, nanmask, YMM_MAX_HALF); arr += 16; - arrsize -= 16; } return nan_count; }