diff --git a/.github/workflows/build-numpy.yml b/.github/workflows/build-numpy.yml
index 4cff8ed8..f992586a 100644
--- a/.github/workflows/build-numpy.yml
+++ b/.github/workflows/build-numpy.yml
@@ -89,6 +89,12 @@ jobs:
         sudo apt update
         sudo apt -y install g++-12 gcc-12 git
 
+    - name: Install Intel SDE
+      run: |
+        curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/788820/sde-external-9.27.0-2023-09-13-lin.tar.xz
+        mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
+        sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
+
     - name: Checkout NumPy main
       uses: actions/checkout@v3
       with:
@@ -123,3 +129,21 @@ jobs:
         CC: gcc-12
       run: |
         spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_spr
+
+    - name: Run tests on TGL
+      working-directory: ${{ github.workspace }}/numpy
+      run: |
+        export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
+        export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
+        cd build-install &&
+        sde -tgl -- python -c "import numpy; numpy.show_config()" &&
+        sde -tgl -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py
+
+    - name: Run tests on SPR
+      working-directory: ${{ github.workspace }}/numpy
+      run: |
+        export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
+        export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
+        cd build-install &&
+        sde -spr -- python -c "import numpy; numpy.show_config()" &&
+        sde -spr -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py
diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp
index a71281f4..be806f5f 100644
--- a/src/avx512-16bit-qsort.hpp
+++ b/src/avx512-16bit-qsort.hpp
@@ -499,8 +499,10 @@ replace_nan_with_inf<zmm_vector<float16>>(uint16_t *arr, arrsize_t arrsize)
 {
     arrsize_t nan_count = 0;
     __mmask16 loadmask = 0xFFFF;
-    while (arrsize > 0) {
-        if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; }
+    for (arrsize_t ii = 0; ii < arrsize; ii = ii + zmm_vector<float16>::numlanes / 2) {
+        if (arrsize - ii < 16) {
+            loadmask = (0x0001 << (arrsize-ii)) - 0x0001;
+        }
         __m256i in_zmm = _mm256_maskz_loadu_epi16(loadmask, arr);
         __m512 in_zmm_asfloat = _mm512_cvtph_ps(in_zmm);
         __mmask16 nanmask = _mm512_cmp_ps_mask(
@@ -508,7 +510,6 @@ replace_nan_with_inf<zmm_vector<float16>>(uint16_t *arr, arrsize_t arrsize)
         nan_count += _mm_popcnt_u32((int32_t)nanmask);
         _mm256_mask_storeu_epi16(arr, nanmask, YMM_MAX_HALF);
         arr += 16;
-        arrsize -= 16;
     }
     return nan_count;
 }