@@ -286,11 +286,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
286286 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
287287 types_per_step = (512 / 8) / sizeof(float); \
288288 for (; left_over >= types_per_step; left_over -= types_per_step) { \
289- __m512 vecA = _mm512_load_ps ((__m512*)in); \
290- __m512 vecB = _mm512_load_ps ((__m512*)out); \
289+ __m512 vecA = _mm512_loadu_ps ((__m512*)in); \
290+ __m512 vecB = _mm512_loadu_ps ((__m512*)out); \
291291 in += types_per_step; \
292292 __m512 res = _mm512_##op##_ps(vecA, vecB); \
293- _mm512_store_ps ((__m512*)out, res); \
293+ _mm512_storeu_ps ((__m512*)out, res); \
294294 out += types_per_step; \
295295 } \
296296 if( 0 == left_over ) return; \
@@ -304,11 +304,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
304304 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
305305 types_per_step = (256 / 8) / sizeof(float); \
306306 for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
307- __m256 vecA = _mm256_load_ps (in); \
307+ __m256 vecA = _mm256_loadu_ps (in); \
308308 in += types_per_step; \
309- __m256 vecB = _mm256_load_ps (out); \
309+ __m256 vecB = _mm256_loadu_ps (out); \
310310 __m256 res = _mm256_##op##_ps(vecA, vecB); \
311- _mm256_store_ps (out, res); \
311+ _mm256_storeu_ps (out, res); \
312312 out += types_per_step; \
313313 } \
314314 if( 0 == left_over ) return; \
@@ -322,11 +322,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in
322322 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \
323323 types_per_step = (128 / 8) / sizeof(float); \
324324 for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
325- __m128 vecA = _mm_load_ps (in); \
325+ __m128 vecA = _mm_loadu_ps (in); \
326326 in += types_per_step; \
327- __m128 vecB = _mm_load_ps (out); \
327+ __m128 vecB = _mm_loadu_ps (out); \
328328 __m128 res = _mm_##op##_ps(vecA, vecB); \
329- _mm_store_ps (out, res); \
329+ _mm_storeu_ps (out, res); \
330330 out += types_per_step; \
331331 } \
332332 }
@@ -367,11 +367,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
367367 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
368368 types_per_step = (512 / 8) / sizeof(double); \
369369 for (; left_over >= types_per_step; left_over -= types_per_step) { \
370- __m512d vecA = _mm512_load_pd (in); \
370+ __m512d vecA = _mm512_loadu_pd (in); \
371371 in += types_per_step; \
372- __m512d vecB = _mm512_load_pd (out); \
372+ __m512d vecB = _mm512_loadu_pd (out); \
373373 __m512d res = _mm512_##op##_pd(vecA, vecB); \
374- _mm512_store_pd ((out), res); \
374+ _mm512_storeu_pd ((out), res); \
375375 out += types_per_step; \
376376 } \
377377 if( 0 == left_over ) return; \
@@ -385,11 +385,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
385385 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
386386 types_per_step = (256 / 8) / sizeof(double); \
387387 for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
388- __m256d vecA = _mm256_load_pd (in); \
388+ __m256d vecA = _mm256_loadu_pd (in); \
389389 in += types_per_step; \
390- __m256d vecB = _mm256_load_pd (out); \
390+ __m256d vecB = _mm256_loadu_pd (out); \
391391 __m256d res = _mm256_##op##_pd(vecA, vecB); \
392- _mm256_store_pd (out, res); \
392+ _mm256_storeu_pd (out, res); \
393393 out += types_per_step; \
394394 } \
395395 if( 0 == left_over ) return; \
@@ -403,11 +403,11 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v
403403 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \
404404 types_per_step = (128 / 8) / sizeof(double); \
405405 for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
406- __m128d vecA = _mm_load_pd (in); \
406+ __m128d vecA = _mm_loadu_pd (in); \
407407 in += types_per_step; \
408- __m128d vecB = _mm_load_pd (out); \
408+ __m128d vecB = _mm_loadu_pd (out); \
409409 __m128d res = _mm_##op##_pd(vecA, vecB); \
410- _mm_store_pd (out, res); \
410+ _mm_storeu_pd (out, res); \
411411 out += types_per_step; \
412412 } \
413413 }
@@ -813,12 +813,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
813813 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
814814 types_per_step = (512 / 8) / sizeof(float); \
815815 for (; left_over >= types_per_step; left_over -= types_per_step) { \
816- __m512 vecA = _mm512_load_ps (in1); \
817- __m512 vecB = _mm512_load_ps (in2); \
816+ __m512 vecA = _mm512_loadu_ps (in1); \
817+ __m512 vecB = _mm512_loadu_ps (in2); \
818818 in1 += types_per_step; \
819819 in2 += types_per_step; \
820820 __m512 res = _mm512_##op##_ps(vecA, vecB); \
821- _mm512_store_ps (out, res); \
821+ _mm512_storeu_ps (out, res); \
822822 out += types_per_step; \
823823 } \
824824 if( 0 == left_over ) return; \
@@ -832,12 +832,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
832832 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
833833 types_per_step = (256 / 8) / sizeof(float); \
834834 for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
835- __m256 vecA = _mm256_load_ps (in1); \
836- __m256 vecB = _mm256_load_ps (in2); \
835+ __m256 vecA = _mm256_loadu_ps (in1); \
836+ __m256 vecB = _mm256_loadu_ps (in2); \
837837 in1 += types_per_step; \
838838 in2 += types_per_step; \
839839 __m256 res = _mm256_##op##_ps(vecA, vecB); \
840- _mm256_store_ps (out, res); \
840+ _mm256_storeu_ps (out, res); \
841841 out += types_per_step; \
842842 } \
843843 if( 0 == left_over ) return; \
@@ -851,12 +851,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1,
851851 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE_FLAG) ) { \
852852 types_per_step = (128 / 8) / sizeof(float); \
853853 for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
854- __m128 vecA = _mm_load_ps (in1); \
855- __m128 vecB = _mm_load_ps (in2); \
854+ __m128 vecA = _mm_loadu_ps (in1); \
855+ __m128 vecB = _mm_loadu_ps (in2); \
856856 in1 += types_per_step; \
857857 in2 += types_per_step; \
858858 __m128 res = _mm_##op##_ps(vecA, vecB); \
859- _mm_store_ps (out, res); \
859+ _mm_storeu_ps (out, res); \
860860 out += types_per_step; \
861861 } \
862862 }
@@ -899,12 +899,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
899899 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG) ) { \
900900 types_per_step = (512 / 8) / sizeof(double); \
901901 for (; left_over >= types_per_step; left_over -= types_per_step) { \
902- __m512d vecA = _mm512_load_pd ((in1)); \
903- __m512d vecB = _mm512_load_pd ((in2)); \
902+ __m512d vecA = _mm512_loadu_pd ((in1)); \
903+ __m512d vecB = _mm512_loadu_pd ((in2)); \
904904 in1 += types_per_step; \
905905 in2 += types_per_step; \
906906 __m512d res = _mm512_##op##_pd(vecA, vecB); \
907- _mm512_store_pd ((out), res); \
907+ _mm512_storeu_pd ((out), res); \
908908 out += types_per_step; \
909909 } \
910910 if( 0 == left_over ) return; \
@@ -918,12 +918,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
918918 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX_FLAG) ) { \
919919 types_per_step = (256 / 8) / sizeof(double); \
920920 for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
921- __m256d vecA = _mm256_load_pd (in1); \
922- __m256d vecB = _mm256_load_pd (in2); \
921+ __m256d vecA = _mm256_loadu_pd (in1); \
922+ __m256d vecB = _mm256_loadu_pd (in2); \
923923 in1 += types_per_step; \
924924 in2 += types_per_step; \
925925 __m256d res = _mm256_##op##_pd(vecA, vecB); \
926- _mm256_store_pd (out, res); \
926+ _mm256_storeu_pd (out, res); \
927927 out += types_per_step; \
928928 } \
929929 if( 0 == left_over ) return; \
@@ -937,12 +937,12 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1,
937937 if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE2_FLAG) ) { \
938938 types_per_step = (128 / 8) / sizeof(double); \
939939 for( ; left_over >= types_per_step; left_over -= types_per_step ) { \
940- __m128d vecA = _mm_load_pd (in1); \
941- __m128d vecB = _mm_load_pd (in2); \
940+ __m128d vecA = _mm_loadu_pd (in1); \
941+ __m128d vecB = _mm_loadu_pd (in2); \
942942 in1 += types_per_step; \
943943 in2 += types_per_step; \
944944 __m128d res = _mm_##op##_pd(vecA, vecB); \
945- _mm_store_pd (out, res); \
945+ _mm_storeu_pd (out, res); \
946946 out += types_per_step; \
947947 } \
948948 }
0 commit comments