@@ -8013,7 +8013,7 @@ pub unsafe fn _mm_maskz_dbsad_epu8<const IMM8: i32>(
80138013/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movepi16_mask&expand=3873)
80148014#[inline]
80158015#[target_feature(enable = "avx512bw")]
8016- #[cfg_attr(test, assert_instr(mov ))] // should be vpmovw2m but msvc does not generate it
8016+ #[cfg_attr(test, assert_instr(vpmovw2m ))]
80178017pub unsafe fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
80188018 let filter = _mm512_set1_epi16(1 << 15);
80198019 let a = _mm512_and_si512(a, filter);
@@ -8025,7 +8025,7 @@ pub unsafe fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
80258025/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movepi16_mask&expand=3872)
80268026#[inline]
80278027#[target_feature(enable = "avx512bw,avx512vl")]
8028- #[cfg_attr(test, assert_instr(mov ))] // should be vpmovw2m but msvc does not generate it
8028+ #[cfg_attr(test, assert_instr(vpmovw2m ))]
80298029pub unsafe fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
80308030 let filter = _mm256_set1_epi16(1 << 15);
80318031 let a = _mm256_and_si256(a, filter);
@@ -8037,7 +8037,7 @@ pub unsafe fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
80378037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi16_mask&expand=3871)
80388038#[inline]
80398039#[target_feature(enable = "avx512bw,avx512vl")]
8040- #[cfg_attr(test, assert_instr(mov ))] // should be vpmovw2m but msvc does not generate it
8040+ #[cfg_attr(test, assert_instr(vpmovw2m ))]
80418041pub unsafe fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
80428042 let filter = _mm_set1_epi16(1 << 15);
80438043 let a = _mm_and_si128(a, filter);
@@ -8049,7 +8049,7 @@ pub unsafe fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
80498049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movepi8_mask&expand=3883)
80508050#[inline]
80518051#[target_feature(enable = "avx512bw")]
8052- #[cfg_attr(test, assert_instr(mov ))] // should be vpmovb2m but msvc does not generate it
8052+ #[cfg_attr(test, assert_instr(vpmovb2m ))]
80538053pub unsafe fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
80548054 let filter = _mm512_set1_epi8(1 << 7);
80558055 let a = _mm512_and_si512(a, filter);
@@ -8061,7 +8061,8 @@ pub unsafe fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
80618061/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movepi8_mask&expand=3882)
80628062#[inline]
80638063#[target_feature(enable = "avx512bw,avx512vl")]
8064- #[cfg_attr(test, assert_instr(mov))] // should be vpmovb2m but msvc does not generate it
8064+ #[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
8065+ // using vpmovb2m plus converting the mask register to a standard register.
80658066pub unsafe fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
80668067 let filter = _mm256_set1_epi8(1 << 7);
80678068 let a = _mm256_and_si256(a, filter);
@@ -8073,7 +8074,8 @@ pub unsafe fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
80738074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi8_mask&expand=3881)
80748075#[inline]
80758076#[target_feature(enable = "avx512bw,avx512vl")]
8076- #[cfg_attr(test, assert_instr(mov))] // should be vpmovb2m but msvc does not generate it
8077+ #[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
8078+ // using vpmovb2m plus converting the mask register to a standard register.
80778079pub unsafe fn _mm_movepi8_mask(a: __m128i) -> __mmask16 {
80788080 let filter = _mm_set1_epi8(1 << 7);
80798081 let a = _mm_and_si128(a, filter);
@@ -8216,8 +8218,9 @@ pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i {
82168218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kadd_mask32&expand=3207)
82178219#[inline]
82188220#[target_feature(enable = "avx512bw")]
8219- #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kaddd
8220- //llvm.x86.avx512.kadd.d
8221+ #[cfg_attr(all(test, target_arch = "x86"), assert_instr(add))]
8222+ #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(lea))] // generate normal lea/add code instead of kaddd
8223+ //llvm.x86.avx512.kadd.d
82218224pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
82228225 transmute(a + b)
82238226}
@@ -8227,7 +8230,9 @@ pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
82278230/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kadd_mask64&expand=3208)
82288231#[inline]
82298232#[target_feature(enable = "avx512bw")]
8230- #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kaddq
8233+ #[cfg_attr(all(test, target_arch = "x86"), assert_instr(add))]
8234+ #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(lea))] // generate normal lea/add code instead of kaddd
8235+ //llvm.x86.avx512.kadd.d
82318236pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
82328237 transmute(a + b)
82338238}
0 commit comments