@@ -596,6 +596,25 @@ pub unsafe fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
596596 )
597597}
598598
599+ /// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
600+ /// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
601+ /// any instructions.
602+ ///
603+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
604+ #[inline]
605+ #[target_feature(enable = "avx512fp16")]
606+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
607+ pub unsafe fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
608+ simd_shuffle!(
609+ a,
610+ _mm256_setzero_ph(),
611+ [
612+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16,
613+ 16, 16, 16, 16, 16, 16, 16, 16
614+ ]
615+ )
616+ }
617+
599618/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
600619/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
601620/// any instructions.
@@ -615,10 +634,10 @@ pub unsafe fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
615634 )
616635}
617636
618- macro_rules! cmp_asm {
637+ macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
619638 ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
620639 let dst: $mask_type;
621- crate::arch:: asm!(
640+ asm!(
622641 "vcmpph {k}, {a}, {b}, {imm8}",
623642 k = lateout(kreg) dst,
624643 a = in($reg) $a,
@@ -630,7 +649,7 @@ macro_rules! cmp_asm {
630649 }};
631650 ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
632651 let dst: $mask_type;
633- crate::arch:: asm!(
652+ asm!(
634653 "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
635654 k = lateout(kreg) dst,
636655 mask = in(kreg) $mask,
@@ -736,6 +755,73 @@ pub unsafe fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
736755 cmp_asm!(__mmask32, k1, zmm_reg, a, b)
737756}
738757
758+ /// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
759+ /// operand specified by imm8, and store the results in mask vector k.
760+ ///
761+ /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
762+ ///
763+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
764+ #[inline]
765+ #[target_feature(enable = "avx512fp16,avx512bw,avx512f")]
766+ #[rustc_legacy_const_generics(2, 3)]
767+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
768+ pub unsafe fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
769+ a: __m512h,
770+ b: __m512h,
771+ ) -> __mmask32 {
772+ static_assert_uimm_bits!(IMM5, 5);
773+ static_assert_sae!(SAE);
774+ if SAE == _MM_FROUND_NO_EXC {
775+ let dst: __mmask32;
776+ asm!(
777+ "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
778+ k = lateout(kreg) dst,
779+ a = in(zmm_reg) a,
780+ b = in(zmm_reg) b,
781+ imm8 = const IMM5,
782+ options(pure, nomem, nostack)
783+ );
784+ dst
785+ } else {
786+ cmp_asm!(__mmask32, zmm_reg, a, b)
787+ }
788+ }
789+
790+ /// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
791+ /// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
792+ /// zeroed out when the corresponding mask bit is not set).
793+ ///
794+ /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
795+ ///
796+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
797+ #[inline]
798+ #[target_feature(enable = "avx512fp16,avx512bw,avx512f")]
799+ #[rustc_legacy_const_generics(3, 4)]
800+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
801+ pub unsafe fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
802+ k1: __mmask32,
803+ a: __m512h,
804+ b: __m512h,
805+ ) -> __mmask32 {
806+ static_assert_uimm_bits!(IMM5, 5);
807+ static_assert_sae!(SAE);
808+ if SAE == _MM_FROUND_NO_EXC {
809+ let dst: __mmask32;
810+ asm!(
811+ "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
812+ k = lateout(kreg) dst,
813+ k1 = in(kreg) k1,
814+ a = in(zmm_reg) a,
815+ b = in(zmm_reg) b,
816+ imm8 = const IMM5,
817+ options(pure, nomem, nostack)
818+ );
819+ dst
820+ } else {
821+ cmp_asm!(__mmask32, k1, zmm_reg, a, b)
822+ }
823+ }
824+
739825/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
740826/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
741827/// passing _MM_FROUND_NO_EXC in the sae parameter.
@@ -803,25 +889,6 @@ pub unsafe fn _mm_mask_cmp_sh_mask<const IMM5: i32>(
803889 _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
804890}
805891
806- /// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
807- /// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
808- /// any instructions.
809- ///
810- /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
811- #[inline]
812- #[target_feature(enable = "avx512fp16")]
813- #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
814- pub unsafe fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
815- simd_shuffle!(
816- a,
817- _mm256_setzero_ph(),
818- [
819- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16,
820- 16, 16, 16, 16, 16, 16, 16, 16
821- ]
822- )
823- }
824-
825892/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
826893/// operand specified by imm8, and return the boolean result (0 or 1).
827894/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
@@ -10942,10 +11009,10 @@ pub unsafe fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
1094211009 _mm256_reduce_max_ph(_mm256_max_ph(p, q))
1094311010}
1094411011
10945- macro_rules! fpclass_asm {
11012+ macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
1094611013 ($mask_type: ty, $reg: ident, $a: expr) => {{
1094711014 let dst: $mask_type;
10948- crate::arch:: asm!(
11015+ asm!(
1094911016 "vfpclassph {k}, {src}, {imm8}",
1095011017 k = lateout(kreg) dst,
1095111018 src = in($reg) $a,
@@ -10956,7 +11023,7 @@ macro_rules! fpclass_asm {
1095611023 }};
1095711024 ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
1095811025 let dst: $mask_type;
10959- crate::arch:: asm!(
11026+ asm!(
1096011027 "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
1096111028 k = lateout(kreg) dst,
1096211029 mask = in(kreg) $mask,
@@ -15873,6 +15940,56 @@ pub unsafe fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(
1587315940 _mm_mask_cvt_roundsh_sd::<SAE>(_mm_setzero_pd(), k, a, b)
1587415941}
1587515942
15943+ /// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15944+ ///
15945+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
15946+ #[inline]
15947+ #[target_feature(enable = "avx512fp16")]
15948+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15949+ pub unsafe fn _mm_cvtsh_h(a: __m128h) -> f16 {
15950+ simd_extract!(a, 0)
15951+ }
15952+
15953+ /// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15954+ ///
15955+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
15956+ #[inline]
15957+ #[target_feature(enable = "avx512fp16")]
15958+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15959+ pub unsafe fn _mm256_cvtsh_h(a: __m256h) -> f16 {
15960+ simd_extract!(a, 0)
15961+ }
15962+
15963+ /// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15964+ ///
15965+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
15966+ #[inline]
15967+ #[target_feature(enable = "avx512fp16")]
15968+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15969+ pub unsafe fn _mm512_cvtsh_h(a: __m512h) -> f16 {
15970+ simd_extract!(a, 0)
15971+ }
15972+
15973+ /// Copy the lower 16-bit integer in a to dst.
15974+ ///
15975+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
15976+ #[inline]
15977+ #[target_feature(enable = "avx512fp16")]
15978+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15979+ pub unsafe fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
15980+ simd_extract!(a.as_i16x8(), 0)
15981+ }
15982+
15983+ /// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
15984+ ///
15985+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
15986+ #[inline]
15987+ #[target_feature(enable = "avx512fp16")]
15988+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15989+ pub unsafe fn _mm_cvtsi16_si128(a: i16) -> __m128i {
15990+ transmute(simd_insert!(i16x8::splat(0), 0, a))
15991+ }
15992+
1587615993#[allow(improper_ctypes)]
1587715994extern "C" {
1587815995 #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
@@ -16693,6 +16810,42 @@ mod tests {
1669316810 assert_eq!(r, 0b01010000010100000101000001010000);
1669416811 }
1669516812
16813+ #[simd_test(enable = "avx512fp16")]
16814+ unsafe fn test_mm512_cmp_round_ph_mask() {
16815+ let a = _mm512_set_ph(
16816+ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16817+ 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16818+ 31.0, 32.0,
16819+ );
16820+ let b = _mm512_set_ph(
16821+ 1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16822+ -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
16823+ -29.0, -30.0, -31.0, -32.0,
16824+ );
16825+ let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
16826+ assert_eq!(r, 0b11110000111100001111000011110000);
16827+ }
16828+
16829+ #[simd_test(enable = "avx512fp16")]
16830+ unsafe fn test_mm512_mask_cmp_round_ph_mask() {
16831+ let a = _mm512_set_ph(
16832+ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16833+ 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16834+ 31.0, 32.0,
16835+ );
16836+ let b = _mm512_set_ph(
16837+ 1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16838+ -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
16839+ -29.0, -30.0, -31.0, -32.0,
16840+ );
16841+ let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
16842+ 0b01010101010101010101010101010101,
16843+ a,
16844+ b,
16845+ );
16846+ assert_eq!(r, 0b01010000010100000101000001010000);
16847+ }
16848+
1669616849 #[simd_test(enable = "avx512fp16")]
1669716850 unsafe fn test_mm_cmp_round_sh_mask() {
1669816851 let a = _mm_set_sh(1.0);
@@ -26800,4 +26953,46 @@ mod tests {
2680026953 let e = _mm_setr_pd(1.0, 20.0);
2680126954 assert_eq_m128d(r, e);
2680226955 }
26956+
26957+ #[simd_test(enable = "avx512fp16")]
26958+ unsafe fn test_mm_cvtsh_h() {
26959+ let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
26960+ let r = _mm_cvtsh_h(a);
26961+ assert_eq!(r, 1.0);
26962+ }
26963+
26964+ #[simd_test(enable = "avx512fp16")]
26965+ unsafe fn test_mm256_cvtsh_h() {
26966+ let a = _mm256_setr_ph(
26967+ 1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26968+ );
26969+ let r = _mm256_cvtsh_h(a);
26970+ assert_eq!(r, 1.0);
26971+ }
26972+
26973+ #[simd_test(enable = "avx512fp16")]
26974+ unsafe fn test_mm512_cvtsh_h() {
26975+ let a = _mm512_setr_ph(
26976+ 1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26977+ 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26978+ 31.0, 32.0,
26979+ );
26980+ let r = _mm512_cvtsh_h(a);
26981+ assert_eq!(r, 1.0);
26982+ }
26983+
26984+ #[simd_test(enable = "avx512fp16")]
26985+ unsafe fn test_mm_cvtsi128_si16() {
26986+ let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
26987+ let r = _mm_cvtsi128_si16(a);
26988+ assert_eq!(r, 1);
26989+ }
26990+
26991+ #[simd_test(enable = "avx512fp16")]
26992+ unsafe fn test_mm_cvtsi16_si128() {
26993+ let a = 1;
26994+ let r = _mm_cvtsi16_si128(a);
26995+ let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
26996+ assert_eq_m128i(r, e);
26997+ }
2680326998}
0 commit comments