diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 88fe4cb085..44d83d0131 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -5021,7 +5021,7 @@ pub unsafe fn vfma_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float6 #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v1f64")] fn vfma_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t; } - vfma_f64_(a, b, c) + vfma_f64_(b, c, a) } /// Floating-point fused Multiply-Add to accumulator(vector) @@ -5034,7 +5034,7 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f64")] fn vfmaq_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t; } - vfmaq_f64_(a, b, c) + vfmaq_f64_(b, c, a) } /// Floating-point fused Multiply-Add to accumulator(vector) @@ -5042,8 +5042,7 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fmadd))] pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t { - let d: float64x1_t = transmute(f64x1::new(c)); - vfma_f64(b, transmute(d), a) + vfma_f64(a, b, vdup_n_f64(c)) } /// Floating-point fused Multiply-Add to accumulator(vector) @@ -5051,8 +5050,301 @@ pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fmla))] pub unsafe fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t { - let d: float64x2_t = transmute(f64x2::new(c, c)); - vfmaq_f64(b, d, a) + vfmaq_f64(a, b, vdupq_n_f64(c)) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfma_lane_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t { + static_assert_imm1!(LANE); + vfma_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfma_laneq_f32(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t { + static_assert_imm2!(LANE); + vfma_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmaq_lane_f32(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t { + static_assert_imm1!(LANE); + vfmaq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmaq_laneq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t { + static_assert_imm2!(LANE); + vfmaq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmadd, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfma_lane_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t { + static_assert!(LANE : i32 where LANE == 0); + vfma_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfma_laneq_f64(a: float64x1_t, b: float64x1_t, c: float64x2_t) -> float64x1_t { + static_assert_imm1!(LANE); + vfma_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmaq_lane_f64(a: float64x2_t, b: float64x2_t, c: float64x1_t) -> float64x2_t { + static_assert!(LANE : i32 where LANE == 0); + vfmaq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmaq_laneq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t { + static_assert_imm1!(LANE); + vfmaq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmas_lane_f32(a: f32, b: f32, c: float32x2_t) -> f32 { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f32")] + fn vfmas_lane_f32_(a: f32, b: f32, c: f32) -> f32; + } + static_assert_imm1!(LANE); + let c: f32 = simd_extract(c, LANE as u32); + vfmas_lane_f32_(b, c, a) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmas_laneq_f32(a: f32, b: f32, c: float32x4_t) -> f32 { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f32")] + fn vfmas_laneq_f32_(a: f32, b: f32, c: f32) -> f32; + } + static_assert_imm2!(LANE); + let c: f32 = simd_extract(c, LANE as u32); + vfmas_laneq_f32_(b, c, a) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmadd, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmad_lane_f64(a: f64, b: f64, c: float64x1_t) -> f64 { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f64")] + fn vfmad_lane_f64_(a: f64, b: f64, c: f64) -> f64; + } + static_assert!(LANE : i32 where LANE == 0); + let c: f64 = simd_extract(c, LANE as u32); + vfmad_lane_f64_(b, c, a) +} + +/// Floating-point fused multiply-add to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmad_laneq_f64(a: f64, b: f64, c: float64x2_t) -> f64 { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f64")] + fn vfmad_laneq_f64_(a: f64, b: f64, c: f64) -> f64; + } + static_assert_imm1!(LANE); + let c: f64 = simd_extract(c, LANE as u32); + vfmad_laneq_f64_(b, c, a) +} + +/// Floating-point fused multiply-subtract from accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmsub))] +pub unsafe fn vfms_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t { + let b: float64x1_t = simd_neg(b); + vfma_f64(a, b, c) +} + +/// Floating-point fused multiply-subtract from accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls))] +pub unsafe fn vfmsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t { + let b: float64x2_t = simd_neg(b); + vfmaq_f64(a, b, c) +} + +/// Floating-point fused Multiply-subtract to accumulator(vector) +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmsub))] +pub unsafe fn vfms_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t { + vfms_f64(a, b, vdup_n_f64(c)) +} + +/// Floating-point fused Multiply-subtract to accumulator(vector) +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls))] +pub unsafe fn vfmsq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t { + vfmsq_f64(a, b, vdupq_n_f64(c)) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfms_lane_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t { + static_assert_imm1!(LANE); + vfms_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfms_laneq_f32(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t { + static_assert_imm2!(LANE); + vfms_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmsq_lane_f32(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t { + static_assert_imm1!(LANE); + vfmsq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmsq_laneq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t { + static_assert_imm2!(LANE); + vfmsq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmsub, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfms_lane_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t { + static_assert!(LANE : i32 where LANE == 0); + vfms_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfms_laneq_f64(a: float64x1_t, b: float64x1_t, c: float64x2_t) -> float64x1_t { + static_assert_imm1!(LANE); + vfms_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmsq_lane_f64(a: float64x2_t, b: float64x2_t, c: float64x1_t) -> float64x2_t { + static_assert!(LANE : i32 where LANE == 0); + vfmsq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmsq_laneq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t { + static_assert_imm1!(LANE); + vfmsq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32))) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmss_lane_f32(a: f32, b: f32, c: float32x2_t) -> f32 { + vfmas_lane_f32::(a, -b, c) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmss_laneq_f32(a: f32, b: f32, c: float32x4_t) -> f32 { + vfmas_laneq_f32::(a, -b, c) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmsub, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmsd_lane_f64(a: f64, b: f64, c: float64x1_t) -> f64 { + vfmad_lane_f64::(a, -b, c) +} + +/// Floating-point fused multiply-subtract to accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmls, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vfmsd_laneq_f64(a: f64, b: f64, c: float64x2_t) -> f64 { + vfmad_laneq_f64::(a, -b, c) } /// Divide @@ -13006,9 +13298,9 @@ mod test { #[simd_test(enable = "neon")] unsafe fn test_vfma_f64() { - let a: f64 = 2.0; + let a: f64 = 8.0; let b: f64 = 6.0; - let c: f64 = 8.0; + let c: f64 = 2.0; let e: f64 = 20.0; let r: f64 = transmute(vfma_f64(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); @@ -13016,9 +13308,9 @@ mod test { #[simd_test(enable = "neon")] unsafe fn test_vfmaq_f64() { - let a: f64x2 = f64x2::new(2.0, 3.0); + let a: f64x2 = f64x2::new(8.0, 18.0); let b: f64x2 = f64x2::new(6.0, 4.0); - let c: f64x2 = f64x2::new(8.0, 18.0); + let c: f64x2 = f64x2::new(2.0, 3.0); let e: f64x2 = f64x2::new(20.0, 30.0); let r: f64x2 = transmute(vfmaq_f64(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); @@ -13044,6 +13336,286 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vfma_lane_f32() { + let a: f32x2 = f32x2::new(2., 3.); + let b: f32x2 = f32x2::new(6., 4.); + let c: f32x2 = f32x2::new(2., 0.); + let e: f32x2 = f32x2::new(14., 11.); + let r: f32x2 = transmute(vfma_lane_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfma_laneq_f32() { + let a: f32x2 = f32x2::new(2., 3.); + let b: f32x2 = f32x2::new(6., 4.); + let c: f32x4 = f32x4::new(2., 0., 0., 0.); + let e: f32x2 = f32x2::new(14., 11.); + let r: f32x2 = transmute(vfma_laneq_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmaq_lane_f32() { + let a: f32x4 = f32x4::new(2., 3., 4., 5.); + let b: f32x4 = f32x4::new(6., 4., 7., 8.); + let c: f32x2 = f32x2::new(2., 0.); + let e: f32x4 = f32x4::new(14., 11., 18., 21.); + let r: f32x4 = transmute(vfmaq_lane_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmaq_laneq_f32() { + let a: f32x4 = f32x4::new(2., 3., 4., 5.); + let b: f32x4 = f32x4::new(6., 4., 7., 8.); + let c: f32x4 = f32x4::new(2., 0., 0., 0.); + let e: f32x4 = f32x4::new(14., 11., 18., 21.); + let r: f32x4 = transmute(vfmaq_laneq_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfma_lane_f64() { + let a: f64 = 2.; + let b: f64 = 6.; + let c: f64 = 2.; + let e: f64 = 14.; + let r: f64 = transmute(vfma_lane_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfma_laneq_f64() { + let a: f64 = 2.; + let b: f64 = 6.; + let c: f64x2 = f64x2::new(2., 0.); + let e: f64 = 14.; + let r: f64 = transmute(vfma_laneq_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmaq_lane_f64() { + let a: f64x2 = f64x2::new(2., 3.); + let b: f64x2 = f64x2::new(6., 4.); + let c: f64 = 2.; + let e: f64x2 = f64x2::new(14., 11.); + let r: f64x2 = transmute(vfmaq_lane_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmaq_laneq_f64() { + let a: f64x2 = f64x2::new(2., 3.); + let b: f64x2 = f64x2::new(6., 4.); + let c: f64x2 = f64x2::new(2., 0.); + let e: f64x2 = f64x2::new(14., 11.); + let r: f64x2 = transmute(vfmaq_laneq_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmas_lane_f32() { + let a: f32 = 2.; + let b: f32 = 6.; + let c: f32x2 = f32x2::new(3., 0.); + let e: f32 = 20.; + let r: f32 = transmute(vfmas_lane_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmas_laneq_f32() { + let a: f32 = 2.; + let b: f32 = 6.; + let c: f32x4 = f32x4::new(3., 0., 0., 0.); + let e: f32 = 20.; + let r: f32 = transmute(vfmas_laneq_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmad_lane_f64() { + let a: f64 = 2.; + let b: f64 = 6.; + let c: f64 = 3.; + let e: f64 = 20.; + let r: f64 = transmute(vfmad_lane_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmad_laneq_f64() { + let a: f64 = 2.; + let b: f64 = 6.; + let c: f64x2 = f64x2::new(3., 0.); + let e: f64 = 20.; + let r: f64 = transmute(vfmad_laneq_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfms_f64() { + let a: f64 = 20.0; + let b: f64 = 6.0; + let c: f64 = 2.0; + let e: f64 = 8.0; + let r: f64 = transmute(vfms_f64(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmsq_f64() { + let a: f64x2 = f64x2::new(20.0, 30.0); + let b: f64x2 = f64x2::new(6.0, 4.0); + let c: f64x2 = f64x2::new(2.0, 3.0); + let e: f64x2 = f64x2::new(8.0, 18.0); + let r: f64x2 = transmute(vfmsq_f64(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfms_n_f64() { + let a: f64 = 50.0; + let b: f64 = 6.0; + let c: f64 = 8.0; + let e: f64 = 2.0; + let r: f64 = transmute(vfms_n_f64(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmsq_n_f64() { + let a: f64x2 = f64x2::new(50.0, 35.0); + let b: f64x2 = f64x2::new(6.0, 4.0); + let c: f64 = 8.0; + let e: f64x2 = f64x2::new(2.0, 3.0); + let r: f64x2 = transmute(vfmsq_n_f64(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfms_lane_f32() { + let a: f32x2 = f32x2::new(14., 11.); + let b: f32x2 = f32x2::new(6., 4.); + let c: f32x2 = f32x2::new(2., 0.); + let e: f32x2 = f32x2::new(2., 3.); + let r: f32x2 = transmute(vfms_lane_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfms_laneq_f32() { + let a: f32x2 = f32x2::new(14., 11.); + let b: f32x2 = f32x2::new(6., 4.); + let c: f32x4 = f32x4::new(2., 0., 0., 0.); + let e: f32x2 = f32x2::new(2., 3.); + let r: f32x2 = transmute(vfms_laneq_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmsq_lane_f32() { + let a: f32x4 = f32x4::new(14., 11., 18., 21.); + let b: f32x4 = f32x4::new(6., 4., 7., 8.); + let c: f32x2 = f32x2::new(2., 0.); + let e: f32x4 = f32x4::new(2., 3., 4., 5.); + let r: f32x4 = transmute(vfmsq_lane_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmsq_laneq_f32() { + let a: f32x4 = f32x4::new(14., 11., 18., 21.); + let b: f32x4 = f32x4::new(6., 4., 7., 8.); + let c: f32x4 = f32x4::new(2., 0., 0., 0.); + let e: f32x4 = f32x4::new(2., 3., 4., 5.); + let r: f32x4 = transmute(vfmsq_laneq_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfms_lane_f64() { + let a: f64 = 14.; + let b: f64 = 6.; + let c: f64 = 2.; + let e: f64 = 2.; + let r: f64 = transmute(vfms_lane_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfms_laneq_f64() { + let a: f64 = 14.; + let b: f64 = 6.; + let c: f64x2 = f64x2::new(2., 0.); + let e: f64 = 2.; + let r: f64 = transmute(vfms_laneq_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmsq_lane_f64() { + let a: f64x2 = f64x2::new(14., 11.); + let b: f64x2 = f64x2::new(6., 4.); + let c: f64 = 2.; + let e: f64x2 = f64x2::new(2., 3.); + let r: f64x2 = transmute(vfmsq_lane_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmsq_laneq_f64() { + let a: f64x2 = f64x2::new(14., 11.); + let b: f64x2 = f64x2::new(6., 4.); + let c: f64x2 = f64x2::new(2., 0.); + let e: f64x2 = f64x2::new(2., 3.); + let r: f64x2 = transmute(vfmsq_laneq_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmss_lane_f32() { + let a: f32 = 14.; + let b: f32 = 6.; + let c: f32x2 = f32x2::new(2., 0.); + let e: f32 = 2.; + let r: f32 = transmute(vfmss_lane_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmss_laneq_f32() { + let a: f32 = 14.; + let b: f32 = 6.; + let c: f32x4 = f32x4::new(2., 0., 0., 0.); + let e: f32 = 2.; + let r: f32 = transmute(vfmss_laneq_f32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmsd_lane_f64() { + let a: f64 = 14.; + let b: f64 = 6.; + let c: f64 = 2.; + let e: f64 = 2.; + let r: f64 = transmute(vfmsd_lane_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmsd_laneq_f64() { + let a: f64 = 14.; + let b: f64 = 6.; + let c: f64x2 = f64x2::new(2., 0.); + let e: f64 = 2.; + let r: f64 = transmute(vfmsd_laneq_f64::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vdiv_f32() { let a: f32x2 = f32x2::new(2.0, 6.0); diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 0387799f6f..296b86469a 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -6607,7 +6607,7 @@ pub unsafe fn vfma_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float3 #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f32")] fn vfma_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t; } -vfma_f32_(a, b, c) +vfma_f32_(b, c, a) } /// Floating-point fused Multiply-Add to accumulator(vector) @@ -6623,7 +6623,7 @@ pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v4f32")] fn vfmaq_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t; } -vfmaq_f32_(a, b, c) +vfmaq_f32_(b, c, a) } /// Floating-point fused Multiply-Add to accumulator(vector) @@ -6633,8 +6633,7 @@ vfmaq_f32_(a, b, c) #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))] pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t { - let d: float32x2_t = transmute(f32x2::new(c, c)); - vfma_f32(b, d, a) + vfma_f32(a, b, vdup_n_f32(c)) } /// Floating-point fused Multiply-Add to accumulator(vector) @@ -6644,8 +6643,49 @@ pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))] pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t { - let d: float32x4_t = transmute(f32x4::new(c, c, c, c)); - vfmaq_f32(b, d, a) + vfmaq_f32(a, b, vdupq_n_f32(c)) +} + +/// Floating-point fused multiply-subtract from accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))] +pub unsafe fn vfms_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t { + let b: float32x2_t = simd_neg(b); + vfma_f32(a, b, c) +} + +/// Floating-point fused multiply-subtract from accumulator +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))] +pub unsafe fn vfmsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t { + let b: float32x4_t = simd_neg(b); + vfmaq_f32(a, b, c) +} + +/// Floating-point fused Multiply-subtract to accumulator(vector) +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))] +pub unsafe fn vfms_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t { + vfms_f32(a, b, vdup_n_f32(c)) +} + +/// Floating-point fused Multiply-subtract to accumulator(vector) +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))] +pub unsafe fn vfmsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t { + vfmsq_f32(a, b, vdupq_n_f32(c)) } /// Subtract @@ -19484,9 +19524,9 @@ mod test { #[simd_test(enable = "neon")] unsafe fn test_vfma_f32() { - let a: f32x2 = f32x2::new(2.0, 3.0); + let a: f32x2 = f32x2::new(8.0, 18.0); let b: f32x2 = f32x2::new(6.0, 4.0); - let c: f32x2 = f32x2::new(8.0, 18.0); + let c: f32x2 = f32x2::new(2.0, 3.0); let e: f32x2 = f32x2::new(20.0, 30.0); let r: f32x2 = transmute(vfma_f32(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); @@ -19494,9 +19534,9 @@ mod test { #[simd_test(enable = "neon")] unsafe fn test_vfmaq_f32() { - let a: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0); + let a: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0); let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0); - let c: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0); + let c: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0); let e: f32x4 = f32x4::new(20.0, 30.0, 40.0, 50.0); let r: f32x4 = transmute(vfmaq_f32(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); @@ -19522,6 +19562,46 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vfms_f32() { + let a: f32x2 = f32x2::new(20.0, 30.0); + let b: f32x2 = f32x2::new(6.0, 4.0); + let c: f32x2 = f32x2::new(2.0, 3.0); + let e: f32x2 = f32x2::new(8.0, 18.0); + let r: f32x2 = transmute(vfms_f32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmsq_f32() { + let a: f32x4 = f32x4::new(20.0, 30.0, 40.0, 50.0); + let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0); + let c: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0); + let e: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0); + let r: f32x4 = transmute(vfmsq_f32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfms_n_f32() { + let a: f32x2 = f32x2::new(50.0, 35.0); + let b: f32x2 = f32x2::new(6.0, 4.0); + let c: f32 = 8.0; + let e: f32x2 = f32x2::new(2.0, 3.0); + let r: f32x2 = transmute(vfms_n_f32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmsq_n_f32() { + let a: f32x4 = f32x4::new(50.0, 35.0, 60.0, 69.0); + let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0); + let c: f32 = 8.0; + let e: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0); + let r: f32x4 = transmute(vfmsq_n_f32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vsub_s8() { let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs index f69de39255..5e5b6fa6d5 100644 --- a/crates/core_arch/src/lib.rs +++ b/crates/core_arch/src/lib.rs @@ -37,7 +37,6 @@ external_doc, allow_internal_unstable, decl_macro, - extended_key_value_attributes, bench_black_box )] #![cfg_attr(test, feature(test, abi_vectorcall))] diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 825ecf5115..159419285f 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2402,31 +2402,27 @@ generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:floa /// Floating-point fused Multiply-Add to accumulator(vector) name = vfma -a = 2.0, 3.0, 4.0, 5.0 +multi_fn = vfma-self-_, b, c, a +a = 8.0, 18.0, 12.0, 10.0 b = 6.0, 4.0, 7.0, 8.0 -c = 8.0, 18.0, 12.0, 10.0 +c = 2.0, 3.0, 4.0, 5.0 validate 20.0, 30.0, 40.0, 50.0 -aarch64 = fmadd link-aarch64 = llvm.fma._EXT_ +aarch64 = fmadd generate float64x1_t - aarch64 = fmla -link-aarch64 = llvm.fma._EXT_ generate float64x2_t target = fp-armv8 arm = vfma -aarch64 = fmla link-arm = llvm.fma._EXT_ -link-aarch64 = llvm.fma._EXT_ generate float*_t /// Floating-point fused Multiply-Add to accumulator(vector) name = vfma n-suffix -multi_fn = transmute, d:in_t, {f64x1::new, c} -multi_fn = vfma-self-noext, b, transmute(d), a +multi_fn = vfma-self-noext, a, b, {vdup-nself-noext, c} a = 2.0, 3.0, 4.0, 5.0 b = 6.0, 4.0, 7.0, 8.0 c = 8.0 @@ -2434,49 +2430,126 @@ validate 50.0, 35.0, 60.0, 69.0 aarch64 = fmadd generate float64x1_t:float64x1_t:f64:float64x1_t +aarch64 = fmla +generate float64x2_t:float64x2_t:f64:float64x2_t -/// Floating-point fused Multiply-Add to accumulator(vector) +target = fp-armv8 +arm = vfma +generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t + +/// Floating-point fused multiply-add to accumulator name = vfma -n-suffix -multi_fn = transmute, d:in_t, {f64x2::new, c, c} -multi_fn = vfma-self-noext, b, d, a -a = 2.0, 3.0, 4.0, 5.0 -b = 6.0, 4.0, 7.0, 8.0 -c = 8.0 -validate 50.0, 35.0, 60.0, 69.0 +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}} +a = 2., 3., 4., 5. +b = 6., 4., 7., 8. +c = 2., 0., 0., 0. +n = 0 +validate 14., 11., 18., 21. aarch64 = fmla -generate float64x2_t:float64x2_t:f64:float64x2_t +generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t +aarch64 = fmadd +generate float64x1_t +aarch64 = fmla +generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t -/// Floating-point fused Multiply-Add to accumulator(vector) +/// Floating-point fused multiply-add to accumulator name = vfma -n-suffix -multi_fn = transmute, d:in_t, {f32x2::new, c, c} -multi_fn = vfma-self-noext, b, d, a -a = 2.0, 3.0, 4.0, 5.0 +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = simd_extract, c:out_t, c, LANE as u32 +multi_fn = vfma-in2lane-_, b, c, a +a = 2. +b = 6. +c = 3., 0., 0., 0. +n = 0 +validate 20. + +aarch64 = fmla +link-aarch64 = llvm.fma._EXT_:f32:f32:f32:f32 +generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 +link-aarch64 = llvm.fma._EXT_:f64:f64:f64:f64 +aarch64 = fmadd +generate f64:f64:float64x1_t:f64 +aarch64 = fmla +generate f64:f64:float64x2_t:f64 + +/// Floating-point fused multiply-subtract from accumulator +name = vfms +multi_fn = simd_neg, b:in_t, b +multi_fn = vfma-self-noext, a, b, c +a = 20.0, 30.0, 40.0, 50.0 b = 6.0, 4.0, 7.0, 8.0 -c = 8.0 -validate 50.0, 35.0, 60.0, 69.0 +c = 2.0, 3.0, 4.0, 5.0 +validate 8.0, 18.0, 12.0, 10.0 + +aarch64 = fmsub +generate float64x1_t +aarch64 = fmls +generate float64x2_t target = fp-armv8 -arm = vfma -aarch64 = fmla -generate float32x2_t:float32x2_t:f32:float32x2_t +arm = vfms +generate float*_t -/// Floating-point fused Multiply-Add to accumulator(vector) -name = vfma +/// Floating-point fused Multiply-subtract to accumulator(vector) +name = vfms n-suffix -multi_fn = transmute, d:in_t, {f32x4::new, c, c, c, c} -multi_fn = vfma-self-noext, b, d, a -a = 2.0, 3.0, 4.0, 5.0 +multi_fn = vfms-self-noext, a, b, {vdup-nself-noext, c} +a = 50.0, 35.0, 60.0, 69.0 b = 6.0, 4.0, 7.0, 8.0 c = 8.0 -validate 50.0, 35.0, 60.0, 69.0 +validate 2.0, 3.0, 4.0, 5.0 + +aarch64 = fmsub +generate float64x1_t:float64x1_t:f64:float64x1_t +aarch64 = fmls +generate float64x2_t:float64x2_t:f64:float64x2_t target = fp-armv8 -arm = vfma -aarch64 = fmla -generate float32x4_t:float32x4_t:f32:float32x4_t +arm = vfms +generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t + +/// Floating-point fused multiply-subtract to accumulator +name = vfms +in2-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_exp_len-LANE +multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}} +a = 14., 11., 18., 21. +b = 6., 4., 7., 8. +c = 2., 0., 0., 0. +n = 0 +validate 2., 3., 4., 5. + +aarch64 = fmls +generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t +aarch64 = fmsub +generate float64x1_t +aarch64 = fmls +generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t + +/// Floating-point fused multiply-subtract to accumulator +name = vfms +in2-lane-suffixes +constn = LANE +multi_fn = vfma-in2lane-::, a, -b, c +a = 14. +b = 6. +c = 2., 0., 0., 0. +n = 0 +validate 2. + +aarch64 = fmls +generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 +aarch64 = fmsub +generate f64:f64:float64x1_t:f64 +aarch64 = fmls +generate f64:f64:float64x2_t:f64 /// Divide name = vdiv diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs index cc48a65b25..8ea14f0f7f 100644 --- a/crates/stdarch-test/src/lib.rs +++ b/crates/stdarch-test/src/lib.rs @@ -122,7 +122,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) { "cvtpi2ps" => 25, // core_arch/src/arm_shared/simd32 // vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit) - "usad8" | "vfma" => 27, + "usad8" | "vfma" | "vfms" => 27, "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29, // Original limit was 20 instructions, but ARM DSP Intrinsics