|
6 | 6 | rustc_attrs, |
7 | 7 | intrinsics, |
8 | 8 | core_intrinsics, |
9 | | - repr_simd |
| 9 | + repr_simd, |
| 10 | + f16, |
| 11 | + f128 |
10 | 12 | )] |
11 | | -#![allow(incomplete_features, internal_features)] |
| 13 | +#![allow(incomplete_features, internal_features, non_camel_case_types)] |
| 14 | +use std::fmt::{self, Debug, Formatter}; |
12 | 15 | use std::intrinsics::simd as intrinsics; |
13 | 16 | use std::ptr; |
14 | 17 | use std::simd::StdFloat; |
15 | 18 | use std::simd::prelude::*; |
16 | 19 |
|
| 20 | +#[repr(simd, packed)] |
| 21 | +#[derive(Copy)] |
| 22 | +struct PackedSimd<T, const N: usize>([T; N]); |
| 23 | + |
| 24 | +impl<T: Copy, const N: usize> Clone for PackedSimd<T, N> { |
| 25 | + fn clone(&self) -> Self { |
| 26 | + *self |
| 27 | + } |
| 28 | +} |
| 29 | + |
| 30 | +impl<T: PartialEq + Copy, const N: usize> PartialEq for PackedSimd<T, N> { |
| 31 | + fn eq(&self, other: &Self) -> bool { |
| 32 | + self.into_array() == other.into_array() |
| 33 | + } |
| 34 | +} |
| 35 | + |
| 36 | +impl<T: Debug + Copy, const N: usize> Debug for PackedSimd<T, N> { |
| 37 | + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
| 38 | + Debug::fmt(&self.into_array(), f) |
| 39 | + } |
| 40 | +} |
| 41 | + |
| 42 | +type f16x2 = PackedSimd<f16, 2>; |
| 43 | +type f16x4 = PackedSimd<f16, 4>; |
| 44 | + |
| 45 | +type f128x2 = PackedSimd<f128, 2>; |
| 46 | +type f128x4 = PackedSimd<f128, 4>; |
| 47 | + |
| 48 | +impl<T: Copy, const N: usize> PackedSimd<T, N> { |
| 49 | + fn splat(x: T) -> Self { |
| 50 | + Self([x; N]) |
| 51 | + } |
| 52 | + fn from_array(a: [T; N]) -> Self { |
| 53 | + Self(a) |
| 54 | + } |
| 55 | + fn into_array(self) -> [T; N] { |
| 56 | + // as we have `repr(packed)`, there shouldn't be any padding bytes |
| 57 | + unsafe { std::mem::transmute_copy(&self) } |
| 58 | + } |
| 59 | +} |
| 60 | + |
17 | 61 | #[rustc_intrinsic] |
18 | 62 | #[rustc_nounwind] |
19 | 63 | pub unsafe fn simd_shuffle_const_generic<T, U, const IDX: &'static [u32]>(x: T, y: T) -> U; |
20 | 64 |
|
| 65 | +pub fn simd_ops_f16() { |
| 66 | + use intrinsics::*; |
| 67 | + |
| 68 | + // small hack to make type inference better |
| 69 | + macro_rules! assert_eq { |
| 70 | + ($a:expr, $b:expr $(,$t:tt)*) => {{ |
| 71 | + let a = $a; |
| 72 | + let b = $b; |
| 73 | + if false { let _inference = b == a; } |
| 74 | + ::std::assert_eq!(a, b, $(,$t)*) |
| 75 | + }} |
| 76 | + } |
| 77 | + |
| 78 | + let a = f16x4::splat(10.0); |
| 79 | + let b = f16x4::from_array([1.0, 2.0, 3.0, -4.0]); |
| 80 | + |
| 81 | + unsafe { |
| 82 | + assert_eq!(simd_neg(b), f16x4::from_array([-1.0, -2.0, -3.0, 4.0])); |
| 83 | + assert_eq!(simd_add(a, b), f16x4::from_array([11.0, 12.0, 13.0, 6.0])); |
| 84 | + assert_eq!(simd_sub(a, b), f16x4::from_array([9.0, 8.0, 7.0, 14.0])); |
| 85 | + assert_eq!(simd_mul(a, b), f16x4::from_array([10.0, 20.0, 30.0, -40.0])); |
| 86 | + assert_eq!(simd_div(b, a), f16x4::from_array([0.1, 0.2, 0.3, -0.4])); |
| 87 | + assert_eq!(simd_div(a, f16x4::splat(2.0)), f16x4::splat(5.0)); |
| 88 | + assert_eq!(simd_rem(a, b), f16x4::from_array([0.0, 0.0, 1.0, 2.0])); |
| 89 | + assert_eq!(simd_fabs(b), f16x4::from_array([1.0, 2.0, 3.0, 4.0])); |
| 90 | + assert_eq!( |
| 91 | + simd_fmax(a, simd_mul(b, f16x4::splat(4.0))), |
| 92 | + f16x4::from_array([10.0, 10.0, 12.0, 10.0]) |
| 93 | + ); |
| 94 | + assert_eq!( |
| 95 | + simd_fmin(a, simd_mul(b, f16x4::splat(4.0))), |
| 96 | + f16x4::from_array([4.0, 8.0, 10.0, -16.0]) |
| 97 | + ); |
| 98 | + |
| 99 | + assert_eq!(simd_fma(a, b, a), simd_add(simd_mul(a, b), a)); |
| 100 | + assert_eq!(simd_fma(b, b, a), simd_add(simd_mul(b, b), a)); |
| 101 | + assert_eq!(simd_fma(a, b, b), simd_add(simd_mul(a, b), b)); |
| 102 | + assert_eq!( |
| 103 | + simd_fma(f16x4::splat(-3.2), b, f16x4::splat(f16::NEG_INFINITY)), |
| 104 | + f16x4::splat(f16::NEG_INFINITY) |
| 105 | + ); |
| 106 | + |
| 107 | + assert_eq!(simd_relaxed_fma(a, b, a), simd_add(simd_mul(a, b), a)); |
| 108 | + assert_eq!(simd_relaxed_fma(b, b, a), simd_add(simd_mul(b, b), a)); |
| 109 | + assert_eq!(simd_relaxed_fma(a, b, b), simd_add(simd_mul(a, b), b)); |
| 110 | + assert_eq!( |
| 111 | + simd_relaxed_fma(f16x4::splat(-3.2), b, f16x4::splat(f16::NEG_INFINITY)), |
| 112 | + f16x4::splat(f16::NEG_INFINITY) |
| 113 | + ); |
| 114 | + |
| 115 | + assert_eq!(simd_eq(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([0, !0, 0, 0])); |
| 116 | + assert_eq!(simd_ne(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([!0, 0, !0, !0])); |
| 117 | + assert_eq!(simd_le(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([0, !0, !0, 0])); |
| 118 | + assert_eq!(simd_lt(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([0, 0, !0, 0])); |
| 119 | + assert_eq!(simd_ge(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([!0, !0, 0, !0])); |
| 120 | + assert_eq!(simd_gt(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([!0, 0, 0, !0])); |
| 121 | + |
| 122 | + assert_eq!(simd_reduce_add_ordered(a, 0.0), 40.0f16); |
| 123 | + assert_eq!(simd_reduce_add_ordered(b, 0.0), 2.0f16); |
| 124 | + assert_eq!(simd_reduce_mul_ordered(a, 1.0), 10000.0f16); |
| 125 | + assert_eq!(simd_reduce_mul_ordered(b, 1.0), -24.0f16); |
| 126 | + assert_eq!(simd_reduce_max(a), 10.0f16); |
| 127 | + assert_eq!(simd_reduce_max(b), 3.0f16); |
| 128 | + assert_eq!(simd_reduce_min(a), 10.0f16); |
| 129 | + assert_eq!(simd_reduce_min(b), -4.0f16); |
| 130 | + |
| 131 | + assert_eq!( |
| 132 | + simd_fmax(f16x2::from_array([0.0, f16::NAN]), f16x2::from_array([f16::NAN, 0.0])), |
| 133 | + f16x2::from_array([0.0, 0.0]) |
| 134 | + ); |
| 135 | + assert_eq!(simd_reduce_max(f16x2::from_array([0.0, f16::NAN])), 0.0f16); |
| 136 | + assert_eq!(simd_reduce_max(f16x2::from_array([f16::NAN, 0.0])), 0.0f16); |
| 137 | + assert_eq!( |
| 138 | + simd_fmin(f16x2::from_array([0.0, f16::NAN]), f16x2::from_array([f16::NAN, 0.0])), |
| 139 | + f16x2::from_array([0.0, 0.0]) |
| 140 | + ); |
| 141 | + assert_eq!(simd_reduce_min(f16x2::from_array([0.0, f16::NAN])), 0.0f16); |
| 142 | + assert_eq!(simd_reduce_min(f16x2::from_array([f16::NAN, 0.0])), 0.0f16); |
| 143 | + } |
| 144 | +} |
| 145 | + |
21 | 146 | fn simd_ops_f32() { |
22 | 147 | let a = f32x4::splat(10.0); |
23 | 148 | let b = f32x4::from_array([1.0, 2.0, 3.0, -4.0]); |
@@ -148,6 +273,87 @@ fn simd_ops_f64() { |
148 | 273 | assert_eq!(f64x2::from_array([f64::NAN, 0.0]).reduce_min(), 0.0); |
149 | 274 | } |
150 | 275 |
|
| 276 | +pub fn simd_ops_f128() { |
| 277 | + use intrinsics::*; |
| 278 | + |
| 279 | + // small hack to make type inference better |
| 280 | + macro_rules! assert_eq { |
| 281 | + ($a:expr, $b:expr $(,$t:tt)*) => {{ |
| 282 | + let a = $a; |
| 283 | + let b = $b; |
| 284 | + if false { let _inference = b == a; } |
| 285 | + ::std::assert_eq!(a, b, $(,$t)*) |
| 286 | + }} |
| 287 | + } |
| 288 | + |
| 289 | + let a = f128x4::splat(10.0); |
| 290 | + let b = f128x4::from_array([1.0, 2.0, 3.0, -4.0]); |
| 291 | + |
| 292 | + unsafe { |
| 293 | + assert_eq!(simd_neg(b), f128x4::from_array([-1.0, -2.0, -3.0, 4.0])); |
| 294 | + assert_eq!(simd_add(a, b), f128x4::from_array([11.0, 12.0, 13.0, 6.0])); |
| 295 | + assert_eq!(simd_sub(a, b), f128x4::from_array([9.0, 8.0, 7.0, 14.0])); |
| 296 | + assert_eq!(simd_mul(a, b), f128x4::from_array([10.0, 20.0, 30.0, -40.0])); |
| 297 | + assert_eq!(simd_div(b, a), f128x4::from_array([0.1, 0.2, 0.3, -0.4])); |
| 298 | + assert_eq!(simd_div(a, f128x4::splat(2.0)), f128x4::splat(5.0)); |
| 299 | + assert_eq!(simd_rem(a, b), f128x4::from_array([0.0, 0.0, 1.0, 2.0])); |
| 300 | + assert_eq!(simd_fabs(b), f128x4::from_array([1.0, 2.0, 3.0, 4.0])); |
| 301 | + assert_eq!( |
| 302 | + simd_fmax(a, simd_mul(b, f128x4::splat(4.0))), |
| 303 | + f128x4::from_array([10.0, 10.0, 12.0, 10.0]) |
| 304 | + ); |
| 305 | + assert_eq!( |
| 306 | + simd_fmin(a, simd_mul(b, f128x4::splat(4.0))), |
| 307 | + f128x4::from_array([4.0, 8.0, 10.0, -16.0]) |
| 308 | + ); |
| 309 | + |
| 310 | + assert_eq!(simd_fma(a, b, a), simd_add(simd_mul(a, b), a)); |
| 311 | + assert_eq!(simd_fma(b, b, a), simd_add(simd_mul(b, b), a)); |
| 312 | + assert_eq!(simd_fma(a, b, b), simd_add(simd_mul(a, b), b)); |
| 313 | + assert_eq!( |
| 314 | + simd_fma(f128x4::splat(-3.2), b, f128x4::splat(f128::NEG_INFINITY)), |
| 315 | + f128x4::splat(f128::NEG_INFINITY) |
| 316 | + ); |
| 317 | + |
| 318 | + assert_eq!(simd_relaxed_fma(a, b, a), simd_add(simd_mul(a, b), a)); |
| 319 | + assert_eq!(simd_relaxed_fma(b, b, a), simd_add(simd_mul(b, b), a)); |
| 320 | + assert_eq!(simd_relaxed_fma(a, b, b), simd_add(simd_mul(a, b), b)); |
| 321 | + assert_eq!( |
| 322 | + simd_relaxed_fma(f128x4::splat(-3.2), b, f128x4::splat(f128::NEG_INFINITY)), |
| 323 | + f128x4::splat(f128::NEG_INFINITY) |
| 324 | + ); |
| 325 | + |
| 326 | + assert_eq!(simd_eq(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([0, !0, 0, 0])); |
| 327 | + assert_eq!(simd_ne(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([!0, 0, !0, !0])); |
| 328 | + assert_eq!(simd_le(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([0, !0, !0, 0])); |
| 329 | + assert_eq!(simd_lt(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([0, 0, !0, 0])); |
| 330 | + assert_eq!(simd_ge(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([!0, !0, 0, !0])); |
| 331 | + assert_eq!(simd_gt(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([!0, 0, 0, !0])); |
| 332 | + |
| 333 | + assert_eq!(simd_reduce_add_ordered(a, 0.0), 40.0f128); |
| 334 | + assert_eq!(simd_reduce_add_ordered(b, 0.0), 2.0f128); |
| 335 | + assert_eq!(simd_reduce_mul_ordered(a, 1.0), 10000.0f128); |
| 336 | + assert_eq!(simd_reduce_mul_ordered(b, 1.0), -24.0f128); |
| 337 | + assert_eq!(simd_reduce_max(a), 10.0f128); |
| 338 | + assert_eq!(simd_reduce_max(b), 3.0f128); |
| 339 | + assert_eq!(simd_reduce_min(a), 10.0f128); |
| 340 | + assert_eq!(simd_reduce_min(b), -4.0f128); |
| 341 | + |
| 342 | + assert_eq!( |
| 343 | + simd_fmax(f128x2::from_array([0.0, f128::NAN]), f128x2::from_array([f128::NAN, 0.0])), |
| 344 | + f128x2::from_array([0.0, 0.0]) |
| 345 | + ); |
| 346 | + assert_eq!(simd_reduce_max(f128x2::from_array([0.0, f128::NAN])), 0.0f128); |
| 347 | + assert_eq!(simd_reduce_max(f128x2::from_array([f128::NAN, 0.0])), 0.0f128); |
| 348 | + assert_eq!( |
| 349 | + simd_fmin(f128x2::from_array([0.0, f128::NAN]), f128x2::from_array([f128::NAN, 0.0])), |
| 350 | + f128x2::from_array([0.0, 0.0]) |
| 351 | + ); |
| 352 | + assert_eq!(simd_reduce_min(f128x2::from_array([0.0, f128::NAN])), 0.0f128); |
| 353 | + assert_eq!(simd_reduce_min(f128x2::from_array([f128::NAN, 0.0])), 0.0f128); |
| 354 | + } |
| 355 | +} |
| 356 | + |
151 | 357 | fn simd_ops_i32() { |
152 | 358 | let a = i32x4::splat(10); |
153 | 359 | let b = i32x4::from_array([1, 2, 3, -4]); |
@@ -563,6 +769,31 @@ fn simd_gather_scatter() { |
563 | 769 | } |
564 | 770 |
|
565 | 771 | fn simd_round() { |
| 772 | + unsafe { |
| 773 | + use intrinsics::*; |
| 774 | + |
| 775 | + assert_eq!( |
| 776 | + simd_ceil(f16x4::from_array([0.9, 1.001, 2.0, -4.5])), |
| 777 | + f16x4::from_array([1.0, 2.0, 2.0, -4.0]) |
| 778 | + ); |
| 779 | + assert_eq!( |
| 780 | + simd_floor(f16x4::from_array([0.9, 1.001, 2.0, -4.5])), |
| 781 | + f16x4::from_array([0.0, 1.0, 2.0, -5.0]) |
| 782 | + ); |
| 783 | + assert_eq!( |
| 784 | + simd_round(f16x4::from_array([0.9, 1.001, 2.0, -4.5])), |
| 785 | + f16x4::from_array([1.0, 1.0, 2.0, -5.0]) |
| 786 | + ); |
| 787 | + assert_eq!( |
| 788 | + simd_round_ties_even(f16x4::from_array([0.9, 1.001, 2.0, -4.5])), |
| 789 | + f16x4::from_array([1.0, 1.0, 2.0, -4.0]) |
| 790 | + ); |
| 791 | + assert_eq!( |
| 792 | + simd_trunc(f16x4::from_array([0.9, 1.001, 2.0, -4.5])), |
| 793 | + f16x4::from_array([0.0, 1.0, 2.0, -4.0]) |
| 794 | + ); |
| 795 | + } |
| 796 | + |
566 | 797 | assert_eq!( |
567 | 798 | f32x4::from_array([0.9, 1.001, 2.0, -4.5]).ceil(), |
568 | 799 | f32x4::from_array([1.0, 2.0, 2.0, -4.0]) |
@@ -604,6 +835,31 @@ fn simd_round() { |
604 | 835 | f64x4::from_array([0.9, 1.001, 2.0, -4.5]).trunc(), |
605 | 836 | f64x4::from_array([0.0, 1.0, 2.0, -4.0]) |
606 | 837 | ); |
| 838 | + |
| 839 | + unsafe { |
| 840 | + use intrinsics::*; |
| 841 | + |
| 842 | + assert_eq!( |
| 843 | + simd_ceil(f128x4::from_array([0.9, 1.001, 2.0, -4.5])), |
| 844 | + f128x4::from_array([1.0, 2.0, 2.0, -4.0]) |
| 845 | + ); |
| 846 | + assert_eq!( |
| 847 | + simd_floor(f128x4::from_array([0.9, 1.001, 2.0, -4.5])), |
| 848 | + f128x4::from_array([0.0, 1.0, 2.0, -5.0]) |
| 849 | + ); |
| 850 | + assert_eq!( |
| 851 | + simd_round(f128x4::from_array([0.9, 1.001, 2.0, -4.5])), |
| 852 | + f128x4::from_array([1.0, 1.0, 2.0, -5.0]) |
| 853 | + ); |
| 854 | + assert_eq!( |
| 855 | + simd_round_ties_even(f128x4::from_array([0.9, 1.001, 2.0, -4.5])), |
| 856 | + f128x4::from_array([1.0, 1.0, 2.0, -4.0]) |
| 857 | + ); |
| 858 | + assert_eq!( |
| 859 | + simd_trunc(f128x4::from_array([0.9, 1.001, 2.0, -4.5])), |
| 860 | + f128x4::from_array([0.0, 1.0, 2.0, -4.0]) |
| 861 | + ); |
| 862 | + } |
607 | 863 | } |
608 | 864 |
|
609 | 865 | fn simd_intrinsics() { |
@@ -724,8 +980,10 @@ fn simd_ops_non_pow2() { |
724 | 980 |
|
725 | 981 | fn main() { |
726 | 982 | simd_mask(); |
| 983 | + simd_ops_f16(); |
727 | 984 | simd_ops_f32(); |
728 | 985 | simd_ops_f64(); |
| 986 | + simd_ops_f128(); |
729 | 987 | simd_ops_i32(); |
730 | 988 | simd_ops_non_pow2(); |
731 | 989 | simd_cast(); |
|
0 commit comments