From 9c12ad3c6922f9835c94f61e89d7cbec8a3c7fea Mon Sep 17 00:00:00 2001 From: TheIronBorn Date: Wed, 18 Jul 2018 22:25:13 +0000 Subject: [PATCH] impl SIMD wmul, UniformInt --- Cargo.toml | 3 +- src/distributions/float.rs | 10 +- src/distributions/integer.rs | 2 +- src/distributions/uniform.rs | 292 ++++++++++++++++++++++++++++------- src/distributions/utils.rs | 134 +++++++++++++++- src/lib.rs | 2 + 6 files changed, 377 insertions(+), 66 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d8e9ea02dd8..7a998b81597 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ nightly = ["i128_support", "simd_support"] # enables all features requiring nigh std = ["rand_core/std", "alloc", "libc", "winapi", "cloudabi", "fuchsia-zircon"] alloc = ["rand_core/alloc"] # enables Vec and Box support (without std) i128_support = [] # enables i128 and u128 support -simd_support = [] # enables SIMD support +simd_support = ["packed_simd"] # enables SIMD support serde1 = ["serde", "serde_derive", "rand_core/serde1"] # enables serialization for PRNGs [workspace] @@ -34,6 +34,7 @@ rand_core = { path = "rand_core", version = "0.2", default-features = false } # only for deprecations and benches: rand_isaac = { path = "rand_isaac", version = "0.1", default-features = false } log = { version = "0.4", optional = true } +packed_simd = { version = "0.1", optional = true, features = ["into_bits"] } serde = { version = "1", optional = true } serde_derive = { version = "1", optional = true } diff --git a/src/distributions/float.rs b/src/distributions/float.rs index 0d418ebdc74..d7a5bb98bd3 100644 --- a/src/distributions/float.rs +++ b/src/distributions/float.rs @@ -15,7 +15,7 @@ use Rng; use distributions::{Distribution, Standard}; use distributions::utils::FloatSIMDUtils; #[cfg(feature="simd_support")] -use core::simd::*; +use packed_simd::*; /// A distribution to sample floating point numbers uniformly in the half-open /// interval `(0, 1]`, i.e. including 1 but not 0. @@ -106,7 +106,7 @@ macro_rules! float_impls { // Multiply-based method; 24/53 random bits; [0, 1) interval. // We use the most significant bits because for simple RNGs // those are usually more random. - let float_size = mem::size_of::<$f_scalar>() * 8; + let float_size = mem::size_of::<$f_scalar>() as u32 * 8; let precision = $fraction_bits + 1; let scale = 1.0 / ((1 as $u_scalar << precision) as $f_scalar); @@ -121,7 +121,7 @@ macro_rules! float_impls { // Multiply-based method; 24/53 random bits; (0, 1] interval. // We use the most significant bits because for simple RNGs // those are usually more random. - let float_size = mem::size_of::<$f_scalar>() * 8; + let float_size = mem::size_of::<$f_scalar>() as u32 * 8; let precision = $fraction_bits + 1; let scale = 1.0 / ((1 as $u_scalar << precision) as $f_scalar); @@ -138,7 +138,7 @@ macro_rules! float_impls { // We use the most significant bits because for simple RNGs // those are usually more random. use core::$f_scalar::EPSILON; - let float_size = mem::size_of::<$f_scalar>() * 8; + let float_size = mem::size_of::<$f_scalar>() as u32 * 8; let value: $uty = rng.gen(); let fraction = value >> (float_size - $fraction_bits); @@ -174,7 +174,7 @@ mod tests { use distributions::{Open01, OpenClosed01}; use rngs::mock::StepRng; #[cfg(feature="simd_support")] - use core::simd::*; + use packed_simd::*; const EPSILON32: f32 = ::core::f32::EPSILON; const EPSILON64: f64 = ::core::f64::EPSILON; diff --git a/src/distributions/integer.rs b/src/distributions/integer.rs index 82efd9ba795..0fce20f6de7 100644 --- a/src/distributions/integer.rs +++ b/src/distributions/integer.rs @@ -13,7 +13,7 @@ use {Rng}; use distributions::{Distribution, Standard}; #[cfg(feature="simd_support")] -use core::simd::*; +use packed_simd::*; impl Distribution for Standard { #[inline] diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index bf68e45f51a..b4e34873ec4 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -30,10 +30,10 @@ //! ``` //! use rand::{Rng, thread_rng}; //! use rand::distributions::Uniform; -//! +//! //! let mut rng = thread_rng(); //! let side = Uniform::new(-10.0, 10.0); -//! +//! //! // sample between 1 and 10 points //! for _ in 0..rng.gen_range(1, 11) { //! // sample a point from the square with sides -10 - 10 in two dimensions @@ -124,7 +124,7 @@ use distributions::utils::Float; #[cfg(feature="simd_support")] -use core::simd::*; +use packed_simd::*; /// Sample values uniformly between two bounds. /// @@ -475,6 +475,150 @@ uniform_int_impl! { usize, isize, usize, isize, usize } #[cfg(feature = "i128_support")] uniform_int_impl! { u128, u128, u128, i128, u128 } +#[cfg(feature = "simd_support")] +macro_rules! uniform_simd_int_impl { + ($ty:ident, $unsigned:ident, $u_scalar:ident) => { + // The "pick the largest zone that can fit in an `u32`" optimization + // is less useful here. Multiple lanes complicate things, we don't + // know the PRNG's minimal output size, and casting to a larger vector + // is generally a bad idea for SIMD performance. The user can still + // implement it manually. + + // TODO: look into `Uniform::::new(0u32, 100)` functionality + // perhaps `impl SampleUniform for $u_scalar`? + impl SampleUniform for $ty { + type Sampler = UniformInt<$ty>; + } + + impl UniformSampler for UniformInt<$ty> { + type X = $ty; + + #[inline] // if the range is constant, this helps LLVM to do the + // calculations at compile-time. + fn new(low_b: B1, high_b: B2) -> Self + where B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!(low.lt(high).all(), "Uniform::new called with `low >= high`"); + UniformSampler::new_inclusive(low, high - 1) + } + + #[inline] // if the range is constant, this helps LLVM to do the + // calculations at compile-time. + fn new_inclusive(low_b: B1, high_b: B2) -> Self + where B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!(low.le(high).all(), + "Uniform::new_inclusive called with `low > high`"); + let unsigned_max = ::core::$u_scalar::MAX; + + // NOTE: these may need to be replaced with explicitly + // wrapping operations if `packed_simd` changes + let range: $unsigned = ((high - low) + 1).cast(); + // `% 0` will panic at runtime. + let not_full_range = range.gt($unsigned::splat(0)); + // replacing 0 with `unsigned_max` allows a faster `select` + // with bitwise OR + let modulo = not_full_range.select(range, $unsigned::splat(unsigned_max)); + // wrapping addition + let ints_to_reject = (unsigned_max - range + 1) % modulo; + // When `range` is 0, `lo` of `v.wmul(range)` will always be + // zero which means only one sample is needed. + let zone = unsigned_max - ints_to_reject; + + UniformInt { + low: low, + // These are really $unsigned values, but store as $ty: + range: range.cast(), + zone: zone.cast(), + } + } + + fn sample(&self, rng: &mut R) -> Self::X { + let range: $unsigned = self.range.cast(); + let zone: $unsigned = self.zone.cast(); + + // This might seem very slow, generating a whole new + // SIMD vector for every sample rejection. For most uses + // though, the chance of rejection is small and provides good + // general performance. With multiple lanes, that chance is + // multiplied. To mitigate this, we replace only the lanes of + // the vector which fail, iteratively reducing the chance of + // rejection. The replacement method does however add a little + // overhead. Benchmarking or calculating probabilities might + // reveal contexts where this replacement method is slower. + let mut v: $unsigned = rng.gen(); + loop { + let (hi, lo) = v.wmul(range); + let mask = lo.le(zone); + if mask.all() { + let hi: $ty = hi.cast(); + // wrapping addition + let result = self.low + hi; + // `select` here compiles to a blend operation + // When `range.eq(0).none()` the compare and blend + // operations are avoided. + let v: $ty = v.cast(); + return range.gt($unsigned::splat(0)).select(result, v); + } + // Replace only the failing lanes + v = mask.select(v, rng.gen()); + } + } + } + }; + + // bulk implementation + ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => { + $( + uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar); + uniform_simd_int_impl!($signed, $unsigned, $u_scalar); + )+ + }; +} + +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u64x2, i64x2), + (u64x4, i64x4), + (u64x8, i64x8), + u64 +} + +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u32x2, i32x2), + (u32x4, i32x4), + (u32x8, i32x8), + (u32x16, i32x16), + u32 +} + +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u16x2, i16x2), + (u16x4, i16x4), + (u16x8, i16x8), + (u16x16, i16x16), + (u16x32, i16x32), + u16 +} + +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u8x2, i8x2), + (u8x4, i8x4), + (u8x8, i8x8), + (u8x16, i8x16), + (u8x32, i8x32), + (u8x64, i8x64), + u8 +} /// The back-end implementing [`UniformSampler`] for floating-point types. @@ -571,7 +715,7 @@ macro_rules! uniform_float_impl { fn sample(&self, rng: &mut R) -> Self::X { // Generate a value in the range [1, 2) - let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard as u8) + let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard) .into_float_with_exponent(0); // Get a value in the range [0, 1) in order to avoid @@ -600,7 +744,7 @@ macro_rules! uniform_float_impl { loop { // Generate a value in the range [1, 2) - let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard as u32) + let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard) .into_float_with_exponent(0); // Get a value in the range [0, 1) in order to avoid @@ -785,7 +929,7 @@ mod tests { use rngs::mock::StepRng; use distributions::uniform::Uniform; use distributions::utils::FloatAsSIMD; - #[cfg(feature="simd_support")] use core::simd::*; + #[cfg(feature="simd_support")] use packed_simd::*; #[should_panic] #[test] @@ -810,50 +954,86 @@ mod tests { #[test] fn test_integers() { + use core::{i8, i16, i32, i64, isize}; + use core::{u8, u16, u32, u64, usize}; + #[cfg(feature = "i128_support")] + use core::{i128, u128}; + let mut rng = ::test::rng(251); macro_rules! t { - ($($ty:ident),*) => {{ - $( - let v: &[($ty, $ty)] = &[(0, 10), - (10, 127), - (::core::$ty::MIN, ::core::$ty::MAX)]; - for &(low, high) in v.iter() { - let my_uniform = Uniform::new(low, high); - for _ in 0..1000 { - let v: $ty = rng.sample(my_uniform); - assert!(low <= v && v < high); - } + ($ty:ident, $v:expr, $le:expr, $lt:expr) => {{ + for &(low, high) in $v.iter() { + let my_uniform = Uniform::new(low, high); + for _ in 0..1000 { + let v: $ty = rng.sample(my_uniform); + assert!($le(low, v) && $lt(v, high)); + } - let my_uniform = Uniform::new_inclusive(low, high); - for _ in 0..1000 { - let v: $ty = rng.sample(my_uniform); - assert!(low <= v && v <= high); - } + let my_uniform = Uniform::new_inclusive(low, high); + for _ in 0..1000 { + let v: $ty = rng.sample(my_uniform); + assert!($le(low, v) && $le(v, high)); + } - let my_uniform = Uniform::new(&low, high); - for _ in 0..1000 { - let v: $ty = rng.sample(my_uniform); - assert!(low <= v && v < high); - } + let my_uniform = Uniform::new(&low, high); + for _ in 0..1000 { + let v: $ty = rng.sample(my_uniform); + assert!($le(low, v) && $lt(v, high)); + } - let my_uniform = Uniform::new_inclusive(&low, &high); - for _ in 0..1000 { - let v: $ty = rng.sample(my_uniform); - assert!(low <= v && v <= high); - } + let my_uniform = Uniform::new_inclusive(&low, &high); + for _ in 0..1000 { + let v: $ty = rng.sample(my_uniform); + assert!($le(low, v) && $le(v, high)); + } - for _ in 0..1000 { - let v: $ty = rng.gen_range(low, high); - assert!(low <= v && v < high); - } + for _ in 0..1000 { + let v: $ty = rng.gen_range(low, high); + assert!($le(low, v) && $lt(v, high)); } - )* - }} + } + }}; + + // scalar bulk + ($($ty:ident),*) => {{ + $(t!( + $ty, + [(0, 10), (10, 127), ($ty::MIN, $ty::MAX)], + |x, y| x <= y, + |x, y| x < y + );)* + }}; + + // simd bulk + ($($ty:ident),* => $scalar:ident) => {{ + $(t!( + $ty, + [ + ($ty::splat(0), $ty::splat(10)), + ($ty::splat(10), $ty::splat(127)), + ($ty::splat($scalar::MIN), $ty::splat($scalar::MAX)), + ], + |x: $ty, y| x.le(y).all(), + |x: $ty, y| x.lt(y).all() + );)* + }}; } t!(i8, i16, i32, i64, isize, u8, u16, u32, u64, usize); #[cfg(feature = "i128_support")] - t!(i128, u128) + t!(i128, u128); + + #[cfg(feature = "simd_support")] + { + t!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64 => u8); + t!(i8x2, i8x4, i8x8, i8x16, i8x32, i8x64 => i8); + t!(u16x2, u16x4, u16x8, u16x16, u16x32 => u16); + t!(i16x2, i16x4, i16x8, i16x16, i16x32 => i16); + t!(u32x2, u32x4, u32x8, u32x16 => u32); + t!(i32x2, i32x4, i32x8, i32x16 => i32); + t!(u64x2, u64x4, u64x8 => u64); + t!(i64x2, i64x4, i64x8 => i64); + } } #[test] @@ -925,13 +1105,16 @@ mod tests { t!(f32, f32, 32 - 23); t!(f64, f64, 64 - 52); - #[cfg(feature="simd_support")] t!(f32x2, f32, 32 - 23); - #[cfg(feature="simd_support")] t!(f32x4, f32, 32 - 23); - #[cfg(feature="simd_support")] t!(f32x8, f32, 32 - 23); - #[cfg(feature="simd_support")] t!(f32x16, f32, 32 - 23); - #[cfg(feature="simd_support")] t!(f64x2, f64, 64 - 52); - #[cfg(feature="simd_support")] t!(f64x4, f64, 64 - 52); - #[cfg(feature="simd_support")] t!(f64x8, f64, 64 - 52); + #[cfg(feature="simd_support")] + { + t!(f32x2, f32, 32 - 23); + t!(f32x4, f32, 32 - 23); + t!(f32x8, f32, 32 - 23); + t!(f32x16, f32, 32 - 23); + t!(f64x2, f64, 64 - 52); + t!(f64x4, f64, 64 - 52); + t!(f64x8, f64, 64 - 52); + } } #[test] @@ -978,13 +1161,16 @@ mod tests { t!(f32, f32); t!(f64, f64); - #[cfg(feature="simd_support")] t!(f32x2, f32); - #[cfg(feature="simd_support")] t!(f32x4, f32); - #[cfg(feature="simd_support")] t!(f32x8, f32); - #[cfg(feature="simd_support")] t!(f32x16, f32); - #[cfg(feature="simd_support")] t!(f64x2, f64); - #[cfg(feature="simd_support")] t!(f64x4, f64); - #[cfg(feature="simd_support")] t!(f64x8, f64); + #[cfg(feature="simd_support")] + { + t!(f32x2, f32); + t!(f32x4, f32); + t!(f32x8, f32); + t!(f32x16, f32); + t!(f64x2, f64); + t!(f64x4, f64); + t!(f64x8, f64); + } } diff --git a/src/distributions/utils.rs b/src/distributions/utils.rs index 8ac2c66dcb6..6a7e401c5f5 100644 --- a/src/distributions/utils.rs +++ b/src/distributions/utils.rs @@ -11,7 +11,7 @@ //! Math helper functions #[cfg(feature="simd_support")] -use core::simd::*; +use packed_simd::*; #[cfg(feature="std")] use distributions::ziggurat_tables; #[cfg(feature="std")] @@ -35,7 +35,30 @@ macro_rules! wmul_impl { ((tmp >> $shift) as $ty, tmp as $ty) } } - } + }; + + // simd bulk implementation + ($(($ty:ident, $wide:ident),)+, $shift:expr) => { + $( + impl WideningMultiply for $ty { + type Output = ($ty, $ty); + + #[inline(always)] + fn wmul(self, x: $ty) -> Self::Output { + // For supported vectors, this should compile to a couple + // supported multiply & swizzle instructions (no actual + // casting). + // TODO: optimize + let y: $wide = self.cast(); + let x: $wide = x.cast(); + let tmp = y * x; + let hi: $ty = (tmp >> $shift).cast(); + let lo: $ty = tmp.cast(); + (hi, lo) + } + } + )+ + }; } wmul_impl! { u8, u16, 8 } wmul_impl! { u16, u32, 16 } @@ -73,7 +96,36 @@ macro_rules! wmul_impl_large { (high, low) } } - } + }; + + // simd bulk implementation + (($($ty:ty,)+) $scalar:ty, $half:expr) => { + $( + impl WideningMultiply for $ty { + type Output = ($ty, $ty); + + #[inline(always)] + fn wmul(self, b: $ty) -> Self::Output { + // needs wrapping multiplication + const LOWER_MASK: $scalar = !0 >> $half; + let mut low = (self & LOWER_MASK) * (b & LOWER_MASK); + let mut t = low >> $half; + low &= LOWER_MASK; + t += (self >> $half) * (b & LOWER_MASK); + low += (t & LOWER_MASK) << $half; + let mut high = t >> $half; + t = low >> $half; + low &= LOWER_MASK; + t += (b >> $half) * (self & LOWER_MASK); + low += (t & LOWER_MASK) << $half; + high += t >> $half; + high += (self >> $half) * (b >> $half); + + (high, low) + } + } + )+ + }; } #[cfg(not(feature = "i128_support"))] wmul_impl_large! { u64, 32 } @@ -98,6 +150,76 @@ wmul_impl_usize! { u32 } #[cfg(target_pointer_width = "64")] wmul_impl_usize! { u64 } +#[cfg(feature = "simd_support")] +mod simd_wmul { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + use super::*; + + wmul_impl! { + (u8x2, u16x2), + (u8x4, u16x4), + (u8x8, u16x8), + (u8x16, u16x16), + (u8x32, u16x32),, + 8 + } + + wmul_impl! { (u16x2, u32x2),, 16 } + #[cfg(not(target_feature = "sse2"))] + wmul_impl! { (u16x4, u32x4),, 16 } + #[cfg(not(target_feature = "sse4.2"))] + wmul_impl! { (u16x8, u32x8),, 16 } + #[cfg(not(target_feature = "avx2"))] + wmul_impl! { (u16x16, u32x16),, 16 } + + // 16-bit lane widths allow use of the x86 `mulhi` instructions, which + // means `wmul` can be implemented with only two instructions. + #[allow(unused_macros)] + macro_rules! wmul_impl_16 { + ($ty:ident, $intrinsic:ident, $mulhi:ident, $mullo:ident) => { + impl WideningMultiply for $ty { + type Output = ($ty, $ty); + + #[inline(always)] + fn wmul(self, x: $ty) -> Self::Output { + let b = $intrinsic::from_bits(x); + let a = $intrinsic::from_bits(self); + let hi = $ty::from_bits(unsafe { $mulhi(a, b) }); + let lo = $ty::from_bits(unsafe { $mullo(a, b) }); + (hi, lo) + } + } + }; + } + + #[cfg(target_feature = "sse2")] + wmul_impl_16! { u16x4, __m64, _mm_mulhi_pu16, _mm_mullo_pi16 } + #[cfg(target_feature = "sse4.2")] + wmul_impl_16! { u16x8, __m128i, _mm_mulhi_epu16, _mm_mullo_epi16 } + #[cfg(target_feature = "avx2")] + wmul_impl_16! { u16x16, __m256i, _mm256_mulhi_epu16, _mm256_mullo_epi16 } + // FIXME: there are no `__m512i` types in stdsimd yet, so `wmul::` + // cannot use the same implementation. + + wmul_impl! { + (u32x2, u64x2), + (u32x4, u64x4), + (u32x8, u64x8),, + 32 + } + + // TODO: optimize, this seems to seriously slow things down + wmul_impl_large! { (u8x64,) u8, 4 } + wmul_impl_large! { (u16x32,) u16, 8 } + wmul_impl_large! { (u32x16,) u32, 16 } + wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 } +} +#[cfg(feature = "simd_support")] +pub use self::simd_wmul::*; + /// Helper trait when dealing with scalar and SIMD floating point types. pub(crate) trait FloatSIMDUtils { @@ -263,7 +385,7 @@ macro_rules! simd_impl { <$ty>::from_bits(<$uty>::from_bits(self) + <$uty>::from_bits(mask)) } type UInt = $uty; - fn cast_from_int(i: Self::UInt) -> Self { $ty::from(i) } + fn cast_from_int(i: Self::UInt) -> Self { i.cast() } } } } @@ -271,10 +393,10 @@ macro_rules! simd_impl { #[cfg(feature="simd_support")] simd_impl! { f32x2, f32, m32x2, u32x2 } #[cfg(feature="simd_support")] simd_impl! { f32x4, f32, m32x4, u32x4 } #[cfg(feature="simd_support")] simd_impl! { f32x8, f32, m32x8, u32x8 } -#[cfg(feature="simd_support")] simd_impl! { f32x16, f32, m1x16, u32x16 } +#[cfg(feature="simd_support")] simd_impl! { f32x16, f32, m32x16, u32x16 } #[cfg(feature="simd_support")] simd_impl! { f64x2, f64, m64x2, u64x2 } #[cfg(feature="simd_support")] simd_impl! { f64x4, f64, m64x4, u64x4 } -#[cfg(feature="simd_support")] simd_impl! { f64x8, f64, m1x8, u64x8 } +#[cfg(feature="simd_support")] simd_impl! { f64x8, f64, m64x8, u64x8 } /// Calculates ln(gamma(x)) (natural logarithm of the gamma /// function) using the Lanczos approximation. diff --git a/src/lib.rs b/src/lib.rs index fb794d4b271..c23c12f6e50 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -237,6 +237,8 @@ #[cfg(feature="std")] extern crate std as core; #[cfg(all(feature = "alloc", not(feature="std")))] extern crate alloc; +#[cfg(feature="simd_support")] extern crate packed_simd; + #[cfg(test)] #[cfg(feature="serde1")] extern crate bincode; #[cfg(feature="serde1")] extern crate serde; #[cfg(feature="serde1")] #[macro_use] extern crate serde_derive;