From 9c12ad3c6922f9835c94f61e89d7cbec8a3c7fea Mon Sep 17 00:00:00 2001
From: TheIronBorn <TheIronBorn@users.noreply.github.com>
Date: Wed, 18 Jul 2018 22:25:13 +0000
Subject: [PATCH] impl SIMD wmul, UniformInt

---
 Cargo.toml                   |   3 +-
 src/distributions/float.rs   |  10 +-
 src/distributions/integer.rs |   2 +-
 src/distributions/uniform.rs | 292 ++++++++++++++++++++++++++++-------
 src/distributions/utils.rs   | 134 +++++++++++++++-
 src/lib.rs                   |   2 +
 6 files changed, 377 insertions(+), 66 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index d8e9ea02dd8..7a998b81597 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,7 @@ nightly = ["i128_support", "simd_support"] # enables all features requiring nigh
 std = ["rand_core/std", "alloc", "libc", "winapi", "cloudabi", "fuchsia-zircon"]
 alloc = ["rand_core/alloc"]  # enables Vec and Box support (without std)
 i128_support = [] # enables i128 and u128 support
-simd_support = [] # enables SIMD support
+simd_support = ["packed_simd"] # enables SIMD support
 serde1 = ["serde", "serde_derive", "rand_core/serde1"] # enables serialization for PRNGs
 
 [workspace]
@@ -34,6 +34,7 @@ rand_core = { path = "rand_core", version = "0.2", default-features = false }
 # only for deprecations and benches:
 rand_isaac = { path = "rand_isaac", version = "0.1", default-features = false }
 log = { version = "0.4", optional = true }
+packed_simd = { version = "0.1", optional = true, features = ["into_bits"] }
 serde = { version = "1", optional = true }
 serde_derive = { version = "1", optional = true }
 
diff --git a/src/distributions/float.rs b/src/distributions/float.rs
index 0d418ebdc74..d7a5bb98bd3 100644
--- a/src/distributions/float.rs
+++ b/src/distributions/float.rs
@@ -15,7 +15,7 @@ use Rng;
 use distributions::{Distribution, Standard};
 use distributions::utils::FloatSIMDUtils;
 #[cfg(feature="simd_support")]
-use core::simd::*;
+use packed_simd::*;
 
 /// A distribution to sample floating point numbers uniformly in the half-open
 /// interval `(0, 1]`, i.e. including 1 but not 0.
@@ -106,7 +106,7 @@ macro_rules! float_impls {
                 // Multiply-based method; 24/53 random bits; [0, 1) interval.
                 // We use the most significant bits because for simple RNGs
                 // those are usually more random.
-                let float_size = mem::size_of::<$f_scalar>() * 8;
+                let float_size = mem::size_of::<$f_scalar>() as u32 * 8;
                 let precision = $fraction_bits + 1;
                 let scale = 1.0 / ((1 as $u_scalar << precision) as $f_scalar);
 
@@ -121,7 +121,7 @@ macro_rules! float_impls {
                 // Multiply-based method; 24/53 random bits; (0, 1] interval.
                 // We use the most significant bits because for simple RNGs
                 // those are usually more random.
-                let float_size = mem::size_of::<$f_scalar>() * 8;
+                let float_size = mem::size_of::<$f_scalar>() as u32 * 8;
                 let precision = $fraction_bits + 1;
                 let scale = 1.0 / ((1 as $u_scalar << precision) as $f_scalar);
 
@@ -138,7 +138,7 @@ macro_rules! float_impls {
                 // We use the most significant bits because for simple RNGs
                 // those are usually more random.
                 use core::$f_scalar::EPSILON;
-                let float_size = mem::size_of::<$f_scalar>() * 8;
+                let float_size = mem::size_of::<$f_scalar>() as u32 * 8;
 
                 let value: $uty = rng.gen();
                 let fraction = value >> (float_size - $fraction_bits);
@@ -174,7 +174,7 @@ mod tests {
     use distributions::{Open01, OpenClosed01};
     use rngs::mock::StepRng;
     #[cfg(feature="simd_support")]
-    use core::simd::*;
+    use packed_simd::*;
 
     const EPSILON32: f32 = ::core::f32::EPSILON;
     const EPSILON64: f64 = ::core::f64::EPSILON;
diff --git a/src/distributions/integer.rs b/src/distributions/integer.rs
index 82efd9ba795..0fce20f6de7 100644
--- a/src/distributions/integer.rs
+++ b/src/distributions/integer.rs
@@ -13,7 +13,7 @@
 use {Rng};
 use distributions::{Distribution, Standard};
 #[cfg(feature="simd_support")]
-use core::simd::*;
+use packed_simd::*;
 
 impl Distribution<u8> for Standard {
     #[inline]
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index bf68e45f51a..b4e34873ec4 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -30,10 +30,10 @@
 //! ```
 //! use rand::{Rng, thread_rng};
 //! use rand::distributions::Uniform;
-//! 
+//!
 //! let mut rng = thread_rng();
 //! let side = Uniform::new(-10.0, 10.0);
-//! 
+//!
 //! // sample between 1 and 10 points
 //! for _ in 0..rng.gen_range(1, 11) {
 //!     // sample a point from the square with sides -10 - 10 in two dimensions
@@ -124,7 +124,7 @@ use distributions::utils::Float;
 
 
 #[cfg(feature="simd_support")]
-use core::simd::*;
+use packed_simd::*;
 
 /// Sample values uniformly between two bounds.
 ///
@@ -475,6 +475,150 @@ uniform_int_impl! { usize, isize, usize, isize, usize }
 #[cfg(feature = "i128_support")]
 uniform_int_impl! { u128, u128, u128, i128, u128 }
 
+#[cfg(feature = "simd_support")]
+macro_rules! uniform_simd_int_impl {
+    ($ty:ident, $unsigned:ident, $u_scalar:ident) => {
+        // The "pick the largest zone that can fit in an `u32`" optimization
+        // is less useful here. Multiple lanes complicate things, we don't
+        // know the PRNG's minimal output size, and casting to a larger vector
+        // is generally a bad idea for SIMD performance. The user can still
+        // implement it manually.
+
+        // TODO: look into `Uniform::<u32x4>::new(0u32, 100)` functionality
+        //       perhaps `impl SampleUniform for $u_scalar`?
+        impl SampleUniform for $ty {
+            type Sampler = UniformInt<$ty>;
+        }
+
+        impl UniformSampler for UniformInt<$ty> {
+            type X = $ty;
+
+            #[inline] // if the range is constant, this helps LLVM to do the
+                      // calculations at compile-time.
+            fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
+                where B1: SampleBorrow<Self::X> + Sized,
+                      B2: SampleBorrow<Self::X> + Sized
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(low.lt(high).all(), "Uniform::new called with `low >= high`");
+                UniformSampler::new_inclusive(low, high - 1)
+            }
+
+            #[inline] // if the range is constant, this helps LLVM to do the
+                      // calculations at compile-time.
+            fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
+                where B1: SampleBorrow<Self::X> + Sized,
+                      B2: SampleBorrow<Self::X> + Sized
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(low.le(high).all(),
+                        "Uniform::new_inclusive called with `low > high`");
+                let unsigned_max = ::core::$u_scalar::MAX;
+
+                // NOTE: these may need to be replaced with explicitly
+                // wrapping operations if `packed_simd` changes
+                let range: $unsigned = ((high - low) + 1).cast();
+                // `% 0` will panic at runtime.
+                let not_full_range = range.gt($unsigned::splat(0));
+                // replacing 0 with `unsigned_max` allows a faster `select`
+                // with bitwise OR
+                let modulo = not_full_range.select(range, $unsigned::splat(unsigned_max));
+                // wrapping addition
+                let ints_to_reject = (unsigned_max - range + 1) % modulo;
+                // When `range` is 0, `lo` of `v.wmul(range)` will always be
+                // zero which means only one sample is needed.
+                let zone = unsigned_max - ints_to_reject;
+
+                UniformInt {
+                    low: low,
+                    // These are really $unsigned values, but store as $ty:
+                    range: range.cast(),
+                    zone: zone.cast(),
+                }
+            }
+
+            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
+                let range: $unsigned = self.range.cast();
+                let zone: $unsigned = self.zone.cast();
+
+                // This might seem very slow, generating a whole new
+                // SIMD vector for every sample rejection. For most uses
+                // though, the chance of rejection is small and provides good
+                // general performance. With multiple lanes, that chance is
+                // multiplied. To mitigate this, we replace only the lanes of
+                // the vector which fail, iteratively reducing the chance of
+                // rejection. The replacement method does however add a little
+                // overhead. Benchmarking or calculating probabilities might
+                // reveal contexts where this replacement method is slower.
+                let mut v: $unsigned = rng.gen();
+                loop {
+                    let (hi, lo) = v.wmul(range);
+                    let mask = lo.le(zone);
+                    if mask.all() {
+                        let hi: $ty = hi.cast();
+                        // wrapping addition
+                        let result = self.low + hi;
+                        // `select` here compiles to a blend operation
+                        // When `range.eq(0).none()` the compare and blend
+                        // operations are avoided.
+                        let v: $ty = v.cast();
+                        return range.gt($unsigned::splat(0)).select(result, v);
+                    }
+                    // Replace only the failing lanes
+                    v = mask.select(v, rng.gen());
+                }
+            }
+        }
+    };
+
+    // bulk implementation
+    ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => {
+        $(
+            uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar);
+            uniform_simd_int_impl!($signed, $unsigned, $u_scalar);
+        )+
+    };
+}
+
+#[cfg(feature = "simd_support")]
+uniform_simd_int_impl! {
+    (u64x2, i64x2),
+    (u64x4, i64x4),
+    (u64x8, i64x8),
+    u64
+}
+
+#[cfg(feature = "simd_support")]
+uniform_simd_int_impl! {
+    (u32x2, i32x2),
+    (u32x4, i32x4),
+    (u32x8, i32x8),
+    (u32x16, i32x16),
+    u32
+}
+
+#[cfg(feature = "simd_support")]
+uniform_simd_int_impl! {
+    (u16x2, i16x2),
+    (u16x4, i16x4),
+    (u16x8, i16x8),
+    (u16x16, i16x16),
+    (u16x32, i16x32),
+    u16
+}
+
+#[cfg(feature = "simd_support")]
+uniform_simd_int_impl! {
+    (u8x2, i8x2),
+    (u8x4, i8x4),
+    (u8x8, i8x8),
+    (u8x16, i8x16),
+    (u8x32, i8x32),
+    (u8x64, i8x64),
+    u8
+}
 
 
 /// The back-end implementing [`UniformSampler`] for floating-point types.
@@ -571,7 +715,7 @@ macro_rules! uniform_float_impl {
 
             fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
                 // Generate a value in the range [1, 2)
-                let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard as u8)
+                let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard)
                                .into_float_with_exponent(0);
 
                 // Get a value in the range [0, 1) in order to avoid
@@ -600,7 +744,7 @@ macro_rules! uniform_float_impl {
 
                 loop {
                     // Generate a value in the range [1, 2)
-                    let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard as u32)
+                    let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard)
                                    .into_float_with_exponent(0);
 
                     // Get a value in the range [0, 1) in order to avoid
@@ -785,7 +929,7 @@ mod tests {
     use rngs::mock::StepRng;
     use distributions::uniform::Uniform;
     use distributions::utils::FloatAsSIMD;
-    #[cfg(feature="simd_support")] use core::simd::*;
+    #[cfg(feature="simd_support")] use packed_simd::*;
 
     #[should_panic]
     #[test]
@@ -810,50 +954,86 @@ mod tests {
 
     #[test]
     fn test_integers() {
+        use core::{i8, i16, i32, i64, isize};
+        use core::{u8, u16, u32, u64, usize};
+        #[cfg(feature = "i128_support")]
+        use core::{i128, u128};
+
         let mut rng = ::test::rng(251);
         macro_rules! t {
-            ($($ty:ident),*) => {{
-                $(
-                   let v: &[($ty, $ty)] = &[(0, 10),
-                                            (10, 127),
-                                            (::core::$ty::MIN, ::core::$ty::MAX)];
-                   for &(low, high) in v.iter() {
-                        let my_uniform = Uniform::new(low, high);
-                        for _ in 0..1000 {
-                            let v: $ty = rng.sample(my_uniform);
-                            assert!(low <= v && v < high);
-                        }
+            ($ty:ident, $v:expr, $le:expr, $lt:expr) => {{
+                for &(low, high) in $v.iter() {
+                    let my_uniform = Uniform::new(low, high);
+                    for _ in 0..1000 {
+                        let v: $ty = rng.sample(my_uniform);
+                        assert!($le(low, v) && $lt(v, high));
+                    }
 
-                        let my_uniform = Uniform::new_inclusive(low, high);
-                        for _ in 0..1000 {
-                            let v: $ty = rng.sample(my_uniform);
-                            assert!(low <= v && v <= high);
-                        }
+                    let my_uniform = Uniform::new_inclusive(low, high);
+                    for _ in 0..1000 {
+                        let v: $ty = rng.sample(my_uniform);
+                        assert!($le(low, v) && $le(v, high));
+                    }
 
-                        let my_uniform = Uniform::new(&low, high);
-                        for _ in 0..1000 {
-                            let v: $ty = rng.sample(my_uniform);
-                            assert!(low <= v && v < high);
-                        }
+                    let my_uniform = Uniform::new(&low, high);
+                    for _ in 0..1000 {
+                        let v: $ty = rng.sample(my_uniform);
+                        assert!($le(low, v) && $lt(v, high));
+                    }
 
-                        let my_uniform = Uniform::new_inclusive(&low, &high);
-                        for _ in 0..1000 {
-                            let v: $ty = rng.sample(my_uniform);
-                            assert!(low <= v && v <= high);
-                        }
+                    let my_uniform = Uniform::new_inclusive(&low, &high);
+                    for _ in 0..1000 {
+                        let v: $ty = rng.sample(my_uniform);
+                        assert!($le(low, v) && $le(v, high));
+                    }
 
-                        for _ in 0..1000 {
-                            let v: $ty = rng.gen_range(low, high);
-                            assert!(low <= v && v < high);
-                        }
+                    for _ in 0..1000 {
+                        let v: $ty = rng.gen_range(low, high);
+                        assert!($le(low, v) && $lt(v, high));
                     }
-                 )*
-            }}
+                }
+            }};
+
+            // scalar bulk
+            ($($ty:ident),*) => {{
+                $(t!(
+                    $ty,
+                    [(0, 10), (10, 127), ($ty::MIN, $ty::MAX)],
+                    |x, y| x <= y,
+                    |x, y| x < y
+                );)*
+            }};
+
+            // simd bulk
+            ($($ty:ident),* => $scalar:ident) => {{
+                $(t!(
+                    $ty,
+                    [
+                        ($ty::splat(0), $ty::splat(10)),
+                        ($ty::splat(10), $ty::splat(127)),
+                        ($ty::splat($scalar::MIN), $ty::splat($scalar::MAX)),
+                    ],
+                    |x: $ty, y| x.le(y).all(),
+                    |x: $ty, y| x.lt(y).all()
+                );)*
+            }};
         }
         t!(i8, i16, i32, i64, isize,
            u8, u16, u32, u64, usize);
         #[cfg(feature = "i128_support")]
-        t!(i128, u128)
+        t!(i128, u128);
+
+        #[cfg(feature = "simd_support")]
+        {
+            t!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64 => u8);
+            t!(i8x2, i8x4, i8x8, i8x16, i8x32, i8x64 => i8);
+            t!(u16x2, u16x4, u16x8, u16x16, u16x32 => u16);
+            t!(i16x2, i16x4, i16x8, i16x16, i16x32 => i16);
+            t!(u32x2, u32x4, u32x8, u32x16 => u32);
+            t!(i32x2, i32x4, i32x8, i32x16 => i32);
+            t!(u64x2, u64x4, u64x8 => u64);
+            t!(i64x2, i64x4, i64x8 => i64);
+        }
     }
 
     #[test]
@@ -925,13 +1105,16 @@ mod tests {
 
         t!(f32, f32, 32 - 23);
         t!(f64, f64, 64 - 52);
-        #[cfg(feature="simd_support")] t!(f32x2, f32, 32 - 23);
-        #[cfg(feature="simd_support")] t!(f32x4, f32, 32 - 23);
-        #[cfg(feature="simd_support")] t!(f32x8, f32, 32 - 23);
-        #[cfg(feature="simd_support")] t!(f32x16, f32, 32 - 23);
-        #[cfg(feature="simd_support")] t!(f64x2, f64, 64 - 52);
-        #[cfg(feature="simd_support")] t!(f64x4, f64, 64 - 52);
-        #[cfg(feature="simd_support")] t!(f64x8, f64, 64 - 52);
+        #[cfg(feature="simd_support")]
+        {
+            t!(f32x2, f32, 32 - 23);
+            t!(f32x4, f32, 32 - 23);
+            t!(f32x8, f32, 32 - 23);
+            t!(f32x16, f32, 32 - 23);
+            t!(f64x2, f64, 64 - 52);
+            t!(f64x4, f64, 64 - 52);
+            t!(f64x8, f64, 64 - 52);
+        }
     }
 
     #[test]
@@ -978,13 +1161,16 @@ mod tests {
 
         t!(f32, f32);
         t!(f64, f64);
-        #[cfg(feature="simd_support")] t!(f32x2, f32);
-        #[cfg(feature="simd_support")] t!(f32x4, f32);
-        #[cfg(feature="simd_support")] t!(f32x8, f32);
-        #[cfg(feature="simd_support")] t!(f32x16, f32);
-        #[cfg(feature="simd_support")] t!(f64x2, f64);
-        #[cfg(feature="simd_support")] t!(f64x4, f64);
-        #[cfg(feature="simd_support")] t!(f64x8, f64);
+        #[cfg(feature="simd_support")]
+        {
+            t!(f32x2, f32);
+            t!(f32x4, f32);
+            t!(f32x8, f32);
+            t!(f32x16, f32);
+            t!(f64x2, f64);
+            t!(f64x4, f64);
+            t!(f64x8, f64);
+        }
     }
 
 
diff --git a/src/distributions/utils.rs b/src/distributions/utils.rs
index 8ac2c66dcb6..6a7e401c5f5 100644
--- a/src/distributions/utils.rs
+++ b/src/distributions/utils.rs
@@ -11,7 +11,7 @@
 //! Math helper functions
 
 #[cfg(feature="simd_support")]
-use core::simd::*;
+use packed_simd::*;
 #[cfg(feature="std")]
 use distributions::ziggurat_tables;
 #[cfg(feature="std")]
@@ -35,7 +35,30 @@ macro_rules! wmul_impl {
                 ((tmp >> $shift) as $ty, tmp as $ty)
             }
         }
-    }
+    };
+
+    // simd bulk implementation
+    ($(($ty:ident, $wide:ident),)+, $shift:expr) => {
+        $(
+            impl WideningMultiply for $ty {
+                type Output = ($ty, $ty);
+
+                #[inline(always)]
+                fn wmul(self, x: $ty) -> Self::Output {
+                    // For supported vectors, this should compile to a couple
+                    // supported multiply & swizzle instructions (no actual
+                    // casting).
+                    // TODO: optimize
+                    let y: $wide = self.cast();
+                    let x: $wide = x.cast();
+                    let tmp = y * x;
+                    let hi: $ty = (tmp >> $shift).cast();
+                    let lo: $ty = tmp.cast();
+                    (hi, lo)
+                }
+            }
+        )+
+    };
 }
 wmul_impl! { u8, u16, 8 }
 wmul_impl! { u16, u32, 16 }
@@ -73,7 +96,36 @@ macro_rules! wmul_impl_large {
                 (high, low)
             }
         }
-    }
+    };
+
+    // simd bulk implementation
+    (($($ty:ty,)+) $scalar:ty, $half:expr) => {
+        $(
+            impl WideningMultiply for $ty {
+                type Output = ($ty, $ty);
+
+                #[inline(always)]
+                fn wmul(self, b: $ty) -> Self::Output {
+                    // needs wrapping multiplication
+                    const LOWER_MASK: $scalar = !0 >> $half;
+                    let mut low = (self & LOWER_MASK) * (b & LOWER_MASK);
+                    let mut t = low >> $half;
+                    low &= LOWER_MASK;
+                    t += (self >> $half) * (b & LOWER_MASK);
+                    low += (t & LOWER_MASK) << $half;
+                    let mut high = t >> $half;
+                    t = low >> $half;
+                    low &= LOWER_MASK;
+                    t += (b >> $half) * (self & LOWER_MASK);
+                    low += (t & LOWER_MASK) << $half;
+                    high += t >> $half;
+                    high += (self >> $half) * (b >> $half);
+
+                    (high, low)
+                }
+            }
+        )+
+    };
 }
 #[cfg(not(feature = "i128_support"))]
 wmul_impl_large! { u64, 32 }
@@ -98,6 +150,76 @@ wmul_impl_usize! { u32 }
 #[cfg(target_pointer_width = "64")]
 wmul_impl_usize! { u64 }
 
+#[cfg(feature = "simd_support")]
+mod simd_wmul {
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::*;
+    use super::*;
+
+    wmul_impl! {
+        (u8x2, u16x2),
+        (u8x4, u16x4),
+        (u8x8, u16x8),
+        (u8x16, u16x16),
+        (u8x32, u16x32),,
+        8
+    }
+
+    wmul_impl! { (u16x2, u32x2),, 16 }
+    #[cfg(not(target_feature = "sse2"))]
+    wmul_impl! { (u16x4, u32x4),, 16 }
+    #[cfg(not(target_feature = "sse4.2"))]
+    wmul_impl! { (u16x8, u32x8),, 16 }
+    #[cfg(not(target_feature = "avx2"))]
+    wmul_impl! { (u16x16, u32x16),, 16 }
+
+    // 16-bit lane widths allow use of the x86 `mulhi` instructions, which
+    // means `wmul` can be implemented with only two instructions.
+    #[allow(unused_macros)]
+    macro_rules! wmul_impl_16 {
+        ($ty:ident, $intrinsic:ident, $mulhi:ident, $mullo:ident) => {
+            impl WideningMultiply for $ty {
+                type Output = ($ty, $ty);
+
+                #[inline(always)]
+                fn wmul(self, x: $ty) -> Self::Output {
+                    let b = $intrinsic::from_bits(x);
+                    let a = $intrinsic::from_bits(self);
+                    let hi = $ty::from_bits(unsafe { $mulhi(a, b) });
+                    let lo = $ty::from_bits(unsafe { $mullo(a, b) });
+                    (hi, lo)
+                }
+            }
+        };
+    }
+
+    #[cfg(target_feature = "sse2")]
+    wmul_impl_16! { u16x4, __m64, _mm_mulhi_pu16, _mm_mullo_pi16 }
+    #[cfg(target_feature = "sse4.2")]
+    wmul_impl_16! { u16x8, __m128i, _mm_mulhi_epu16, _mm_mullo_epi16 }
+    #[cfg(target_feature = "avx2")]
+    wmul_impl_16! { u16x16, __m256i, _mm256_mulhi_epu16, _mm256_mullo_epi16 }
+    // FIXME: there are no `__m512i` types in stdsimd yet, so `wmul::<u16x32>`
+    // cannot use the same implementation.
+
+    wmul_impl! {
+        (u32x2, u64x2),
+        (u32x4, u64x4),
+        (u32x8, u64x8),,
+        32
+    }
+
+    // TODO: optimize, this seems to seriously slow things down
+    wmul_impl_large! { (u8x64,) u8, 4 }
+    wmul_impl_large! { (u16x32,) u16, 8 }
+    wmul_impl_large! { (u32x16,) u32, 16 }
+    wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 }
+}
+#[cfg(feature = "simd_support")]
+pub use self::simd_wmul::*;
+
 
 /// Helper trait when dealing with scalar and SIMD floating point types.
 pub(crate) trait FloatSIMDUtils {
@@ -263,7 +385,7 @@ macro_rules! simd_impl {
                 <$ty>::from_bits(<$uty>::from_bits(self) + <$uty>::from_bits(mask))
             }
             type UInt = $uty;
-            fn cast_from_int(i: Self::UInt) -> Self { $ty::from(i) }
+            fn cast_from_int(i: Self::UInt) -> Self { i.cast() }
         }
     }
 }
@@ -271,10 +393,10 @@ macro_rules! simd_impl {
 #[cfg(feature="simd_support")] simd_impl! { f32x2, f32, m32x2, u32x2 }
 #[cfg(feature="simd_support")] simd_impl! { f32x4, f32, m32x4, u32x4 }
 #[cfg(feature="simd_support")] simd_impl! { f32x8, f32, m32x8, u32x8 }
-#[cfg(feature="simd_support")] simd_impl! { f32x16, f32, m1x16, u32x16 }
+#[cfg(feature="simd_support")] simd_impl! { f32x16, f32, m32x16, u32x16 }
 #[cfg(feature="simd_support")] simd_impl! { f64x2, f64, m64x2, u64x2 }
 #[cfg(feature="simd_support")] simd_impl! { f64x4, f64, m64x4, u64x4 }
-#[cfg(feature="simd_support")] simd_impl! { f64x8, f64, m1x8, u64x8 }
+#[cfg(feature="simd_support")] simd_impl! { f64x8, f64, m64x8, u64x8 }
 
 /// Calculates ln(gamma(x)) (natural logarithm of the gamma
 /// function) using the Lanczos approximation.
diff --git a/src/lib.rs b/src/lib.rs
index fb794d4b271..c23c12f6e50 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -237,6 +237,8 @@
 #[cfg(feature="std")] extern crate std as core;
 #[cfg(all(feature = "alloc", not(feature="std")))] extern crate alloc;
 
+#[cfg(feature="simd_support")] extern crate packed_simd;
+
 #[cfg(test)] #[cfg(feature="serde1")] extern crate bincode;
 #[cfg(feature="serde1")] extern crate serde;
 #[cfg(feature="serde1")] #[macro_use] extern crate serde_derive;