From 0dab6d1593e07a080b75d2091bc5f5a2ba90c53e Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Fri, 10 Aug 2018 14:27:04 -0700
Subject: [PATCH 01/29] Implemented AVX intrinsics

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     | 506 ++++++++++++++++++
 .../Microsoft.ML.CpuMath.csproj               |   4 +
 2 files changed, 510 insertions(+)
 create mode 100644 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
new file mode 100644
index 0000000000..00453e75f4
--- /dev/null
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -0,0 +1,506 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+// The exported function names need to be unique (can't be disambiguated based on signature), hence
+// we introduce suffix letters to indicate the general patterns used.
+// * A suffix means aligned and padded for SSE operations.
+// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector.
+// * Tran means the matrix is transposed.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Microsoft.ML.Runtime.Internal.CpuMath
+{
+    internal static class AvxIntrinsics
+    {
+        private const int CbAlign = 32;
+
+        private static bool Compat(AlignedArray a)
+        {
+            Contracts.AssertValue(a);
+            Contracts.Assert(a.Size > 0);
+            return a.CbAlign == CbAlign;
+        }
+
+        private static unsafe float* Ptr(AlignedArray a, float* p)
+        {
+            Contracts.AssertValue(a);
+            float* q = p + a.GetBase((long)p);
+            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
+            return q;
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe Vector256<float> ToVector256(in Vector128<float> a, in Vector128<float> b)
+        {
+            // REVIEW NEEDED: Is it the correct port of the following code?
+            // #ifndef _WIN32
+            // #define _mm256_set_m128(va, vb) _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1)
+            // #endif
+            return Avx.InsertVector128<float>(Avx.ExtendToVector256<float>(b), a, 1);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void ZeroUpper()
+        {
+            // Currently no-op since _mm256_zeroupper is not supported (ref: https://github.com/dotnet/coreclr/pull/16955)
+            // This is a placeholder in case the intrinsic is supported later on.
+            return;
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe Vector128<float> GetLow(in Vector128<float> x)
+        {
+            return Avx.ExtractVector128<float>(x, 0);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe Vector128<float> GetHigh(in Vector128<float> x)
+        {
+            return Avx.ExtractVector128<float>(x, 1);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> Rotate(in Vector128<float> x)
+        {
+            // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA.
+            return Sse.Shuffle(x, x, 0x39);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
+        {
+            Sse.StoreScalar(dst + idx[0], x);
+            Vector128<float> rotated = Rotate(in x);
+            Sse.StoreScalar(dst + idx[1], rotated);
+            rotated = Rotate(in rotated);
+            Sse.StoreScalar(dst + idx[2], rotated);
+            rotated = Rotate(in rotated);
+            Sse.StoreScalar(dst + idx[3], rotated);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<float> VectorSum(in Vector256<float> vector)
+        {
+            Vector256<float> partialSum = Avx.HorizontalAdd(vector, vector);
+            return Avx.HorizontalAdd(partialSum, partialSum);
+        }
+
+        // Multiply matrix times vector into vector.
+        internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            {
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
+
+                float* pSrcEnd = psrc + ccol;
+                float* pDstEnd = pdst + crow;
+                float* pDstCurrent = pdst;
+                float* pMatCurrent = pmat;
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector256<float> res0 = Avx.SetZeroVector256();
+                    Vector256<float> res1 = res0;
+                    Vector256<float> res2 = res0;
+                    Vector256<float> res3 = res0;
+
+                    float* pSrcCurrent = psrc;
+
+                    while (pSrcCurrent < pSrcEnd)
+                    {
+                        float* pMatTemp = pMatCurrent;
+
+                        Vector256<float> x01 = Avx.LoadAlignedVector256(pMatTemp);
+                        Vector256<float> x11 = Avx.LoadAlignedVector256(pMatTemp += ccol);
+                        Vector256<float> x21 = Avx.LoadAlignedVector256(pMatTemp += ccol);
+                        Vector256<float> x31 = Avx.LoadAlignedVector256(pMatTemp += ccol);
+                        Vector256<float> x02 = Avx.LoadAlignedVector256(pSrcCurrent);
+
+                        res0 = Avx.Add(res0, Avx.Multiply(x01, x02));
+                        res1 = Avx.Add(res1, Avx.Multiply(x11, x02));
+                        res2 = Avx.Add(res2, Avx.Multiply(x21, x02));
+                        res3 = Avx.Add(res3, Avx.Multiply(x31, x02));
+
+                        pSrcCurrent += 8;
+                        pMatCurrent += 8;
+                    }
+
+                    // Add up the entries of each, with the 4 results in res0
+                    res0 = Avx.HorizontalAdd(res0, res1);
+                    res2 = Avx.HorizontalAdd(res2, res3);
+                    res0 = Avx.HorizontalAdd(res0, res2);
+
+                    Vector128<float> sum = Sse.Add(GetLow(in res0), GetHigh(in res0));
+                    if (add)
+                    {
+                        sum = Sse.Add(sum, Sse.LoadAlignedVector128(pDstCurrent));
+                    }
+                    Sse.StoreAligned(pDstCurrent, sum);
+
+                    pDstCurrent += 4;
+                    pMatCurrent += 3 * ccol;
+                }
+
+                ZeroUpper();
+            }
+        }
+
+        // Partial sparse source vector.
+        internal static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
+                                        int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
+        {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+            
+            // REVIEW: For extremely sparse inputs, interchanging the loops would
+            // likely be more efficient.
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            fixed (int* pposSrc = &rgposSrc[0])
+            {
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
+
+                int* pposMin = pposSrc + iposMin;
+                int* pposEnd = pposSrc + iposEnd;
+                float* pDstEnd = pdst + crow;
+                float* pm0 = pmat - posMin;
+                float* pSrcCurrent = psrc - posMin;
+                float* pDstCurrent = pdst;
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    float* pm1 = pm0 + ccol;
+                    float* pm2 = pm1 + ccol;
+                    float* pm3 = pm2 + ccol;
+                    Vector256<float> result = Avx.SetZeroVector256();
+
+                    int* ppos = pposMin;
+
+                    while (ppos < pposEnd)
+                    {
+                        int col1 = *ppos;
+                        int col2 = col1 + 4 * ccol;
+                        Vector256<float> x1 = Avx.SetVector256(pm3[col2], pm2[col2], pm1[col2], pm0[col2],
+                                                                pm3[col1], pm2[col1], pm1[col1], pm0[col1]);
+                        Vector256<float> x2 = Avx.SetAllVector256(pSrcCurrent[col1]);
+                        x2 = Avx.Multiply(x2, x1);
+                        result = Avx.Add(result, x2);
+
+                        ppos++;
+                    }
+
+                    if (add)
+                    {
+                        result = Avx.Add(result, Avx.LoadAlignedVector256(pDstCurrent));
+                    }
+                    Avx.StoreAligned(pDstCurrent, result);
+
+                    pDstCurrent += 8;
+                    pm0 += 8 * ccol;
+                }
+
+                ZeroUpper();
+            }
+        }
+
+        internal static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            {
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
+
+                float* pSrcEnd = psrc + ccol;
+                float* pDstEnd = pdst + crow;
+                float* pSrcCurrent = psrc;
+                float* pMatCurrent = pmat;
+
+                // We do 4-way unrolling
+                if (!add)
+                {
+                    Vector128<float> h01 = Sse.LoadAlignedVector128(pSrcCurrent);
+                    // Replicate each slot of h01 (ABCD) into its own register.
+                    Vector128<float> h11 = Sse.Shuffle(h01, h01, 0x55); // B
+                    Vector128<float> h21 = Sse.Shuffle(h01, h01, 0xAA); // C
+                    Vector128<float> h31 = Sse.Shuffle(h01, h01, 0xFF); // D
+                    h01 = Sse.Shuffle(h01, h01, 0x00); // A
+
+                    Vector256<float> x01 = ToVector256(h01, h01);
+                    Vector256<float> x11 = ToVector256(h11, h11);
+                    Vector256<float> x21 = ToVector256(h21, h21);
+                    Vector256<float> x31 = ToVector256(h31, h31);
+
+                    pSrcCurrent += 4;
+
+                    float* pDstCurrent = pdst;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        float* pMatTemp = pMatCurrent;
+                        Vector256<float> x02 = Avx.LoadAlignedVector256(pMatTemp);
+                        Vector256<float> x12 = Avx.LoadAlignedVector256(pMatTemp += crow);
+                        Vector256<float> x22 = Avx.LoadAlignedVector256(pMatTemp += crow);
+                        Vector256<float> x32 = Avx.LoadAlignedVector256(pMatTemp += crow);
+
+                        x02 = Avx.Multiply(x01, x02);
+                        x12 = Avx.Multiply(x11, x12);
+                        x22 = Avx.Multiply(x21, x22);
+                        x32 = Avx.Multiply(x31, x32);
+
+                        x02 = Avx.Add(x02, x12);
+                        x22 = Avx.Add(x22, x32);
+                        x02 = Avx.Add(x02, x22);
+
+                        Avx.StoreAligned(pDstCurrent, x02);
+
+                        pDstCurrent += 8;
+                        pMatCurrent += 8;
+                    }
+
+                    pMatCurrent += 3 * crow;
+                }
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> h01 = Sse.LoadAlignedVector128(pSrcCurrent);
+                    // Replicate each slot of h01 (ABCD) into its own register.
+                    Vector128<float> h11 = Sse.Shuffle(h01, h01, 0x55); // B
+                    Vector128<float> h21 = Sse.Shuffle(h01, h01, 0xAA); // C
+                    Vector128<float> h31 = Sse.Shuffle(h01, h01, 0xFF); // D
+                    h01 = Sse.Shuffle(h01, h01, 0x00); // A
+
+                    Vector256<float> x01 = ToVector256(h01, h01);
+                    Vector256<float> x11 = ToVector256(h11, h11);
+                    Vector256<float> x21 = ToVector256(h21, h21);
+                    Vector256<float> x31 = ToVector256(h31, h31);
+
+                    float* pDstCurrent = pdst;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        float* pMatTemp = pMatCurrent;
+
+                        Vector256<float> x02 = Avx.LoadAlignedVector256(pMatTemp);
+                        Vector256<float> x12 = Avx.LoadAlignedVector256(pMatTemp += crow);
+                        Vector256<float> x22 = Avx.LoadAlignedVector256(pMatTemp += crow);
+                        Vector256<float> x32 = Avx.LoadAlignedVector256(pMatTemp += crow);
+                        Vector256<float> x3 = Avx.LoadAlignedVector256(pDstCurrent);
+
+                        x02 = Avx.Multiply(x01, x02);
+                        x12 = Avx.Multiply(x11, x12);
+                        x22 = Avx.Multiply(x21, x22);
+                        x32 = Avx.Multiply(x31, x32);
+
+                        x02 = Avx.Add(x02, x12);
+                        x22 = Avx.Add(x22, x32);
+                        x02 = Avx.Add(x02, x22);
+                        x3 = Avx.Add(x02, x3);
+
+                        Avx.StoreAligned(pDstCurrent, x3);
+
+                        pDstCurrent += 8;
+                        pMatCurrent += 8;
+                    }
+
+                    pMatCurrent += 3 * crow;
+                    pSrcCurrent += 4;
+                }
+
+                ZeroUpper();
+            }
+        }
+
+        // Partial sparse source vector.
+        internal static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
+                                        int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow)
+        {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            fixed (int* pposSrc = &rgposSrc[0])
+            {
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
+
+                int* ppos = pposSrc + iposMin;
+                int* pposEnd = pposSrc + iposEnd;
+                float* pDstEnd = pdst + crow;
+
+                if (!add)
+                {
+                    int col = *ppos - posMin;
+                    ppos++;
+
+                    Vector256<float> x0 = Avx.SetAllVector256(psrc[col]);
+                    float* pDstCurrent = pdst;
+                    float* pMatCurrent = pmat + col * crow;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        Vector256<float> x1 = Avx.LoadAlignedVector256(pMatCurrent);
+                        x1 = Avx.Multiply(x1, x0);
+                        Avx.StoreAligned(pDstCurrent, x1);
+
+                        pDstCurrent += 8;
+                        pMatCurrent += 8;
+                    }
+                }
+
+                // REVIEW: Should we explore unrolling the outer loop?
+                while (ppos < pposEnd)
+                {
+                    int col = *ppos - posMin;
+
+                    Vector256<float> x0 = Avx.SetAllVector256(psrc[col]);
+                    float* pDstCurrent = pdst;
+                    float* pMatCurrent = pmat + col * crow;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        Vector256<float> x1 = Avx.LoadAlignedVector256(pMatCurrent);
+                        Vector256<float> x2 = Avx.LoadAlignedVector256(pDstCurrent);
+                        x1 = Avx.Multiply(x1, x0);
+                        x2 = Avx.Add(x2, x1);
+                        Avx.StoreAligned(pDstCurrent, x2);
+
+                        pDstCurrent += 8;
+                        pMatCurrent += 8;
+                    }
+
+                    ppos++;
+                }
+
+                ZeroUpper();
+            }
+        }
+
+        internal static unsafe void ScaleX(float scale, Span<float> dst)
+        {
+            Vector256<float> scaleVector = Avx.SetAllVector256(scale);
+
+            fixed (float* pdst = dst)
+            {
+                float* pDstCurrent = pdst;
+                float* pEnd = pdst + dst.Length;
+
+                while (pDstCurrent < pEnd)
+                {
+                    Vector256<float> dstVector = Avx.LoadAlignedVector256(pDstCurrent);
+
+                    dstVector = Avx.Multiply(scaleVector, dstVector);
+                    Avx.StoreAligned(pDstCurrent, dstVector);
+
+                    pDstCurrent += 8;
+                }
+            }
+
+            ZeroUpper();
+        }
+
+        internal static unsafe void AddScaleX(float scale, Span<float> src, Span<float> dst)
+        {
+            Vector256<float> scaleVector = Avx.SetAllVector256(scale);
+
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pEnd = pdst + dst.Length;
+
+                while (pDstCurrent < pEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadAlignedVector256(pSrcCurrent);
+                    Vector256<float> dstVector = Avx.LoadAlignedVector256(pDstCurrent);
+
+                    srcVector = Avx.Multiply(srcVector, scaleVector);
+                    dstVector = Avx.Add(dstVector, srcVector);
+                    Avx.StoreAligned(pDstCurrent, dstVector);
+
+                    pDstCurrent += 8;
+                    pSrcCurrent += 8;
+                }
+            }
+
+            ZeroUpper();
+        }
+
+        internal static unsafe void AddX(Span<float> src, Span<float> dst)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadAlignedVector256(pSrcCurrent);
+                    Vector256<float> dstVector = Avx.LoadAlignedVector256(pDstCurrent);
+
+                    Vector256<float> result = Avx.Add(srcVector, dstVector);
+                    Avx.StoreAligned(pDstCurrent, result);
+
+                    pSrcCurrent += 8;
+                    pDstCurrent += 8;
+                }
+
+                ZeroUpper();
+            }
+        }
+
+        internal static unsafe float SumX(Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector256<float> result = Avx.SetZeroVector256();
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    result = Avx.Add(result, Avx.LoadAlignedVector256(pSrcCurrent));
+                    pSrcCurrent += 8;
+                }
+
+                result = VectorSum(in result);
+                Vector128<float> result128 = Sse.AddScalar(GetLow(result), GetHigh(result));
+
+                float sum = Sse.ConvertToSingle(result128);
+                ZeroUpper();
+                return sum;
+            }
+        }
+    }
+}
diff --git a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
index b6c95b93f4..4c46db9c3c 100644
--- a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
+++ b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
@@ -27,4 +27,8 @@
     <Compile Remove="CpuMathUtils.netcoreapp.cs" />
     <Compile Remove="SseIntrinsics.cs" />
   </ItemGroup>
+
+  <ItemGroup>
+    <Compile Remove="AvxIntrinsics.cs" />
+  </ItemGroup>
 </Project>
\ No newline at end of file

From 3d76fb19c738a637272025220b0d5dd11f9e8eaf Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Wed, 15 Aug 2018 16:24:50 -0700
Subject: [PATCH 02/29] Implemented performance tests for AVX intrinsics, with
 some fixes to the intrinsics Note: Building perf tests succeed, but running
 perf tests for AVX intrinsics ends without results.

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     |  12 +-
 .../CpuMathUtils.netcoreapp.cs                |  30 +++-
 .../Microsoft.ML.CpuMath.csproj               |   9 +-
 .../AvxPerformanceTests.cs                    | 150 ++++++++++++++++++
 .../CpuMathNativeUtils.cs                     |  12 ++
 .../SsePerformanceTests.cs                    |   3 +-
 6 files changed, 203 insertions(+), 13 deletions(-)
 create mode 100644 test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 00453e75f4..073a8b561a 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -53,13 +53,13 @@ private static unsafe void ZeroUpper()
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<float> GetLow(in Vector128<float> x)
+        private static unsafe Vector128<float> GetLow(in Vector256<float> x)
         {
             return Avx.ExtractVector128<float>(x, 0);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<float> GetHigh(in Vector128<float> x)
+        private static unsafe Vector128<float> GetHigh(in Vector256<float> x)
         {
             return Avx.ExtractVector128<float>(x, 1);
         }
@@ -112,7 +112,7 @@ internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src
 
                 while (pDstCurrent < pDstEnd)
                 {
-                    Vector256<float> res0 = Avx.SetZeroVector256();
+                    Vector256<float> res0 = Avx.SetZeroVector256<float>();
                     Vector256<float> res1 = res0;
                     Vector256<float> res2 = res0;
                     Vector256<float> res3 = res0;
@@ -165,7 +165,7 @@ internal static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc,
             Contracts.Assert(Compat(mat));
             Contracts.Assert(Compat(src));
             Contracts.Assert(Compat(dst));
-            
+
             // REVIEW: For extremely sparse inputs, interchanging the loops would
             // likely be more efficient.
             fixed (float* pSrcStart = &src.Items[0])
@@ -189,7 +189,7 @@ internal static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc,
                     float* pm1 = pm0 + ccol;
                     float* pm2 = pm1 + ccol;
                     float* pm3 = pm2 + ccol;
-                    Vector256<float> result = Avx.SetZeroVector256();
+                    Vector256<float> result = Avx.SetZeroVector256<float>();
 
                     int* ppos = pposMin;
 
@@ -486,7 +486,7 @@ internal static unsafe float SumX(Span<float> src)
                 float* pSrcEnd = psrc + src.Length;
                 float* pSrcCurrent = psrc;
 
-                Vector256<float> result = Avx.SetZeroVector256();
+                Vector256<float> result = Avx.SetZeroVector256<float>();
 
                 while (pSrcCurrent < pSrcEnd)
                 {
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 81d7acf25a..4991c1fd86 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -17,7 +17,20 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
             Contracts.Assert(mat.Size == dst.Size * src.Size);
             Contracts.Assert(crun >= 0);
 
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                if (!tran)
+                {
+                    Contracts.Assert(crun <= dst.Size);
+                    AvxIntrinsics.MatMulX(add, mat, src, dst, crun, src.Size);
+                }
+                else
+                {
+                    Contracts.Assert(crun <= src.Size);
+                    AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Size, crun);
+                }
+            }
+            else if (Sse.IsSupported)
             {
                 if (!tran)
                 {
@@ -96,7 +109,20 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
             Contracts.AssertNonEmpty(rgposSrc);
             Contracts.Assert(crun >= 0);
 
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                if (!tran)
+                {
+                    Contracts.Assert(crun <= dst.Size);
+                    AvxIntrinsics.MatMulPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size);
+                }
+                else
+                {
+                    Contracts.Assert(crun <= srcValues.Size);
+                    AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
+                }
+            }
+            else if (Sse.IsSupported)
             {
                 if (!tran)
                 {
diff --git a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
index 4c46db9c3c..ef24bf2762 100644
--- a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
+++ b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
@@ -10,6 +10,11 @@
     <LangVersion>7.3</LangVersion>
   </PropertyGroup>
 
+  <PropertyGroup Condition="'$(Configuration)|$(TargetFramework)|$(Platform)'=='Release-Intrinsics|netstandard2.0|AnyCPU'">
+    <GenerateSerializationAssemblies>Auto</GenerateSerializationAssemblies>
+    <Optimize>true</Optimize>
+  </PropertyGroup>
+
   <ItemGroup>
     <Compile Include="..\Microsoft.ML.Core\Utilities\Contracts.cs" />
 
@@ -26,9 +31,7 @@
   <ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
     <Compile Remove="CpuMathUtils.netcoreapp.cs" />
     <Compile Remove="SseIntrinsics.cs" />
-  </ItemGroup>
-
-  <ItemGroup>
     <Compile Remove="AvxIntrinsics.cs" />
   </ItemGroup>
+
 </Project>
\ No newline at end of file
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
new file mode 100644
index 0000000000..004333cd68
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
@@ -0,0 +1,150 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Running;
+using Microsoft.ML.Runtime.Internal.CpuMath;
+
+namespace Microsoft.ML.CpuMath.PerformanceTests
+{
+    public class AvxPerformanceTests
+    {
+        private const int EXP_MAX = 127;
+        private const int EXP_MIN = 0;
+
+        private const int IDXLEN = 1000003;
+        private const int LEN = 1000003;
+        private const int EXP_RANGE = EXP_MAX / 2;
+        private const int DEFAULT_SEED = 253421;
+        private const float DEFAULT_SCALE = 1.11f;
+        private const int DEFAULT_CROW = 500;
+        private const int DEFAULT_CCOL = 2000;
+        private const bool ADD = true;
+
+        private float[] src, dst, original, src1, src2, result;
+        private int[] idx;
+        private int seed = DEFAULT_SEED;
+
+        private static float NextFloat(Random rand, int expRange)
+        {
+            double mantissa = (rand.NextDouble() * 2.0) - 1.0;
+            double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1));
+            return (float)(mantissa * exponent);
+        }
+
+        private static int GetSeed()
+        {
+            int seed = DEFAULT_SEED;
+
+            if (Environment.GetEnvironmentVariable("CPUMATH_SEED") != null)
+            {
+                string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED");
+
+                if (!int.TryParse(CPUMATH_SEED, out seed))
+                {
+                    if (string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase))
+                    {
+                        seed = new Random().Next();
+                    }
+                    else
+                    {
+                        seed = DEFAULT_SEED;
+                    }
+                }
+            }
+
+            Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results");
+
+            return seed;
+        }
+
+        [GlobalSetup]
+        public void Setup()
+        {
+            src = new float[LEN];
+            dst = new float[LEN];
+            src1 = new float[LEN];
+            src2 = new float[LEN];
+            original = new float[LEN];
+            result = new float[LEN];
+            idx = new int[IDXLEN];
+
+            seed = GetSeed();
+            Random rand = new Random(seed);
+
+            for (int i = 0; i < LEN; i++)
+            {
+                src[i] = NextFloat(rand, EXP_RANGE);
+                dst[i] = NextFloat(rand, EXP_RANGE);
+                original[i] = dst[i];
+                result[i] = dst[i];
+                src1[i] = NextFloat(rand, EXP_RANGE);
+                src2[i] = NextFloat(rand, EXP_RANGE);
+            }
+
+            for (int i = 0; i < IDXLEN; i++)
+            {
+                idx[i] = rand.Next(0, LEN);
+            }
+        }
+
+        [GlobalCleanup]
+        public void GlobalCleanup()
+        {
+            original.CopyTo(dst, 0);
+            original.CopyTo(result, 0);
+        }
+
+        [Benchmark]
+        public unsafe void NativeScaleXPerf()
+        {
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.ScaleX(DEFAULT_SCALE, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedScaleXPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeAddScaleXPerf()
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.AddScaleX(DEFAULT_SCALE, psrc, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedAddScaleXPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeAddXPerf()
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.AddX(psrc, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedAddXPerf() => CpuMathUtils.Add(src, dst, LEN);
+
+        [Benchmark]
+        public unsafe float NativeSumXPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumX(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumXPerf() => CpuMathUtils.Sum(src, LEN);
+    }
+}
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
index 8df3352556..27f46022eb 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
@@ -85,5 +85,17 @@ internal static class CpuMathNativeUtils
 
         [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateSU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void SdcaL1UpdateSU(float primalUpdate, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "ScaleX"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void ScaleX(float a, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "AddScaleX"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddScaleX(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "AddX"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddX(/*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "SumX"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumX(/*const*/ float* ps, int c);
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index ade2ea6a0e..a6ddf56a36 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -16,7 +16,7 @@ public class SsePerformanceTests
 
         private const int IDXLEN = 1000003;
         private const int LEN = 1000003;
-        private const int EXP_RANGE = EXP_MAX / 2;
+        private const int EXP_RANGE = EXP_MAX / 8;
         private const int DEFAULT_SEED = 253421;
         private const float DEFAULT_SCALE = 1.11f;
         private const int DEFAULT_CROW = 500;
@@ -312,7 +312,6 @@ public unsafe float NativeMaxAbsDiffUPerf()
 
         [Benchmark]
         public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN);
-        // TODO: MaxAbsU!!!
 
         [Benchmark]
         public unsafe float NativeDotUPerf()

From 6a51bd865a3dbd1e15c15987201496e4882a7b64 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Thu, 16 Aug 2018 11:15:56 -0700
Subject: [PATCH 03/29] Changes to perf tests in response to feedback

---
 .../AvxPerformanceTests.cs                                | 8 +++-----
 .../SsePerformanceTests.cs                                | 6 ++----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
index 004333cd68..b178ca684d 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
@@ -16,7 +16,7 @@ public class AvxPerformanceTests
 
         private const int IDXLEN = 1000003;
         private const int LEN = 1000003;
-        private const int EXP_RANGE = EXP_MAX / 2;
+        private const int EXP_RANGE = EXP_MAX / 8;
         private const int DEFAULT_SEED = 253421;
         private const float DEFAULT_SCALE = 1.11f;
         private const int DEFAULT_CROW = 500;
@@ -37,11 +37,10 @@ private static float NextFloat(Random rand, int expRange)
         private static int GetSeed()
         {
             int seed = DEFAULT_SEED;
+            string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED");
 
-            if (Environment.GetEnvironmentVariable("CPUMATH_SEED") != null)
+            if (CPUMATH_SEED != null)
             {
-                string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED");
-
                 if (!int.TryParse(CPUMATH_SEED, out seed))
                 {
                     if (string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase))
@@ -56,7 +55,6 @@ private static int GetSeed()
             }
 
             Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results");
-
             return seed;
         }
 
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index a6ddf56a36..02f844f033 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -37,11 +37,10 @@ private static float NextFloat(Random rand, int expRange)
         private static int GetSeed()
         {
             int seed = DEFAULT_SEED;
+            string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED");
 
-            if (Environment.GetEnvironmentVariable("CPUMATH_SEED") != null)
+            if (CPUMATH_SEED != null)
             {
-                string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED");
-
                 if (!int.TryParse(CPUMATH_SEED, out seed))
                 {
                     if(string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase))
@@ -56,7 +55,6 @@ private static int GetSeed()
             }
 
             Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results");
-
             return seed;
         }
 

From 1b2cea94b5f9ba309aa3cc6d63c0812e21f1cc8a Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Thu, 16 Aug 2018 20:44:03 -0700
Subject: [PATCH 04/29] Fixes across multiple files to make unit tests and perf
 tests work for all used AVX intrinsics Note: Except matrix operations

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     | 178 +++++++++---
 .../CpuMathUtils.netcoreapp.cs                |  24 +-
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     |   8 +-
 .../AvxPerformanceTests.cs                    |  46 +--
 .../SsePerformanceTests.cs                    |   4 +-
 .../UnitTests.cs                              | 262 +++++++++---------
 6 files changed, 303 insertions(+), 219 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 073a8b561a..94434b7ee5 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -35,17 +35,17 @@ private static bool Compat(AlignedArray a)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector256<float> ToVector256(in Vector128<float> a, in Vector128<float> b)
+        private static Vector256<float> ToVector256(in Vector128<float> a, in Vector128<float> b)
         {
             // REVIEW NEEDED: Is it the correct port of the following code?
             // #ifndef _WIN32
             // #define _mm256_set_m128(va, vb) _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1)
             // #endif
-            return Avx.InsertVector128<float>(Avx.ExtendToVector256<float>(b), a, 1);
+            return Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe void ZeroUpper()
+        private static void ZeroUpper()
         {
             // Currently no-op since _mm256_zeroupper is not supported (ref: https://github.com/dotnet/coreclr/pull/16955)
             // This is a placeholder in case the intrinsic is supported later on.
@@ -53,15 +53,15 @@ private static unsafe void ZeroUpper()
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<float> GetLow(in Vector256<float> x)
+        private static Vector128<float> GetLow(in Vector256<float> x)
         {
-            return Avx.ExtractVector128<float>(x, 0);
+            return Avx.ExtractVector128(x, 0);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<float> GetHigh(in Vector256<float> x)
+        private static Vector128<float> GetHigh(in Vector256<float> x)
         {
-            return Avx.ExtractVector128<float>(x, 1);
+            return Avx.ExtractVector128(x, 1);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
@@ -84,12 +84,28 @@ private static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector256<float> VectorSum(in Vector256<float> vector)
+        private static Vector256<float> VectorSum256(in Vector256<float> vector)
         {
             Vector256<float> partialSum = Avx.HorizontalAdd(vector, vector);
             return Avx.HorizontalAdd(partialSum, partialSum);
         }
 
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> VectorSum128(in Vector128<float> vector)
+        {
+            if (Sse3.IsSupported)
+            {
+                Vector128<float> partialSum = Sse3.HorizontalAdd(vector, vector);
+                return Sse3.HorizontalAdd(partialSum, partialSum);
+            }
+            else
+            {
+                Vector128<float> partialSum = Sse.Add(vector, Sse.MoveHighToLow(vector, vector));
+                // The control byte shuffles the four 32-bit floats of partialSum: ABCD -> BADC.
+                return Sse.Add(partialSum, Sse.Shuffle(partialSum, partialSum, 0xB1));
+            }
+        }
+
         // Multiply matrix times vector into vector.
         internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
@@ -403,33 +419,53 @@ internal static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgpos
             }
         }
 
-        internal static unsafe void ScaleX(float scale, Span<float> dst)
+        internal static unsafe void ScaleU(float scale, Span<float> dst)
         {
-            Vector256<float> scaleVector = Avx.SetAllVector256(scale);
-
             fixed (float* pdst = dst)
             {
                 float* pDstCurrent = pdst;
                 float* pEnd = pdst + dst.Length;
 
-                while (pDstCurrent < pEnd)
+                Vector256<float> scaleVector256 = Avx.SetAllVector256(scale);
+
+                while (pDstCurrent + 8 <= pEnd)
                 {
-                    Vector256<float> dstVector = Avx.LoadAlignedVector256(pDstCurrent);
+                    Vector256<float> dstVector = Avx.LoadVector256(pDstCurrent);
 
-                    dstVector = Avx.Multiply(scaleVector, dstVector);
-                    Avx.StoreAligned(pDstCurrent, dstVector);
+                    dstVector = Avx.Multiply(scaleVector256, dstVector);
+                    Avx.Store(pDstCurrent, dstVector);
 
                     pDstCurrent += 8;
                 }
+
+                Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
+
+                while (pDstCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+
+                    dstVector = Sse.Multiply(scaleVector128, dstVector);
+                    Sse.Store(pDstCurrent, dstVector);
+
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+                    dstVector = Sse.MultiplyScalar(scaleVector128, dstVector);
+                    Sse.StoreScalar(pDstCurrent, dstVector);
+
+                    pDstCurrent++;
+                }
             }
 
             ZeroUpper();
         }
 
-        internal static unsafe void AddScaleX(float scale, Span<float> src, Span<float> dst)
+        internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float> dst)
         {
-            Vector256<float> scaleVector = Avx.SetAllVector256(scale);
-
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             {
@@ -437,24 +473,54 @@ internal static unsafe void AddScaleX(float scale, Span<float> src, Span<float>
                 float* pDstCurrent = pdst;
                 float* pEnd = pdst + dst.Length;
 
-                while (pDstCurrent < pEnd)
+                Vector256<float> scaleVector256 = Avx.SetAllVector256(scale);
+
+                while (pDstCurrent + 8 <= pEnd)
                 {
-                    Vector256<float> srcVector = Avx.LoadAlignedVector256(pSrcCurrent);
-                    Vector256<float> dstVector = Avx.LoadAlignedVector256(pDstCurrent);
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    Vector256<float> dstVector = Avx.LoadVector256(pDstCurrent);
 
-                    srcVector = Avx.Multiply(srcVector, scaleVector);
+                    srcVector = Avx.Multiply(srcVector, scaleVector256);
                     dstVector = Avx.Add(dstVector, srcVector);
-                    Avx.StoreAligned(pDstCurrent, dstVector);
+                    Avx.Store(pDstCurrent, dstVector);
 
-                    pDstCurrent += 8;
                     pSrcCurrent += 8;
+                    pDstCurrent += 8;
+                }
+
+                Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
+
+                while (pDstCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+
+                    srcVector = Sse.Multiply(srcVector, scaleVector128);
+                    dstVector = Sse.Add(dstVector, srcVector);
+                    Sse.Store(pDstCurrent, dstVector);
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+                    srcVector = Sse.MultiplyScalar(srcVector, scaleVector128);
+                    dstVector = Sse.AddScalar(dstVector, srcVector);
+                    Sse.StoreScalar(pDstCurrent, dstVector);
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
                 }
             }
 
             ZeroUpper();
         }
 
-        internal static unsafe void AddX(Span<float> src, Span<float> dst)
+        internal static unsafe void AddU(Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -463,41 +529,81 @@ internal static unsafe void AddX(Span<float> src, Span<float> dst)
                 float* pDstCurrent = pdst;
                 float* pEnd = psrc + src.Length;
 
-                while (pSrcCurrent < pEnd)
+                while (pSrcCurrent + 8 <= pEnd)
                 {
-                    Vector256<float> srcVector = Avx.LoadAlignedVector256(pSrcCurrent);
-                    Vector256<float> dstVector = Avx.LoadAlignedVector256(pDstCurrent);
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    Vector256<float> dstVector = Avx.LoadVector256(pDstCurrent);
 
                     Vector256<float> result = Avx.Add(srcVector, dstVector);
-                    Avx.StoreAligned(pDstCurrent, result);
+                    Avx.Store(pDstCurrent, result);
 
                     pSrcCurrent += 8;
                     pDstCurrent += 8;
                 }
 
+                while (pSrcCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+
+                    Vector128<float> result = Sse.Add(srcVector, dstVector);
+                    Sse.Store(pDstCurrent, result);
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+                    Vector128<float> result = Sse.AddScalar(srcVector, dstVector);
+                    Sse.StoreScalar(pDstCurrent, result);
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                }
+
                 ZeroUpper();
             }
         }
 
-        internal static unsafe float SumX(Span<float> src)
+        internal static unsafe float SumU(Span<float> src)
         {
             fixed (float* psrc = src)
             {
                 float* pSrcEnd = psrc + src.Length;
                 float* pSrcCurrent = psrc;
 
-                Vector256<float> result = Avx.SetZeroVector256<float>();
+                Vector256<float> result256 = Avx.SetZeroVector256<float>();
 
-                while (pSrcCurrent < pSrcEnd)
+                while (pSrcCurrent + 8 <= pSrcEnd)
                 {
-                    result = Avx.Add(result, Avx.LoadAlignedVector256(pSrcCurrent));
+                    result256 = Avx.Add(result256, Avx.LoadVector256(pSrcCurrent));
                     pSrcCurrent += 8;
                 }
 
-                result = VectorSum(in result);
-                Vector128<float> result128 = Sse.AddScalar(GetLow(result), GetHigh(result));
+                result256 = VectorSum256(in result256);
+                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+
+                Vector128<float> result128 = Sse.SetZeroVector128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    result128 = Sse.Add(result128, Sse.LoadVector128(pSrcCurrent));
+                    pSrcCurrent += 4;
+                }
+
+                result128 = VectorSum128(in result128);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    result128 = Sse.AddScalar(result128, Sse.LoadScalarVector128(pSrcCurrent));
+                    pSrcCurrent++;
+                }
 
-                float sum = Sse.ConvertToSingle(result128);
+                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
                 ZeroUpper();
                 return sum;
             }
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 4991c1fd86..c192052ca6 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -230,7 +230,11 @@ public static void Scale(float a, float[] dst, int offset, int count)
 
         private static void Scale(float a, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.ScaleU(a, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.ScaleU(a, dst);
             }
@@ -321,7 +325,11 @@ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, in
 
         private static void AddScale(float a, Span<float> src, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddScaleU(a, src, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScaleU(a, src, dst);
             }
@@ -420,7 +428,11 @@ public static void Add(float[] src, float[] dst, int count)
 
         private static void Add(Span<float> src, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddU(src, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddU(src, dst);
             }
@@ -527,7 +539,11 @@ public static float Sum(float[] src, int offset, int count)
 
         private static float Sum(Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.SumU(src);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.SumU(src);
             }
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index bf7ad03e34..44157364a8 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -406,19 +406,19 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos
         }
 
         // dst[i] += scale
-        internal static unsafe void AddScalarU(float scale, Span<float> dst)
+        internal static unsafe void AddScalarU(float scalar, Span<float> dst)
         {
             fixed (float* pdst = dst)
             {
                 float* pDstEnd = pdst + dst.Length;
                 float* pDstCurrent = pdst;
 
-                Vector128<float> x1 = Sse.SetAllVector128(scale);
+                Vector128<float> scalarVector = Sse.SetAllVector128(scalar);
 
                 while (pDstCurrent + 4 <= pDstEnd)
                 {
                     Vector128<float> x2 = Sse.LoadVector128(pDstCurrent);
-                    x2 = Sse.Add(x2, x1);
+                    x2 = Sse.Add(x2, scalarVector);
                     Sse.Store(pDstCurrent, x2);
 
                     pDstCurrent += 4;
@@ -427,7 +427,7 @@ internal static unsafe void AddScalarU(float scale, Span<float> dst)
                 while (pDstCurrent < pDstEnd)
                 {
                     Vector128<float> x2 = Sse.LoadScalarVector128(pDstCurrent);
-                    x2 = Sse.AddScalar(x2, x1);
+                    x2 = Sse.AddScalar(x2, scalarVector);
                     Sse.StoreScalar(pDstCurrent, x2);
 
                     pDstCurrent++;
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
index b178ca684d..fdb7140738 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
@@ -96,53 +96,15 @@ public void GlobalCleanup()
         }
 
         [Benchmark]
-        public unsafe void NativeScaleXPerf()
-        {
-            fixed (float* pdst = dst)
-            {
-                CpuMathNativeUtils.ScaleX(DEFAULT_SCALE, pdst, LEN);
-            }
-        }
-
-        [Benchmark]
-        public void ManagedScaleXPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
-
-        [Benchmark]
-        public unsafe void NativeAddScaleXPerf()
-        {
-            fixed (float* psrc = src)
-            fixed (float* pdst = dst)
-            {
-                CpuMathNativeUtils.AddScaleX(DEFAULT_SCALE, psrc, pdst, LEN);
-            }
-        }
+        public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
 
         [Benchmark]
-        public void ManagedAddScaleXPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
+        public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
 
         [Benchmark]
-        public unsafe void NativeAddXPerf()
-        {
-            fixed (float* psrc = src)
-            fixed (float* pdst = dst)
-            {
-                CpuMathNativeUtils.AddX(psrc, pdst, LEN);
-            }
-        }
-
-        [Benchmark]
-        public void ManagedAddXPerf() => CpuMathUtils.Add(src, dst, LEN);
-
-        [Benchmark]
-        public unsafe float NativeSumXPerf()
-        {
-            fixed (float* psrc = src)
-            {
-                return CpuMathNativeUtils.SumX(psrc, LEN);
-            }
-        }
+        public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN);
 
         [Benchmark]
-        public float ManagedSumXPerf() => CpuMathUtils.Sum(src, LEN);
+        public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN);
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index 02f844f033..ff1f451550 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -96,11 +96,11 @@ public void GlobalCleanup()
         }
 
         [Benchmark]
-        public unsafe float NativeAddScalarUPerf()
+        public unsafe void NativeAddScalarUPerf()
         {
             fixed (float* pdst = dst)
             {
-                return CpuMathNativeUtils.AddScalarU(DEFAULT_SCALE, pdst, LEN);
+                CpuMathNativeUtils.AddScalarU(DEFAULT_SCALE, pdst, LEN);
             }
         }
 
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
index d1d5955a8e..b57066be8c 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
@@ -70,137 +70,137 @@ public CpuMathUtilsUnitTests()
             testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 };
         }
 
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })]
-        [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })]
-        [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })]
-        public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-
-            CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })]
-        [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })]
-        [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })]
-        public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-
-            CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })]
-        [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })]
-        [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })]
-        public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-
-            CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })]
-        [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })]
-        [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })]
-        public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-
-            CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })]
-        [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })]
-        [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })]
-        public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-            int[] idx = testIndexArray;
-
-            CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })]
-        [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })]
-        [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })]
-        public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-            int[] idx = testIndexArray;
-
-            CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })]
-        [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })]
-        [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })]
-        public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-            int[] idx = testIndexArray;
-
-            CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })]
-        [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })]
-        [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })]
-        public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-            int[] idx = testIndexArray;
-
-            CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
-        }
+        //[Theory]
+        //[InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })]
+        //[InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })]
+        //[InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })]
+        //public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
+        //{
+        //    AlignedArray mat = testMatrices[matTest];
+        //    AlignedArray src = testSrcVectors[srcTest];
+        //    AlignedArray dst = testDstVectors[dstTest];
+
+        //    CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
+        //    float[] actual = new float[dst.Size];
+        //    dst.CopyTo(actual, 0, dst.Size);
+        //    Assert.Equal(expected, actual, comparer);
+        //}
+
+        //[Theory]
+        //[InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })]
+        //[InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })]
+        //[InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })]
+        //public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        //{
+        //    AlignedArray mat = testMatrices[matTest];
+        //    AlignedArray src = testSrcVectors[srcTest];
+        //    AlignedArray dst = testDstVectors[dstTest];
+
+        //    CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
+        //    float[] actual = new float[dst.Size];
+        //    dst.CopyTo(actual, 0, dst.Size);
+        //    Assert.Equal(expected, actual, comparer);
+        //}
+
+        //[Theory]
+        //[InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })]
+        //[InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })]
+        //[InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })]
+        //public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
+        //{
+        //    AlignedArray mat = testMatrices[matTest];
+        //    AlignedArray src = testSrcVectors[srcTest];
+        //    AlignedArray dst = testDstVectors[dstTest];
+
+        //    CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
+        //    float[] actual = new float[dst.Size];
+        //    dst.CopyTo(actual, 0, dst.Size);
+        //    Assert.Equal(expected, actual, comparer);
+        //}
+
+        //[Theory]
+        //[InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })]
+        //[InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })]
+        //[InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })]
+        //public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        //{
+        //    AlignedArray mat = testMatrices[matTest];
+        //    AlignedArray src = testSrcVectors[srcTest];
+        //    AlignedArray dst = testDstVectors[dstTest];
+
+        //    CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
+        //    float[] actual = new float[dst.Size];
+        //    dst.CopyTo(actual, 0, dst.Size);
+        //    Assert.Equal(expected, actual, comparer);
+        //}
+
+        //[Theory]
+        //[InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })]
+        //[InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })]
+        //[InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })]
+        //public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
+        //{
+        //    AlignedArray mat = testMatrices[matTest];
+        //    AlignedArray src = testSrcVectors[srcTest];
+        //    AlignedArray dst = testDstVectors[dstTest];
+        //    int[] idx = testIndexArray;
+
+        //    CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
+        //    float[] actual = new float[dst.Size];
+        //    dst.CopyTo(actual, 0, dst.Size);
+        //    Assert.Equal(expected, actual, comparer);
+        //}
+
+        //[Theory]
+        //[InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })]
+        //[InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })]
+        //[InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })]
+        //public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        //{
+        //    AlignedArray mat = testMatrices[matTest];
+        //    AlignedArray src = testSrcVectors[srcTest];
+        //    AlignedArray dst = testDstVectors[dstTest];
+        //    int[] idx = testIndexArray;
+
+        //    CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
+        //    float[] actual = new float[dst.Size];
+        //    dst.CopyTo(actual, 0, dst.Size);
+        //    Assert.Equal(expected, actual, comparer);
+        //}
+
+        //[Theory]
+        //[InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })]
+        //[InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })]
+        //[InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })]
+        //public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected)
+        //{
+        //    AlignedArray mat = testMatrices[matTest];
+        //    AlignedArray src = testSrcVectors[srcTest];
+        //    AlignedArray dst = testDstVectors[dstTest];
+        //    int[] idx = testIndexArray;
+
+        //    CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
+        //    float[] actual = new float[dst.Size];
+        //    dst.CopyTo(actual, 0, dst.Size);
+        //    Assert.Equal(expected, actual, comparer);
+        //}
+
+        //[Theory]
+        //[InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })]
+        //[InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })]
+        //[InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })]
+        //public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        //{
+        //    AlignedArray mat = testMatrices[matTest];
+        //    AlignedArray src = testSrcVectors[srcTest];
+        //    AlignedArray dst = testDstVectors[dstTest];
+        //    int[] idx = testIndexArray;
+
+        //    CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
+        //    float[] actual = new float[dst.Size];
+        //    dst.CopyTo(actual, 0, dst.Size);
+        //    Assert.Equal(expected, actual, comparer);
+        //}
 
         [Theory]
         [InlineData(0)]

From f471726416439540a21d0dcb3e8e0fba538688b7 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Thu, 16 Aug 2018 23:28:33 -0700
Subject: [PATCH 05/29] Implemented new AVX intrinsics that do not involve
 matrix operations, passing basic unit tests

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     | 1075 ++++++++++++++++-
 .../CpuMathUtils.netcoreapp.cs                |  108 +-
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     |  196 +--
 3 files changed, 1246 insertions(+), 133 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 94434b7ee5..4c006ea868 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -64,6 +64,24 @@ private static Vector128<float> GetHigh(in Vector256<float> x)
             return Avx.ExtractVector128(x, 1);
         }
 
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe Vector128<float> Load1(float* src, int* idx)
+        {
+            return Sse.SetScalarVector128(src[idx[0]]);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe Vector128<float> Load4(float* src, int* idx)
+        {
+            return Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe Vector256<float> Load8(float* src, int* idx)
+        {
+            return Avx.SetVector256(src[idx[7]], src[idx[6]], src[idx[5]], src[idx[4]], src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
+        }
+
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> Rotate(in Vector128<float> x)
         {
@@ -84,10 +102,24 @@ private static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector256<float> VectorSum256(in Vector256<float> vector)
+        private static unsafe void Store8(in Vector256<float> x, float* dst, int* idx)
         {
-            Vector256<float> partialSum = Avx.HorizontalAdd(vector, vector);
-            return Avx.HorizontalAdd(partialSum, partialSum);
+            Vector128<float> tmp = GetLow(in x);
+            Sse.StoreScalar(dst + idx[0], tmp);
+            tmp = Rotate(in tmp);
+            Sse.StoreScalar(dst + idx[1], tmp);
+            tmp = Rotate(in tmp);
+            Sse.StoreScalar(dst + idx[2], tmp);
+            tmp = Rotate(in tmp);
+            Sse.StoreScalar(dst + idx[3], tmp);
+            tmp = GetHigh(in x);
+            Sse.StoreScalar(dst + idx[4], tmp);
+            tmp = Rotate(in tmp);
+            Sse.StoreScalar(dst + idx[5], tmp);
+            tmp = Rotate(in tmp);
+            Sse.StoreScalar(dst + idx[6], tmp);
+            tmp = Rotate(in tmp);
+            Sse.StoreScalar(dst + idx[7], tmp);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
@@ -106,6 +138,68 @@ private static Vector128<float> VectorSum128(in Vector128<float> vector)
             }
         }
 
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<float> VectorSum256(in Vector256<float> vector)
+        {
+            Vector256<float> partialSum = Avx.HorizontalAdd(vector, vector);
+            return Avx.HorizontalAdd(partialSum, partialSum);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> VectorMax128(in Vector128<float> vector)
+        {
+            Vector128<float> x1 = Sse.Shuffle(vector, vector, 0xB1);
+            Vector128<float> partialMax = Sse.Max(vector, x1);
+            x1 = Sse.Shuffle(partialMax, partialMax, 0x02);
+            return Sse.MaxScalar(partialMax, x1);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<float> VectorMax256(in Vector256<float> vector)
+        {
+            Vector256<float> x1 = Avx.Shuffle(vector, vector, 0xB1);
+            Vector256<float> partialMax = Avx.Max(vector, x1);
+            x1 = Avx.Shuffle(partialMax, partialMax, 0x02);
+            return Avx.Max(partialMax, x1);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> GetAbsMask128()
+        {
+            return Sse2.IsSupported ?
+                Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF)) :
+                Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<float> GetAbsMask256()
+        {
+            return Avx.StaticCast<int, float>(Avx.SetAllVector256(0x7FFFFFFF));
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> GetNewDst128(in Vector128<float> xDst1, in Vector128<float> signMask, in Vector128<float> xThreshold)
+        {
+            Vector128<float> xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise
+            Vector128<float> xDst1Abs = Sse.Xor(xDst1, xSign);
+            Vector128<float> xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true
+            Vector128<float> x2 = Sse.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise
+            return Sse.And(Sse.Subtract(xDst1, x2), xCond);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<float> GetNewDst256(in Vector256<float> xDst1, in Vector256<float> signMask, in Vector256<float> xThreshold)
+        {
+            Vector256<float> xSign = Avx.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise
+            Vector256<float> xDst1Abs = Avx.Xor(xDst1, xSign);
+
+            // REVIEW NEEDED: Do we want Signaling or NonSignaling?  The original functionality is NonSignaling, which does not throw an exception even when there is an NaN.
+            // Signaling means that if an operand contains an NaN, an exception is raised (ref: https://stackoverflow.com/questions/16988199/how-to-choose-avx-compare-predicate-variants)
+            Vector256<float> xCond = Avx.Compare(xDst1Abs, xThreshold, FloatComparisonMode.GreaterThanOrderedSignaling); // result = 0xFFFF FFFF if true
+            Vector256<float> x2 = Avx.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise
+            return Avx.And(Avx.Subtract(xDst1, x2), xCond);
+        }
+
         // Multiply matrix times vector into vector.
         internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
@@ -419,6 +513,49 @@ internal static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgpos
             }
         }
 
+        // dst[i] += scale
+        internal static unsafe void AddScalarU(float scalar, Span<float> dst)
+        {
+            fixed (float* pdst = dst)
+            {
+                float* pDstEnd = pdst + dst.Length;
+                float* pDstCurrent = pdst;
+
+                Vector256<float> scalarVector256 = Avx.SetAllVector256(scalar);
+
+                while (pDstCurrent + 8 <= pDstEnd)
+                {
+                    Vector256<float> dstVector = Avx.LoadVector256(pDstCurrent);
+                    dstVector = Avx.Add(dstVector, scalarVector256);
+                    Avx.Store(pDstCurrent, dstVector);
+
+                    pDstCurrent += 8;
+                }
+
+                Vector128<float> scalarVector128 = Sse.SetAllVector128(scalar);
+
+                while (pDstCurrent + 4 <= pDstEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+                    dstVector = Sse.Add(dstVector, scalarVector128);
+                    Sse.Store(pDstCurrent, dstVector);
+
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+                    dstVector = Sse.AddScalar(dstVector, scalarVector128);
+                    Sse.StoreScalar(pDstCurrent, dstVector);
+
+                    pDstCurrent++;
+                }
+            }
+
+            ZeroUpper();
+        }
+
         internal static unsafe void ScaleU(float scale, Span<float> dst)
         {
             fixed (float* pdst = dst)
@@ -464,6 +601,101 @@ internal static unsafe void ScaleU(float scale, Span<float> dst)
             ZeroUpper();
         }
 
+        internal static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> dst)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                float* pDstEnd = pdst + dst.Length;
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+
+                Vector256<float> scaleVector256 = Avx.SetAllVector256(scale);
+
+                while (pDstCurrent + 8 <= pDstEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    srcVector = Avx.Multiply(srcVector, scaleVector256);
+                    Avx.Store(pDstCurrent, srcVector);
+
+                    pSrcCurrent += 8;
+                    pDstCurrent += 8;
+                }
+
+                Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
+
+                while (pDstCurrent + 4 <= pDstEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    srcVector = Sse.Multiply(srcVector, scaleVector128);
+                    Sse.Store(pDstCurrent, srcVector);
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.MultiplyScalar(srcVector, scaleVector128);
+                    Sse.StoreScalar(pDstCurrent, srcVector);
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                }
+            }
+
+            ZeroUpper();
+        }
+
+        // dst[i] = a * (dst[i] + b)
+        internal static unsafe void ScaleAddU(float a, float b, Span<float> dst)
+        {
+            fixed (float* pdst = dst)
+            {
+                float* pDstEnd = pdst + dst.Length;
+                float* pDstCurrent = pdst;
+
+                Vector256<float> a256 = Avx.SetAllVector256(a);
+                Vector256<float> b256 = Avx.SetAllVector256(b);
+
+                while (pDstCurrent + 8 <= pDstEnd)
+                {
+                    Vector256<float> dstVector = Avx.LoadVector256(pDstCurrent);
+                    dstVector = Avx.Add(dstVector, b256);
+                    dstVector = Avx.Multiply(dstVector, a256);
+                    Avx.Store(pDstCurrent, dstVector);
+
+                    pDstCurrent += 8;
+                }
+
+                Vector128<float> a128 = Sse.SetAllVector128(a);
+                Vector128<float> b128 = Sse.SetAllVector128(b);
+
+                while (pDstCurrent + 4 <= pDstEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+                    dstVector = Sse.Add(dstVector, b128);
+                    dstVector = Sse.Multiply(dstVector, a128);
+                    Sse.Store(pDstCurrent, dstVector);
+
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+                    dstVector = Sse.AddScalar(dstVector, b128);
+                    dstVector = Sse.MultiplyScalar(dstVector, a128);
+                    Sse.StoreScalar(pDstCurrent, dstVector);
+
+                    pDstCurrent++;
+                }
+            }
+
+            ZeroUpper();
+        }
+
         internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
@@ -520,6 +752,117 @@ internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float>
             ZeroUpper();
         }
 
+        internal static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<float> dst, Span<float> result)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (float* pres = result)
+            {
+                float* pResEnd = pres + result.Length;
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pResCurrent = pres;
+
+                Vector256<float> scaleVector256 = Avx.SetAllVector256(scale);
+
+                while (pResCurrent + 8 <= pResEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    Vector256<float> dstVector = Avx.LoadVector256(pDstCurrent);
+                    srcVector = Avx.Multiply(srcVector, scaleVector256);
+                    dstVector = Avx.Add(dstVector, srcVector);
+                    Avx.Store(pResCurrent, dstVector);
+
+                    pSrcCurrent += 8;
+                    pDstCurrent += 8;
+                    pResCurrent += 8;
+                }
+
+                Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
+
+                while (pResCurrent + 4 <= pResEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+                    srcVector = Sse.Multiply(srcVector, scaleVector128);
+                    dstVector = Sse.Add(dstVector, srcVector);
+                    Sse.Store(pResCurrent, dstVector);
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                    pResCurrent += 4;
+                }
+
+                while (pResCurrent < pResEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+                    srcVector = Sse.MultiplyScalar(srcVector, scaleVector128);
+                    dstVector = Sse.AddScalar(dstVector, srcVector);
+                    Sse.StoreScalar(pResCurrent, dstVector);
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                    pResCurrent++;
+                }
+            }
+
+            ZeroUpper();
+        }
+
+        internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx, Span<float> dst)
+        {
+            fixed (float* psrc = src)
+            fixed (int* pidx = idx)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                int* pIdxCurrent = pidx;
+                float* pDstCurrent = pdst;
+                int* pEnd = pidx + idx.Length;
+
+                Vector256<float> scaleVector256 = Avx.SetAllVector256(scale);
+
+                while (pIdxCurrent + 8 <= pEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    Vector256<float> dstVector = Load8(pDstCurrent, pIdxCurrent);
+
+                    srcVector = Avx.Multiply(srcVector, scaleVector256);
+                    dstVector = Avx.Add(dstVector, srcVector);
+                    Store8(in dstVector, pDstCurrent, pIdxCurrent);
+
+                    pIdxCurrent += 8;
+                    pSrcCurrent += 8;
+                }
+
+                Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
+
+                while (pIdxCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Load4(pDstCurrent, pIdxCurrent);
+
+                    srcVector = Sse.Multiply(srcVector, scaleVector128);
+                    dstVector = Sse.Add(dstVector, srcVector);
+                    Store4(in dstVector, pDstCurrent, pIdxCurrent);
+
+                    pIdxCurrent += 4;
+                    pSrcCurrent += 4;
+                }
+
+                while (pIdxCurrent < pEnd)
+                {
+                    pDstCurrent[*pIdxCurrent] += scale * (*pSrcCurrent);
+
+                    pIdxCurrent++;
+                    pSrcCurrent++;
+                }
+            }
+
+            ZeroUpper();
+        }
+
         internal static unsafe void AddU(Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
@@ -564,34 +907,132 @@ internal static unsafe void AddU(Span<float> src, Span<float> dst)
                     pSrcCurrent++;
                     pDstCurrent++;
                 }
-
-                ZeroUpper();
             }
+
+            ZeroUpper();
         }
 
-        internal static unsafe float SumU(Span<float> src)
+        internal static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
         {
             fixed (float* psrc = src)
+            fixed (int* pidx = idx)
+            fixed (float* pdst = dst)
             {
-                float* pSrcEnd = psrc + src.Length;
                 float* pSrcCurrent = psrc;
+                int* pIdxCurrent = pidx;
+                float* pDstCurrent = pdst;
+                int* pEnd = pidx + idx.Length;
 
-                Vector256<float> result256 = Avx.SetZeroVector256<float>();
-
-                while (pSrcCurrent + 8 <= pSrcEnd)
+                while (pIdxCurrent + 8 <= pEnd)
                 {
-                    result256 = Avx.Add(result256, Avx.LoadVector256(pSrcCurrent));
+                    Vector256<float> srcVector = Load8(pDstCurrent, pIdxCurrent);
+                    Vector256<float> dstVector = Avx.LoadVector256(pSrcCurrent);
+
+                    srcVector = Avx.Add(srcVector, dstVector);
+                    Store8(in srcVector, pDstCurrent, pIdxCurrent);
+
+                    pIdxCurrent += 8;
                     pSrcCurrent += 8;
                 }
 
-                result256 = VectorSum256(in result256);
-                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+                while (pIdxCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Load4(pDstCurrent, pIdxCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pSrcCurrent);
 
-                Vector128<float> result128 = Sse.SetZeroVector128();
+                    srcVector = Sse.Add(srcVector, dstVector);
+                    Store4(in srcVector, pDstCurrent, pIdxCurrent);
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
-                {
-                    result128 = Sse.Add(result128, Sse.LoadVector128(pSrcCurrent));
+                    pIdxCurrent += 4;
+                    pSrcCurrent += 4;
+                }
+
+                while (pIdxCurrent < pEnd)
+                {
+                    pDstCurrent[*pIdxCurrent] += *pSrcCurrent;
+
+                    pIdxCurrent++;
+                    pSrcCurrent++;
+                }
+            }
+
+            ZeroUpper();
+        }
+
+        internal static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2, Span<float> dst)
+        {
+            fixed (float* psrc1 = &src1[0])
+            fixed (float* psrc2 = &src2[0])
+            fixed (float* pdst = dst)
+            {
+                float* pSrc1Current = psrc1;
+                float* pSrc2Current = psrc2;
+                float* pDstCurrent = pdst;
+                float* pEnd = pdst + dst.Length;
+
+                while (pDstCurrent + 8 <= pEnd)
+                {
+                    Vector256<float> src1Vector = Avx.LoadVector256(pSrc1Current);
+                    Vector256<float> src2Vector = Avx.LoadVector256(pSrc2Current);
+                    src2Vector = Avx.Multiply(src1Vector, src2Vector);
+                    Avx.Store(pDstCurrent, src2Vector);
+
+                    pSrc1Current += 8;
+                    pSrc2Current += 8;
+                    pDstCurrent += 8;
+                }
+
+                while (pDstCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> src1Vector = Sse.LoadVector128(pSrc1Current);
+                    Vector128<float> src2Vector = Sse.LoadVector128(pSrc2Current);
+                    src2Vector = Sse.Multiply(src1Vector, src2Vector);
+                    Sse.Store(pDstCurrent, src2Vector);
+
+                    pSrc1Current += 4;
+                    pSrc2Current += 4;
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pEnd)
+                {
+                    Vector128<float> src1Vector = Sse.LoadScalarVector128(pSrc1Current);
+                    Vector128<float> src2Vector = Sse.LoadScalarVector128(pSrc2Current);
+                    src2Vector = Sse.MultiplyScalar(src1Vector, src2Vector);
+                    Sse.StoreScalar(pDstCurrent, src2Vector);
+
+                    pSrc1Current++;
+                    pSrc2Current++;
+                    pDstCurrent++;
+                }
+            }
+
+            ZeroUpper();
+        }
+
+        internal static unsafe float SumU(Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector256<float> result256 = Avx.SetZeroVector256<float>();
+
+                while (pSrcCurrent + 8 <= pSrcEnd)
+                {
+                    result256 = Avx.Add(result256, Avx.LoadVector256(pSrcCurrent));
+                    pSrcCurrent += 8;
+                }
+
+                result256 = VectorSum256(in result256);
+                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+
+                Vector128<float> result128 = Sse.SetZeroVector128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    result128 = Sse.Add(result128, Sse.LoadVector128(pSrcCurrent));
                     pSrcCurrent += 4;
                 }
 
@@ -608,5 +1049,605 @@ internal static unsafe float SumU(Span<float> src)
                 return sum;
             }
         }
+
+        internal static unsafe float SumSqU(Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector256<float> result256 = Avx.SetZeroVector256<float>();
+
+                while (pSrcCurrent + 8 <= pSrcEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    result256 = Avx.Add(result256, Avx.Multiply(srcVector, srcVector));
+
+                    pSrcCurrent += 8;
+                }
+
+                result256 = VectorSum256(in result256);
+                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+
+                Vector128<float> result128 = Sse.SetZeroVector128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    result128 = Sse.Add(result128, Sse.Multiply(srcVector, srcVector));
+
+                    pSrcCurrent += 4;
+                }
+
+                result128 = VectorSum128(in result128);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, srcVector));
+
+                    pSrcCurrent++;
+                }
+
+                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
+                ZeroUpper();
+                return sum;
+            }
+        }
+
+        internal static unsafe float SumSqDiffU(float mean, Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector256<float> result256 = Avx.SetZeroVector256<float>();
+                Vector256<float> meanVector256 = Avx.SetAllVector256(mean);
+
+                while (pSrcCurrent + 8 <= pSrcEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    srcVector = Avx.Subtract(srcVector, meanVector256);
+                    result256 = Avx.Add(result256, Avx.Multiply(srcVector, srcVector));
+
+                    pSrcCurrent += 8;
+                }
+
+                result256 = VectorSum256(in result256);
+                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+
+                Vector128<float> result128 = Sse.SetZeroVector128();
+                Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    srcVector = Sse.Subtract(srcVector, meanVector128);
+                    result128 = Sse.Add(result128, Sse.Multiply(srcVector, srcVector));
+
+                    pSrcCurrent += 4;
+                }
+
+                result128 = VectorSum128(in result128);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.SubtractScalar(srcVector, meanVector128);
+                    result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, srcVector));
+
+                    pSrcCurrent++;
+                }
+
+                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
+                ZeroUpper();
+                return sum;
+            }
+        }
+
+        internal static unsafe float SumAbsU(Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector256<float> result256 = Avx.SetZeroVector256<float>();
+                Vector256<float> mask256 = GetAbsMask256();
+
+                while (pSrcCurrent + 8 <= pSrcEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    result256 = Avx.Add(result256, Avx.And(srcVector, mask256));
+
+                    pSrcCurrent += 8;
+                }
+
+                result256 = VectorSum256(in result256);
+                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+
+                Vector128<float> result128 = Sse.SetZeroVector128();
+                Vector128<float> mask128 = GetAbsMask128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    result128 = Sse.Add(result128, Sse.And(srcVector, mask128));
+
+                    pSrcCurrent += 4;
+                }
+
+                result128 = VectorSum128(in result128);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    result128 = Sse.AddScalar(result128, Sse.And(srcVector, mask128));
+
+                    pSrcCurrent++;
+                }
+
+                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
+                ZeroUpper();
+                return sum;
+            }
+        }
+
+        internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector256<float> result256 = Avx.SetZeroVector256<float>();
+                Vector256<float> meanVector256 = Avx.SetAllVector256(mean);
+                Vector256<float> mask256 = GetAbsMask256();
+
+                while (pSrcCurrent + 8 <= pSrcEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    srcVector = Avx.Subtract(srcVector, meanVector256);
+                    result256 = Avx.Add(result256, Avx.And(srcVector, mask256));
+
+                    pSrcCurrent += 8;
+                }
+
+                result256 = VectorSum256(in result256);
+                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+
+                Vector128<float> result128 = Sse.SetZeroVector128();
+                Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
+                Vector128<float> mask128 = GetAbsMask128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    srcVector = Sse.Subtract(srcVector, meanVector128);
+                    result128 = Sse.Add(result128, Sse.And(srcVector, mask128));
+
+                    pSrcCurrent += 4;
+                }
+
+                result128 = VectorSum128(in result128);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.SubtractScalar(srcVector, meanVector128);
+                    result128 = Sse.AddScalar(result128, Sse.And(srcVector, mask128));
+
+                    pSrcCurrent++;
+                }
+
+                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
+                ZeroUpper();
+                return sum;
+            }
+        }
+
+        internal static unsafe float MaxAbsU(Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector256<float> result256 = Avx.SetZeroVector256<float>();
+                Vector256<float> mask256 = GetAbsMask256();
+
+                while (pSrcCurrent + 8 <= pSrcEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    result256 = Avx.Max(result256, Avx.And(srcVector, mask256));
+
+                    pSrcCurrent += 8;
+                }
+
+                result256 = VectorMax256(in result256);
+                Vector128<float> resultPadded = Sse.MaxScalar(GetLow(result256), GetHigh(result256));
+
+                Vector128<float> result128 = Sse.SetZeroVector128();
+                Vector128<float> mask128 = GetAbsMask128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    result128 = Sse.Max(result128, Sse.And(srcVector, mask128));
+
+                    pSrcCurrent += 4;
+                }
+
+                result128 = VectorMax128(in result128);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    result128 = Sse.MaxScalar(result128, Sse.And(srcVector, mask128));
+
+                    pSrcCurrent++;
+                }
+
+                float max = Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded));
+                ZeroUpper();
+                return max;
+            }
+        }
+
+        internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector256<float> result256 = Avx.SetZeroVector256<float>();
+                Vector256<float> meanVector256 = Avx.SetAllVector256(mean);
+                Vector256<float> mask256 = GetAbsMask256();
+
+                while (pSrcCurrent + 8 <= pSrcEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    srcVector = Avx.Subtract(srcVector, meanVector256);
+                    result256 = Avx.Max(result256, Avx.And(srcVector, mask256));
+
+                    pSrcCurrent += 8;
+                }
+
+                result256 = VectorMax256(in result256);
+                Vector128<float> resultPadded = Sse.MaxScalar(GetLow(result256), GetHigh(result256));
+
+                Vector128<float> result128 = Sse.SetZeroVector128();
+                Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
+                Vector128<float> mask128 = GetAbsMask128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    srcVector = Sse.Subtract(srcVector, meanVector128);
+                    result128 = Sse.Max(result128, Sse.And(srcVector, mask128));
+
+                    pSrcCurrent += 4;
+                }
+
+                result128 = VectorMax128(in result128);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.SubtractScalar(srcVector, meanVector128);
+                    result128 = Sse.MaxScalar(result128, Sse.And(srcVector, mask128));
+
+                    pSrcCurrent++;
+                }
+
+                float max = Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded));
+                ZeroUpper();
+                return max;
+            }
+        }
+
+        internal static unsafe float DotU(Span<float> src, Span<float> dst)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pSrcEnd = psrc + src.Length;
+
+                Vector256<float> result256 = Avx.SetZeroVector256<float>();
+
+                while (pSrcCurrent + 8 <= pSrcEnd)
+                {
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
+                    Vector256<float> dstVector = Avx.LoadVector256(pDstCurrent);
+
+                    result256 = Avx.Add(result256, Avx.Multiply(srcVector, dstVector));
+
+                    pSrcCurrent += 8;
+                    pDstCurrent += 8;
+                }
+
+                result256 = VectorSum256(in result256);
+                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+
+                Vector128<float> result128 = Sse.SetZeroVector128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+
+                    result128 = Sse.Add(result128, Sse.Multiply(srcVector, dstVector));
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                result128 = VectorSum128(in result128);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+                    result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, dstVector));
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                }
+
+                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
+                ZeroUpper();
+                return sum;
+            }
+        }
+
+        internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (int* pidx = idx)
+            {
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                int* pIdxCurrent = pidx;
+                int* pIdxEnd = pidx + idx.Length;
+
+                Vector256<float> result256 = Avx.SetZeroVector256<float>();
+
+                while (pIdxCurrent + 8 <= pIdxEnd)
+                {
+                    Vector256<float> srcVector = Load8(pSrcCurrent, pIdxCurrent);
+                    Vector256<float> dstVector = Avx.LoadVector256(pDstCurrent);
+
+                    result256 = Avx.Add(result256, Avx.Multiply(srcVector, dstVector));
+
+                    pIdxCurrent += 8;
+                    pDstCurrent += 8;
+                }
+
+                result256 = VectorSum256(in result256);
+                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+
+                Vector128<float> result128 = Sse.SetZeroVector128();
+
+                while (pIdxCurrent + 4 <= pIdxEnd)
+                {
+                    Vector128<float> srcVector = Load4(pSrcCurrent, pIdxCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+
+                    result128 = Sse.Add(result128, Sse.Multiply(srcVector, dstVector));
+
+                    pIdxCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                result128 = VectorSum128(in result128);
+
+                while (pIdxCurrent < pIdxEnd)
+                {
+                    Vector128<float> srcVector = Load1(pSrcCurrent, pIdxCurrent);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+                    result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, dstVector));
+
+                    pIdxCurrent++;
+                    pDstCurrent++;
+                }
+
+                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
+                ZeroUpper();
+                return sum;
+            }
+        }
+
+        internal static unsafe float Dist2(Span<float> src, Span<float> dst)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pSrcEnd = psrc + src.Length;
+
+                Vector256<float> sqDistanceVector256 = Avx.SetZeroVector256<float>();
+
+                while (pSrcCurrent + 8 <= pSrcEnd)
+                {
+                    Vector256<float> distanceVector = Avx.Subtract(Avx.LoadVector256(pSrcCurrent),
+                                                                    Avx.LoadVector256(pDstCurrent));
+                    sqDistanceVector256 = Avx.Add(sqDistanceVector256,
+                                                Avx.Multiply(distanceVector, distanceVector));
+
+                    pSrcCurrent += 8;
+                    pDstCurrent += 8;
+                }
+
+                sqDistanceVector256 = VectorSum256(in sqDistanceVector256);
+                Vector128<float> sqDistanceVectorPadded = Sse.AddScalar(GetLow(sqDistanceVector256), GetHigh(sqDistanceVector256));
+
+                Vector128<float> sqDistanceVector128 = Sse.SetZeroVector128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent),
+                                                                    Sse.LoadVector128(pDstCurrent));
+                    sqDistanceVector128 = Sse.Add(sqDistanceVector128,
+                                                Sse.Multiply(distanceVector, distanceVector));
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                sqDistanceVector128 = VectorSum128(in sqDistanceVector128);
+
+                float norm = Sse.ConvertToSingle(Sse.AddScalar(sqDistanceVector128, sqDistanceVectorPadded));
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    float distance = (*pSrcCurrent) - (*pDstCurrent);
+                    norm += distance * distance;
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                }
+
+                ZeroUpper();
+                return norm;
+            }
+        }
+
+        internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst1 = v)
+            fixed (float* pdst2 = w)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+                float* pDst1Current = pdst1;
+                float* pDst2Current = pdst2;
+
+                Vector256<float> xPrimal256 = Avx.SetAllVector256(primalUpdate);
+
+                Vector256<float> signMask256 = Avx.SetAllVector256(-0.0f); // 0x8000 0000
+                Vector256<float> xThreshold256 = Avx.SetAllVector256(threshold);
+
+                while (pSrcCurrent + 8 <= pSrcEnd)
+                {
+                    Vector256<float> xSrc = Avx.LoadVector256(pSrcCurrent);
+
+                    Vector256<float> xDst1 = Avx.LoadVector256(pDst1Current);
+                    xDst1 = Avx.Add(xDst1, Avx.Multiply(xSrc, xPrimal256));
+                    Vector256<float> xDst2 = GetNewDst256(xDst1, signMask256, xThreshold256);
+
+                    Avx.Store(pDst1Current, xDst1);
+                    Avx.Store(pDst2Current, xDst2);
+
+                    pSrcCurrent += 8;
+                    pDst1Current += 8;
+                    pDst2Current += 8;
+                }
+
+                Vector128<float> xPrimal128 = Sse.SetAllVector128(primalUpdate);
+
+                Vector128<float> signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000
+                Vector128<float> xThreshold128 = Sse.SetAllVector128(threshold);
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> xSrc = Sse.LoadVector128(pSrcCurrent);
+
+                    Vector128<float> xDst1 = Sse.LoadVector128(pDst1Current);
+                    xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128));
+                    Vector128<float> xDst2 = GetNewDst128(xDst1, signMask128, xThreshold128);
+
+                    Sse.Store(pDst1Current, xDst1);
+                    Sse.Store(pDst2Current, xDst2);
+
+                    pSrcCurrent += 4;
+                    pDst1Current += 4;
+                    pDst2Current += 4;
+                }
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    *pDst1Current += (*pSrcCurrent) * primalUpdate;
+                    float dst1 = *pDst1Current;
+                    *pDst2Current = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0;
+
+                    pSrcCurrent++;
+                    pDst1Current++;
+                    pDst2Current++;
+                }
+            }
+        }
+
+        internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
+        {
+            fixed (float* psrc = src)
+            fixed (int* pidx = indices)
+            fixed (float* pdst1 = v)
+            fixed (float* pdst2 = w)
+            {
+                int* pIdxEnd = pidx + indices.Length;
+                float* pSrcCurrent = psrc;
+                int* pIdxCurrent = pidx;
+
+                Vector256<float> xPrimal256 = Avx.SetAllVector256(primalUpdate);
+
+                Vector256<float> signMask = Avx.SetAllVector256(-0.0f); // 0x8000 0000
+                Vector256<float> xThreshold = Avx.SetAllVector256(threshold);
+
+                while (pIdxCurrent + 8 <= pIdxEnd)
+                {
+                    Vector256<float> xSrc = Avx.LoadVector256(pSrcCurrent);
+
+                    Vector256<float> xDst1 = Load8(pdst1, pIdxCurrent);
+                    xDst1 = Avx.Add(xDst1, Avx.Multiply(xSrc, xPrimal256));
+                    Vector256<float> xDst2 = GetNewDst256(xDst1, signMask, xThreshold);
+
+                    Store8(in xDst1, pdst1, pIdxCurrent);
+                    Store8(in xDst2, pdst2, pIdxCurrent);
+
+                    pIdxCurrent += 8;
+                    pSrcCurrent += 8;
+                }
+
+                Vector128<float> xPrimal128 = Sse.SetAllVector128(primalUpdate);
+
+                Vector128<float> signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000
+                Vector128<float> xThreshold128 = Sse.SetAllVector128(threshold);
+
+                while (pIdxCurrent + 4 <= pIdxEnd)
+                {
+                    Vector128<float> xSrc = Sse.LoadVector128(pSrcCurrent);
+
+                    Vector128<float> xDst1 = Load4(pdst1, pIdxCurrent);
+                    xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128));
+                    Vector128<float> xDst2 = GetNewDst128(xDst1, signMask128, xThreshold128);
+
+                    Store4(in xDst1, pdst1, pIdxCurrent);
+                    Store4(in xDst2, pdst2, pIdxCurrent);
+
+                    pIdxCurrent += 4;
+                    pSrcCurrent += 4;
+                }
+
+                while (pIdxCurrent < pIdxEnd)
+                {
+                    int index = *pIdxCurrent;
+                    pdst1[index] += (*pSrcCurrent) * primalUpdate;
+                    float dst1 = pdst1[index];
+                    pdst2[index] = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0;
+
+                    pIdxCurrent++;
+                    pSrcCurrent++;
+                }
+            }
+        }
     }
 }
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index c192052ca6..1e944aea55 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -196,7 +196,11 @@ public static void Add(float a, float[] dst, int count)
 
         private static void Add(float a, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddScalarU(a, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScalarU(a, dst);
             }
@@ -261,7 +265,11 @@ public static void Scale(float a, float[] src, float[] dst, int count)
 
         private static void Scale(float a, Span<float> src, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.ScaleSrcU(a, src, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.ScaleSrcU(a, src, dst);
             }
@@ -286,7 +294,11 @@ public static void ScaleAdd(float a, float b, float[] dst, int count)
 
         private static void ScaleAdd(float a, float b, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.ScaleAddU(a, b, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.ScaleAddU(a, b, dst);
             }
@@ -373,7 +385,11 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in
 
         private static void AddScale(float a, Span<float> src, Span<int> indices, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddScaleSU(a, src, indices, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScaleSU(a, src, indices, dst);
             }
@@ -402,7 +418,11 @@ public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res,
 
         private static void AddScaleCopy(float a, Span<float> src, Span<float> dst, Span<float> res)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddScaleCopyU(a, src, dst, res);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScaleCopyU(a, src, dst, res);
             }
@@ -476,7 +496,11 @@ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, i
 
         private static void Add(Span<float> src, Span<int> indices, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddSU(src, indices, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddSU(src, indices, dst);
             }
@@ -505,7 +529,11 @@ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int c
 
         private static void MulElementWise(Span<float> src1, Span<float> src2, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.MulElementWiseU(src1, src2, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.MulElementWiseU(src1, src2, dst);
             }
@@ -579,7 +607,11 @@ public static float SumSq(float[] src, int offset, int count)
 
         private static float SumSq(Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.SumSqU(src);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.SumSqU(src);
             }
@@ -606,7 +638,11 @@ public static float SumSq(float mean, float[] src, int offset, int count)
 
         private static float SumSq(float mean, Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return (mean == 0) ? AvxIntrinsics.SumSqU(src) : AvxIntrinsics.SumSqDiffU(mean, src);
+            }
+            else if (Sse.IsSupported)
             {
                 return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src);
             }
@@ -642,7 +678,11 @@ public static float SumAbs(float[] src, int offset, int count)
 
         private static float SumAbs(Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.SumAbsU(src);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.SumAbsU(src);
             }
@@ -669,7 +709,11 @@ public static float SumAbs(float mean, float[] src, int offset, int count)
 
         private static float SumAbs(float mean, Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return (mean == 0) ? AvxIntrinsics.SumAbsU(src) : AvxIntrinsics.SumAbsDiffU(mean, src);
+            }
+            else if (Sse.IsSupported)
             {
                 return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src);
             }
@@ -705,7 +749,11 @@ public static float MaxAbs(float[] src, int offset, int count)
 
         private static float MaxAbs(Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.MaxAbsU(src);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.MaxAbsU(src);
             }
@@ -735,7 +783,11 @@ public static float MaxAbsDiff(float mean, float[] src, int count)
 
         private static float MaxAbsDiff(float mean, Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.MaxAbsDiffU(mean, src);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.MaxAbsDiffU(mean, src);
             }
@@ -779,7 +831,11 @@ public static float DotProductDense(float[] a, int offset, float[] b, int count)
 
         private static float DotProductDense(Span<float> a, Span<float> b)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.DotU(a, b);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.DotU(a, b);
             }
@@ -826,7 +882,11 @@ public static float DotProductSparse(float[] a, int offset, float[] b, int[] ind
 
         private static float DotProductSparse(Span<float> a, Span<float> b, Span<int> indices)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.DotSU(a, b, indices);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.DotSU(a, b, indices);
             }
@@ -855,7 +915,11 @@ public static float L2DistSquared(float[] a, float[] b, int count)
 
         private static float L2DistSquared(Span<float> a, Span<float> b)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.Dist2(a, b);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.Dist2(a, b);
             }
@@ -951,7 +1015,11 @@ public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src
 
         private static void SdcaL1UpdateDense(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
             }
@@ -985,7 +1053,11 @@ public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] sr
 
         private static void SdcaL1UpdateSparse(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
             }
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 44157364a8..02be0f7aaf 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -417,18 +417,18 @@ internal static unsafe void AddScalarU(float scalar, Span<float> dst)
 
                 while (pDstCurrent + 4 <= pDstEnd)
                 {
-                    Vector128<float> x2 = Sse.LoadVector128(pDstCurrent);
-                    x2 = Sse.Add(x2, scalarVector);
-                    Sse.Store(pDstCurrent, x2);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+                    dstVector = Sse.Add(dstVector, scalarVector);
+                    Sse.Store(pDstCurrent, dstVector);
 
                     pDstCurrent += 4;
                 }
 
                 while (pDstCurrent < pDstEnd)
                 {
-                    Vector128<float> x2 = Sse.LoadScalarVector128(pDstCurrent);
-                    x2 = Sse.AddScalar(x2, scalarVector);
-                    Sse.StoreScalar(pDstCurrent, x2);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+                    dstVector = Sse.AddScalar(dstVector, scalarVector);
+                    Sse.StoreScalar(pDstCurrent, dstVector);
 
                     pDstCurrent++;
                 }
@@ -437,13 +437,13 @@ internal static unsafe void AddScalarU(float scalar, Span<float> dst)
 
         internal static unsafe void ScaleU(float scale, Span<float> dst)
         {
-            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
-
             fixed (float* pdst = dst)
             {
                 float* pDstCurrent = pdst;
                 float* pEnd = pdst + dst.Length;
 
+                Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+
                 while (pDstCurrent + 4 <= pEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
@@ -468,8 +468,6 @@ internal static unsafe void ScaleU(float scale, Span<float> dst)
 
         internal static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> dst)
         {
-            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
-
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             {
@@ -477,6 +475,8 @@ internal static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float>
                 float* pSrcCurrent = psrc;
                 float* pDstCurrent = pdst;
 
+                Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+
                 while (pDstCurrent + 4 <= pDstEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
@@ -502,19 +502,19 @@ internal static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float>
         // dst[i] = a * (dst[i] + b)
         internal static unsafe void ScaleAddU(float a, float b, Span<float> dst)
         {
-            Vector128<float> x1 = Sse.SetAllVector128(a);
-            Vector128<float> x2 = Sse.SetAllVector128(b);
-
             fixed (float* pdst = dst)
             {
                 float* pDstEnd = pdst + dst.Length;
                 float* pDstCurrent = pdst;
 
+                Vector128<float> aVector = Sse.SetAllVector128(a);
+                Vector128<float> bVector = Sse.SetAllVector128(b);
+
                 while (pDstCurrent + 4 <= pDstEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
-                    dstVector = Sse.Add(dstVector, x2);
-                    dstVector = Sse.Multiply(dstVector, x1);
+                    dstVector = Sse.Add(dstVector, bVector);
+                    dstVector = Sse.Multiply(dstVector, aVector);
                     Sse.Store(pDstCurrent, dstVector);
 
                     pDstCurrent += 4;
@@ -523,8 +523,8 @@ internal static unsafe void ScaleAddU(float a, float b, Span<float> dst)
                 while (pDstCurrent < pDstEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
-                    dstVector = Sse.AddScalar(dstVector, x2);
-                    dstVector = Sse.MultiplyScalar(dstVector, x1);
+                    dstVector = Sse.AddScalar(dstVector, bVector);
+                    dstVector = Sse.MultiplyScalar(dstVector, aVector);
                     Sse.StoreScalar(pDstCurrent, dstVector);
 
                     pDstCurrent++;
@@ -534,8 +534,6 @@ internal static unsafe void ScaleAddU(float a, float b, Span<float> dst)
 
         internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float> dst)
         {
-            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
-
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             {
@@ -543,6 +541,8 @@ internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float>
                 float* pDstCurrent = pdst;
                 float* pEnd = pdst + dst.Length;
 
+                Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+
                 while (pDstCurrent + 4 <= pEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
@@ -582,15 +582,15 @@ internal static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<flo
                 float* pDstCurrent = pdst;
                 float* pResCurrent = pres;
 
-                Vector128<float> x1 = Sse.SetAllVector128(scale);
+                Vector128<float> scaleVector = Sse.SetAllVector128(scale);
 
                 while (pResCurrent + 4 <= pResEnd)
                 {
-                    Vector128<float> x2 = Sse.LoadVector128(pSrcCurrent);
-                    Vector128<float> x3 = Sse.LoadVector128(pDstCurrent);
-                    x2 = Sse.Multiply(x2, x1);
-                    x3 = Sse.Add(x3, x2);
-                    Sse.Store(pResCurrent, x3);
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+                    srcVector = Sse.Multiply(srcVector, scaleVector);
+                    dstVector = Sse.Add(dstVector, srcVector);
+                    Sse.Store(pResCurrent, dstVector);
 
                     pSrcCurrent += 4;
                     pDstCurrent += 4;
@@ -599,11 +599,11 @@ internal static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<flo
 
                 while (pResCurrent < pResEnd)
                 {
-                    Vector128<float> x2 = Sse.LoadScalarVector128(pSrcCurrent);
-                    Vector128<float> x3 = Sse.LoadScalarVector128(pDstCurrent);
-                    x2 = Sse.MultiplyScalar(x2, x1);
-                    x3 = Sse.AddScalar(x3, x2);
-                    Sse.StoreScalar(pResCurrent, x3);
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+                    srcVector = Sse.MultiplyScalar(srcVector, scaleVector);
+                    dstVector = Sse.AddScalar(dstVector, srcVector);
+                    Sse.StoreScalar(pResCurrent, dstVector);
 
                     pSrcCurrent++;
                     pDstCurrent++;
@@ -614,8 +614,6 @@ internal static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<flo
 
         internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx, Span<float> dst)
         {
-            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
-
             fixed (float* psrc = src)
             fixed (int* pidx = idx)
             fixed (float* pdst = dst)
@@ -625,6 +623,8 @@ internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> i
                 float* pDstCurrent = pdst;
                 int* pEnd = pidx + idx.Length;
 
+                Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+
                 while (pIdxCurrent + 4 <= pEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
@@ -782,14 +782,14 @@ internal static unsafe float SumU(Span<float> src)
 
         internal static unsafe float SumSqU(Span<float> src)
         {
-            Vector128<float> result = Sse.SetZeroVector128();
-
             fixed (float* psrc = src)
             {
+                float* pSrcEnd = psrc + src.Length;
                 float* pSrcCurrent = psrc;
-                float* pEnd = psrc + src.Length;
 
-                while (pSrcCurrent + 4 <= pEnd)
+                Vector128<float> result = Sse.SetZeroVector128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     result = Sse.Add(result, Sse.Multiply(srcVector, srcVector));
@@ -799,16 +799,16 @@ internal static unsafe float SumSqU(Span<float> src)
 
                 result = VectorSum(in result);
 
-                while (pSrcCurrent < pEnd)
+                while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, srcVector));
 
                     pSrcCurrent++;
                 }
-            }
 
-            return Sse.ConvertToSingle(result);
+                return Sse.ConvertToSingle(result);
+            }
         }
 
         internal static unsafe float SumSqDiffU(float mean, Span<float> src)
@@ -823,9 +823,9 @@ internal static unsafe float SumSqDiffU(float mean, Span<float> src)
 
                 while (pSrcCurrent + 4 <= pSrcEnd)
                 {
-                    Vector128<float> x = Sse.LoadVector128(pSrcCurrent);
-                    x = Sse.Subtract(x, meanVector);
-                    result = Sse.Add(result, Sse.Multiply(x, x));
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    srcVector = Sse.Subtract(srcVector, meanVector);
+                    result = Sse.Add(result, Sse.Multiply(srcVector, srcVector));
 
                     pSrcCurrent += 4;
                 }
@@ -834,9 +834,9 @@ internal static unsafe float SumSqDiffU(float mean, Span<float> src)
 
                 while (pSrcCurrent < pSrcEnd)
                 {
-                    Vector128<float> x = Sse.LoadScalarVector128(pSrcCurrent);
-                    x = Sse.SubtractScalar(x, meanVector);
-                    result = Sse.AddScalar(result, Sse.MultiplyScalar(x, x));
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.SubtractScalar(srcVector, meanVector);
+                    result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, srcVector));
 
                     pSrcCurrent++;
                 }
@@ -847,15 +847,15 @@ internal static unsafe float SumSqDiffU(float mean, Span<float> src)
 
         internal static unsafe float SumAbsU(Span<float> src)
         {
-            Vector128<float> result = Sse.SetZeroVector128();
-            Vector128<float> mask = GetAbsMask();
-
             fixed (float* psrc = src)
             {
+                float* pSrcEnd = psrc + src.Length;
                 float* pSrcCurrent = psrc;
-                float* pEnd = psrc + src.Length;
 
-                while (pSrcCurrent + 4 <= pEnd)
+                Vector128<float> result = Sse.SetZeroVector128();
+                Vector128<float> mask = GetAbsMask();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     result = Sse.Add(result, Sse.And(srcVector, mask));
@@ -865,30 +865,30 @@ internal static unsafe float SumAbsU(Span<float> src)
 
                 result = VectorSum(in result);
 
-                while (pSrcCurrent < pEnd)
+                while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     result = Sse.AddScalar(result, Sse.And(srcVector, mask));
 
                     pSrcCurrent++;
                 }
-            }
 
-            return Sse.ConvertToSingle(result);
+                return Sse.ConvertToSingle(result);
+            }
         }
 
         internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
         {
-            Vector128<float> result = Sse.SetZeroVector128();
-            Vector128<float> meanVector = Sse.SetAllVector128(mean);
-            Vector128<float> mask = GetAbsMask();
-
             fixed (float* psrc = src)
             {
+                float* pSrcEnd = psrc + src.Length;
                 float* pSrcCurrent = psrc;
-                float* pEnd = psrc + src.Length;
 
-                while (pSrcCurrent + 4 <= pEnd)
+                Vector128<float> result = Sse.SetZeroVector128();
+                Vector128<float> meanVector = Sse.SetAllVector128(mean);
+                Vector128<float> mask = GetAbsMask();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector);
@@ -899,7 +899,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
 
                 result = VectorSum(in result);
 
-                while (pSrcCurrent < pEnd)
+                while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     srcVector = Sse.SubtractScalar(srcVector, meanVector);
@@ -907,22 +907,22 @@ internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
 
                     pSrcCurrent++;
                 }
-            }
 
-            return Sse.ConvertToSingle(result);
+                return Sse.ConvertToSingle(result);
+            }
         }
 
         internal static unsafe float MaxAbsU(Span<float> src)
         {
-            Vector128<float> result = Sse.SetZeroVector128();
-            Vector128<float> mask = GetAbsMask();
-
             fixed (float* psrc = src)
             {
+                float* pSrcEnd = psrc + src.Length;
                 float* pSrcCurrent = psrc;
-                float* pEnd = psrc + src.Length;
 
-                while (pSrcCurrent + 4 <= pEnd)
+                Vector128<float> result = Sse.SetZeroVector128();
+                Vector128<float> mask = GetAbsMask();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     result = Sse.Max(result, Sse.And(srcVector, mask));
@@ -932,30 +932,30 @@ internal static unsafe float MaxAbsU(Span<float> src)
 
                 result = VectorMax(in result);
 
-                while (pSrcCurrent < pEnd)
+                while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     result = Sse.MaxScalar(result, Sse.And(srcVector, mask));
 
                     pSrcCurrent++;
                 }
-            }
 
-            return Sse.ConvertToSingle(result);
+                return Sse.ConvertToSingle(result);
+            }
         }
 
         internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
         {
-            Vector128<float> result = Sse.SetZeroVector128();
-            Vector128<float> meanVector = Sse.SetAllVector128(mean);
-            Vector128<float> mask = GetAbsMask();
-
             fixed (float* psrc = src)
             {
+                float* pSrcEnd = psrc + src.Length;
                 float* pSrcCurrent = psrc;
-                float* pEnd = psrc + src.Length;
 
-                while (pSrcCurrent + 4 <= pEnd)
+                Vector128<float> result = Sse.SetZeroVector128();
+                Vector128<float> meanVector = Sse.SetAllVector128(mean);
+                Vector128<float> mask = GetAbsMask();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector);
@@ -966,7 +966,7 @@ internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
 
                 result = VectorMax(in result);
 
-                while (pSrcCurrent < pEnd)
+                while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     srcVector = Sse.SubtractScalar(srcVector, meanVector);
@@ -974,23 +974,23 @@ internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
 
                     pSrcCurrent++;
                 }
-            }
 
-            return Sse.ConvertToSingle(result);
+                return Sse.ConvertToSingle(result);
+            }
         }
 
         internal static unsafe float DotU(Span<float> src, Span<float> dst)
         {
-            Vector128<float> result = Sse.SetZeroVector128();
-
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             {
                 float* pSrcCurrent = psrc;
                 float* pDstCurrent = pdst;
-                float* pEnd = psrc + src.Length;
+                float* pSrcEnd = psrc + src.Length;
 
-                while (pSrcCurrent + 4 <= pEnd)
+                Vector128<float> result = Sse.SetZeroVector128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
@@ -1003,7 +1003,7 @@ internal static unsafe float DotU(Span<float> src, Span<float> dst)
 
                 result = VectorSum(in result);
 
-                while (pSrcCurrent < pEnd)
+                while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
@@ -1013,15 +1013,13 @@ internal static unsafe float DotU(Span<float> src, Span<float> dst)
                     pSrcCurrent++;
                     pDstCurrent++;
                 }
-            }
 
-            return Sse.ConvertToSingle(result);
+                return Sse.ConvertToSingle(result);
+            }
         }
 
         internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx)
         {
-            Vector128<float> result = Sse.SetZeroVector128();
-
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             fixed (int* pidx = idx)
@@ -1029,9 +1027,11 @@ internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> i
                 float* pSrcCurrent = psrc;
                 float* pDstCurrent = pdst;
                 int* pIdxCurrent = pidx;
-                int* pEnd = pidx + idx.Length;
+                int* pIdxEnd = pidx + idx.Length;
 
-                while (pIdxCurrent + 4 <= pEnd)
+                Vector128<float> result = Sse.SetZeroVector128();
+
+                while (pIdxCurrent + 4 <= pIdxEnd)
                 {
                     Vector128<float> srcVector = Load4(pSrcCurrent, pIdxCurrent);
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
@@ -1044,7 +1044,7 @@ internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> i
 
                 result = VectorSum(in result);
 
-                while (pIdxCurrent < pEnd)
+                while (pIdxCurrent < pIdxEnd)
                 {
                     Vector128<float> srcVector = Load1(pSrcCurrent, pIdxCurrent);
                     Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
@@ -1054,23 +1054,23 @@ internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> i
                     pIdxCurrent++;
                     pDstCurrent++;
                 }
-            }
 
-            return Sse.ConvertToSingle(result);
+                return Sse.ConvertToSingle(result);
+            }
         }
 
         internal static unsafe float Dist2(Span<float> src, Span<float> dst)
         {
-            Vector128<float> sqDistanceVector = Sse.SetZeroVector128();
-
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             {
                 float* pSrcCurrent = psrc;
                 float* pDstCurrent = pdst;
-                float* pEnd = psrc + src.Length;
+                float* pSrcEnd = psrc + src.Length;
 
-                while (pSrcCurrent + 4 <= pEnd)
+                Vector128<float> sqDistanceVector = Sse.SetZeroVector128();
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent),
                                                                     Sse.LoadVector128(pDstCurrent));
@@ -1084,7 +1084,7 @@ internal static unsafe float Dist2(Span<float> src, Span<float> dst)
                 sqDistanceVector = VectorSum(in sqDistanceVector);
 
                 float norm = Sse.ConvertToSingle(sqDistanceVector);
-                while (pSrcCurrent < pEnd)
+                while (pSrcCurrent < pSrcEnd)
                 {
                     float distance = (*pSrcCurrent) - (*pDstCurrent);
                     norm += distance * distance;

From 41d65f51ec031f57efc227ceef2519fbf351d01d Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Thu, 16 Aug 2018 23:35:25 -0700
Subject: [PATCH 06/29] Implemented perf tests for AVX via CpuMathUtils class

---
 .../AvxPerformanceTests.cs                    | 55 +++++++++++++++++++
 .../SsePerformanceTests.cs                    |  2 +-
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
index fdb7140738..7625ce987d 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
@@ -95,16 +95,71 @@ public void GlobalCleanup()
             original.CopyTo(result, 0);
         }
 
+        [Benchmark]
+        public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN);
+
         [Benchmark]
         public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
 
+        [Benchmark]
+        public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN);
+
+        [Benchmark]
+        public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN);
+
         [Benchmark]
         public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
 
+        [Benchmark]
+        public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN);
+
+        [Benchmark]
+        public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN);
+
         [Benchmark]
         public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN);
 
+        [Benchmark]
+        public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN);
+
+
+        [Benchmark]
+        public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN);
+
         [Benchmark]
         public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN);
+
+        [Benchmark]
+        public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN);
+
+        [Benchmark]
+        public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN);
+
+        [Benchmark]
+        public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN);
+
+        [Benchmark]
+        public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN);
+
+        [Benchmark]
+        public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN);
+
+        [Benchmark]
+        public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN);
+
+        [Benchmark]
+        public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN);
+
+        [Benchmark]
+        public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN);
+
+        [Benchmark]
+        public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN);
+
+        [Benchmark]
+        public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result);
+
+        [Benchmark]
+        public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result);
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index ff1f451550..8893a2f877 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -9,7 +9,7 @@
 
 namespace Microsoft.ML.CpuMath.PerformanceTests
 {
-    public class SsePerformanceTests
+    public class AvxVSSseNativePerformanceTests
     {
         private const int EXP_MAX = 127;
         private const int EXP_MIN = 0;

From 8c34e8783c03f01cbb0d68906757b390281cb3a2 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Fri, 17 Aug 2018 10:57:37 -0700
Subject: [PATCH 07/29] Implemented switching logic for Vector128/256Alignment
 between SSE and AVX support

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     | 60 +++++++++----------
 .../CpuAligenedMathUtils.cs                   |  4 +-
 .../CpuMathUtils.netcoreapp.cs                |  9 +++
 .../CpuMathUtils.netstandard.cs               |  8 +++
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     | 54 ++++++++---------
 src/Microsoft.ML.Transforms/RffTransform.cs   | 18 +++---
 6 files changed, 85 insertions(+), 68 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 4c006ea868..21ef1a16f7 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -15,22 +15,22 @@
 
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
-    internal static class AvxIntrinsics
+    public static class AvxIntrinsics
     {
-        private const int CbAlign = 32;
+        private const int Vector256Alignment = 32;
 
         private static bool Compat(AlignedArray a)
         {
             Contracts.AssertValue(a);
             Contracts.Assert(a.Size > 0);
-            return a.CbAlign == CbAlign;
+            return a.CbAlign == Vector256Alignment;
         }
 
         private static unsafe float* Ptr(AlignedArray a, float* p)
         {
             Contracts.AssertValue(a);
             float* q = p + a.GetBase((long)p);
-            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
+            Contracts.Assert(((long)q & (Vector256Alignment - 1)) == 0);
             return q;
         }
 
@@ -201,7 +201,7 @@ private static Vector256<float> GetNewDst256(in Vector256<float> xDst1, in Vecto
         }
 
         // Multiply matrix times vector into vector.
-        internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
             Contracts.Assert(Compat(mat));
             Contracts.Assert(Compat(src));
@@ -269,7 +269,7 @@ internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src
         }
 
         // Partial sparse source vector.
-        internal static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
+        public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
                                         int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
         {
             Contracts.Assert(Compat(mat));
@@ -330,7 +330,7 @@ internal static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc,
             }
         }
 
-        internal static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
             Contracts.Assert(Compat(mat));
             Contracts.Assert(Compat(src));
@@ -445,7 +445,7 @@ internal static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray
         }
 
         // Partial sparse source vector.
-        internal static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
+        public static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
                                         int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow)
         {
             Contracts.Assert(Compat(mat));
@@ -514,7 +514,7 @@ internal static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgpos
         }
 
         // dst[i] += scale
-        internal static unsafe void AddScalarU(float scalar, Span<float> dst)
+        public static unsafe void AddScalarU(float scalar, Span<float> dst)
         {
             fixed (float* pdst = dst)
             {
@@ -556,7 +556,7 @@ internal static unsafe void AddScalarU(float scalar, Span<float> dst)
             ZeroUpper();
         }
 
-        internal static unsafe void ScaleU(float scale, Span<float> dst)
+        public static unsafe void ScaleU(float scale, Span<float> dst)
         {
             fixed (float* pdst = dst)
             {
@@ -601,7 +601,7 @@ internal static unsafe void ScaleU(float scale, Span<float> dst)
             ZeroUpper();
         }
 
-        internal static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> dst)
+        public static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -649,7 +649,7 @@ internal static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float>
         }
 
         // dst[i] = a * (dst[i] + b)
-        internal static unsafe void ScaleAddU(float a, float b, Span<float> dst)
+        public static unsafe void ScaleAddU(float a, float b, Span<float> dst)
         {
             fixed (float* pdst = dst)
             {
@@ -696,7 +696,7 @@ internal static unsafe void ScaleAddU(float a, float b, Span<float> dst)
             ZeroUpper();
         }
 
-        internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float> dst)
+        public static unsafe void AddScaleU(float scale, Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -752,7 +752,7 @@ internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float>
             ZeroUpper();
         }
 
-        internal static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<float> dst, Span<float> result)
+        public static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<float> dst, Span<float> result)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -810,7 +810,7 @@ internal static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<flo
             ZeroUpper();
         }
 
-        internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx, Span<float> dst)
+        public static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (int* pidx = idx)
@@ -863,7 +863,7 @@ internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> i
             ZeroUpper();
         }
 
-        internal static unsafe void AddU(Span<float> src, Span<float> dst)
+        public static unsafe void AddU(Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -912,7 +912,7 @@ internal static unsafe void AddU(Span<float> src, Span<float> dst)
             ZeroUpper();
         }
 
-        internal static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
+        public static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (int* pidx = idx)
@@ -959,7 +959,7 @@ internal static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> ds
             ZeroUpper();
         }
 
-        internal static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2, Span<float> dst)
+        public static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2, Span<float> dst)
         {
             fixed (float* psrc1 = &src1[0])
             fixed (float* psrc2 = &src2[0])
@@ -1010,7 +1010,7 @@ internal static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2,
             ZeroUpper();
         }
 
-        internal static unsafe float SumU(Span<float> src)
+        public static unsafe float SumU(Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -1050,7 +1050,7 @@ internal static unsafe float SumU(Span<float> src)
             }
         }
 
-        internal static unsafe float SumSqU(Span<float> src)
+        public static unsafe float SumSqU(Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -1096,7 +1096,7 @@ internal static unsafe float SumSqU(Span<float> src)
             }
         }
 
-        internal static unsafe float SumSqDiffU(float mean, Span<float> src)
+        public static unsafe float SumSqDiffU(float mean, Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -1147,7 +1147,7 @@ internal static unsafe float SumSqDiffU(float mean, Span<float> src)
             }
         }
 
-        internal static unsafe float SumAbsU(Span<float> src)
+        public static unsafe float SumAbsU(Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -1195,7 +1195,7 @@ internal static unsafe float SumAbsU(Span<float> src)
             }
         }
 
-        internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
+        public static unsafe float SumAbsDiffU(float mean, Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -1248,7 +1248,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
             }
         }
 
-        internal static unsafe float MaxAbsU(Span<float> src)
+        public static unsafe float MaxAbsU(Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -1296,7 +1296,7 @@ internal static unsafe float MaxAbsU(Span<float> src)
             }
         }
 
-        internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
+        public static unsafe float MaxAbsDiffU(float mean, Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -1349,7 +1349,7 @@ internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
             }
         }
 
-        internal static unsafe float DotU(Span<float> src, Span<float> dst)
+        public static unsafe float DotU(Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -1406,7 +1406,7 @@ internal static unsafe float DotU(Span<float> src, Span<float> dst)
             }
         }
 
-        internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx)
+        public static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -1465,7 +1465,7 @@ internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> i
             }
         }
 
-        internal static unsafe float Dist2(Span<float> src, Span<float> dst)
+        public static unsafe float Dist2(Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -1520,7 +1520,7 @@ internal static unsafe float Dist2(Span<float> src, Span<float> dst)
             }
         }
 
-        internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
+        public static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
         {
             fixed (float* psrc = src)
             fixed (float* pdst1 = v)
@@ -1586,7 +1586,7 @@ internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, f
             }
         }
 
-        internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
+        public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
         {
             fixed (float* psrc = src)
             fixed (int* pidx = indices)
diff --git a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
index 9c7fa5ae1f..d217ccf6f9 100644
--- a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
+++ b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
@@ -16,7 +16,7 @@ public static void AssertCompatible(ICpuFullMatrix values)
 #if DEBUG
             var mat = values as TMatrix;
             Contracts.AssertValue(mat);
-            Contracts.Assert(mat.Items.CbAlign == CpuMathUtils.Vector128Alignment);
+            Contracts.Assert(mat.Items.CbAlign == CpuMathUtils.GetVectorAlignment());
 #endif
         }
 
@@ -29,7 +29,7 @@ public static void AssertCompatible(ICpuVector values)
 #if DEBUG
             CpuAlignedVector vec = values as CpuAlignedVector;
             Contracts.AssertValue(vec);
-            Contracts.Assert(vec.Items.CbAlign == CpuMathUtils.Vector128Alignment);
+            Contracts.Assert(vec.Items.CbAlign == CpuMathUtils.GetVectorAlignment());
 #endif
         }
 
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 1e944aea55..807f7239d7 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -12,6 +12,15 @@ public static partial class CpuMathUtils
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         public const int Vector128Alignment = 16;
 
+        // The count of bytes in Vector256<T>, corresponding to _cbAlign in AlignedArray
+        public const int Vector256Alignment = 32;
+
+        public static int GetVectorAlignment()
+        {
+            // Assumes SSE support on machines that run ML.NET.
+            return Avx.IsSupported ? Vector256Alignment : Vector128Alignment;
+        }
+
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
         {
             Contracts.Assert(mat.Size == dst.Size * src.Size);
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
index 6f480b0f25..db620dbbb6 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -6,9 +6,17 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
+        // REVIEW NEEDED: AVX support cannot be checked in .NET Core App 2.0, so we assume Vector128 alignment for SSE.  Is it okay?
+
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         public const int Vector128Alignment = 16;
 
+        public static int GetVectorAlignment()
+        {
+            // Assumes SSE support on machines that run ML.NET.
+            return Vector128Alignment;
+        }
+
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun);
 
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 02be0f7aaf..a555b8ba8c 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -20,7 +20,7 @@
 
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
-    internal static class SseIntrinsics
+    public static class SseIntrinsics
     {
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;
@@ -115,7 +115,7 @@ private static Vector128<float> GetNewDst(in Vector128<float> xDst1, in Vector12
         }
 
         // Multiply matrix times vector into vector.
-        internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
             Contracts.Assert(Compat(mat));
             Contracts.Assert(Compat(src));
@@ -180,7 +180,7 @@ internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src
         }
 
         // Partial sparse source vector.
-        internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
+        public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
                                         int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
         {
             Contracts.Assert(Compat(mat));
@@ -237,7 +237,7 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc,
             }
         }
 
-        internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
             Contracts.Assert(Compat(mat));
             Contracts.Assert(Compat(src));
@@ -339,7 +339,7 @@ internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray
         }
 
         // Partial sparse source vector.
-        internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
+        public static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
                                         int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow)
         {
             Contracts.Assert(Compat(mat));
@@ -406,7 +406,7 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos
         }
 
         // dst[i] += scale
-        internal static unsafe void AddScalarU(float scalar, Span<float> dst)
+        public static unsafe void AddScalarU(float scalar, Span<float> dst)
         {
             fixed (float* pdst = dst)
             {
@@ -435,7 +435,7 @@ internal static unsafe void AddScalarU(float scalar, Span<float> dst)
             }
         }
 
-        internal static unsafe void ScaleU(float scale, Span<float> dst)
+        public static unsafe void ScaleU(float scale, Span<float> dst)
         {
             fixed (float* pdst = dst)
             {
@@ -466,7 +466,7 @@ internal static unsafe void ScaleU(float scale, Span<float> dst)
             }
         }
 
-        internal static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> dst)
+        public static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -500,7 +500,7 @@ internal static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float>
         }
 
         // dst[i] = a * (dst[i] + b)
-        internal static unsafe void ScaleAddU(float a, float b, Span<float> dst)
+        public static unsafe void ScaleAddU(float a, float b, Span<float> dst)
         {
             fixed (float* pdst = dst)
             {
@@ -532,7 +532,7 @@ internal static unsafe void ScaleAddU(float a, float b, Span<float> dst)
             }
         }
 
-        internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float> dst)
+        public static unsafe void AddScaleU(float scale, Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -571,7 +571,7 @@ internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float>
             }
         }
 
-        internal static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<float> dst, Span<float> result)
+        public static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<float> dst, Span<float> result)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -612,7 +612,7 @@ internal static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<flo
             }
         }
 
-        internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx, Span<float> dst)
+        public static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (int* pidx = idx)
@@ -648,7 +648,7 @@ internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> i
             }
         }
 
-        internal static unsafe void AddU(Span<float> src, Span<float> dst)
+        public static unsafe void AddU(Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -683,7 +683,7 @@ internal static unsafe void AddU(Span<float> src, Span<float> dst)
             }
         }
 
-        internal static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
+        public static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (int* pidx = idx)
@@ -716,7 +716,7 @@ internal static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> ds
             }
         }
 
-        internal static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2, Span<float> dst)
+        public static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2, Span<float> dst)
         {
             fixed (float* psrc1 = &src1[0])
             fixed (float* psrc2 = &src2[0])
@@ -753,7 +753,7 @@ internal static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2,
             }
         }
 
-        internal static unsafe float SumU(Span<float> src)
+        public static unsafe float SumU(Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -780,7 +780,7 @@ internal static unsafe float SumU(Span<float> src)
             }
         }
 
-        internal static unsafe float SumSqU(Span<float> src)
+        public static unsafe float SumSqU(Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -811,7 +811,7 @@ internal static unsafe float SumSqU(Span<float> src)
             }
         }
 
-        internal static unsafe float SumSqDiffU(float mean, Span<float> src)
+        public static unsafe float SumSqDiffU(float mean, Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -845,7 +845,7 @@ internal static unsafe float SumSqDiffU(float mean, Span<float> src)
             }
         }
 
-        internal static unsafe float SumAbsU(Span<float> src)
+        public static unsafe float SumAbsU(Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -877,7 +877,7 @@ internal static unsafe float SumAbsU(Span<float> src)
             }
         }
 
-        internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
+        public static unsafe float SumAbsDiffU(float mean, Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -912,7 +912,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
             }
         }
 
-        internal static unsafe float MaxAbsU(Span<float> src)
+        public static unsafe float MaxAbsU(Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -944,7 +944,7 @@ internal static unsafe float MaxAbsU(Span<float> src)
             }
         }
 
-        internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
+        public static unsafe float MaxAbsDiffU(float mean, Span<float> src)
         {
             fixed (float* psrc = src)
             {
@@ -979,7 +979,7 @@ internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
             }
         }
 
-        internal static unsafe float DotU(Span<float> src, Span<float> dst)
+        public static unsafe float DotU(Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -1018,7 +1018,7 @@ internal static unsafe float DotU(Span<float> src, Span<float> dst)
             }
         }
 
-        internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx)
+        public static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -1059,7 +1059,7 @@ internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> i
             }
         }
 
-        internal static unsafe float Dist2(Span<float> src, Span<float> dst)
+        public static unsafe float Dist2(Span<float> src, Span<float> dst)
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
@@ -1097,7 +1097,7 @@ internal static unsafe float Dist2(Span<float> src, Span<float> dst)
             }
         }
 
-        internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
+        public static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
         {
             fixed (float* psrc = src)
             fixed (float* pdst1 = v)
@@ -1142,7 +1142,7 @@ internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, f
             }
         }
 
-        internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
+        public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
         {
             fixed (float* psrc = src)
             fixed (int* pidx = indices)
diff --git a/src/Microsoft.ML.Transforms/RffTransform.cs b/src/Microsoft.ML.Transforms/RffTransform.cs
index 6ad6ceec5f..e1f1d5f39d 100644
--- a/src/Microsoft.ML.Transforms/RffTransform.cs
+++ b/src/Microsoft.ML.Transforms/RffTransform.cs
@@ -122,8 +122,8 @@ public TransformInfo(IHost host, Column item, Arguments args, int d, Float avgDi
 
                 int roundedUpD = RoundUp(NewDim, CfltAlign);
                 int roundedUpNumFeatures = RoundUp(SrcDim, CfltAlign);
-                RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.Vector128Alignment);
-                RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.Vector128Alignment);
+                RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment());
+                RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment());
 
                 InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD);
             }
@@ -158,8 +158,8 @@ public TransformInfo(IHostEnvironment env, ModelLoadContext ctx, int colValueCou
                 // initialize the transform matrix
                 int roundedUpD = RoundUp(NewDim, CfltAlign);
                 int roundedUpNumFeatures = RoundUp(SrcDim, CfltAlign);
-                RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.Vector128Alignment);
-                RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.Vector128Alignment);
+                RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment());
+                RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment());
                 InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD);
             }
 
@@ -227,7 +227,7 @@ private static VersionInfo GetVersionInfo()
         private readonly TransformInfo[] _transformInfos;
 
         private const string RegistrationName = "Rff";
-        private const int CfltAlign = CpuMathUtils.Vector128Alignment / sizeof(float);
+        private const int CfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float);
 
         private static string TestColumnType(ColumnType type)
         {
@@ -496,8 +496,8 @@ private ValueGetter<VBuffer<Float>> GetterFromVectorType(IRow input, int iinfo)
             var getSrc = GetSrcGetter<VBuffer<Float>>(input, iinfo);
             var src = default(VBuffer<Float>);
 
-            var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, CfltAlign), CpuMathUtils.Vector128Alignment);
-            var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.Vector128Alignment);
+            var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, CfltAlign), CpuMathUtils.GetVectorAlignment());
+            var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.GetVectorAlignment());
 
             return
                 (ref VBuffer<Float> dst) =>
@@ -512,8 +512,8 @@ private ValueGetter<VBuffer<Float>> GetterFromFloatType(IRow input, int iinfo)
             var getSrc = GetSrcGetter<Float>(input, iinfo);
             var src = default(Float);
 
-            var featuresAligned = new AlignedArray(RoundUp(1, CfltAlign), CpuMathUtils.Vector128Alignment);
-            var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.Vector128Alignment);
+            var featuresAligned = new AlignedArray(RoundUp(1, CfltAlign), CpuMathUtils.GetVectorAlignment());
+            var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.GetVectorAlignment());
 
             var oneDimensionalVector = new VBuffer<Float>(1, new Float[] { 0 });
 

From ddeb6551145e6e5f438ad8b6de47f0591ca296c6 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Fri, 17 Aug 2018 11:37:12 -0700
Subject: [PATCH 08/29] Changed perf tests to reveal SSE and AVX intrinsics
 perf separately

---
 .../AvxPerformanceTests.cs                    | 113 ++++++++++++++----
 .../CpuMathNativeUtils.cs                     |  12 --
 .../SsePerformanceTests.cs                    | 113 ++++++++++++++----
 3 files changed, 180 insertions(+), 58 deletions(-)

diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
index 7625ce987d..01058384f8 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
@@ -96,70 +96,137 @@ public void GlobalCleanup()
         }
 
         [Benchmark]
-        public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN);
+        public void ManagedAddScalarUPerf()
+        {
+            AvxIntrinsics.AddScalarU(DEFAULT_SCALE, new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
-        public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
+        public void ManagedScaleUPerf()
+        {
+            AvxIntrinsics.ScaleU(DEFAULT_SCALE, new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
-        public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN);
+        public void ManagedScaleSrcUPerf()
+        {
+            AvxIntrinsics.ScaleSrcU(DEFAULT_SCALE, new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
-        public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN);
+        public void ManagedScaleAddUPerf()
+        {
+            AvxIntrinsics.ScaleAddU(DEFAULT_SCALE, DEFAULT_SCALE, new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
-        public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
+        public void ManagedAddScaleUPerf()
+        {
+            AvxIntrinsics.AddScaleU(DEFAULT_SCALE, new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
-        public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN);
+        public void ManagedAddScaleSUPerf()
+        {
+            AvxIntrinsics.AddScaleSU(DEFAULT_SCALE, new Span<float>(src), new Span<int>(idx, 0, IDXLEN), new Span<float>(dst));
+        }
 
         [Benchmark]
-        public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN);
+        public void ManagedAddScaleCopyUPerf()
+        {
+            AvxIntrinsics.AddScaleCopyU(DEFAULT_SCALE, new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN), new Span<float>(result, 0, LEN));
+        }
 
         [Benchmark]
-        public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN);
+        public void ManagedAddUPerf()
+        {
+            AvxIntrinsics.AddU(new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
-        public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN);
+        public void ManagedAddSUPerf()
+        {
+            AvxIntrinsics.AddSU(new Span<float>(src), new Span<int>(idx, 0, IDXLEN), new Span<float>(dst));
+        }
 
 
         [Benchmark]
-        public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN);
+        public void ManagedMulElementWiseUPerf()
+        {
+            AvxIntrinsics.MulElementWiseU(new Span<float>(src1, 0, LEN), new Span<float>(src2, 0, LEN),
+                                            new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
-        public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN);
+        public float ManagedSumUPerf()
+        {
+            return AvxIntrinsics.SumU(new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
-        public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN);
+        public float ManagedSumSqUPerf()
+        {
+            return AvxIntrinsics.SumSqU(new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
-        public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN);
+        public float ManagedSumSqDiffUPerf()
+        {
+            return AvxIntrinsics.SumSqDiffU(DEFAULT_SCALE, new Span<float>(src, 0, LEN));
+        }
 
-        [Benchmark]
-        public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN);
+       [Benchmark]
+        public float ManagedSumAbsUPerf()
+        {
+            return AvxIntrinsics.SumAbsU(new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
-        public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN);
+        public float ManagedSumAbsDiffUPerf()
+        {
+            return AvxIntrinsics.SumAbsDiffU(DEFAULT_SCALE, new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
-        public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN);
+        public float ManagedMaxAbsUPerf()
+        {
+            return AvxIntrinsics.MaxAbsU(new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
-        public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN);
+        public float ManagedMaxAbsDiffUPerf()
+        {
+            return AvxIntrinsics.MaxAbsDiffU(DEFAULT_SCALE, new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
-        public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN);
+        public float ManagedDotUPerf()
+        {
+            return AvxIntrinsics.DotU(new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
-        public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN);
+        public float ManagedDotSUPerf()
+        {
+            return AvxIntrinsics.DotSU(new Span<float>(src), new Span<float>(dst), new Span<int>(idx, 0, IDXLEN));
+        }
 
         [Benchmark]
-        public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN);
+        public float ManagedDist2Perf()
+        {
+            return AvxIntrinsics.Dist2(new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
-        public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result);
+        public void ManagedSdcaL1UpdateUPerf()
+        {
+            AvxIntrinsics.SdcaL1UpdateU(DEFAULT_SCALE, new Span<float>(src, 0, LEN), DEFAULT_SCALE, new Span<float>(dst, 0, LEN), new Span<float>(result, 0, LEN));
+        }
 
         [Benchmark]
-        public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result);
+        public void ManagedSdcaL1UpdateSUPerf()
+        {
+            AvxIntrinsics.SdcaL1UpdateSU(DEFAULT_SCALE, new Span<float>(src, 0, IDXLEN), new Span<int>(idx, 0, IDXLEN), DEFAULT_SCALE, new Span<float>(dst), new Span<float>(result));
+        }
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
index 27f46022eb..8df3352556 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
@@ -85,17 +85,5 @@ internal static class CpuMathNativeUtils
 
         [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateSU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void SdcaL1UpdateSU(float primalUpdate, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c);
-
-        [DllImport("CpuMathNative", EntryPoint = "ScaleX"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void ScaleX(float a, /*_Inout_*/ float* pd, int c);
-
-        [DllImport("CpuMathNative", EntryPoint = "AddScaleX"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void AddScaleX(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
-
-        [DllImport("CpuMathNative", EntryPoint = "AddX"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void AddX(/*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
-
-        [DllImport("CpuMathNative", EntryPoint = "SumX"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe float SumX(/*const*/ float* ps, int c);
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index 8893a2f877..c3869c63d9 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -9,7 +9,7 @@
 
 namespace Microsoft.ML.CpuMath.PerformanceTests
 {
-    public class AvxVSSseNativePerformanceTests
+    public class SsePerformanceTests
     {
         private const int EXP_MAX = 127;
         private const int EXP_MIN = 0;
@@ -105,7 +105,10 @@ public unsafe void NativeAddScalarUPerf()
         }
 
         [Benchmark]
-        public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN);
+        public void ManagedAddScalarUPerf()
+        {
+            SseIntrinsics.AddScalarU(DEFAULT_SCALE, new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe void NativeScaleUPerf()
@@ -117,7 +120,10 @@ public unsafe void NativeScaleUPerf()
         }
 
         [Benchmark]
-        public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
+        public void ManagedScaleUPerf()
+        {
+            SseIntrinsics.ScaleU(DEFAULT_SCALE, new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe void NativeScaleSrcUPerf()
@@ -130,7 +136,10 @@ public unsafe void NativeScaleSrcUPerf()
         }
 
         [Benchmark]
-        public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN);
+        public void ManagedScaleSrcUPerf()
+        {
+            SseIntrinsics.ScaleSrcU(DEFAULT_SCALE, new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe void NativeScaleAddUPerf()
@@ -142,7 +151,10 @@ public unsafe void NativeScaleAddUPerf()
         }
 
         [Benchmark]
-        public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN);
+        public void ManagedScaleAddUPerf()
+        {
+            SseIntrinsics.ScaleAddU(DEFAULT_SCALE, DEFAULT_SCALE, new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe void NativeAddScaleUPerf()
@@ -155,7 +167,10 @@ public unsafe void NativeAddScaleUPerf()
         }
 
         [Benchmark]
-        public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
+        public void ManagedAddScaleUPerf()
+        {
+            SseIntrinsics.AddScaleU(DEFAULT_SCALE, new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe void NativeAddScaleSUPerf()
@@ -169,7 +184,10 @@ public unsafe void NativeAddScaleSUPerf()
         }
 
         [Benchmark]
-        public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN);
+        public void ManagedAddScaleSUPerf()
+        {
+            SseIntrinsics.AddScaleSU(DEFAULT_SCALE, new Span<float>(src), new Span<int>(idx, 0, IDXLEN), new Span<float>(dst));
+        }
 
         [Benchmark]
         public unsafe void NativeAddScaleCopyUPerf()
@@ -183,7 +201,10 @@ public unsafe void NativeAddScaleCopyUPerf()
         }
 
         [Benchmark]
-        public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN);
+        public void ManagedAddScaleCopyUPerf()
+        {
+            SseIntrinsics.AddScaleCopyU(DEFAULT_SCALE, new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN), new Span<float>(result, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe void NativeAddUPerf()
@@ -196,7 +217,10 @@ public unsafe void NativeAddUPerf()
         }
 
         [Benchmark]
-        public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN);
+        public void ManagedAddUPerf()
+        {
+            SseIntrinsics.AddU(new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe void NativeAddSUPerf()
@@ -210,7 +234,10 @@ public unsafe void NativeAddSUPerf()
         }
 
         [Benchmark]
-        public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN);
+        public void ManagedAddSUPerf()
+        {
+            SseIntrinsics.AddSU(new Span<float>(src), new Span<int>(idx, 0, IDXLEN), new Span<float>(dst));
+        }
 
 
         [Benchmark]
@@ -225,7 +252,11 @@ public unsafe void NativeMulElementWiseUPerf()
         }
 
         [Benchmark]
-        public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN);
+        public void ManagedMulElementWiseUPerf()
+        {
+            SseIntrinsics.MulElementWiseU(new Span<float>(src1, 0, LEN), new Span<float>(src2, 0, LEN),
+                                            new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe float NativeSumUPerf()
@@ -237,7 +268,10 @@ public unsafe float NativeSumUPerf()
         }
 
         [Benchmark]
-        public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN);
+        public float ManagedSumUPerf()
+        {
+            return SseIntrinsics.SumU(new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe float NativeSumSqUPerf()
@@ -249,7 +283,10 @@ public unsafe float NativeSumSqUPerf()
         }
 
         [Benchmark]
-        public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN);
+        public float ManagedSumSqUPerf()
+        {
+            return SseIntrinsics.SumSqU(new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe float NativeSumSqDiffUPerf()
@@ -261,7 +298,10 @@ public unsafe float NativeSumSqDiffUPerf()
         }
 
         [Benchmark]
-        public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN);
+        public float ManagedSumSqDiffUPerf()
+        {
+            return SseIntrinsics.SumSqDiffU(DEFAULT_SCALE, new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe float NativeSumAbsUPerf()
@@ -273,7 +313,10 @@ public unsafe float NativeSumAbsUPerf()
         }
 
         [Benchmark]
-        public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN);
+        public float ManagedSumAbsUPerf()
+        {
+            return SseIntrinsics.SumAbsU(new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe float NativeSumAbsDiffUPerf()
@@ -285,7 +328,10 @@ public unsafe float NativeSumAbsDiffUPerf()
         }
 
         [Benchmark]
-        public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN);
+        public float ManagedSumAbsDiffUPerf()
+        {
+            return SseIntrinsics.SumAbsDiffU(DEFAULT_SCALE, new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe float NativeMaxAbsUPerf()
@@ -297,7 +343,10 @@ public unsafe float NativeMaxAbsUPerf()
         }
 
         [Benchmark]
-        public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN);
+        public float ManagedMaxAbsUPerf()
+        {
+            return SseIntrinsics.MaxAbsU(new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe float NativeMaxAbsDiffUPerf()
@@ -309,7 +358,10 @@ public unsafe float NativeMaxAbsDiffUPerf()
         }
 
         [Benchmark]
-        public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN);
+        public float ManagedMaxAbsDiffUPerf()
+        {
+            return SseIntrinsics.MaxAbsDiffU(DEFAULT_SCALE, new Span<float>(src, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe float NativeDotUPerf()
@@ -322,7 +374,10 @@ public unsafe float NativeDotUPerf()
         }
 
         [Benchmark]
-        public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN);
+        public float ManagedDotUPerf()
+        {
+            return SseIntrinsics.DotU(new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe float NativeDotSUPerf()
@@ -336,7 +391,10 @@ public unsafe float NativeDotSUPerf()
         }
 
         [Benchmark]
-        public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN);
+        public float ManagedDotSUPerf()
+        {
+            return SseIntrinsics.DotSU(new Span<float>(src), new Span<float>(dst), new Span<int>(idx, 0, IDXLEN));
+        }
 
         [Benchmark]
         public unsafe float NativeDist2Perf()
@@ -349,7 +407,10 @@ public unsafe float NativeDist2Perf()
         }
 
         [Benchmark]
-        public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN);
+        public float ManagedDist2Perf()
+        {
+            return SseIntrinsics.Dist2(new Span<float>(src, 0, LEN), new Span<float>(dst, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe void NativeSdcaL1UpdateUPerf()
@@ -363,7 +424,10 @@ public unsafe void NativeSdcaL1UpdateUPerf()
         }
 
         [Benchmark]
-        public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result);
+        public void ManagedSdcaL1UpdateUPerf()
+        {
+            SseIntrinsics.SdcaL1UpdateU(DEFAULT_SCALE, new Span<float>(src, 0, LEN), DEFAULT_SCALE, new Span<float>(dst, 0, LEN), new Span<float>(result, 0, LEN));
+        }
 
         [Benchmark]
         public unsafe void NativeSdcaL1UpdateSUPerf()
@@ -378,6 +442,9 @@ public unsafe void NativeSdcaL1UpdateSUPerf()
         }
 
         [Benchmark]
-        public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result);
+        public void ManagedSdcaL1UpdateSUPerf()
+        {
+            SseIntrinsics.SdcaL1UpdateSU(DEFAULT_SCALE, new Span<float>(src, 0, IDXLEN), new Span<int>(idx, 0, IDXLEN), DEFAULT_SCALE, new Span<float>(dst), new Span<float>(result));
+        }
     }
 }

From c776fb098527b631bd1eebca26ec379e31916f75 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Fri, 17 Aug 2018 11:40:30 -0700
Subject: [PATCH 09/29] Fixed access modifiers of private fields

---
 src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs  | 4 ++--
 src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 807f7239d7..6843cd4757 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -10,10 +10,10 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
     public static partial class CpuMathUtils
     {
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
-        public const int Vector128Alignment = 16;
+        private const int Vector128Alignment = 16;
 
         // The count of bytes in Vector256<T>, corresponding to _cbAlign in AlignedArray
-        public const int Vector256Alignment = 32;
+        private const int Vector256Alignment = 32;
 
         public static int GetVectorAlignment()
         {
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
index db620dbbb6..bbb7f3bd6a 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -9,7 +9,7 @@ public static partial class CpuMathUtils
         // REVIEW NEEDED: AVX support cannot be checked in .NET Core App 2.0, so we assume Vector128 alignment for SSE.  Is it okay?
 
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
-        public const int Vector128Alignment = 16;
+        private const int Vector128Alignment = 16;
 
         public static int GetVectorAlignment()
         {

From df09fe3d6bd4875d4d4081f5792e942eedee09aa Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Fri, 17 Aug 2018 15:18:47 -0700
Subject: [PATCH 10/29] Implemented all unit tests for AVX intrinsics that do
 not involve matrix operations with longer input arrays

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     | 20 +++----
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     |  8 +--
 .../UnitTests.cs                              | 59 ++++++++++++-------
 3 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 21ef1a16f7..73981981b6 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -925,11 +925,11 @@ public static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
 
                 while (pIdxCurrent + 8 <= pEnd)
                 {
-                    Vector256<float> srcVector = Load8(pDstCurrent, pIdxCurrent);
-                    Vector256<float> dstVector = Avx.LoadVector256(pSrcCurrent);
+                    Vector256<float> dstVector = Load8(pDstCurrent, pIdxCurrent);
+                    Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
 
-                    srcVector = Avx.Add(srcVector, dstVector);
-                    Store8(in srcVector, pDstCurrent, pIdxCurrent);
+                    dstVector = Avx.Add(dstVector, srcVector);
+                    Store8(in dstVector, pDstCurrent, pIdxCurrent);
 
                     pIdxCurrent += 8;
                     pSrcCurrent += 8;
@@ -937,11 +937,11 @@ public static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
 
                 while (pIdxCurrent + 4 <= pEnd)
                 {
-                    Vector128<float> srcVector = Load4(pDstCurrent, pIdxCurrent);
-                    Vector128<float> dstVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Load4(pDstCurrent, pIdxCurrent);
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
 
-                    srcVector = Sse.Add(srcVector, dstVector);
-                    Store4(in srcVector, pDstCurrent, pIdxCurrent);
+                    dstVector = Sse.Add(dstVector, srcVector);
+                    Store4(in dstVector, pDstCurrent, pIdxCurrent);
 
                     pIdxCurrent += 4;
                     pSrcCurrent += 4;
@@ -961,8 +961,8 @@ public static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
 
         public static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2, Span<float> dst)
         {
-            fixed (float* psrc1 = &src1[0])
-            fixed (float* psrc2 = &src2[0])
+            fixed (float* psrc1 = src1)
+            fixed (float* psrc2 = src2)
             fixed (float* pdst = dst)
             {
                 float* pSrc1Current = psrc1;
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index a555b8ba8c..e6bc9d6dd4 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -696,11 +696,11 @@ public static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
 
                 while (pIdxCurrent + 4 <= pEnd)
                 {
-                    Vector128<float> srcVector = Load4(pDstCurrent, pIdxCurrent);
-                    Vector128<float> dstVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Load4(pDstCurrent, pIdxCurrent);
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
 
-                    srcVector = Sse.Add(srcVector, dstVector);
-                    Store4(in srcVector, pDstCurrent, pIdxCurrent);
+                    dstVector = Sse.Add(dstVector, srcVector);
+                    Store4(in dstVector, pDstCurrent, pIdxCurrent);
 
                     pIdxCurrent += 4;
                     pSrcCurrent += 4;
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
index b57066be8c..89424f9177 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
@@ -5,6 +5,7 @@
 using System;
 using System.Collections.Generic;
 using Xunit;
+using Xunit.Abstractions;
 using Microsoft.ML.Runtime.Internal.CpuMath;
 
 namespace Microsoft.ML.CpuMath.UnitTests
@@ -20,14 +21,18 @@ public class CpuMathUtilsUnitTests
         private const int SseCbAlign = 16;
         private FloatEqualityComparer comparer;
 
-        public CpuMathUtilsUnitTests()
+        private readonly ITestOutputHelper output;
+
+        public CpuMathUtilsUnitTests(ITestOutputHelper output)
         {
+            this.output = output;
+
             // Padded array whose length is a multiple of 4
-            float[] testArray1 = new float[8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
+            float[] testArray1 = new float[16] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
             // Unpadded array whose length is not a multiple of 4.
-            float[] testArray2 = new float[7] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f };
+            float[] testArray2 = new float[15] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f };
             testArrays = new float[][] { testArray1, testArray2 };
-            testIndexArray = new int[4] { 0, 2, 5, 6 };
+            testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 };
             comparer = new FloatEqualityComparer();
 
             // Padded matrices whose dimensions are multiples of 4
@@ -308,6 +313,11 @@ public void AddScaleSUTest(int test)
             expected[2] = -13.806f;
             expected[5] = -43.522f;
             expected[6] = 55.978f;
+            expected[8] = -178.869f;
+            expected[11] = -31.941f;
+            expected[12] = -51.205f;
+            expected[13] = -21.337f;
+            expected[14] = 35.782f;
 
             CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length);
             var actual = dst;
@@ -373,6 +383,11 @@ public void AddSUTest(int test)
             expected[2] = -12.14f;
             expected[5] = -36.69f;
             expected[6] = 46.29f;
+            expected[8] = -104.41f;
+            expected[11] = -13.09f;
+            expected[12] = -73.92f;
+            expected[13] = -23.64f;
+            expected[14] = 34.41f;
 
             CpuMathUtils.Add(src, idx, dst, idx.Length);
             var actual = dst;
@@ -407,8 +422,8 @@ public void MulElementWiseUTest(int test)
         }
 
         [Theory]
-        [InlineData(0, -93.9f)]
-        [InlineData(1, -97.19f)]
+        [InlineData(0, -187.8f)]
+        [InlineData(1, -191.09f)]
         public void SumUTest(int test, float expected)
         {
             float[] src = (float[])testArrays[test].Clone();
@@ -417,8 +432,8 @@ public void SumUTest(int test, float expected)
         }
 
         [Theory]
-        [InlineData(0, 13399.9376f)]
-        [InlineData(1, 13389.1135f)]
+        [InlineData(0, 26799.8752f)]
+        [InlineData(1, 26789.0511f)]
         public void SumSqUTest(int test, float expected)
         {
             float[] src = (float[])testArrays[test].Clone();
@@ -427,8 +442,8 @@ public void SumSqUTest(int test, float expected)
         }
 
         [Theory]
-        [InlineData(0, 13742.3176f)]
-        [InlineData(1, 13739.7895f)]
+        [InlineData(0, 27484.6352f)]
+        [InlineData(1, 27482.1071f)]
         public void SumSqDiffUTest(int test, float expected)
         {
             float[] src = (float[])testArrays[test].Clone();
@@ -437,8 +452,8 @@ public void SumSqDiffUTest(int test, float expected)
         }
 
         [Theory]
-        [InlineData(0, 196.98f)]
-        [InlineData(1, 193.69f)]
+        [InlineData(0, 393.96f)]
+        [InlineData(1, 390.67f)]
         public void SumAbsUTest(int test, float expected)
         {
             float[] src = (float[])testArrays[test].Clone();
@@ -447,8 +462,8 @@ public void SumAbsUTest(int test, float expected)
         }
 
         [Theory]
-        [InlineData(0, 196.98f)]
-        [InlineData(1, 195.39f)]
+        [InlineData(0, 393.96f)]
+        [InlineData(1, 392.37f)]
         public void SumAbsDiffUTest(int test, float expected)
         {
             float[] src = (float[])testArrays[test].Clone();
@@ -477,8 +492,8 @@ public void MaxAbsDiffUTest(int test, float expected)
         }
 
         [Theory]
-        [InlineData(0, 13306.0376f)]
-        [InlineData(1, 13291.9235f)]
+        [InlineData(0, 26612.0752f)]
+        [InlineData(1, 26597.9611f)]
         public void DotUTest(int test, float expected)
         {
             float[] src = (float[])testArrays[test].Clone();
@@ -490,12 +505,12 @@ public void DotUTest(int test, float expected)
             }
 
             var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length);
-            Assert.Equal(expected, actual, 2);
+            Assert.Equal(expected, actual, 1);
         }
 
         [Theory]
-        [InlineData(0, 736.7352f)]
-        [InlineData(1, 736.7352f)]
+        [InlineData(0, -3406.2154f)]
+        [InlineData(1, -3406.2154f)]
         public void DotSUTest(int test, float expected)
         {
             float[] src = (float[])testArrays[test].Clone();
@@ -509,12 +524,12 @@ public void DotSUTest(int test, float expected)
             }
 
             var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length);
-            Assert.Equal(expected, actual, 4);
+            Assert.Equal(expected, actual, 2);
         }
 
         [Theory]
-        [InlineData(0, 8.0f)]
-        [InlineData(1, 7.0f)]
+        [InlineData(0, 16.0f)]
+        [InlineData(1, 15.0f)]
         public void Dist2Test(int test, float expected)
         {
             float[] src = (float[])testArrays[test].Clone();

From a9c481f46c7693fbe409bed9f4584a36e64634e1 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Fri, 17 Aug 2018 18:02:50 -0700
Subject: [PATCH 11/29] Implemented unit tests for AVX intrinsics

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     |   1 +
 .../UnitTests.cs                              | 314 +++++++++---------
 2 files changed, 162 insertions(+), 153 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 73981981b6..863a40b787 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -5,6 +5,7 @@
 // The exported function names need to be unique (can't be disambiguated based on signature), hence
 // we introduce suffix letters to indicate the general patterns used.
 // * A suffix means aligned and padded for SSE operations.
+// * U suffix means unaligned and unpadded.
 // * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector.
 // * Tran means the matrix is transposed.
 
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
index 89424f9177..a155396448 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
@@ -18,7 +18,8 @@ public class CpuMathUtilsUnitTests
         private readonly AlignedArray[] testSrcVectors;
         private readonly AlignedArray[] testDstVectors;
         private const float DEFAULT_SCALE = 1.7f;
-        private const int SseCbAlign = 16;
+        private const int Vector128Assignment = 16;
+        private const int Vector256Assignment = 32;
         private FloatEqualityComparer comparer;
 
         private readonly ITestOutputHelper output;
@@ -35,177 +36,183 @@ public CpuMathUtilsUnitTests(ITestOutputHelper output)
             testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 };
             comparer = new FloatEqualityComparer();
 
-            // Padded matrices whose dimensions are multiples of 4
-            float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
-                1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
-            float[] testMatrix2 = new float[4 * 8];
+            // Padded matrices whose dimensions are multiples of 8
+            float[] testMatrix1 = new float[8 * 8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
+                                                        1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
+                                                        1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
+                                                        1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
+                                                        1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
+                                                        1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
+                                                        1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
+                                                        1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
+            float[] testMatrix2 = new float[8 * 16];
 
             for (int i = 0; i < testMatrix2.Length; i++)
             {
                 testMatrix2[i] = i + 1;
             }
 
-            AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, SseCbAlign);
-            AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, SseCbAlign);
+            AlignedArray testMatrixAligned1 = new AlignedArray(8 * 8, Vector256Assignment);
+            AlignedArray testMatrixAligned2 = new AlignedArray(8 * 16, Vector256Assignment);
             testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length);
             testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length);
 
             testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 };
 
-            // Padded source vectors whose dimensions are multiples of 4
-            float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f };
-            float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f };
+            // Padded source vectors whose dimensions are multiples of 8
+            float[] testSrcVector1 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f };
+            float[] testSrcVector2 = new float[16] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f, 16f };
 
-            AlignedArray testSrcVectorAligned1 = new AlignedArray(4, SseCbAlign);
-            AlignedArray testSrcVectorAligned2 = new AlignedArray(8, SseCbAlign);
+            AlignedArray testSrcVectorAligned1 = new AlignedArray(8, Vector256Assignment);
+            AlignedArray testSrcVectorAligned2 = new AlignedArray(16, Vector256Assignment);
             testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length);
             testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length);
 
             testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 };
 
-            // Padded destination vectors whose dimensions are multiples of 4
-            float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f };
-            float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f };
+            // Padded destination vectors whose dimensions are multiples of 8
+            float[] testDstVector1 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f };
+            float[] testDstVector2 = new float[16] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f };
 
-            AlignedArray testDstVectorAligned1 = new AlignedArray(4, SseCbAlign);
-            AlignedArray testDstVectorAligned2 = new AlignedArray(8, SseCbAlign);
+            AlignedArray testDstVectorAligned1 = new AlignedArray(8, Vector256Assignment);
+            AlignedArray testDstVectorAligned2 = new AlignedArray(16, Vector256Assignment);
             testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length);
             testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length);
 
             testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 };
         }
 
-        //[Theory]
-        //[InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })]
-        //[InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })]
-        //[InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })]
-        //public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
-        //{
-        //    AlignedArray mat = testMatrices[matTest];
-        //    AlignedArray src = testSrcVectors[srcTest];
-        //    AlignedArray dst = testDstVectors[dstTest];
-
-        //    CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
-        //    float[] actual = new float[dst.Size];
-        //    dst.CopyTo(actual, 0, dst.Size);
-        //    Assert.Equal(expected, actual, comparer);
-        //}
-
-        //[Theory]
-        //[InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })]
-        //[InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })]
-        //[InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })]
-        //public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        //{
-        //    AlignedArray mat = testMatrices[matTest];
-        //    AlignedArray src = testSrcVectors[srcTest];
-        //    AlignedArray dst = testDstVectors[dstTest];
-
-        //    CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
-        //    float[] actual = new float[dst.Size];
-        //    dst.CopyTo(actual, 0, dst.Size);
-        //    Assert.Equal(expected, actual, comparer);
-        //}
-
-        //[Theory]
-        //[InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })]
-        //[InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })]
-        //[InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })]
-        //public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
-        //{
-        //    AlignedArray mat = testMatrices[matTest];
-        //    AlignedArray src = testSrcVectors[srcTest];
-        //    AlignedArray dst = testDstVectors[dstTest];
-
-        //    CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
-        //    float[] actual = new float[dst.Size];
-        //    dst.CopyTo(actual, 0, dst.Size);
-        //    Assert.Equal(expected, actual, comparer);
-        //}
-
-        //[Theory]
-        //[InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })]
-        //[InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })]
-        //[InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })]
-        //public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        //{
-        //    AlignedArray mat = testMatrices[matTest];
-        //    AlignedArray src = testSrcVectors[srcTest];
-        //    AlignedArray dst = testDstVectors[dstTest];
-
-        //    CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
-        //    float[] actual = new float[dst.Size];
-        //    dst.CopyTo(actual, 0, dst.Size);
-        //    Assert.Equal(expected, actual, comparer);
-        //}
-
-        //[Theory]
-        //[InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })]
-        //[InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })]
-        //[InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })]
-        //public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
-        //{
-        //    AlignedArray mat = testMatrices[matTest];
-        //    AlignedArray src = testSrcVectors[srcTest];
-        //    AlignedArray dst = testDstVectors[dstTest];
-        //    int[] idx = testIndexArray;
-
-        //    CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
-        //    float[] actual = new float[dst.Size];
-        //    dst.CopyTo(actual, 0, dst.Size);
-        //    Assert.Equal(expected, actual, comparer);
-        //}
-
-        //[Theory]
-        //[InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })]
-        //[InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })]
-        //[InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })]
-        //public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        //{
-        //    AlignedArray mat = testMatrices[matTest];
-        //    AlignedArray src = testSrcVectors[srcTest];
-        //    AlignedArray dst = testDstVectors[dstTest];
-        //    int[] idx = testIndexArray;
-
-        //    CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
-        //    float[] actual = new float[dst.Size];
-        //    dst.CopyTo(actual, 0, dst.Size);
-        //    Assert.Equal(expected, actual, comparer);
-        //}
-
-        //[Theory]
-        //[InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })]
-        //[InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })]
-        //[InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })]
-        //public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected)
-        //{
-        //    AlignedArray mat = testMatrices[matTest];
-        //    AlignedArray src = testSrcVectors[srcTest];
-        //    AlignedArray dst = testDstVectors[dstTest];
-        //    int[] idx = testIndexArray;
-
-        //    CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
-        //    float[] actual = new float[dst.Size];
-        //    dst.CopyTo(actual, 0, dst.Size);
-        //    Assert.Equal(expected, actual, comparer);
-        //}
-
-        //[Theory]
-        //[InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })]
-        //[InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })]
-        //[InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })]
-        //public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        //{
-        //    AlignedArray mat = testMatrices[matTest];
-        //    AlignedArray src = testSrcVectors[srcTest];
-        //    AlignedArray dst = testDstVectors[dstTest];
-        //    int[] idx = testIndexArray;
-
-        //    CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
-        //    float[] actual = new float[dst.Size];
-        //    dst.CopyTo(actual, 0, dst.Size);
-        //    Assert.Equal(expected, actual, comparer);
-        //}
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f })]
+        [InlineData(1, 1, 0, new float[] { 1496f, 3672f, 5848f, 8024f, 10200f, 12376f, 14552f, 16728f })]
+        [InlineData(1, 0, 1, new float[] { 204f, 492f, 780f, 1068f, 1356f, 1644f, 1932f, 2220f, 2508f, 2796f, 3084f, 3372f, 3660f, 3948f, 4236f, 4524f })]
+        public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { -416.68f, -415.68f, -414.68f, -413.68f, -412.68f, -411.68f, -410.68f, -409.68f })]
+        [InlineData(1, 1, 0, new float[] { 1496f, 3673f, 5850f, 8027f, 10204f, 12381f, 14558f, 16735f })]
+        [InlineData(1, 0, 1, new float[] { 204f, 493f, 782f, 1071f, 1360f, 1649f, 1938f, 2227f, 2516f, 2805f, 3094f, 3383f, 3672f, 3961f, 4250f, 4539f })]
+        public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 70.56f, -85.68f, -351.36f, 498.24f, -3829.32f, -969.48f, 1168.2f, 118.44f })]
+        [InlineData(1, 0, 1, new float[] { 2724f, 2760f, 2796f, 2832f, 2868f, 2904f, 2940f, 2976f, 3012f, 3048f, 3084f, 3120f, 3156f, 3192f, 3228f, 3264f })]
+        [InlineData(1, 1, 0, new float[] { 11016f, 11152f, 11288f, 11424f, 11560f, 11696f, 11832f, 11968f })]
+        public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 70.56f, -84.68f, -349.36f, 501.24f, -3825.32f, -964.48f, 1174.2f, 125.44f })]
+        [InlineData(1, 0, 1, new float[] { 2724f, 2761f, 2798f, 2835f, 2872f, 2909f, 2946f, 2983f, 3020f, 3057f, 3094f, 3131f, 3168f, 3205f, 3242f, 3279f })]
+        [InlineData(1, 1, 0, new float[] { 11016f, 11153f, 11290f, 11427f, 11564f, 11701f, 11838f, 11975f })]
+        public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f })]
+        [InlineData(1, 1, 0, new float[] { 910f, 2190f, 3470f, 4750f, 6030f, 7310f, 8590f, 9870f })]
+        [InlineData(1, 0, 1, new float[] { 95f, 231f, 367f, 503f, 639f, 775f, 911f, 1047f, 1183f, 1319f, 1455f, 1591f, 1727f, 1863f, 1999f, 2135f })]
+        public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+            int[] idx = testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 38.25f, 39.25f, 40.25f, 41.25f, 42.25f, 43.25f, 44.25f, 45.25f })]
+        [InlineData(1, 1, 0, new float[] { 910f, 2191f, 3472f, 4753f, 6034f, 7315f, 8596f, 9877f })]
+        [InlineData(1, 0, 1, new float[] { 95f, 232f, 369f, 506f, 643f, 780f, 917f, 1054f, 1191f, 1328f, 1465f, 1602f, 1739f, 1876f, 2013f, 2150f })]
+        public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+            int[] idx = testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 33.32f, -40.46f, -165.92f, 235.28f, -1808.29f, -457.81f, 551.65f, 55.93f })]
+        [InlineData(1, 0, 1, new float[] { 1265f, 1282f, 1299f, 1316f, 1333f, 1350f, 1367f, 1384f, 1401f, 1418f, 1435f, 1452f, 1469f, 1486f, 1503f, 1520f })]
+        [InlineData(1, 1, 0, new float[] { 6720f, 6800f, 6880f, 6960f, 7040f, 7120f, 7200f, 7280f })]
+        public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+            int[] idx = testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 33.32f, -39.46f, -163.92f, 238.28f, -1804.29f, -452.81f, 557.65f, 62.93f })]
+        [InlineData(1, 0, 1, new float[] { 1265f, 1283f, 1301f, 1319f, 1337f, 1355f, 1373f, 1391f, 1409f, 1427f, 1445f, 1463f, 1481f, 1499f, 1517f, 1535f })]
+        [InlineData(1, 1, 0, new float[] { 6720f, 6801f, 6882f, 6963f, 7044f, 7125f, 7206f, 7287f })]
+        public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+            int[] idx = testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
 
         [Theory]
         [InlineData(0)]
@@ -546,11 +553,11 @@ public void Dist2Test(int test, float expected)
         }
 
         [Theory]
-        [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })]
-        [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })]
+        [InlineData(0, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })]
+        [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 0f, 10f, 11f, 0f, 0f, 0f, 0f, 16f })]
         public void ZeroItemsUTest(int test, int[] idx, float[] expected)
         {
-            AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign);
+            AlignedArray src = new AlignedArray(8 + 8 * test, Vector256Assignment);
             src.CopyFrom(testSrcVectors[test]);
 
             CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx);
@@ -564,7 +571,8 @@ public void ZeroItemsUTest(int test, int[] idx, float[] expected)
         [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })]
         public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected)
         {
-            AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign);
+            // Uses Vector128Assignment since the intrinsic does not use any SSE/AVX algorithm.
+            AlignedArray src = new AlignedArray(4 + 4 * test, Vector128Assignment);
             src.CopyFrom(testSrcVectors[test]);
 
             CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx);
@@ -622,7 +630,7 @@ internal class FloatEqualityComparer : IEqualityComparer<float>
     {
         public bool Equals(float a, float b)
         {
-            return Math.Abs(a - b) < 1e-5f;
+            return Math.Abs(a - b) < 1e-3f;
         }
 
         public int GetHashCode(float a)

From c692a6fe907213df1d58108c3a01be5565be23f7 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Fri, 17 Aug 2018 18:26:50 -0700
Subject: [PATCH 12/29] Fixed errors on the RffTransform.CfltAlign
 const-expression requirement

---
 src/Microsoft.ML.Transforms/RffTransform.cs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Microsoft.ML.Transforms/RffTransform.cs b/src/Microsoft.ML.Transforms/RffTransform.cs
index e1f1d5f39d..c118ce232a 100644
--- a/src/Microsoft.ML.Transforms/RffTransform.cs
+++ b/src/Microsoft.ML.Transforms/RffTransform.cs
@@ -227,7 +227,10 @@ private static VersionInfo GetVersionInfo()
         private readonly TransformInfo[] _transformInfos;
 
         private const string RegistrationName = "Rff";
-        private const int CfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float);
+
+        // REVIEW NEEDED: Used 32 (CpuMathUtils.Vector256Alignment) instead of CpuMathUtils.GetVectorAlignment()
+        // to silence the error that restricts the expression for CfltAlign to be constant.
+        private const int CfltAlign = 32 / sizeof(float);
 
         private static string TestColumnType(ColumnType type)
         {

From 40528e4dcc30134314219f21a808d9413c5f7c55 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Fri, 17 Aug 2018 18:52:53 -0700
Subject: [PATCH 13/29] Fixed Debug errors by making RffTransform.CfltAlign
 read-only

---
 src/Microsoft.ML.Transforms/RffTransform.cs | 22 ++++++++++-----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/Microsoft.ML.Transforms/RffTransform.cs b/src/Microsoft.ML.Transforms/RffTransform.cs
index c118ce232a..79e6bdb01c 100644
--- a/src/Microsoft.ML.Transforms/RffTransform.cs
+++ b/src/Microsoft.ML.Transforms/RffTransform.cs
@@ -120,8 +120,8 @@ public TransformInfo(IHost host, Column item, Arguments args, int d, Float avgDi
                     sub = args.MatrixGenerator;
                 _matrixGenerator = sub.CreateInstance(host, avgDist);
 
-                int roundedUpD = RoundUp(NewDim, CfltAlign);
-                int roundedUpNumFeatures = RoundUp(SrcDim, CfltAlign);
+                int roundedUpD = RoundUp(NewDim, _cfltAlign);
+                int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign);
                 RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment());
                 RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment());
 
@@ -156,8 +156,8 @@ public TransformInfo(IHostEnvironment env, ModelLoadContext ctx, int colValueCou
                     ctx.LoadModelOrNull<IFourierDistributionSampler, SignatureLoadModel>(env, out _matrixGenerator, directoryName));
 
                 // initialize the transform matrix
-                int roundedUpD = RoundUp(NewDim, CfltAlign);
-                int roundedUpNumFeatures = RoundUp(SrcDim, CfltAlign);
+                int roundedUpD = RoundUp(NewDim, _cfltAlign);
+                int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign);
                 RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment());
                 RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment());
                 InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD);
@@ -227,10 +227,7 @@ private static VersionInfo GetVersionInfo()
         private readonly TransformInfo[] _transformInfos;
 
         private const string RegistrationName = "Rff";
-
-        // REVIEW NEEDED: Used 32 (CpuMathUtils.Vector256Alignment) instead of CpuMathUtils.GetVectorAlignment()
-        // to silence the error that restricts the expression for CfltAlign to be constant.
-        private const int CfltAlign = 32 / sizeof(float);
+        private readonly int _cfltAlign;
 
         private static string TestColumnType(ColumnType type)
         {
@@ -254,6 +251,7 @@ public RffTransform(IHostEnvironment env,
             string source = null)
             : this(env, new Arguments() { Column = new[] { new Column() { Source = source ?? name, Name = name } }, NewDim = newDim }, input)
         {
+            _cfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float);
         }
 
         /// <summary>
@@ -499,8 +497,8 @@ private ValueGetter<VBuffer<Float>> GetterFromVectorType(IRow input, int iinfo)
             var getSrc = GetSrcGetter<VBuffer<Float>>(input, iinfo);
             var src = default(VBuffer<Float>);
 
-            var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, CfltAlign), CpuMathUtils.GetVectorAlignment());
-            var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.GetVectorAlignment());
+            var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, _cfltAlign), CpuMathUtils.GetVectorAlignment());
+            var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment());
 
             return
                 (ref VBuffer<Float> dst) =>
@@ -515,8 +513,8 @@ private ValueGetter<VBuffer<Float>> GetterFromFloatType(IRow input, int iinfo)
             var getSrc = GetSrcGetter<Float>(input, iinfo);
             var src = default(Float);
 
-            var featuresAligned = new AlignedArray(RoundUp(1, CfltAlign), CpuMathUtils.GetVectorAlignment());
-            var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.GetVectorAlignment());
+            var featuresAligned = new AlignedArray(RoundUp(1, _cfltAlign), CpuMathUtils.GetVectorAlignment());
+            var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment());
 
             var oneDimensionalVector = new VBuffer<Float>(1, new Float[] { 0 });
 

From 4d7d8effa7cc62eb02d0368c4a76b8031fce1abc Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Fri, 17 Aug 2018 19:05:03 -0700
Subject: [PATCH 14/29] Fixed errors by making CfltAlign static (and read-only)

---
 src/Microsoft.ML.Transforms/RffTransform.cs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Microsoft.ML.Transforms/RffTransform.cs b/src/Microsoft.ML.Transforms/RffTransform.cs
index 79e6bdb01c..675235d6ef 100644
--- a/src/Microsoft.ML.Transforms/RffTransform.cs
+++ b/src/Microsoft.ML.Transforms/RffTransform.cs
@@ -227,7 +227,7 @@ private static VersionInfo GetVersionInfo()
         private readonly TransformInfo[] _transformInfos;
 
         private const string RegistrationName = "Rff";
-        private readonly int _cfltAlign;
+        private static readonly int _cfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float);
 
         private static string TestColumnType(ColumnType type)
         {
@@ -251,7 +251,6 @@ public RffTransform(IHostEnvironment env,
             string source = null)
             : this(env, new Arguments() { Column = new[] { new Column() { Source = source ?? name, Name = name } }, NewDim = newDim }, input)
         {
-            _cfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float);
         }
 
         /// <summary>

From 75e4cde97f470eafed114db075096e8fb29d95ee Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Sat, 18 Aug 2018 18:54:29 -0700
Subject: [PATCH 15/29] Developed two unit tests for netcoreapp and netstandard
 to deal with different alignments separately, with some style changes to
 readonly variables

---
 ...oft.ML.CpuMath.UnitTests.netcoreapp.csproj |   4 -
 .../UnitTests.netcoreapp.cs}                  | 216 +++---
 .../UnitTests.netstandard.cs                  | 619 ++++++++++++++++++
 3 files changed, 724 insertions(+), 115 deletions(-)
 rename test/{Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs => Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs} (77%)
 create mode 100644 test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs

diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj
index e611b15032..44ad91ed90 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj
@@ -8,9 +8,5 @@
   <ItemGroup>
     <ProjectReference Include="..\..\src\Microsoft.ML.CpuMath\Microsoft.ML.CpuMath.csproj " />
   </ItemGroup>
-
-  <ItemGroup>
-    <Compile Include="..\Microsoft.ML.CpuMath.UnitTests.netstandard\UnitTests.cs" />
-  </ItemGroup>
   
 </Project>
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs
similarity index 77%
rename from test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
rename to test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs
index a155396448..2d59a2acf1 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs
@@ -1,40 +1,35 @@
-// Licensed to the .NET Foundation under one or more agreements.
+﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
 using System;
 using System.Collections.Generic;
 using Xunit;
-using Xunit.Abstractions;
 using Microsoft.ML.Runtime.Internal.CpuMath;
 
 namespace Microsoft.ML.CpuMath.UnitTests
 {
     public class CpuMathUtilsUnitTests
     {
-        private readonly float[][] testArrays;
-        private readonly int[] testIndexArray;
-        private readonly AlignedArray[] testMatrices;
-        private readonly AlignedArray[] testSrcVectors;
-        private readonly AlignedArray[] testDstVectors;
-        private const float DEFAULT_SCALE = 1.7f;
-        private const int Vector128Assignment = 16;
-        private const int Vector256Assignment = 32;
-        private FloatEqualityComparer comparer;
+        private readonly float[][] _testArrays;
+        private readonly int[] _testIndexArray;
+        private readonly AlignedArray[] _testMatrices;
+        private readonly AlignedArray[] _testSrcVectors;
+        private readonly AlignedArray[] _testDstVectors;
+        private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment();
+        private readonly FloatEqualityComparer _comparer;
 
-        private readonly ITestOutputHelper output;
+        private const float DEFAULT_SCALE = 1.7f;
 
-        public CpuMathUtilsUnitTests(ITestOutputHelper output)
+        public CpuMathUtilsUnitTests()
         {
-            this.output = output;
-
             // Padded array whose length is a multiple of 4
             float[] testArray1 = new float[16] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
             // Unpadded array whose length is not a multiple of 4.
             float[] testArray2 = new float[15] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f };
-            testArrays = new float[][] { testArray1, testArray2 };
-            testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 };
-            comparer = new FloatEqualityComparer();
+            _testArrays = new float[][] { testArray1, testArray2 };
+            _testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 };
+            _comparer = new FloatEqualityComparer();
 
             // Padded matrices whose dimensions are multiples of 8
             float[] testMatrix1 = new float[8 * 8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
@@ -52,34 +47,34 @@ public CpuMathUtilsUnitTests(ITestOutputHelper output)
                 testMatrix2[i] = i + 1;
             }
 
-            AlignedArray testMatrixAligned1 = new AlignedArray(8 * 8, Vector256Assignment);
-            AlignedArray testMatrixAligned2 = new AlignedArray(8 * 16, Vector256Assignment);
+            AlignedArray testMatrixAligned1 = new AlignedArray(8 * 8, _vectorAlignment);
+            AlignedArray testMatrixAligned2 = new AlignedArray(8 * 16, _vectorAlignment);
             testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length);
             testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length);
 
-            testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 };
+            _testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 };
 
             // Padded source vectors whose dimensions are multiples of 8
             float[] testSrcVector1 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f };
             float[] testSrcVector2 = new float[16] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f, 16f };
 
-            AlignedArray testSrcVectorAligned1 = new AlignedArray(8, Vector256Assignment);
-            AlignedArray testSrcVectorAligned2 = new AlignedArray(16, Vector256Assignment);
+            AlignedArray testSrcVectorAligned1 = new AlignedArray(8, _vectorAlignment);
+            AlignedArray testSrcVectorAligned2 = new AlignedArray(16, _vectorAlignment);
             testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length);
             testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length);
 
-            testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 };
+            _testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 };
 
             // Padded destination vectors whose dimensions are multiples of 8
             float[] testDstVector1 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f };
             float[] testDstVector2 = new float[16] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f };
 
-            AlignedArray testDstVectorAligned1 = new AlignedArray(8, Vector256Assignment);
-            AlignedArray testDstVectorAligned2 = new AlignedArray(16, Vector256Assignment);
+            AlignedArray testDstVectorAligned1 = new AlignedArray(8, _vectorAlignment);
+            AlignedArray testDstVectorAligned2 = new AlignedArray(16, _vectorAlignment);
             testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length);
             testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length);
 
-            testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 };
+            _testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 };
         }
 
         [Theory]
@@ -88,14 +83,14 @@ public CpuMathUtilsUnitTests(ITestOutputHelper output)
         [InlineData(1, 0, 1, new float[] { 204f, 492f, 780f, 1068f, 1356f, 1644f, 1932f, 2220f, 2508f, 2796f, 3084f, 3372f, 3660f, 3948f, 4236f, 4524f })]
         public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
 
             CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -104,14 +99,14 @@ public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
         [InlineData(1, 0, 1, new float[] { 204f, 493f, 782f, 1071f, 1360f, 1649f, 1938f, 2227f, 2516f, 2805f, 3094f, 3383f, 3672f, 3961f, 4250f, 4539f })]
         public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
 
             CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -120,14 +115,14 @@ public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expect
         [InlineData(1, 1, 0, new float[] { 11016f, 11152f, 11288f, 11424f, 11560f, 11696f, 11832f, 11968f })]
         public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
 
             CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -136,14 +131,14 @@ public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expec
         [InlineData(1, 1, 0, new float[] { 11016f, 11153f, 11290f, 11427f, 11564f, 11701f, 11838f, 11975f })]
         public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
 
             CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -152,15 +147,15 @@ public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] ex
         [InlineData(1, 0, 1, new float[] { 95f, 231f, 367f, 503f, 639f, 775f, 911f, 1047f, 1183f, 1319f, 1455f, 1591f, 1727f, 1863f, 1999f, 2135f })]
         public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-            int[] idx = testIndexArray;
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+            int[] idx = _testIndexArray;
 
             CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -169,15 +164,15 @@ public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected
         [InlineData(1, 0, 1, new float[] { 95f, 232f, 369f, 506f, 643f, 780f, 917f, 1054f, 1191f, 1328f, 1465f, 1602f, 1739f, 1876f, 2013f, 2150f })]
         public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-            int[] idx = testIndexArray;
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+            int[] idx = _testIndexArray;
 
             CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -186,15 +181,15 @@ public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expec
         [InlineData(1, 1, 0, new float[] { 6720f, 6800f, 6880f, 6960f, 7040f, 7120f, 7200f, 7280f })]
         public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-            int[] idx = testIndexArray;
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+            int[] idx = _testIndexArray;
 
             CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -203,15 +198,15 @@ public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expe
         [InlineData(1, 1, 0, new float[] { 6720f, 6801f, 6882f, 6963f, 7044f, 7125f, 7206f, 7287f })]
         public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[matTest];
-            AlignedArray src = testSrcVectors[srcTest];
-            AlignedArray dst = testDstVectors[dstTest];
-            int[] idx = testIndexArray;
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+            int[] idx = _testIndexArray;
 
             CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -219,7 +214,7 @@ public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] e
         [InlineData(1)]
         public void AddScalarUTest(int test)
         {
-            float[] dst = (float[])testArrays[test].Clone();
+            float[] dst = (float[])_testArrays[test].Clone();
             float[] expected = (float[])dst.Clone();
 
             for (int i = 0; i < expected.Length; i++)
@@ -229,7 +224,7 @@ public void AddScalarUTest(int test)
 
             CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length);
             var actual = dst;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -237,7 +232,7 @@ public void AddScalarUTest(int test)
         [InlineData(1)]
         public void ScaleUTest(int test)
         {
-            float[] dst = (float[])testArrays[test].Clone();
+            float[] dst = (float[])_testArrays[test].Clone();
             float[] expected = (float[])dst.Clone();
 
             for (int i = 0; i < expected.Length; i++)
@@ -247,7 +242,7 @@ public void ScaleUTest(int test)
 
             CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length);
             var actual = dst;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -255,7 +250,7 @@ public void ScaleUTest(int test)
         [InlineData(1)]
         public void ScaleSrcUTest(int test)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
             float[] expected = (float[])dst.Clone();
 
@@ -266,7 +261,7 @@ public void ScaleSrcUTest(int test)
 
             CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length);
             var actual = dst;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -274,7 +269,7 @@ public void ScaleSrcUTest(int test)
         [InlineData(1)]
         public void ScaleAddUTest(int test)
         {
-            float[] dst = (float[])testArrays[test].Clone();
+            float[] dst = (float[])_testArrays[test].Clone();
             float[] expected = (float[])dst.Clone();
 
             for (int i = 0; i < expected.Length; i++)
@@ -284,7 +279,7 @@ public void ScaleAddUTest(int test)
 
             CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length);
             var actual = dst;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -292,7 +287,7 @@ public void ScaleAddUTest(int test)
         [InlineData(1)]
         public void AddScaleUTest(int test)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
             float[] expected = (float[])dst.Clone();
 
@@ -303,7 +298,7 @@ public void AddScaleUTest(int test)
 
             CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, dst.Length);
             var actual = dst;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -311,9 +306,9 @@ public void AddScaleUTest(int test)
         [InlineData(1)]
         public void AddScaleSUTest(int test)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
-            int[] idx = testIndexArray;
+            int[] idx = _testIndexArray;
             float[] expected = (float[])dst.Clone();
 
             expected[0] = 5.292f;
@@ -328,7 +323,7 @@ public void AddScaleSUTest(int test)
 
             CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length);
             var actual = dst;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -336,7 +331,7 @@ public void AddScaleSUTest(int test)
         [InlineData(1)]
         public void AddScaleCopyUTest(int test)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
             float[] result = (float[])dst.Clone();
             float[] expected = (float[])dst.Clone();
@@ -348,7 +343,7 @@ public void AddScaleCopyUTest(int test)
 
             CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length);
             var actual = result;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -356,7 +351,7 @@ public void AddScaleCopyUTest(int test)
         [InlineData(1)]
         public void AddUTest(int test)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
             float[] expected = (float[])src.Clone();
 
@@ -373,7 +368,7 @@ public void AddUTest(int test)
 
             CpuMathUtils.Add(src, dst, dst.Length);
             var actual = dst;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -381,9 +376,9 @@ public void AddUTest(int test)
         [InlineData(1)]
         public void AddSUTest(int test)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
-            int[] idx = testIndexArray;
+            int[] idx = _testIndexArray;
             float[] expected = (float[])dst.Clone();
 
             expected[0] = 3.92f;
@@ -398,7 +393,7 @@ public void AddSUTest(int test)
 
             CpuMathUtils.Add(src, idx, dst, idx.Length);
             var actual = dst;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -406,7 +401,7 @@ public void AddSUTest(int test)
         [InlineData(1)]
         public void MulElementWiseUTest(int test)
         {
-            float[] src1 = (float[])testArrays[test].Clone();
+            float[] src1 = (float[])_testArrays[test].Clone();
             float[] src2 = (float[])src1.Clone();
             float[] dst = (float[])src1.Clone();
 
@@ -425,7 +420,7 @@ public void MulElementWiseUTest(int test)
 
             CpuMathUtils.MulElementWise(src1, src2, dst, dst.Length);
             var actual = dst;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -433,7 +428,7 @@ public void MulElementWiseUTest(int test)
         [InlineData(1, -191.09f)]
         public void SumUTest(int test, float expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             var actual = CpuMathUtils.Sum(src, src.Length);
             Assert.Equal(expected, actual, 2);
         }
@@ -443,7 +438,7 @@ public void SumUTest(int test, float expected)
         [InlineData(1, 26789.0511f)]
         public void SumSqUTest(int test, float expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             var actual = CpuMathUtils.SumSq(src, src.Length);
             Assert.Equal(expected, actual, 2);
         }
@@ -453,7 +448,7 @@ public void SumSqUTest(int test, float expected)
         [InlineData(1, 27482.1071f)]
         public void SumSqDiffUTest(int test, float expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length);
             Assert.Equal(expected, actual, 2);
         }
@@ -463,7 +458,7 @@ public void SumSqDiffUTest(int test, float expected)
         [InlineData(1, 390.67f)]
         public void SumAbsUTest(int test, float expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             var actual = CpuMathUtils.SumAbs(src, src.Length);
             Assert.Equal(expected, actual, 2);
         }
@@ -473,7 +468,7 @@ public void SumAbsUTest(int test, float expected)
         [InlineData(1, 392.37f)]
         public void SumAbsDiffUTest(int test, float expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length);
             Assert.Equal(expected, actual, 2);
         }
@@ -483,7 +478,7 @@ public void SumAbsDiffUTest(int test, float expected)
         [InlineData(1, 106.37f)]
         public void MaxAbsUTest(int test, float expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             var actual = CpuMathUtils.MaxAbs(src, src.Length);
             Assert.Equal(expected, actual, 2);
         }
@@ -493,7 +488,7 @@ public void MaxAbsUTest(int test, float expected)
         [InlineData(1, 108.07f)]
         public void MaxAbsDiffUTest(int test, float expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length);
             Assert.Equal(expected, actual, 2);
         }
@@ -503,7 +498,7 @@ public void MaxAbsDiffUTest(int test, float expected)
         [InlineData(1, 26597.9611f)]
         public void DotUTest(int test, float expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
 
             for (int i = 0; i < dst.Length; i++)
@@ -520,9 +515,9 @@ public void DotUTest(int test, float expected)
         [InlineData(1, -3406.2154f)]
         public void DotSUTest(int test, float expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
-            int[] idx = testIndexArray;
+            int[] idx = _testIndexArray;
 
             // Ensures src and dst are different arrays
             for (int i = 0; i < dst.Length; i++)
@@ -539,7 +534,7 @@ public void DotSUTest(int test, float expected)
         [InlineData(1, 15.0f)]
         public void Dist2Test(int test, float expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
 
             // Ensures src and dst are different arrays
@@ -557,28 +552,27 @@ public void Dist2Test(int test, float expected)
         [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 0f, 10f, 11f, 0f, 0f, 0f, 0f, 16f })]
         public void ZeroItemsUTest(int test, int[] idx, float[] expected)
         {
-            AlignedArray src = new AlignedArray(8 + 8 * test, Vector256Assignment);
-            src.CopyFrom(testSrcVectors[test]);
+            AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment);
+            src.CopyFrom(_testSrcVectors[test]);
 
             CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx);
             float[] actual = new float[src.Size];
             src.CopyTo(actual, 0, src.Size);
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
-        [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })]
-        [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })]
+        [InlineData(0, new int[] { 0, 2, 5 }, new float[] { 0f, 2f, 0f, 4f, 5f, 6f, 0f, 8f })]
+        [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 9f, 0f, 11f, 12f, 0f, 0f, 0f, 16f })]
         public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected)
         {
-            // Uses Vector128Assignment since the intrinsic does not use any SSE/AVX algorithm.
-            AlignedArray src = new AlignedArray(4 + 4 * test, Vector128Assignment);
-            src.CopyFrom(testSrcVectors[test]);
+            AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment);
+            src.CopyFrom(_testSrcVectors[test]);
 
             CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx);
             float[] actual = new float[src.Size];
             src.CopyTo(actual, 0, src.Size);
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -586,7 +580,7 @@ public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected)
         [InlineData(1)]
         public void SdcaL1UpdateUTest(int test)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] v = (float[])src.Clone();
             float[] w = (float[])src.Clone();
             float[] expected = (float[])w.Clone();
@@ -599,7 +593,7 @@ public void SdcaL1UpdateUTest(int test)
 
             CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w);
             var actual = w;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
 
         [Theory]
@@ -607,10 +601,10 @@ public void SdcaL1UpdateUTest(int test)
         [InlineData(1)]
         public void SdcaL1UpdateSUTest(int test)
         {
-            float[] src = (float[])testArrays[test].Clone();
+            float[] src = (float[])_testArrays[test].Clone();
             float[] v = (float[])src.Clone();
             float[] w = (float[])src.Clone();
-            int[] idx = testIndexArray;
+            int[] idx = _testIndexArray;
             float[] expected = (float[])w.Clone();
 
             for (int i = 0; i < idx.Length; i++)
@@ -622,7 +616,7 @@ public void SdcaL1UpdateSUTest(int test)
 
             CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w);
             var actual = w;
-            Assert.Equal(expected, actual, comparer);
+            Assert.Equal(expected, actual, _comparer);
         }
     }
 
@@ -638,4 +632,4 @@ public int GetHashCode(float a)
             throw new NotImplementedException();
         }
     }
-}
+}
\ No newline at end of file
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs
new file mode 100644
index 0000000000..f453c0749d
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs
@@ -0,0 +1,619 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Xunit;
+using Microsoft.ML.Runtime.Internal.CpuMath;
+
+namespace Microsoft.ML.CpuMath.UnitTests
+{
+    public class CpuMathUtilsUnitTests
+    {
+        private readonly float[][] _testArrays;
+        private readonly int[] _testIndexArray;
+        private readonly AlignedArray[] _testMatrices;
+        private readonly AlignedArray[] _testSrcVectors;
+        private readonly AlignedArray[] _testDstVectors;
+        private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment();
+        private readonly FloatEqualityComparer _comparer;
+
+        private const float DEFAULT_SCALE = 1.7f;
+
+        public CpuMathUtilsUnitTests()
+        {
+            // Padded array whose length is a multiple of 4
+            float[] testArray1 = new float[8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
+            // Unpadded array whose length is not a multiple of 4.
+            float[] testArray2 = new float[7] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f };
+            _testArrays = new float[][] { testArray1, testArray2 };
+            _testIndexArray = new int[4] { 0, 2, 5, 6 };
+            _comparer = new FloatEqualityComparer();
+
+            // Padded matrices whose dimensions are multiples of 4
+            float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
+                1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
+            float[] testMatrix2 = new float[4 * 8];
+
+            for (int i = 0; i < testMatrix2.Length; i++)
+            {
+                testMatrix2[i] = i + 1;
+            }
+
+            AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, _vectorAlignment);
+            AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, _vectorAlignment);
+            testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length);
+            testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length);
+
+            _testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 };
+
+            // Padded source vectors whose dimensions are multiples of 4
+            float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f };
+            float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f };
+
+            AlignedArray testSrcVectorAligned1 = new AlignedArray(4, _vectorAlignment);
+            AlignedArray testSrcVectorAligned2 = new AlignedArray(8, _vectorAlignment);
+            testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length);
+            testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length);
+
+            _testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 };
+
+            // Padded destination vectors whose dimensions are multiples of 4
+            float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f };
+            float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f };
+
+            AlignedArray testDstVectorAligned1 = new AlignedArray(4, _vectorAlignment);
+            AlignedArray testDstVectorAligned2 = new AlignedArray(8, _vectorAlignment);
+            testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length);
+            testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length);
+
+            _testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 };
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })]
+        [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })]
+        [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })]
+        public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })]
+        [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })]
+        [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })]
+        public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })]
+        [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })]
+        [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })]
+        public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })]
+        [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })]
+        [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })]
+        public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })]
+        [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })]
+        [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })]
+        public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+            int[] idx = _testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })]
+        [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })]
+        [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })]
+        public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+            int[] idx = _testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })]
+        [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })]
+        [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })]
+        public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+            int[] idx = _testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })]
+        [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })]
+        [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })]
+        public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
+            int[] idx = _testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddScalarUTest(int test)
+        {
+            float[] dst = (float[])_testArrays[test].Clone();
+            float[] expected = (float[])dst.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] += DEFAULT_SCALE;
+            }
+
+            CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void ScaleUTest(int test)
+        {
+            float[] dst = (float[])_testArrays[test].Clone();
+            float[] expected = (float[])dst.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] *= DEFAULT_SCALE;
+            }
+
+            CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void ScaleSrcUTest(int test)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            float[] expected = (float[])dst.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] *= DEFAULT_SCALE;
+            }
+
+            CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void ScaleAddUTest(int test)
+        {
+            float[] dst = (float[])_testArrays[test].Clone();
+            float[] expected = (float[])dst.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] = DEFAULT_SCALE * (dst[i] + DEFAULT_SCALE);
+            }
+
+            CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddScaleUTest(int test)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            float[] expected = (float[])dst.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] *= (1 + DEFAULT_SCALE);
+            }
+
+            CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddScaleSUTest(int test)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            int[] idx = _testIndexArray;
+            float[] expected = (float[])dst.Clone();
+
+            expected[0] = 5.292f;
+            expected[2] = -13.806f;
+            expected[5] = -43.522f;
+            expected[6] = 55.978f;
+
+            CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddScaleCopyUTest(int test)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            float[] result = (float[])dst.Clone();
+            float[] expected = (float[])dst.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] *= (1 + DEFAULT_SCALE);
+            }
+
+            CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length);
+            var actual = result;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddUTest(int test)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            float[] expected = (float[])src.Clone();
+
+            // Ensures src and dst are different arrays
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] = 2 * expected[i] + 1;
+            }
+
+            CpuMathUtils.Add(src, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddSUTest(int test)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            int[] idx = _testIndexArray;
+            float[] expected = (float[])dst.Clone();
+
+            expected[0] = 3.92f;
+            expected[2] = -12.14f;
+            expected[5] = -36.69f;
+            expected[6] = 46.29f;
+
+            CpuMathUtils.Add(src, idx, dst, idx.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void MulElementWiseUTest(int test)
+        {
+            float[] src1 = (float[])_testArrays[test].Clone();
+            float[] src2 = (float[])src1.Clone();
+            float[] dst = (float[])src1.Clone();
+
+            // Ensures src1 and src2 are different arrays
+            for (int i = 0; i < src2.Length; i++)
+            {
+                src2[i] += 1;
+            }
+
+            float[] expected = (float[])src1.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] *= (1 + expected[i]);
+            }
+
+            CpuMathUtils.MulElementWise(src1, src2, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0, -93.9f)]
+        [InlineData(1, -97.19f)]
+        public void SumUTest(int test, float expected)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            var actual = CpuMathUtils.Sum(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 13399.9376f)]
+        [InlineData(1, 13389.1135f)]
+        public void SumSqUTest(int test, float expected)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            var actual = CpuMathUtils.SumSq(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 13742.3176f)]
+        [InlineData(1, 13739.7895f)]
+        public void SumSqDiffUTest(int test, float expected)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 196.98f)]
+        [InlineData(1, 193.69f)]
+        public void SumAbsUTest(int test, float expected)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            var actual = CpuMathUtils.SumAbs(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 196.98f)]
+        [InlineData(1, 195.39f)]
+        public void SumAbsDiffUTest(int test, float expected)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 106.37f)]
+        [InlineData(1, 106.37f)]
+        public void MaxAbsUTest(int test, float expected)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            var actual = CpuMathUtils.MaxAbs(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 108.07f)]
+        [InlineData(1, 108.07f)]
+        public void MaxAbsDiffUTest(int test, float expected)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 13306.0376f)]
+        [InlineData(1, 13291.9235f)]
+        public void DotUTest(int test, float expected)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 736.7352f)]
+        [InlineData(1, 736.7352f)]
+        public void DotSUTest(int test, float expected)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            int[] idx = _testIndexArray;
+
+            // Ensures src and dst are different arrays
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length);
+            Assert.Equal(expected, actual, 4);
+        }
+
+        [Theory]
+        [InlineData(0, 8.0f)]
+        [InlineData(1, 7.0f)]
+        public void Dist2Test(int test, float expected)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+
+            // Ensures src and dst are different arrays
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length);
+            Assert.Equal(expected, actual, 0);
+        }
+
+        [Theory]
+        [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })]
+        [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })]
+        public void ZeroItemsUTest(int test, int[] idx, float[] expected)
+        {
+            AlignedArray src = new AlignedArray(4 + 4 * test, _vectorAlignment);
+            src.CopyFrom(_testSrcVectors[test]);
+
+            CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx);
+            float[] actual = new float[src.Size];
+            src.CopyTo(actual, 0, src.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })]
+        [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })]
+        public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected)
+        {
+            AlignedArray src = new AlignedArray(4 + 4 * test, _vectorAlignment);
+            src.CopyFrom(_testSrcVectors[test]);
+
+            CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx);
+            float[] actual = new float[src.Size];
+            src.CopyTo(actual, 0, src.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void SdcaL1UpdateUTest(int test)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] v = (float[])src.Clone();
+            float[] w = (float[])src.Clone();
+            float[] expected = (float[])w.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                float value = src[i] * (1 + DEFAULT_SCALE);
+                expected[i] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0;
+            }
+
+            CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w);
+            var actual = w;
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void SdcaL1UpdateSUTest(int test)
+        {
+            float[] src = (float[])_testArrays[test].Clone();
+            float[] v = (float[])src.Clone();
+            float[] w = (float[])src.Clone();
+            int[] idx = _testIndexArray;
+            float[] expected = (float[])w.Clone();
+
+            for (int i = 0; i < idx.Length; i++)
+            {
+                int index = idx[i];
+                float value = v[index] + src[i] * DEFAULT_SCALE;
+                expected[index] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0;
+            }
+
+            CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w);
+            var actual = w;
+            Assert.Equal(expected, actual, _comparer);
+        }
+    }
+
+    internal class FloatEqualityComparer : IEqualityComparer<float>
+    {
+        public bool Equals(float a, float b)
+        {
+            return Math.Abs(a - b) < 1e-5f;
+        }
+
+        public int GetHashCode(float a)
+        {
+            throw new NotImplementedException();
+        }
+    }
+}
\ No newline at end of file

From a763059a22000ab8cabfb08cb1fca4bfaa99aa5b Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Sat, 18 Aug 2018 19:03:00 -0700
Subject: [PATCH 16/29] Kept only the most recent unit tests which are
 sufficient for both netcoreapp and netstandard

---
 .../{UnitTests.netcoreapp.cs => UnitTests.cs} |   0
 ...ft.ML.CpuMath.UnitTests.netstandard.csproj |   6 +-
 .../UnitTests.netstandard.cs                  | 619 ------------------
 3 files changed, 5 insertions(+), 620 deletions(-)
 rename test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/{UnitTests.netcoreapp.cs => UnitTests.cs} (100%)
 delete mode 100644 test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs

diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
similarity index 100%
rename from test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs
rename to test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj
index 9552f688a8..862c95ef90 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj
@@ -12,5 +12,9 @@
   <ItemGroup>
     <NativeAssemblyReference Include="CpuMathNative" />
   </ItemGroup>
+
+  <ItemGroup>
+    <Compile Include="..\Microsoft.ML.CpuMath.UnitTests.netcoreapp\UnitTests.cs" />
+  </ItemGroup>
  
-</Project>
+</Project>
\ No newline at end of file
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs
deleted file mode 100644
index f453c0749d..0000000000
--- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs
+++ /dev/null
@@ -1,619 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System;
-using System.Collections.Generic;
-using Xunit;
-using Microsoft.ML.Runtime.Internal.CpuMath;
-
-namespace Microsoft.ML.CpuMath.UnitTests
-{
-    public class CpuMathUtilsUnitTests
-    {
-        private readonly float[][] _testArrays;
-        private readonly int[] _testIndexArray;
-        private readonly AlignedArray[] _testMatrices;
-        private readonly AlignedArray[] _testSrcVectors;
-        private readonly AlignedArray[] _testDstVectors;
-        private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment();
-        private readonly FloatEqualityComparer _comparer;
-
-        private const float DEFAULT_SCALE = 1.7f;
-
-        public CpuMathUtilsUnitTests()
-        {
-            // Padded array whose length is a multiple of 4
-            float[] testArray1 = new float[8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
-            // Unpadded array whose length is not a multiple of 4.
-            float[] testArray2 = new float[7] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f };
-            _testArrays = new float[][] { testArray1, testArray2 };
-            _testIndexArray = new int[4] { 0, 2, 5, 6 };
-            _comparer = new FloatEqualityComparer();
-
-            // Padded matrices whose dimensions are multiples of 4
-            float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
-                1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
-            float[] testMatrix2 = new float[4 * 8];
-
-            for (int i = 0; i < testMatrix2.Length; i++)
-            {
-                testMatrix2[i] = i + 1;
-            }
-
-            AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, _vectorAlignment);
-            AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, _vectorAlignment);
-            testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length);
-            testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length);
-
-            _testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 };
-
-            // Padded source vectors whose dimensions are multiples of 4
-            float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f };
-            float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f };
-
-            AlignedArray testSrcVectorAligned1 = new AlignedArray(4, _vectorAlignment);
-            AlignedArray testSrcVectorAligned2 = new AlignedArray(8, _vectorAlignment);
-            testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length);
-            testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length);
-
-            _testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 };
-
-            // Padded destination vectors whose dimensions are multiples of 4
-            float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f };
-            float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f };
-
-            AlignedArray testDstVectorAligned1 = new AlignedArray(4, _vectorAlignment);
-            AlignedArray testDstVectorAligned2 = new AlignedArray(8, _vectorAlignment);
-            testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length);
-            testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length);
-
-            _testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 };
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })]
-        [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })]
-        [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })]
-        public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
-
-            CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })]
-        [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })]
-        [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })]
-        public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
-
-            CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })]
-        [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })]
-        [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })]
-        public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
-
-            CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })]
-        [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })]
-        [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })]
-        public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
-
-            CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })]
-        [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })]
-        [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })]
-        public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
-            int[] idx = _testIndexArray;
-
-            CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })]
-        [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })]
-        [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })]
-        public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
-            int[] idx = _testIndexArray;
-
-            CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })]
-        [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })]
-        [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })]
-        public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
-            int[] idx = _testIndexArray;
-
-            CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })]
-        [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })]
-        [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })]
-        public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
-        {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
-            int[] idx = _testIndexArray;
-
-            CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void AddScalarUTest(int test)
-        {
-            float[] dst = (float[])_testArrays[test].Clone();
-            float[] expected = (float[])dst.Clone();
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i] += DEFAULT_SCALE;
-            }
-
-            CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length);
-            var actual = dst;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void ScaleUTest(int test)
-        {
-            float[] dst = (float[])_testArrays[test].Clone();
-            float[] expected = (float[])dst.Clone();
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i] *= DEFAULT_SCALE;
-            }
-
-            CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length);
-            var actual = dst;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void ScaleSrcUTest(int test)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-            float[] expected = (float[])dst.Clone();
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i] *= DEFAULT_SCALE;
-            }
-
-            CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length);
-            var actual = dst;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void ScaleAddUTest(int test)
-        {
-            float[] dst = (float[])_testArrays[test].Clone();
-            float[] expected = (float[])dst.Clone();
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i] = DEFAULT_SCALE * (dst[i] + DEFAULT_SCALE);
-            }
-
-            CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length);
-            var actual = dst;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void AddScaleUTest(int test)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-            float[] expected = (float[])dst.Clone();
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i] *= (1 + DEFAULT_SCALE);
-            }
-
-            CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, dst.Length);
-            var actual = dst;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void AddScaleSUTest(int test)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-            int[] idx = _testIndexArray;
-            float[] expected = (float[])dst.Clone();
-
-            expected[0] = 5.292f;
-            expected[2] = -13.806f;
-            expected[5] = -43.522f;
-            expected[6] = 55.978f;
-
-            CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length);
-            var actual = dst;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void AddScaleCopyUTest(int test)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-            float[] result = (float[])dst.Clone();
-            float[] expected = (float[])dst.Clone();
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i] *= (1 + DEFAULT_SCALE);
-            }
-
-            CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length);
-            var actual = result;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void AddUTest(int test)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-            float[] expected = (float[])src.Clone();
-
-            // Ensures src and dst are different arrays
-            for (int i = 0; i < dst.Length; i++)
-            {
-                dst[i] += 1;
-            }
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i] = 2 * expected[i] + 1;
-            }
-
-            CpuMathUtils.Add(src, dst, dst.Length);
-            var actual = dst;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void AddSUTest(int test)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-            int[] idx = _testIndexArray;
-            float[] expected = (float[])dst.Clone();
-
-            expected[0] = 3.92f;
-            expected[2] = -12.14f;
-            expected[5] = -36.69f;
-            expected[6] = 46.29f;
-
-            CpuMathUtils.Add(src, idx, dst, idx.Length);
-            var actual = dst;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void MulElementWiseUTest(int test)
-        {
-            float[] src1 = (float[])_testArrays[test].Clone();
-            float[] src2 = (float[])src1.Clone();
-            float[] dst = (float[])src1.Clone();
-
-            // Ensures src1 and src2 are different arrays
-            for (int i = 0; i < src2.Length; i++)
-            {
-                src2[i] += 1;
-            }
-
-            float[] expected = (float[])src1.Clone();
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i] *= (1 + expected[i]);
-            }
-
-            CpuMathUtils.MulElementWise(src1, src2, dst, dst.Length);
-            var actual = dst;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0, -93.9f)]
-        [InlineData(1, -97.19f)]
-        public void SumUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            var actual = CpuMathUtils.Sum(src, src.Length);
-            Assert.Equal(expected, actual, 2);
-        }
-
-        [Theory]
-        [InlineData(0, 13399.9376f)]
-        [InlineData(1, 13389.1135f)]
-        public void SumSqUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            var actual = CpuMathUtils.SumSq(src, src.Length);
-            Assert.Equal(expected, actual, 2);
-        }
-
-        [Theory]
-        [InlineData(0, 13742.3176f)]
-        [InlineData(1, 13739.7895f)]
-        public void SumSqDiffUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length);
-            Assert.Equal(expected, actual, 2);
-        }
-
-        [Theory]
-        [InlineData(0, 196.98f)]
-        [InlineData(1, 193.69f)]
-        public void SumAbsUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            var actual = CpuMathUtils.SumAbs(src, src.Length);
-            Assert.Equal(expected, actual, 2);
-        }
-
-        [Theory]
-        [InlineData(0, 196.98f)]
-        [InlineData(1, 195.39f)]
-        public void SumAbsDiffUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length);
-            Assert.Equal(expected, actual, 2);
-        }
-
-        [Theory]
-        [InlineData(0, 106.37f)]
-        [InlineData(1, 106.37f)]
-        public void MaxAbsUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            var actual = CpuMathUtils.MaxAbs(src, src.Length);
-            Assert.Equal(expected, actual, 2);
-        }
-
-        [Theory]
-        [InlineData(0, 108.07f)]
-        [InlineData(1, 108.07f)]
-        public void MaxAbsDiffUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length);
-            Assert.Equal(expected, actual, 2);
-        }
-
-        [Theory]
-        [InlineData(0, 13306.0376f)]
-        [InlineData(1, 13291.9235f)]
-        public void DotUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-
-            for (int i = 0; i < dst.Length; i++)
-            {
-                dst[i] += 1;
-            }
-
-            var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length);
-            Assert.Equal(expected, actual, 2);
-        }
-
-        [Theory]
-        [InlineData(0, 736.7352f)]
-        [InlineData(1, 736.7352f)]
-        public void DotSUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-            int[] idx = _testIndexArray;
-
-            // Ensures src and dst are different arrays
-            for (int i = 0; i < dst.Length; i++)
-            {
-                dst[i] += 1;
-            }
-
-            var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length);
-            Assert.Equal(expected, actual, 4);
-        }
-
-        [Theory]
-        [InlineData(0, 8.0f)]
-        [InlineData(1, 7.0f)]
-        public void Dist2Test(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-
-            // Ensures src and dst are different arrays
-            for (int i = 0; i < dst.Length; i++)
-            {
-                dst[i] += 1;
-            }
-
-            var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length);
-            Assert.Equal(expected, actual, 0);
-        }
-
-        [Theory]
-        [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })]
-        [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })]
-        public void ZeroItemsUTest(int test, int[] idx, float[] expected)
-        {
-            AlignedArray src = new AlignedArray(4 + 4 * test, _vectorAlignment);
-            src.CopyFrom(_testSrcVectors[test]);
-
-            CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx);
-            float[] actual = new float[src.Size];
-            src.CopyTo(actual, 0, src.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })]
-        [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })]
-        public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected)
-        {
-            AlignedArray src = new AlignedArray(4 + 4 * test, _vectorAlignment);
-            src.CopyFrom(_testSrcVectors[test]);
-
-            CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx);
-            float[] actual = new float[src.Size];
-            src.CopyTo(actual, 0, src.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void SdcaL1UpdateUTest(int test)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] v = (float[])src.Clone();
-            float[] w = (float[])src.Clone();
-            float[] expected = (float[])w.Clone();
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                float value = src[i] * (1 + DEFAULT_SCALE);
-                expected[i] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0;
-            }
-
-            CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w);
-            var actual = w;
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0)]
-        [InlineData(1)]
-        public void SdcaL1UpdateSUTest(int test)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            float[] v = (float[])src.Clone();
-            float[] w = (float[])src.Clone();
-            int[] idx = _testIndexArray;
-            float[] expected = (float[])w.Clone();
-
-            for (int i = 0; i < idx.Length; i++)
-            {
-                int index = idx[i];
-                float value = v[index] + src[i] * DEFAULT_SCALE;
-                expected[index] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0;
-            }
-
-            CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w);
-            var actual = w;
-            Assert.Equal(expected, actual, _comparer);
-        }
-    }
-
-    internal class FloatEqualityComparer : IEqualityComparer<float>
-    {
-        public bool Equals(float a, float b)
-        {
-            return Math.Abs(a - b) < 1e-5f;
-        }
-
-        public int GetHashCode(float a)
-        {
-            throw new NotImplementedException();
-        }
-    }
-}
\ No newline at end of file

From 8bc8cc83183befbf71912738381e8ea138be4e7c Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Mon, 20 Aug 2018 13:28:50 -0700
Subject: [PATCH 17/29] Respond to PR feedback: Style changes

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     | 117 +++++-------------
 .../CpuMathUtils.netstandard.cs               |   2 +-
 2 files changed, 32 insertions(+), 87 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 863a40b787..db9335b4ed 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -45,14 +45,6 @@ private static Vector256<float> ToVector256(in Vector128<float> a, in Vector128<
             return Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1);
         }
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static void ZeroUpper()
-        {
-            // Currently no-op since _mm256_zeroupper is not supported (ref: https://github.com/dotnet/coreclr/pull/16955)
-            // This is a placeholder in case the intrinsic is supported later on.
-            return;
-        }
-
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> GetLow(in Vector256<float> x)
         {
@@ -264,8 +256,6 @@ public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src,
                     pDstCurrent += 4;
                     pMatCurrent += 3 * ccol;
                 }
-
-                ZeroUpper();
             }
         }
 
@@ -326,8 +316,6 @@ public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, A
                     pDstCurrent += 8;
                     pm0 += 8 * ccol;
                 }
-
-                ZeroUpper();
             }
         }
 
@@ -440,8 +428,6 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
                     pMatCurrent += 3 * crow;
                     pSrcCurrent += 4;
                 }
-
-                ZeroUpper();
             }
         }
 
@@ -509,8 +495,6 @@ public static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSr
 
                     ppos++;
                 }
-
-                ZeroUpper();
             }
         }
 
@@ -535,7 +519,7 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
 
                 Vector128<float> scalarVector128 = Sse.SetAllVector128(scalar);
 
-                while (pDstCurrent + 4 <= pDstEnd)
+                if (pDstCurrent + 4 <= pDstEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
                     dstVector = Sse.Add(dstVector, scalarVector128);
@@ -553,8 +537,6 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
                     pDstCurrent++;
                 }
             }
-
-            ZeroUpper();
         }
 
         public static unsafe void ScaleU(float scale, Span<float> dst)
@@ -578,7 +560,7 @@ public static unsafe void ScaleU(float scale, Span<float> dst)
 
                 Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
 
-                while (pDstCurrent + 4 <= pEnd)
+                if (pDstCurrent + 4 <= pEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
 
@@ -598,8 +580,6 @@ public static unsafe void ScaleU(float scale, Span<float> dst)
                     pDstCurrent++;
                 }
             }
-
-            ZeroUpper();
         }
 
         public static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> dst)
@@ -625,7 +605,7 @@ public static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> ds
 
                 Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
 
-                while (pDstCurrent + 4 <= pDstEnd)
+                if (pDstCurrent + 4 <= pDstEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Multiply(srcVector, scaleVector128);
@@ -645,8 +625,6 @@ public static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> ds
                     pDstCurrent++;
                 }
             }
-
-            ZeroUpper();
         }
 
         // dst[i] = a * (dst[i] + b)
@@ -673,7 +651,7 @@ public static unsafe void ScaleAddU(float a, float b, Span<float> dst)
                 Vector128<float> a128 = Sse.SetAllVector128(a);
                 Vector128<float> b128 = Sse.SetAllVector128(b);
 
-                while (pDstCurrent + 4 <= pDstEnd)
+                if (pDstCurrent + 4 <= pDstEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
                     dstVector = Sse.Add(dstVector, b128);
@@ -693,8 +671,6 @@ public static unsafe void ScaleAddU(float a, float b, Span<float> dst)
                     pDstCurrent++;
                 }
             }
-
-            ZeroUpper();
         }
 
         public static unsafe void AddScaleU(float scale, Span<float> src, Span<float> dst)
@@ -723,7 +699,7 @@ public static unsafe void AddScaleU(float scale, Span<float> src, Span<float> ds
 
                 Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
 
-                while (pDstCurrent + 4 <= pEnd)
+                if (pDstCurrent + 4 <= pEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
@@ -749,8 +725,6 @@ public static unsafe void AddScaleU(float scale, Span<float> src, Span<float> ds
                     pDstCurrent++;
                 }
             }
-
-            ZeroUpper();
         }
 
         public static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<float> dst, Span<float> result)
@@ -781,7 +755,7 @@ public static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<float
 
                 Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
 
-                while (pResCurrent + 4 <= pResEnd)
+                if (pResCurrent + 4 <= pResEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
@@ -807,8 +781,6 @@ public static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<float
                     pResCurrent++;
                 }
             }
-
-            ZeroUpper();
         }
 
         public static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx, Span<float> dst)
@@ -839,7 +811,7 @@ public static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx
 
                 Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
 
-                while (pIdxCurrent + 4 <= pEnd)
+                if (pIdxCurrent + 4 <= pEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     Vector128<float> dstVector = Load4(pDstCurrent, pIdxCurrent);
@@ -860,8 +832,6 @@ public static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx
                     pSrcCurrent++;
                 }
             }
-
-            ZeroUpper();
         }
 
         public static unsafe void AddU(Span<float> src, Span<float> dst)
@@ -885,7 +855,7 @@ public static unsafe void AddU(Span<float> src, Span<float> dst)
                     pDstCurrent += 8;
                 }
 
-                while (pSrcCurrent + 4 <= pEnd)
+                if (pSrcCurrent + 4 <= pEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
@@ -909,8 +879,6 @@ public static unsafe void AddU(Span<float> src, Span<float> dst)
                     pDstCurrent++;
                 }
             }
-
-            ZeroUpper();
         }
 
         public static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
@@ -936,7 +904,7 @@ public static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
                     pSrcCurrent += 8;
                 }
 
-                while (pIdxCurrent + 4 <= pEnd)
+                if (pIdxCurrent + 4 <= pEnd)
                 {
                     Vector128<float> dstVector = Load4(pDstCurrent, pIdxCurrent);
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
@@ -956,8 +924,6 @@ public static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
                     pSrcCurrent++;
                 }
             }
-
-            ZeroUpper();
         }
 
         public static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2, Span<float> dst)
@@ -983,7 +949,7 @@ public static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2, Sp
                     pDstCurrent += 8;
                 }
 
-                while (pDstCurrent + 4 <= pEnd)
+                if (pDstCurrent + 4 <= pEnd)
                 {
                     Vector128<float> src1Vector = Sse.LoadVector128(pSrc1Current);
                     Vector128<float> src2Vector = Sse.LoadVector128(pSrc2Current);
@@ -1007,8 +973,6 @@ public static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2, Sp
                     pDstCurrent++;
                 }
             }
-
-            ZeroUpper();
         }
 
         public static unsafe float SumU(Span<float> src)
@@ -1031,7 +995,7 @@ public static unsafe float SumU(Span<float> src)
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     result128 = Sse.Add(result128, Sse.LoadVector128(pSrcCurrent));
                     pSrcCurrent += 4;
@@ -1045,9 +1009,7 @@ public static unsafe float SumU(Span<float> src)
                     pSrcCurrent++;
                 }
 
-                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
-                ZeroUpper();
-                return sum;
+                return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
             }
         }
 
@@ -1073,7 +1035,7 @@ public static unsafe float SumSqU(Span<float> src)
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     result128 = Sse.Add(result128, Sse.Multiply(srcVector, srcVector));
@@ -1091,9 +1053,7 @@ public static unsafe float SumSqU(Span<float> src)
                     pSrcCurrent++;
                 }
 
-                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
-                ZeroUpper();
-                return sum;
+                return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
             }
         }
 
@@ -1122,7 +1082,7 @@ public static unsafe float SumSqDiffU(float mean, Span<float> src)
                 Vector128<float> result128 = Sse.SetZeroVector128();
                 Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector128);
@@ -1142,9 +1102,7 @@ public static unsafe float SumSqDiffU(float mean, Span<float> src)
                     pSrcCurrent++;
                 }
 
-                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
-                ZeroUpper();
-                return sum;
+                return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
             }
         }
 
@@ -1172,7 +1130,7 @@ public static unsafe float SumAbsU(Span<float> src)
                 Vector128<float> result128 = Sse.SetZeroVector128();
                 Vector128<float> mask128 = GetAbsMask128();
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     result128 = Sse.Add(result128, Sse.And(srcVector, mask128));
@@ -1190,9 +1148,7 @@ public static unsafe float SumAbsU(Span<float> src)
                     pSrcCurrent++;
                 }
 
-                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
-                ZeroUpper();
-                return sum;
+                return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
             }
         }
 
@@ -1223,7 +1179,7 @@ public static unsafe float SumAbsDiffU(float mean, Span<float> src)
                 Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
                 Vector128<float> mask128 = GetAbsMask128();
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector128);
@@ -1243,9 +1199,7 @@ public static unsafe float SumAbsDiffU(float mean, Span<float> src)
                     pSrcCurrent++;
                 }
 
-                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
-                ZeroUpper();
-                return sum;
+                return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
             }
         }
 
@@ -1273,7 +1227,7 @@ public static unsafe float MaxAbsU(Span<float> src)
                 Vector128<float> result128 = Sse.SetZeroVector128();
                 Vector128<float> mask128 = GetAbsMask128();
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     result128 = Sse.Max(result128, Sse.And(srcVector, mask128));
@@ -1291,9 +1245,7 @@ public static unsafe float MaxAbsU(Span<float> src)
                     pSrcCurrent++;
                 }
 
-                float max = Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded));
-                ZeroUpper();
-                return max;
+                return Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded));
             }
         }
 
@@ -1324,7 +1276,7 @@ public static unsafe float MaxAbsDiffU(float mean, Span<float> src)
                 Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
                 Vector128<float> mask128 = GetAbsMask128();
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector128);
@@ -1344,9 +1296,7 @@ public static unsafe float MaxAbsDiffU(float mean, Span<float> src)
                     pSrcCurrent++;
                 }
 
-                float max = Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded));
-                ZeroUpper();
-                return max;
+                return Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded));
             }
         }
 
@@ -1377,7 +1327,7 @@ public static unsafe float DotU(Span<float> src, Span<float> dst)
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
@@ -1401,9 +1351,7 @@ public static unsafe float DotU(Span<float> src, Span<float> dst)
                     pDstCurrent++;
                 }
 
-                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
-                ZeroUpper();
-                return sum;
+                return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
             }
         }
 
@@ -1436,7 +1384,7 @@ public static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
 
-                while (pIdxCurrent + 4 <= pIdxEnd)
+                if (pIdxCurrent + 4 <= pIdxEnd)
                 {
                     Vector128<float> srcVector = Load4(pSrcCurrent, pIdxCurrent);
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
@@ -1460,9 +1408,7 @@ public static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx
                     pDstCurrent++;
                 }
 
-                float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
-                ZeroUpper();
-                return sum;
+                return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded));
             }
         }
 
@@ -1493,7 +1439,7 @@ public static unsafe float Dist2(Span<float> src, Span<float> dst)
 
                 Vector128<float> sqDistanceVector128 = Sse.SetZeroVector128();
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent),
                                                                     Sse.LoadVector128(pDstCurrent));
@@ -1516,7 +1462,6 @@ public static unsafe float Dist2(Span<float> src, Span<float> dst)
                     pDstCurrent++;
                 }
 
-                ZeroUpper();
                 return norm;
             }
         }
@@ -1558,7 +1503,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, flo
                 Vector128<float> signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000
                 Vector128<float> xThreshold128 = Sse.SetAllVector128(threshold);
 
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> xSrc = Sse.LoadVector128(pSrcCurrent);
 
@@ -1623,7 +1568,7 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Sp
                 Vector128<float> signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000
                 Vector128<float> xThreshold128 = Sse.SetAllVector128(threshold);
 
-                while (pIdxCurrent + 4 <= pIdxEnd)
+                if (pIdxCurrent + 4 <= pIdxEnd)
                 {
                     Vector128<float> xSrc = Sse.LoadVector128(pSrcCurrent);
 
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
index bbb7f3bd6a..706f4529bb 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -6,7 +6,7 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
-        // REVIEW NEEDED: AVX support cannot be checked in .NET Core App 2.0, so we assume Vector128 alignment for SSE.  Is it okay?
+        // REVIEW NEEDED: AVX support cannot be checked in .NET Standard 2.0, so we assume Vector128 alignment for SSE.  Is it okay?
 
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;

From f1664faa061864e5137fa63bf4f0af5055147bea Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Mon, 20 Aug 2018 16:52:26 -0700
Subject: [PATCH 18/29] Respond to PR feedback: More style changes

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs            | 2 +-
 src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index db9335b4ed..1bcc8e651e 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -48,7 +48,7 @@ private static Vector256<float> ToVector256(in Vector128<float> a, in Vector128<
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> GetLow(in Vector256<float> x)
         {
-            return Avx.ExtractVector128(x, 0);
+            return Avx.GetLowerHalf(x);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
index 706f4529bb..497dd59003 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -6,8 +6,6 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
-        // REVIEW NEEDED: AVX support cannot be checked in .NET Standard 2.0, so we assume Vector128 alignment for SSE.  Is it okay?
-
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;
 

From 26ed88490570de6194cff4f327cd23f0797a225f Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Mon, 20 Aug 2018 16:53:02 -0700
Subject: [PATCH 19/29] Implemented class inheritance in perf tests to reduce
 overlapping code

---
 .../AvxPerformanceTests.cs                    |  86 +--------------
 .../PerformanceTests.cs                       | 101 ++++++++++++++++++
 .../SsePerformanceTests.cs                    |  86 +--------------
 3 files changed, 103 insertions(+), 170 deletions(-)
 create mode 100644 test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs

diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
index 01058384f8..2e4b598540 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
@@ -9,92 +9,8 @@
 
 namespace Microsoft.ML.CpuMath.PerformanceTests
 {
-    public class AvxPerformanceTests
+    public class AvxPerformanceTests : PerformanceTests
     {
-        private const int EXP_MAX = 127;
-        private const int EXP_MIN = 0;
-
-        private const int IDXLEN = 1000003;
-        private const int LEN = 1000003;
-        private const int EXP_RANGE = EXP_MAX / 8;
-        private const int DEFAULT_SEED = 253421;
-        private const float DEFAULT_SCALE = 1.11f;
-        private const int DEFAULT_CROW = 500;
-        private const int DEFAULT_CCOL = 2000;
-        private const bool ADD = true;
-
-        private float[] src, dst, original, src1, src2, result;
-        private int[] idx;
-        private int seed = DEFAULT_SEED;
-
-        private static float NextFloat(Random rand, int expRange)
-        {
-            double mantissa = (rand.NextDouble() * 2.0) - 1.0;
-            double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1));
-            return (float)(mantissa * exponent);
-        }
-
-        private static int GetSeed()
-        {
-            int seed = DEFAULT_SEED;
-            string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED");
-
-            if (CPUMATH_SEED != null)
-            {
-                if (!int.TryParse(CPUMATH_SEED, out seed))
-                {
-                    if (string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase))
-                    {
-                        seed = new Random().Next();
-                    }
-                    else
-                    {
-                        seed = DEFAULT_SEED;
-                    }
-                }
-            }
-
-            Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results");
-            return seed;
-        }
-
-        [GlobalSetup]
-        public void Setup()
-        {
-            src = new float[LEN];
-            dst = new float[LEN];
-            src1 = new float[LEN];
-            src2 = new float[LEN];
-            original = new float[LEN];
-            result = new float[LEN];
-            idx = new int[IDXLEN];
-
-            seed = GetSeed();
-            Random rand = new Random(seed);
-
-            for (int i = 0; i < LEN; i++)
-            {
-                src[i] = NextFloat(rand, EXP_RANGE);
-                dst[i] = NextFloat(rand, EXP_RANGE);
-                original[i] = dst[i];
-                result[i] = dst[i];
-                src1[i] = NextFloat(rand, EXP_RANGE);
-                src2[i] = NextFloat(rand, EXP_RANGE);
-            }
-
-            for (int i = 0; i < IDXLEN; i++)
-            {
-                idx[i] = rand.Next(0, LEN);
-            }
-        }
-
-        [GlobalCleanup]
-        public void GlobalCleanup()
-        {
-            original.CopyTo(dst, 0);
-            original.CopyTo(result, 0);
-        }
-
         [Benchmark]
         public void ManagedAddScalarUPerf()
         {
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs
new file mode 100644
index 0000000000..64278eaf21
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs
@@ -0,0 +1,101 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Running;
+using Microsoft.ML.Runtime.Internal.CpuMath;
+
+namespace Microsoft.ML.CpuMath.PerformanceTests
+{
+    public class PerformanceTests
+    {
+        private const int EXP_MAX = 127;
+        private const int EXP_MIN = 0;
+        private const int EXP_RANGE = EXP_MAX / 8;
+
+        protected const int IDXLEN = 1000003;
+        protected const int LEN = 1000003;
+        
+        private const int DEFAULT_SEED = 253421;
+        protected const float DEFAULT_SCALE = 1.11f;
+
+        protected const int DEFAULT_CROW = 500;
+        protected const int DEFAULT_CCOL = 2000;
+        protected const bool ADD = true;
+
+        protected float[] src, dst, original, src1, src2, result;
+        protected int[] idx;
+
+        private int seed = DEFAULT_SEED;
+
+        private float NextFloat(Random rand, int expRange)
+        {
+            double mantissa = (rand.NextDouble() * 2.0) - 1.0;
+            double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1));
+            return (float)(mantissa * exponent);
+        }
+
+        private int GetSeed()
+        {
+            int seed = DEFAULT_SEED;
+            string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED");
+
+            if (CPUMATH_SEED != null)
+            {
+                if (!int.TryParse(CPUMATH_SEED, out seed))
+                {
+                    if (string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase))
+                    {
+                        seed = new Random().Next();
+                    }
+                    else
+                    {
+                        seed = DEFAULT_SEED;
+                    }
+                }
+            }
+
+            Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results");
+            return seed;
+        }
+
+        [GlobalSetup]
+        public void Setup()
+        {
+            src = new float[LEN];
+            dst = new float[LEN];
+            src1 = new float[LEN];
+            src2 = new float[LEN];
+            original = new float[LEN];
+            result = new float[LEN];
+            idx = new int[IDXLEN];
+
+            seed = GetSeed();
+            Random rand = new Random(seed);
+
+            for (int i = 0; i < LEN; i++)
+            {
+                src[i] = NextFloat(rand, EXP_RANGE);
+                dst[i] = NextFloat(rand, EXP_RANGE);
+                original[i] = dst[i];
+                result[i] = dst[i];
+                src1[i] = NextFloat(rand, EXP_RANGE);
+                src2[i] = NextFloat(rand, EXP_RANGE);
+            }
+
+            for (int i = 0; i < IDXLEN; i++)
+            {
+                idx[i] = rand.Next(0, LEN);
+            }
+        }
+
+        [GlobalCleanup]
+        public void GlobalCleanup()
+        {
+            original.CopyTo(dst, 0);
+            original.CopyTo(result, 0);
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index c3869c63d9..3188c64db9 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -9,92 +9,8 @@
 
 namespace Microsoft.ML.CpuMath.PerformanceTests
 {
-    public class SsePerformanceTests
+    public class SsePerformanceTests : PerformanceTests
     {
-        private const int EXP_MAX = 127;
-        private const int EXP_MIN = 0;
-
-        private const int IDXLEN = 1000003;
-        private const int LEN = 1000003;
-        private const int EXP_RANGE = EXP_MAX / 8;
-        private const int DEFAULT_SEED = 253421;
-        private const float DEFAULT_SCALE = 1.11f;
-        private const int DEFAULT_CROW = 500;
-        private const int DEFAULT_CCOL = 2000;
-        private const bool ADD = true;
-
-        private float[] src, dst, original, src1, src2, result;
-        private int[] idx;
-        private int seed = DEFAULT_SEED;
-
-        private static float NextFloat(Random rand, int expRange)
-        {
-            double mantissa = (rand.NextDouble() * 2.0) - 1.0;
-            double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1));
-            return (float)(mantissa * exponent);
-        }
-
-        private static int GetSeed()
-        {
-            int seed = DEFAULT_SEED;
-            string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED");
-
-            if (CPUMATH_SEED != null)
-            {
-                if (!int.TryParse(CPUMATH_SEED, out seed))
-                {
-                    if(string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase))
-                    {
-                        seed = new Random().Next();
-                    }
-                    else
-                    {
-                        seed = DEFAULT_SEED;
-                    }
-                }
-            }
-
-            Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results");
-            return seed;
-        }
-
-        [GlobalSetup]
-        public void Setup()
-        {
-            src = new float[LEN];
-            dst = new float[LEN];
-            src1 = new float[LEN];
-            src2 = new float[LEN];
-            original = new float[LEN];
-            result = new float[LEN];
-            idx = new int[IDXLEN];
-
-            seed = GetSeed();
-            Random rand = new Random(seed);
-
-            for (int i = 0; i < LEN; i++)
-            {
-                src[i] = NextFloat(rand, EXP_RANGE);
-                dst[i] = NextFloat(rand, EXP_RANGE);
-                original[i] = dst[i];
-                result[i] = dst[i];
-                src1[i] = NextFloat(rand, EXP_RANGE);
-                src2[i] = NextFloat(rand, EXP_RANGE);
-            }
-
-            for (int i = 0; i < IDXLEN; i++)
-            {
-                idx[i] = rand.Next(0, LEN);
-            }
-        }
-
-        [GlobalCleanup]
-        public void GlobalCleanup()
-        {
-            original.CopyTo(dst, 0);
-            original.CopyTo(result, 0);
-        }
-
         [Benchmark]
         public unsafe void NativeAddScalarUPerf()
         {

From 31de89587928ffeb4cbe0023c50854e9f9ab115d Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Mon, 20 Aug 2018 17:21:57 -0700
Subject: [PATCH 20/29] Respond to PR feedback: Changed Sse/AvxIntrinsics from
 public to interal, adding InternalsVisibleTo attributes

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 7 ++++++-
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 1bcc8e651e..2419e319cb 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -14,9 +14,14 @@
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
+[assembly: InternalsVisibleTo("Microsoft.ML.CpuMath.PerformanceTests, PublicKey=0024000004800000940000000602" +
+    "00000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8" +
+    "cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a349" +
+    "28e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")]
+
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
-    public static class AvxIntrinsics
+    internal static class AvxIntrinsics
     {
         private const int Vector256Alignment = 32;
 
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index e6bc9d6dd4..2ef3de95a0 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -18,9 +18,14 @@
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
+[assembly: InternalsVisibleTo("Microsoft.ML.CpuMath.PerformanceTests, PublicKey=0024000004800000940000000602" +
+    "00000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8" +
+    "cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a349" +
+    "28e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")]
+
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
-    public static class SseIntrinsics
+    internal static class SseIntrinsics
     {
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;

From f07afb2d9688956abd39a478402060a05224fa95 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Mon, 20 Aug 2018 17:37:23 -0700
Subject: [PATCH 21/29] Respond to PR feedback: Used env vars to determine
 whether to use AVX/SSE

---
 .../CpuMathUtils.netcoreapp.cs                | 103 +++++++++---------
 1 file changed, 54 insertions(+), 49 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 6843cd4757..9eb094d4aa 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -9,6 +9,11 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
+        private static string _enableAvx = Environment.GetEnvironmentVariable("COMPlus_EnableAVX");
+        private static string _featureSimd = Environment.GetEnvironmentVariable("COMPlus_FeatureSIMD");
+        private static bool _useAvx = Avx.IsSupported && !string.Equals(_enableAvx, "false", StringComparison.OrdinalIgnoreCase);
+        private static bool _useSse = Sse.IsSupported && !string.Equals(_featureSimd, "false", StringComparison.OrdinalIgnoreCase);
+
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;
 
@@ -18,7 +23,7 @@ public static partial class CpuMathUtils
         public static int GetVectorAlignment()
         {
             // Assumes SSE support on machines that run ML.NET.
-            return Avx.IsSupported ? Vector256Alignment : Vector128Alignment;
+            return _useAvx ? Vector256Alignment : Vector128Alignment;
         }
 
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
@@ -26,7 +31,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
             Contracts.Assert(mat.Size == dst.Size * src.Size);
             Contracts.Assert(crun >= 0);
 
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 if (!tran)
                 {
@@ -39,7 +44,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
                     AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Size, crun);
                 }
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 if (!tran)
                 {
@@ -118,7 +123,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
             Contracts.AssertNonEmpty(rgposSrc);
             Contracts.Assert(crun >= 0);
 
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 if (!tran)
                 {
@@ -131,7 +136,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
                     AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
                 }
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 if (!tran)
                 {
@@ -205,11 +210,11 @@ public static void Add(float a, float[] dst, int count)
 
         private static void Add(float a, Span<float> dst)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.AddScalarU(a, dst);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.AddScalarU(a, dst);
             }
@@ -243,11 +248,11 @@ public static void Scale(float a, float[] dst, int offset, int count)
 
         private static void Scale(float a, Span<float> dst)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.ScaleU(a, dst);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.ScaleU(a, dst);
             }
@@ -274,11 +279,11 @@ public static void Scale(float a, float[] src, float[] dst, int count)
 
         private static void Scale(float a, Span<float> src, Span<float> dst)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.ScaleSrcU(a, src, dst);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.ScaleSrcU(a, src, dst);
             }
@@ -303,11 +308,11 @@ public static void ScaleAdd(float a, float b, float[] dst, int count)
 
         private static void ScaleAdd(float a, float b, Span<float> dst)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.ScaleAddU(a, b, dst);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.ScaleAddU(a, b, dst);
             }
@@ -346,11 +351,11 @@ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, in
 
         private static void AddScale(float a, Span<float> src, Span<float> dst)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.AddScaleU(a, src, dst);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.AddScaleU(a, src, dst);
             }
@@ -394,11 +399,11 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in
 
         private static void AddScale(float a, Span<float> src, Span<int> indices, Span<float> dst)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.AddScaleSU(a, src, indices, dst);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.AddScaleSU(a, src, indices, dst);
             }
@@ -427,11 +432,11 @@ public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res,
 
         private static void AddScaleCopy(float a, Span<float> src, Span<float> dst, Span<float> res)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.AddScaleCopyU(a, src, dst, res);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.AddScaleCopyU(a, src, dst, res);
             }
@@ -457,11 +462,11 @@ public static void Add(float[] src, float[] dst, int count)
 
         private static void Add(Span<float> src, Span<float> dst)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.AddU(src, dst);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.AddU(src, dst);
             }
@@ -505,11 +510,11 @@ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, i
 
         private static void Add(Span<float> src, Span<int> indices, Span<float> dst)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.AddSU(src, indices, dst);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.AddSU(src, indices, dst);
             }
@@ -538,11 +543,11 @@ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int c
 
         private static void MulElementWise(Span<float> src1, Span<float> src2, Span<float> dst)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.MulElementWiseU(src1, src2, dst);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.MulElementWiseU(src1, src2, dst);
             }
@@ -576,11 +581,11 @@ public static float Sum(float[] src, int offset, int count)
 
         private static float Sum(Span<float> src)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 return AvxIntrinsics.SumU(src);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 return SseIntrinsics.SumU(src);
             }
@@ -616,11 +621,11 @@ public static float SumSq(float[] src, int offset, int count)
 
         private static float SumSq(Span<float> src)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 return AvxIntrinsics.SumSqU(src);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 return SseIntrinsics.SumSqU(src);
             }
@@ -647,11 +652,11 @@ public static float SumSq(float mean, float[] src, int offset, int count)
 
         private static float SumSq(float mean, Span<float> src)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 return (mean == 0) ? AvxIntrinsics.SumSqU(src) : AvxIntrinsics.SumSqDiffU(mean, src);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src);
             }
@@ -687,11 +692,11 @@ public static float SumAbs(float[] src, int offset, int count)
 
         private static float SumAbs(Span<float> src)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 return AvxIntrinsics.SumAbsU(src);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 return SseIntrinsics.SumAbsU(src);
             }
@@ -718,11 +723,11 @@ public static float SumAbs(float mean, float[] src, int offset, int count)
 
         private static float SumAbs(float mean, Span<float> src)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 return (mean == 0) ? AvxIntrinsics.SumAbsU(src) : AvxIntrinsics.SumAbsDiffU(mean, src);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src);
             }
@@ -758,11 +763,11 @@ public static float MaxAbs(float[] src, int offset, int count)
 
         private static float MaxAbs(Span<float> src)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 return AvxIntrinsics.MaxAbsU(src);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 return SseIntrinsics.MaxAbsU(src);
             }
@@ -792,11 +797,11 @@ public static float MaxAbsDiff(float mean, float[] src, int count)
 
         private static float MaxAbsDiff(float mean, Span<float> src)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 return AvxIntrinsics.MaxAbsDiffU(mean, src);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 return SseIntrinsics.MaxAbsDiffU(mean, src);
             }
@@ -840,11 +845,11 @@ public static float DotProductDense(float[] a, int offset, float[] b, int count)
 
         private static float DotProductDense(Span<float> a, Span<float> b)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 return AvxIntrinsics.DotU(a, b);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 return SseIntrinsics.DotU(a, b);
             }
@@ -891,11 +896,11 @@ public static float DotProductSparse(float[] a, int offset, float[] b, int[] ind
 
         private static float DotProductSparse(Span<float> a, Span<float> b, Span<int> indices)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 return AvxIntrinsics.DotSU(a, b, indices);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 return SseIntrinsics.DotSU(a, b, indices);
             }
@@ -924,11 +929,11 @@ public static float L2DistSquared(float[] a, float[] b, int count)
 
         private static float L2DistSquared(Span<float> a, Span<float> b)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 return AvxIntrinsics.Dist2(a, b);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 return SseIntrinsics.Dist2(a, b);
             }
@@ -1024,11 +1029,11 @@ public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src
 
         private static void SdcaL1UpdateDense(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
             }
@@ -1062,11 +1067,11 @@ public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] sr
 
         private static void SdcaL1UpdateSparse(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
         {
-            if (Avx.IsSupported)
+            if (_useAvx)
             {
                 AvxIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
             }
-            else if (Sse.IsSupported)
+            else if (_useSse)
             {
                 SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
             }

From 9a9d27216a5b913d8ad6bbf2184c957fca6f4841 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Mon, 20 Aug 2018 18:09:48 -0700
Subject: [PATCH 22/29] Respond to PR feedback: Included 0 into consideration
 for parsing env vars

---
 src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 9eb094d4aa..3ca679a74e 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -11,8 +11,8 @@ public static partial class CpuMathUtils
     {
         private static string _enableAvx = Environment.GetEnvironmentVariable("COMPlus_EnableAVX");
         private static string _featureSimd = Environment.GetEnvironmentVariable("COMPlus_FeatureSIMD");
-        private static bool _useAvx = Avx.IsSupported && !string.Equals(_enableAvx, "false", StringComparison.OrdinalIgnoreCase);
-        private static bool _useSse = Sse.IsSupported && !string.Equals(_featureSimd, "false", StringComparison.OrdinalIgnoreCase);
+        private static bool _useAvx = Avx.IsSupported && !string.Equals(_enableAvx, "false", StringComparison.OrdinalIgnoreCase) && !string.Equals(_enableAvx, "0", StringComparison.OrdinalIgnoreCase);
+        private static bool _useSse = Sse.IsSupported && !string.Equals(_featureSimd, "false", StringComparison.OrdinalIgnoreCase) && !string.Equals(_enableAvx, "0", StringComparison.OrdinalIgnoreCase);
 
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;

From c249d8846d2870b0f43a99c3e3a7df8cda63b0c0 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Tue, 21 Aug 2018 10:47:29 -0700
Subject: [PATCH 23/29] Respond to PR feedback: env vars, InternalsVisibleTo,
 and abstract

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     |   5 -
 .../CpuMathUtils.netcoreapp.cs                | 103 +++++++++---------
 .../Microsoft.ML.CpuMath.csproj               |   7 +-
 .../Properties/AssemblyInfo.cs                |   7 ++
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     |   5 -
 .../PerformanceTests.cs                       |   2 +-
 6 files changed, 60 insertions(+), 69 deletions(-)
 create mode 100644 src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 2419e319cb..5f44625bbe 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -14,11 +14,6 @@
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
-[assembly: InternalsVisibleTo("Microsoft.ML.CpuMath.PerformanceTests, PublicKey=0024000004800000940000000602" +
-    "00000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8" +
-    "cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a349" +
-    "28e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")]
-
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     internal static class AvxIntrinsics
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 3ca679a74e..6843cd4757 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -9,11 +9,6 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
-        private static string _enableAvx = Environment.GetEnvironmentVariable("COMPlus_EnableAVX");
-        private static string _featureSimd = Environment.GetEnvironmentVariable("COMPlus_FeatureSIMD");
-        private static bool _useAvx = Avx.IsSupported && !string.Equals(_enableAvx, "false", StringComparison.OrdinalIgnoreCase) && !string.Equals(_enableAvx, "0", StringComparison.OrdinalIgnoreCase);
-        private static bool _useSse = Sse.IsSupported && !string.Equals(_featureSimd, "false", StringComparison.OrdinalIgnoreCase) && !string.Equals(_enableAvx, "0", StringComparison.OrdinalIgnoreCase);
-
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;
 
@@ -23,7 +18,7 @@ public static partial class CpuMathUtils
         public static int GetVectorAlignment()
         {
             // Assumes SSE support on machines that run ML.NET.
-            return _useAvx ? Vector256Alignment : Vector128Alignment;
+            return Avx.IsSupported ? Vector256Alignment : Vector128Alignment;
         }
 
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
@@ -31,7 +26,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
             Contracts.Assert(mat.Size == dst.Size * src.Size);
             Contracts.Assert(crun >= 0);
 
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 if (!tran)
                 {
@@ -44,7 +39,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
                     AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Size, crun);
                 }
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 if (!tran)
                 {
@@ -123,7 +118,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
             Contracts.AssertNonEmpty(rgposSrc);
             Contracts.Assert(crun >= 0);
 
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 if (!tran)
                 {
@@ -136,7 +131,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
                     AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
                 }
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 if (!tran)
                 {
@@ -210,11 +205,11 @@ public static void Add(float a, float[] dst, int count)
 
         private static void Add(float a, Span<float> dst)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.AddScalarU(a, dst);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScalarU(a, dst);
             }
@@ -248,11 +243,11 @@ public static void Scale(float a, float[] dst, int offset, int count)
 
         private static void Scale(float a, Span<float> dst)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.ScaleU(a, dst);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.ScaleU(a, dst);
             }
@@ -279,11 +274,11 @@ public static void Scale(float a, float[] src, float[] dst, int count)
 
         private static void Scale(float a, Span<float> src, Span<float> dst)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.ScaleSrcU(a, src, dst);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.ScaleSrcU(a, src, dst);
             }
@@ -308,11 +303,11 @@ public static void ScaleAdd(float a, float b, float[] dst, int count)
 
         private static void ScaleAdd(float a, float b, Span<float> dst)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.ScaleAddU(a, b, dst);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.ScaleAddU(a, b, dst);
             }
@@ -351,11 +346,11 @@ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, in
 
         private static void AddScale(float a, Span<float> src, Span<float> dst)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.AddScaleU(a, src, dst);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScaleU(a, src, dst);
             }
@@ -399,11 +394,11 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in
 
         private static void AddScale(float a, Span<float> src, Span<int> indices, Span<float> dst)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.AddScaleSU(a, src, indices, dst);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScaleSU(a, src, indices, dst);
             }
@@ -432,11 +427,11 @@ public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res,
 
         private static void AddScaleCopy(float a, Span<float> src, Span<float> dst, Span<float> res)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.AddScaleCopyU(a, src, dst, res);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScaleCopyU(a, src, dst, res);
             }
@@ -462,11 +457,11 @@ public static void Add(float[] src, float[] dst, int count)
 
         private static void Add(Span<float> src, Span<float> dst)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.AddU(src, dst);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddU(src, dst);
             }
@@ -510,11 +505,11 @@ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, i
 
         private static void Add(Span<float> src, Span<int> indices, Span<float> dst)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.AddSU(src, indices, dst);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddSU(src, indices, dst);
             }
@@ -543,11 +538,11 @@ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int c
 
         private static void MulElementWise(Span<float> src1, Span<float> src2, Span<float> dst)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.MulElementWiseU(src1, src2, dst);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.MulElementWiseU(src1, src2, dst);
             }
@@ -581,11 +576,11 @@ public static float Sum(float[] src, int offset, int count)
 
         private static float Sum(Span<float> src)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 return AvxIntrinsics.SumU(src);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.SumU(src);
             }
@@ -621,11 +616,11 @@ public static float SumSq(float[] src, int offset, int count)
 
         private static float SumSq(Span<float> src)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 return AvxIntrinsics.SumSqU(src);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.SumSqU(src);
             }
@@ -652,11 +647,11 @@ public static float SumSq(float mean, float[] src, int offset, int count)
 
         private static float SumSq(float mean, Span<float> src)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 return (mean == 0) ? AvxIntrinsics.SumSqU(src) : AvxIntrinsics.SumSqDiffU(mean, src);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src);
             }
@@ -692,11 +687,11 @@ public static float SumAbs(float[] src, int offset, int count)
 
         private static float SumAbs(Span<float> src)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 return AvxIntrinsics.SumAbsU(src);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.SumAbsU(src);
             }
@@ -723,11 +718,11 @@ public static float SumAbs(float mean, float[] src, int offset, int count)
 
         private static float SumAbs(float mean, Span<float> src)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 return (mean == 0) ? AvxIntrinsics.SumAbsU(src) : AvxIntrinsics.SumAbsDiffU(mean, src);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src);
             }
@@ -763,11 +758,11 @@ public static float MaxAbs(float[] src, int offset, int count)
 
         private static float MaxAbs(Span<float> src)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 return AvxIntrinsics.MaxAbsU(src);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.MaxAbsU(src);
             }
@@ -797,11 +792,11 @@ public static float MaxAbsDiff(float mean, float[] src, int count)
 
         private static float MaxAbsDiff(float mean, Span<float> src)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 return AvxIntrinsics.MaxAbsDiffU(mean, src);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.MaxAbsDiffU(mean, src);
             }
@@ -845,11 +840,11 @@ public static float DotProductDense(float[] a, int offset, float[] b, int count)
 
         private static float DotProductDense(Span<float> a, Span<float> b)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 return AvxIntrinsics.DotU(a, b);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.DotU(a, b);
             }
@@ -896,11 +891,11 @@ public static float DotProductSparse(float[] a, int offset, float[] b, int[] ind
 
         private static float DotProductSparse(Span<float> a, Span<float> b, Span<int> indices)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 return AvxIntrinsics.DotSU(a, b, indices);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.DotSU(a, b, indices);
             }
@@ -929,11 +924,11 @@ public static float L2DistSquared(float[] a, float[] b, int count)
 
         private static float L2DistSquared(Span<float> a, Span<float> b)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 return AvxIntrinsics.Dist2(a, b);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.Dist2(a, b);
             }
@@ -1029,11 +1024,11 @@ public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src
 
         private static void SdcaL1UpdateDense(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
             }
@@ -1067,11 +1062,11 @@ public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] sr
 
         private static void SdcaL1UpdateSparse(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
         {
-            if (_useAvx)
+            if (Avx.IsSupported)
             {
                 AvxIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
             }
-            else if (_useSse)
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
             }
diff --git a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
index ef24bf2762..05f97d3040 100644
--- a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
+++ b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
@@ -10,10 +10,9 @@
     <LangVersion>7.3</LangVersion>
   </PropertyGroup>
 
-  <PropertyGroup Condition="'$(Configuration)|$(TargetFramework)|$(Platform)'=='Release-Intrinsics|netstandard2.0|AnyCPU'">
-    <GenerateSerializationAssemblies>Auto</GenerateSerializationAssemblies>
-    <Optimize>true</Optimize>
-  </PropertyGroup>
+  <ItemGroup>
+    <Folder Include="Properties\" />
+  </ItemGroup>
 
   <ItemGroup>
     <Compile Include="..\Microsoft.ML.Core\Utilities\Contracts.cs" />
diff --git a/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs b/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000000..ab9968b399
--- /dev/null
+++ b/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs
@@ -0,0 +1,7 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+
+[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.CpuMath.PerformanceTests, PublicKey=002400000480000094000000060200000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a34928e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")]
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 2ef3de95a0..faaa2a44de 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -18,11 +18,6 @@
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
-[assembly: InternalsVisibleTo("Microsoft.ML.CpuMath.PerformanceTests, PublicKey=0024000004800000940000000602" +
-    "00000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8" +
-    "cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a349" +
-    "28e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")]
-
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     internal static class SseIntrinsics
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs
index 64278eaf21..1eb9157a2f 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs
@@ -9,7 +9,7 @@
 
 namespace Microsoft.ML.CpuMath.PerformanceTests
 {
-    public class PerformanceTests
+    public abstract class PerformanceTests
     {
         private const int EXP_MAX = 127;
         private const int EXP_MIN = 0;

From f606432b8b2f452d0e48ad1ec9de01a7f4c06af0 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Tue, 21 Aug 2018 14:06:29 -0700
Subject: [PATCH 24/29] Respond to PR feedback: Added new comparer class
 specifically for MatMul

---
 .../UnitTests.cs                              | 44 +++++++++++++------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
index 2d59a2acf1..1877ebe6b0 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
@@ -5,6 +5,7 @@
 using System;
 using System.Collections.Generic;
 using Xunit;
+using Xunit.Abstractions;
 using Microsoft.ML.Runtime.Internal.CpuMath;
 
 namespace Microsoft.ML.CpuMath.UnitTests
@@ -18,6 +19,7 @@ public class CpuMathUtilsUnitTests
         private readonly AlignedArray[] _testDstVectors;
         private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment();
         private readonly FloatEqualityComparer _comparer;
+        private readonly FloatEqualityComparerForMatMul _matMulComparer;
 
         private const float DEFAULT_SCALE = 1.7f;
 
@@ -30,6 +32,7 @@ public CpuMathUtilsUnitTests()
             _testArrays = new float[][] { testArray1, testArray2 };
             _testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 };
             _comparer = new FloatEqualityComparer();
+            _matMulComparer = new FloatEqualityComparerForMatMul();
 
             // Padded matrices whose dimensions are multiples of 8
             float[] testMatrix1 = new float[8 * 8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
@@ -78,7 +81,7 @@ public CpuMathUtilsUnitTests()
         }
 
         [Theory]
-        [InlineData(0, 0, 0, new float[] { -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f })]
+        [InlineData(0, 0, 0, new float[] { -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f })]
         [InlineData(1, 1, 0, new float[] { 1496f, 3672f, 5848f, 8024f, 10200f, 12376f, 14552f, 16728f })]
         [InlineData(1, 0, 1, new float[] { 204f, 492f, 780f, 1068f, 1356f, 1644f, 1932f, 2220f, 2508f, 2796f, 3084f, 3372f, 3660f, 3948f, 4236f, 4524f })]
         public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
@@ -90,11 +93,11 @@ public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
             CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
+            Assert.Equal(expected, actual, _matMulComparer);
         }
 
         [Theory]
-        [InlineData(0, 0, 0, new float[] { -416.68f, -415.68f, -414.68f, -413.68f, -412.68f, -411.68f, -410.68f, -409.68f })]
+        [InlineData(0, 0, 0, new float[] { -416.6801f, -415.6801f, -414.6801f, -413.6801f, -412.6801f, -411.6801f, -410.6801f, -409.6801f })]
         [InlineData(1, 1, 0, new float[] { 1496f, 3673f, 5850f, 8027f, 10204f, 12381f, 14558f, 16735f })]
         [InlineData(1, 0, 1, new float[] { 204f, 493f, 782f, 1071f, 1360f, 1649f, 1938f, 2227f, 2516f, 2805f, 3094f, 3383f, 3672f, 3961f, 4250f, 4539f })]
         public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
@@ -106,11 +109,11 @@ public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expect
             CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
+            Assert.Equal(expected, actual, _matMulComparer);
         }
 
         [Theory]
-        [InlineData(0, 0, 0, new float[] { 70.56f, -85.68f, -351.36f, 498.24f, -3829.32f, -969.48f, 1168.2f, 118.44f })]
+        [InlineData(0, 0, 0, new float[] { 70.56001f, -85.68f, -351.36f, 498.24f, -3829.32f, -969.48f, 1168.2f, 118.44f })]
         [InlineData(1, 0, 1, new float[] { 2724f, 2760f, 2796f, 2832f, 2868f, 2904f, 2940f, 2976f, 3012f, 3048f, 3084f, 3120f, 3156f, 3192f, 3228f, 3264f })]
         [InlineData(1, 1, 0, new float[] { 11016f, 11152f, 11288f, 11424f, 11560f, 11696f, 11832f, 11968f })]
         public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
@@ -122,11 +125,11 @@ public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expec
             CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
+            Assert.Equal(expected, actual, _matMulComparer);
         }
 
         [Theory]
-        [InlineData(0, 0, 0, new float[] { 70.56f, -84.68f, -349.36f, 501.24f, -3825.32f, -964.48f, 1174.2f, 125.44f })]
+        [InlineData(0, 0, 0, new float[] { 70.56001f, -84.68f, -349.36f, 501.24f, -3825.32f, -964.48f, 1174.2f, 125.44f })]
         [InlineData(1, 0, 1, new float[] { 2724f, 2761f, 2798f, 2835f, 2872f, 2909f, 2946f, 2983f, 3020f, 3057f, 3094f, 3131f, 3168f, 3205f, 3242f, 3279f })]
         [InlineData(1, 1, 0, new float[] { 11016f, 11153f, 11290f, 11427f, 11564f, 11701f, 11838f, 11975f })]
         public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
@@ -138,11 +141,11 @@ public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] ex
             CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
+            Assert.Equal(expected, actual, _matMulComparer);
         }
 
         [Theory]
-        [InlineData(0, 0, 0, new float[] { 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f })]
+        [InlineData(0, 0, 0, new float[] { 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f })]
         [InlineData(1, 1, 0, new float[] { 910f, 2190f, 3470f, 4750f, 6030f, 7310f, 8590f, 9870f })]
         [InlineData(1, 0, 1, new float[] { 95f, 231f, 367f, 503f, 639f, 775f, 911f, 1047f, 1183f, 1319f, 1455f, 1591f, 1727f, 1863f, 1999f, 2135f })]
         public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
@@ -155,11 +158,11 @@ public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected
             CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
+            Assert.Equal(expected, actual, _matMulComparer);
         }
 
         [Theory]
-        [InlineData(0, 0, 0, new float[] { 38.25f, 39.25f, 40.25f, 41.25f, 42.25f, 43.25f, 44.25f, 45.25f })]
+        [InlineData(0, 0, 0, new float[] { 38.25002f, 39.25002f, 40.25002f, 41.25002f, 42.25002f, 43.25002f, 44.25002f, 45.25002f })]
         [InlineData(1, 1, 0, new float[] { 910f, 2191f, 3472f, 4753f, 6034f, 7315f, 8596f, 9877f })]
         [InlineData(1, 0, 1, new float[] { 95f, 232f, 369f, 506f, 643f, 780f, 917f, 1054f, 1191f, 1328f, 1465f, 1602f, 1739f, 1876f, 2013f, 2150f })]
         public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
@@ -172,7 +175,7 @@ public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expec
             CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
+            Assert.Equal(expected, actual, _matMulComparer);
         }
 
         [Theory]
@@ -189,7 +192,7 @@ public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expe
             CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
+            Assert.Equal(expected, actual, _matMulComparer);
         }
 
         [Theory]
@@ -206,7 +209,7 @@ public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] e
             CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
-            Assert.Equal(expected, actual, _comparer);
+            Assert.Equal(expected, actual, _matMulComparer);
         }
 
         [Theory]
@@ -621,6 +624,19 @@ public void SdcaL1UpdateSUTest(int test)
     }
 
     internal class FloatEqualityComparer : IEqualityComparer<float>
+    {
+        public bool Equals(float a, float b)
+        {
+            return Math.Abs(a - b) < 1e-5f;
+        }
+
+        public int GetHashCode(float a)
+        {
+            throw new NotImplementedException();
+        }
+    }
+
+    internal class FloatEqualityComparerForMatMul : IEqualityComparer<float>
     {
         public bool Equals(float a, float b)
         {

From 27ad82930c1488dc3fe9bdd57697771df54e12d4 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Wed, 22 Aug 2018 17:35:54 -0700
Subject: [PATCH 25/29] Respond to PR feedback: Changes to intrinsics

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     | 216 ++++++++----------
 .../CpuMathUtils.netcoreapp.cs                |  11 +-
 .../CpuMathUtils.netstandard.cs               |   8 +-
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     |   2 +-
 4 files changed, 108 insertions(+), 129 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 5f44625bbe..09804f88ed 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -18,69 +18,57 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     internal static class AvxIntrinsics
     {
+        private static readonly Vector128<float> _absMask128 = Sse2.IsSupported ?
+            Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF)) :
+            Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+
+        private static readonly Vector256<float> _absMask256 = Avx.StaticCast<int, float>(Avx.SetAllVector256(0x7FFFFFFF));
+
         private const int Vector256Alignment = 32;
 
-        private static bool Compat(AlignedArray a)
+        private static bool HasCompatibleAlignment(AlignedArray alignedArray)
         {
-            Contracts.AssertValue(a);
-            Contracts.Assert(a.Size > 0);
-            return a.CbAlign == Vector256Alignment;
+            Contracts.AssertValue(alignedArray);
+            Contracts.Assert(alignedArray.Size > 0);
+            return (alignedArray.CbAlign % Vector256Alignment) == 0;
         }
 
-        private static unsafe float* Ptr(AlignedArray a, float* p)
+        private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase)
         {
-            Contracts.AssertValue(a);
-            float* q = p + a.GetBase((long)p);
-            Contracts.Assert(((long)q & (Vector256Alignment - 1)) == 0);
-            return q;
+            Contracts.AssertValue(alignedArray);
+            float* alignedBase = unalignedBase + alignedArray.GetBase((long)unalignedBase);
+            Contracts.Assert(((long)alignedBase & (Vector256Alignment - 1)) == 0);
+            return alignedBase;
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector256<float> ToVector256(in Vector128<float> a, in Vector128<float> b)
-        {
-            // REVIEW NEEDED: Is it the correct port of the following code?
-            // #ifndef _WIN32
-            // #define _mm256_set_m128(va, vb) _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1)
-            // #endif
-            return Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1);
-        }
+        private static Vector256<float> SetHighLow(in Vector128<float> a, in Vector128<float> b)
+            => Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1);
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> GetLow(in Vector256<float> x)
-        {
-            return Avx.GetLowerHalf(x);
-        }
+            => Avx.GetLowerHalf(x);
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> GetHigh(in Vector256<float> x)
-        {
-            return Avx.ExtractVector128(x, 1);
-        }
+            => Avx.ExtractVector128(x, 1);
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe Vector128<float> Load1(float* src, int* idx)
-        {
-            return Sse.SetScalarVector128(src[idx[0]]);
-        }
+            => Sse.SetScalarVector128(src[idx[0]]);
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe Vector128<float> Load4(float* src, int* idx)
-        {
-            return Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
-        }
+            => Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe Vector256<float> Load8(float* src, int* idx)
-        {
-            return Avx.SetVector256(src[idx[7]], src[idx[6]], src[idx[5]], src[idx[4]], src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
-        }
+            => Avx.SetVector256(src[idx[7]], src[idx[6]], src[idx[5]], src[idx[4]], src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
 
+        // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA.
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> Rotate(in Vector128<float> x)
-        {
-            // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA.
-            return Sse.Shuffle(x, x, 0x39);
-        }
+            => Sse.Shuffle(x, x, 0x39);
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
@@ -141,38 +129,44 @@ private static Vector256<float> VectorSum256(in Vector256<float> vector)
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> VectorMax128(in Vector128<float> vector)
         {
+            // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> BADC.
             Vector128<float> x1 = Sse.Shuffle(vector, vector, 0xB1);
+
+            // Performs element-wise maximum operation: The 1st and 3rd 32-bit slots become
+            // max(A, B) and max(C, D).
             Vector128<float> partialMax = Sse.Max(vector, x1);
+
+            // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> CAAA.
             x1 = Sse.Shuffle(partialMax, partialMax, 0x02);
+
+            // Performs element-wise maximum operation: The 1st 32-bit slot becomes
+            // max(A, B, C, D).
             return Sse.MaxScalar(partialMax, x1);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector256<float> VectorMax256(in Vector256<float> vector)
         {
+            // The control byte shuffles the eight 32-bit floats of partialMax: ABCD|EFGH -> BADC|FEHG.
             Vector256<float> x1 = Avx.Shuffle(vector, vector, 0xB1);
+
+            // Performs element-wise maximum operation: The 1st, 3rd, 5th, and 7th 32-bit slots become
+            // max(A, B), max(C, D), max(E, F), and max(G, H).
             Vector256<float> partialMax = Avx.Max(vector, x1);
-            x1 = Avx.Shuffle(partialMax, partialMax, 0x02);
-            return Avx.Max(partialMax, x1);
-        }
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> GetAbsMask128()
-        {
-            return Sse2.IsSupported ?
-                Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF)) :
-                Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
-        }
+            // The control byte shuffles the eight 32-bit floats of partialMax: ABCD|EFGH -> CAAA|GEEE.
+            x1 = Avx.Shuffle(partialMax, partialMax, 0x02);
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector256<float> GetAbsMask256()
-        {
-            return Avx.StaticCast<int, float>(Avx.SetAllVector256(0x7FFFFFFF));
+            // Performs element-wise maximum operation: The 1st and 5th 32-bit slots become
+            // max(max(A, B), max(C, D)) = max(A, B, C, D) and
+            // max(max(E, F), max(G, H)) = max(E, F, G, H).
+            return Avx.Max(partialMax, x1);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> GetNewDst128(in Vector128<float> xDst1, in Vector128<float> signMask, in Vector128<float> xThreshold)
+        private static Vector128<float> GetNewDst128(in Vector128<float> xDst1, in Vector128<float> xThreshold)
         {
+            Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000
             Vector128<float> xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise
             Vector128<float> xDst1Abs = Sse.Xor(xDst1, xSign);
             Vector128<float> xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true
@@ -181,14 +175,12 @@ private static Vector128<float> GetNewDst128(in Vector128<float> xDst1, in Vecto
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector256<float> GetNewDst256(in Vector256<float> xDst1, in Vector256<float> signMask, in Vector256<float> xThreshold)
+        private static Vector256<float> GetNewDst256(in Vector256<float> xDst1, in Vector256<float> xThreshold)
         {
+            Vector256<float> signMask = Avx.SetAllVector256(-0.0f); // 0x8000 0000
             Vector256<float> xSign = Avx.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise
             Vector256<float> xDst1Abs = Avx.Xor(xDst1, xSign);
-
-            // REVIEW NEEDED: Do we want Signaling or NonSignaling?  The original functionality is NonSignaling, which does not throw an exception even when there is an NaN.
-            // Signaling means that if an operand contains an NaN, an exception is raised (ref: https://stackoverflow.com/questions/16988199/how-to-choose-avx-compare-predicate-variants)
-            Vector256<float> xCond = Avx.Compare(xDst1Abs, xThreshold, FloatComparisonMode.GreaterThanOrderedSignaling); // result = 0xFFFF FFFF if true
+            Vector256<float> xCond = Avx.Compare(xDst1Abs, xThreshold, FloatComparisonMode.GreaterThanOrderedNonSignaling); // result = 0xFFFF FFFF if true
             Vector256<float> x2 = Avx.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise
             return Avx.And(Avx.Subtract(xDst1, x2), xCond);
         }
@@ -196,17 +188,17 @@ private static Vector256<float> GetNewDst256(in Vector256<float> xDst1, in Vecto
         // Multiply matrix times vector into vector.
         public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
+            Contracts.Assert(HasCompatibleAlignment(mat));
+            Contracts.Assert(HasCompatibleAlignment(src));
+            Contracts.Assert(HasCompatibleAlignment(dst));
 
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             fixed (float* pMatStart = &mat.Items[0])
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-                float* pmat = Ptr(mat, pMatStart);
+                float* psrc = GetAlignedBase(src, pSrcStart);
+                float* pdst = GetAlignedBase(dst, pDstStart);
+                float* pmat = GetAlignedBase(mat, pMatStart);
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
@@ -263,9 +255,9 @@ public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src,
         public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
                                         int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
+            Contracts.Assert(HasCompatibleAlignment(mat));
+            Contracts.Assert(HasCompatibleAlignment(src));
+            Contracts.Assert(HasCompatibleAlignment(dst));
 
             // REVIEW: For extremely sparse inputs, interchanging the loops would
             // likely be more efficient.
@@ -274,9 +266,9 @@ public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, A
             fixed (float* pMatStart = &mat.Items[0])
             fixed (int* pposSrc = &rgposSrc[0])
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-                float* pmat = Ptr(mat, pMatStart);
+                float* psrc = GetAlignedBase(src, pSrcStart);
+                float* pdst = GetAlignedBase(dst, pDstStart);
+                float* pmat = GetAlignedBase(mat, pMatStart);
 
                 int* pposMin = pposSrc + iposMin;
                 int* pposEnd = pposSrc + iposEnd;
@@ -321,17 +313,17 @@ public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, A
 
         public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
+            Contracts.Assert(HasCompatibleAlignment(mat));
+            Contracts.Assert(HasCompatibleAlignment(src));
+            Contracts.Assert(HasCompatibleAlignment(dst));
 
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             fixed (float* pMatStart = &mat.Items[0])
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-                float* pmat = Ptr(mat, pMatStart);
+                float* psrc = GetAlignedBase(src, pSrcStart);
+                float* pdst = GetAlignedBase(dst, pDstStart);
+                float* pmat = GetAlignedBase(mat, pMatStart);
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
@@ -348,10 +340,10 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
                     Vector128<float> h31 = Sse.Shuffle(h01, h01, 0xFF); // D
                     h01 = Sse.Shuffle(h01, h01, 0x00); // A
 
-                    Vector256<float> x01 = ToVector256(h01, h01);
-                    Vector256<float> x11 = ToVector256(h11, h11);
-                    Vector256<float> x21 = ToVector256(h21, h21);
-                    Vector256<float> x31 = ToVector256(h31, h31);
+                    Vector256<float> x01 = SetHighLow(h01, h01);
+                    Vector256<float> x11 = SetHighLow(h11, h11);
+                    Vector256<float> x21 = SetHighLow(h21, h21);
+                    Vector256<float> x31 = SetHighLow(h31, h31);
 
                     pSrcCurrent += 4;
 
@@ -392,10 +384,10 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
                     Vector128<float> h31 = Sse.Shuffle(h01, h01, 0xFF); // D
                     h01 = Sse.Shuffle(h01, h01, 0x00); // A
 
-                    Vector256<float> x01 = ToVector256(h01, h01);
-                    Vector256<float> x11 = ToVector256(h11, h11);
-                    Vector256<float> x21 = ToVector256(h21, h21);
-                    Vector256<float> x31 = ToVector256(h31, h31);
+                    Vector256<float> x01 = SetHighLow(h01, h01);
+                    Vector256<float> x11 = SetHighLow(h11, h11);
+                    Vector256<float> x21 = SetHighLow(h21, h21);
+                    Vector256<float> x31 = SetHighLow(h31, h31);
 
                     float* pDstCurrent = pdst;
 
@@ -435,18 +427,18 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
         public static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
                                         int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
+            Contracts.Assert(HasCompatibleAlignment(mat));
+            Contracts.Assert(HasCompatibleAlignment(src));
+            Contracts.Assert(HasCompatibleAlignment(dst));
 
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             fixed (float* pMatStart = &mat.Items[0])
             fixed (int* pposSrc = &rgposSrc[0])
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-                float* pmat = Ptr(mat, pMatStart);
+                float* psrc = GetAlignedBase(src, pSrcStart);
+                float* pdst = GetAlignedBase(dst, pDstStart);
+                float* pmat = GetAlignedBase(mat, pMatStart);
 
                 int* ppos = pposSrc + iposMin;
                 int* pposEnd = pposSrc + iposEnd;
@@ -1114,12 +1106,11 @@ public static unsafe float SumAbsU(Span<float> src)
                 float* pSrcCurrent = psrc;
 
                 Vector256<float> result256 = Avx.SetZeroVector256<float>();
-                Vector256<float> mask256 = GetAbsMask256();
 
                 while (pSrcCurrent + 8 <= pSrcEnd)
                 {
                     Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
-                    result256 = Avx.Add(result256, Avx.And(srcVector, mask256));
+                    result256 = Avx.Add(result256, Avx.And(srcVector, _absMask256));
 
                     pSrcCurrent += 8;
                 }
@@ -1128,12 +1119,11 @@ public static unsafe float SumAbsU(Span<float> src)
                 Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
-                Vector128<float> mask128 = GetAbsMask128();
 
                 if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
-                    result128 = Sse.Add(result128, Sse.And(srcVector, mask128));
+                    result128 = Sse.Add(result128, Sse.And(srcVector, _absMask128));
 
                     pSrcCurrent += 4;
                 }
@@ -1143,7 +1133,7 @@ public static unsafe float SumAbsU(Span<float> src)
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
-                    result128 = Sse.AddScalar(result128, Sse.And(srcVector, mask128));
+                    result128 = Sse.AddScalar(result128, Sse.And(srcVector, _absMask128));
 
                     pSrcCurrent++;
                 }
@@ -1161,13 +1151,12 @@ public static unsafe float SumAbsDiffU(float mean, Span<float> src)
 
                 Vector256<float> result256 = Avx.SetZeroVector256<float>();
                 Vector256<float> meanVector256 = Avx.SetAllVector256(mean);
-                Vector256<float> mask256 = GetAbsMask256();
 
                 while (pSrcCurrent + 8 <= pSrcEnd)
                 {
                     Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
                     srcVector = Avx.Subtract(srcVector, meanVector256);
-                    result256 = Avx.Add(result256, Avx.And(srcVector, mask256));
+                    result256 = Avx.Add(result256, Avx.And(srcVector, _absMask256));
 
                     pSrcCurrent += 8;
                 }
@@ -1177,13 +1166,12 @@ public static unsafe float SumAbsDiffU(float mean, Span<float> src)
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
                 Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
-                Vector128<float> mask128 = GetAbsMask128();
 
                 if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector128);
-                    result128 = Sse.Add(result128, Sse.And(srcVector, mask128));
+                    result128 = Sse.Add(result128, Sse.And(srcVector, _absMask128));
 
                     pSrcCurrent += 4;
                 }
@@ -1194,7 +1182,7 @@ public static unsafe float SumAbsDiffU(float mean, Span<float> src)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     srcVector = Sse.SubtractScalar(srcVector, meanVector128);
-                    result128 = Sse.AddScalar(result128, Sse.And(srcVector, mask128));
+                    result128 = Sse.AddScalar(result128, Sse.And(srcVector, _absMask128));
 
                     pSrcCurrent++;
                 }
@@ -1211,12 +1199,11 @@ public static unsafe float MaxAbsU(Span<float> src)
                 float* pSrcCurrent = psrc;
 
                 Vector256<float> result256 = Avx.SetZeroVector256<float>();
-                Vector256<float> mask256 = GetAbsMask256();
 
                 while (pSrcCurrent + 8 <= pSrcEnd)
                 {
                     Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
-                    result256 = Avx.Max(result256, Avx.And(srcVector, mask256));
+                    result256 = Avx.Max(result256, Avx.And(srcVector, _absMask256));
 
                     pSrcCurrent += 8;
                 }
@@ -1225,12 +1212,11 @@ public static unsafe float MaxAbsU(Span<float> src)
                 Vector128<float> resultPadded = Sse.MaxScalar(GetLow(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
-                Vector128<float> mask128 = GetAbsMask128();
 
                 if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
-                    result128 = Sse.Max(result128, Sse.And(srcVector, mask128));
+                    result128 = Sse.Max(result128, Sse.And(srcVector, _absMask128));
 
                     pSrcCurrent += 4;
                 }
@@ -1240,7 +1226,7 @@ public static unsafe float MaxAbsU(Span<float> src)
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
-                    result128 = Sse.MaxScalar(result128, Sse.And(srcVector, mask128));
+                    result128 = Sse.MaxScalar(result128, Sse.And(srcVector, _absMask128));
 
                     pSrcCurrent++;
                 }
@@ -1258,13 +1244,12 @@ public static unsafe float MaxAbsDiffU(float mean, Span<float> src)
 
                 Vector256<float> result256 = Avx.SetZeroVector256<float>();
                 Vector256<float> meanVector256 = Avx.SetAllVector256(mean);
-                Vector256<float> mask256 = GetAbsMask256();
 
                 while (pSrcCurrent + 8 <= pSrcEnd)
                 {
                     Vector256<float> srcVector = Avx.LoadVector256(pSrcCurrent);
                     srcVector = Avx.Subtract(srcVector, meanVector256);
-                    result256 = Avx.Max(result256, Avx.And(srcVector, mask256));
+                    result256 = Avx.Max(result256, Avx.And(srcVector, _absMask256));
 
                     pSrcCurrent += 8;
                 }
@@ -1274,13 +1259,12 @@ public static unsafe float MaxAbsDiffU(float mean, Span<float> src)
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
                 Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
-                Vector128<float> mask128 = GetAbsMask128();
 
                 if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector128);
-                    result128 = Sse.Max(result128, Sse.And(srcVector, mask128));
+                    result128 = Sse.Max(result128, Sse.And(srcVector, _absMask128));
 
                     pSrcCurrent += 4;
                 }
@@ -1291,7 +1275,7 @@ public static unsafe float MaxAbsDiffU(float mean, Span<float> src)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     srcVector = Sse.SubtractScalar(srcVector, meanVector128);
-                    result128 = Sse.MaxScalar(result128, Sse.And(srcVector, mask128));
+                    result128 = Sse.MaxScalar(result128, Sse.And(srcVector, _absMask128));
 
                     pSrcCurrent++;
                 }
@@ -1478,8 +1462,6 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, flo
                 float* pDst2Current = pdst2;
 
                 Vector256<float> xPrimal256 = Avx.SetAllVector256(primalUpdate);
-
-                Vector256<float> signMask256 = Avx.SetAllVector256(-0.0f); // 0x8000 0000
                 Vector256<float> xThreshold256 = Avx.SetAllVector256(threshold);
 
                 while (pSrcCurrent + 8 <= pSrcEnd)
@@ -1488,7 +1470,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, flo
 
                     Vector256<float> xDst1 = Avx.LoadVector256(pDst1Current);
                     xDst1 = Avx.Add(xDst1, Avx.Multiply(xSrc, xPrimal256));
-                    Vector256<float> xDst2 = GetNewDst256(xDst1, signMask256, xThreshold256);
+                    Vector256<float> xDst2 = GetNewDst256(xDst1, xThreshold256);
 
                     Avx.Store(pDst1Current, xDst1);
                     Avx.Store(pDst2Current, xDst2);
@@ -1499,8 +1481,6 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, flo
                 }
 
                 Vector128<float> xPrimal128 = Sse.SetAllVector128(primalUpdate);
-
-                Vector128<float> signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000
                 Vector128<float> xThreshold128 = Sse.SetAllVector128(threshold);
 
                 if (pSrcCurrent + 4 <= pSrcEnd)
@@ -1509,7 +1489,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, flo
 
                     Vector128<float> xDst1 = Sse.LoadVector128(pDst1Current);
                     xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128));
-                    Vector128<float> xDst2 = GetNewDst128(xDst1, signMask128, xThreshold128);
+                    Vector128<float> xDst2 = GetNewDst128(xDst1, xThreshold128);
 
                     Sse.Store(pDst1Current, xDst1);
                     Sse.Store(pDst2Current, xDst2);
@@ -1544,8 +1524,6 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Sp
                 int* pIdxCurrent = pidx;
 
                 Vector256<float> xPrimal256 = Avx.SetAllVector256(primalUpdate);
-
-                Vector256<float> signMask = Avx.SetAllVector256(-0.0f); // 0x8000 0000
                 Vector256<float> xThreshold = Avx.SetAllVector256(threshold);
 
                 while (pIdxCurrent + 8 <= pIdxEnd)
@@ -1554,7 +1532,7 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Sp
 
                     Vector256<float> xDst1 = Load8(pdst1, pIdxCurrent);
                     xDst1 = Avx.Add(xDst1, Avx.Multiply(xSrc, xPrimal256));
-                    Vector256<float> xDst2 = GetNewDst256(xDst1, signMask, xThreshold);
+                    Vector256<float> xDst2 = GetNewDst256(xDst1, xThreshold);
 
                     Store8(in xDst1, pdst1, pIdxCurrent);
                     Store8(in xDst2, pdst2, pIdxCurrent);
@@ -1564,8 +1542,6 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Sp
                 }
 
                 Vector128<float> xPrimal128 = Sse.SetAllVector128(primalUpdate);
-
-                Vector128<float> signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000
                 Vector128<float> xThreshold128 = Sse.SetAllVector128(threshold);
 
                 if (pIdxCurrent + 4 <= pIdxEnd)
@@ -1574,7 +1550,7 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Sp
 
                     Vector128<float> xDst1 = Load4(pdst1, pIdxCurrent);
                     xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128));
-                    Vector128<float> xDst2 = GetNewDst128(xDst1, signMask128, xThreshold128);
+                    Vector128<float> xDst2 = GetNewDst128(xDst1, xThreshold128);
 
                     Store4(in xDst1, pdst1, pIdxCurrent);
                     Store4(in xDst2, pdst2, pIdxCurrent);
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 6843cd4757..f15f5c3938 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics.X86;
 using System;
 
@@ -15,11 +16,13 @@ public static partial class CpuMathUtils
         // The count of bytes in Vector256<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector256Alignment = 32;
 
+        // The count of bytes in a 32-bit float, corresponding to _cbAlign in AlignedArray
+        private const int FloatAlignment = 4;
+
+        // If neither AVX nor SSE is supported, return basic alignment for a 4-byte float.
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         public static int GetVectorAlignment()
-        {
-            // Assumes SSE support on machines that run ML.NET.
-            return Avx.IsSupported ? Vector256Alignment : Vector128Alignment;
-        }
+            => Avx.IsSupported ? Vector256Alignment : (Sse.IsSupported ? Vector128Alignment : FloatAlignment);
 
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
         {
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
index 497dd59003..b35f171388 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -2,6 +2,8 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System.Runtime.CompilerServices;
+
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
@@ -9,11 +11,9 @@ public static partial class CpuMathUtils
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;
 
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         public static int GetVectorAlignment()
-        {
-            // Assumes SSE support on machines that run ML.NET.
-            return Vector128Alignment;
-        }
+            => Vector128Alignment;
 
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun);
 
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index faaa2a44de..323300888c 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -29,7 +29,7 @@ private static bool Compat(AlignedArray a)
         {
             Contracts.AssertValue(a);
             Contracts.Assert(a.Size > 0);
-            return a.CbAlign == Vector128Alignment;
+            return (a.CbAlign % Vector128Alignment) == 0;
         }
 
         private static unsafe float* Ptr(AlignedArray a, float* p)

From 0fd78a63021fd850e6b9bef7e2042454dbf826aa Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Thu, 23 Aug 2018 12:11:15 -0700
Subject: [PATCH 26/29] Respond to PR comment: Makes alignment checking
 consistent in external and internal calls

---
 src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
index d217ccf6f9..30308f219d 100644
--- a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
+++ b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
@@ -16,7 +16,7 @@ public static void AssertCompatible(ICpuFullMatrix values)
 #if DEBUG
             var mat = values as TMatrix;
             Contracts.AssertValue(mat);
-            Contracts.Assert(mat.Items.CbAlign == CpuMathUtils.GetVectorAlignment());
+            Contracts.Assert((mat.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0);
 #endif
         }
 
@@ -29,7 +29,7 @@ public static void AssertCompatible(ICpuVector values)
 #if DEBUG
             CpuAlignedVector vec = values as CpuAlignedVector;
             Contracts.AssertValue(vec);
-            Contracts.Assert(vec.Items.CbAlign == CpuMathUtils.GetVectorAlignment());
+            Contracts.Assert((vec.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0);
 #endif
         }
 

From 3380ded08b654fb4b228d25f05a5005a90a0b60e Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Thu, 23 Aug 2018 12:42:07 -0700
Subject: [PATCH 27/29] Respond to PR feedback: Refactored Sse/AvxIntrinsics
 helper functions

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 164 ++++++----------------
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 160 ++++++++++-----------
 2 files changed, 121 insertions(+), 203 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 09804f88ed..eb4ac3b817 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -18,10 +18,6 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     internal static class AvxIntrinsics
     {
-        private static readonly Vector128<float> _absMask128 = Sse2.IsSupported ?
-            Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF)) :
-            Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
-
         private static readonly Vector256<float> _absMask256 = Avx.StaticCast<int, float>(Avx.SetAllVector256(0x7FFFFFFF));
 
         private const int Vector256Alignment = 32;
@@ -41,10 +37,6 @@ private static bool HasCompatibleAlignment(AlignedArray alignedArray)
             return alignedBase;
         }
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector256<float> SetHighLow(in Vector128<float> a, in Vector128<float> b)
-            => Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1);
-
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> GetLow(in Vector256<float> x)
             => Avx.GetLowerHalf(x);
@@ -53,72 +45,31 @@ private static Vector128<float> GetLow(in Vector256<float> x)
         private static Vector128<float> GetHigh(in Vector256<float> x)
             => Avx.ExtractVector128(x, 1);
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<float> Load1(float* src, int* idx)
-            => Sse.SetScalarVector128(src[idx[0]]);
-
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<float> Load4(float* src, int* idx)
-            => Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
-
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe Vector256<float> Load8(float* src, int* idx)
             => Avx.SetVector256(src[idx[7]], src[idx[6]], src[idx[5]], src[idx[4]], src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
 
-        // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA.
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> Rotate(in Vector128<float> x)
-            => Sse.Shuffle(x, x, 0x39);
-
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
-        {
-            Sse.StoreScalar(dst + idx[0], x);
-            Vector128<float> rotated = Rotate(in x);
-            Sse.StoreScalar(dst + idx[1], rotated);
-            rotated = Rotate(in rotated);
-            Sse.StoreScalar(dst + idx[2], rotated);
-            rotated = Rotate(in rotated);
-            Sse.StoreScalar(dst + idx[3], rotated);
-        }
-
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe void Store8(in Vector256<float> x, float* dst, int* idx)
         {
             Vector128<float> tmp = GetLow(in x);
             Sse.StoreScalar(dst + idx[0], tmp);
-            tmp = Rotate(in tmp);
+            tmp = SseIntrinsics.Rotate(in tmp);
             Sse.StoreScalar(dst + idx[1], tmp);
-            tmp = Rotate(in tmp);
+            tmp = SseIntrinsics.Rotate(in tmp);
             Sse.StoreScalar(dst + idx[2], tmp);
-            tmp = Rotate(in tmp);
+            tmp = SseIntrinsics.Rotate(in tmp);
             Sse.StoreScalar(dst + idx[3], tmp);
             tmp = GetHigh(in x);
             Sse.StoreScalar(dst + idx[4], tmp);
-            tmp = Rotate(in tmp);
+            tmp = SseIntrinsics.Rotate(in tmp);
             Sse.StoreScalar(dst + idx[5], tmp);
-            tmp = Rotate(in tmp);
+            tmp = SseIntrinsics.Rotate(in tmp);
             Sse.StoreScalar(dst + idx[6], tmp);
-            tmp = Rotate(in tmp);
+            tmp = SseIntrinsics.Rotate(in tmp);
             Sse.StoreScalar(dst + idx[7], tmp);
         }
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> VectorSum128(in Vector128<float> vector)
-        {
-            if (Sse3.IsSupported)
-            {
-                Vector128<float> partialSum = Sse3.HorizontalAdd(vector, vector);
-                return Sse3.HorizontalAdd(partialSum, partialSum);
-            }
-            else
-            {
-                Vector128<float> partialSum = Sse.Add(vector, Sse.MoveHighToLow(vector, vector));
-                // The control byte shuffles the four 32-bit floats of partialSum: ABCD -> BADC.
-                return Sse.Add(partialSum, Sse.Shuffle(partialSum, partialSum, 0xB1));
-            }
-        }
-
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector256<float> VectorSum256(in Vector256<float> vector)
         {
@@ -126,24 +77,6 @@ private static Vector256<float> VectorSum256(in Vector256<float> vector)
             return Avx.HorizontalAdd(partialSum, partialSum);
         }
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> VectorMax128(in Vector128<float> vector)
-        {
-            // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> BADC.
-            Vector128<float> x1 = Sse.Shuffle(vector, vector, 0xB1);
-
-            // Performs element-wise maximum operation: The 1st and 3rd 32-bit slots become
-            // max(A, B) and max(C, D).
-            Vector128<float> partialMax = Sse.Max(vector, x1);
-
-            // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> CAAA.
-            x1 = Sse.Shuffle(partialMax, partialMax, 0x02);
-
-            // Performs element-wise maximum operation: The 1st 32-bit slot becomes
-            // max(A, B, C, D).
-            return Sse.MaxScalar(partialMax, x1);
-        }
-
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector256<float> VectorMax256(in Vector256<float> vector)
         {
@@ -163,17 +96,6 @@ private static Vector256<float> VectorMax256(in Vector256<float> vector)
             return Avx.Max(partialMax, x1);
         }
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> GetNewDst128(in Vector128<float> xDst1, in Vector128<float> xThreshold)
-        {
-            Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000
-            Vector128<float> xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise
-            Vector128<float> xDst1Abs = Sse.Xor(xDst1, xSign);
-            Vector128<float> xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true
-            Vector128<float> x2 = Sse.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise
-            return Sse.And(Sse.Subtract(xDst1, x2), xCond);
-        }
-
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector256<float> GetNewDst256(in Vector256<float> xDst1, in Vector256<float> xThreshold)
         {
@@ -340,10 +262,10 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
                     Vector128<float> h31 = Sse.Shuffle(h01, h01, 0xFF); // D
                     h01 = Sse.Shuffle(h01, h01, 0x00); // A
 
-                    Vector256<float> x01 = SetHighLow(h01, h01);
-                    Vector256<float> x11 = SetHighLow(h11, h11);
-                    Vector256<float> x21 = SetHighLow(h21, h21);
-                    Vector256<float> x31 = SetHighLow(h31, h31);
+                    Vector256<float> x01 = Avx.SetHighLow(h01, h01);
+                    Vector256<float> x11 = Avx.SetHighLow(h11, h11);
+                    Vector256<float> x21 = Avx.SetHighLow(h21, h21);
+                    Vector256<float> x31 = Avx.SetHighLow(h31, h31);
 
                     pSrcCurrent += 4;
 
@@ -384,10 +306,10 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
                     Vector128<float> h31 = Sse.Shuffle(h01, h01, 0xFF); // D
                     h01 = Sse.Shuffle(h01, h01, 0x00); // A
 
-                    Vector256<float> x01 = SetHighLow(h01, h01);
-                    Vector256<float> x11 = SetHighLow(h11, h11);
-                    Vector256<float> x21 = SetHighLow(h21, h21);
-                    Vector256<float> x31 = SetHighLow(h31, h31);
+                    Vector256<float> x01 = Avx.SetHighLow(h01, h01);
+                    Vector256<float> x11 = Avx.SetHighLow(h11, h11);
+                    Vector256<float> x21 = Avx.SetHighLow(h21, h21);
+                    Vector256<float> x31 = Avx.SetHighLow(h31, h31);
 
                     float* pDstCurrent = pdst;
 
@@ -806,11 +728,11 @@ public static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx
                 if (pIdxCurrent + 4 <= pEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
-                    Vector128<float> dstVector = Load4(pDstCurrent, pIdxCurrent);
+                    Vector128<float> dstVector = SseIntrinsics.Load4(pDstCurrent, pIdxCurrent);
 
                     srcVector = Sse.Multiply(srcVector, scaleVector128);
                     dstVector = Sse.Add(dstVector, srcVector);
-                    Store4(in dstVector, pDstCurrent, pIdxCurrent);
+                    SseIntrinsics.Store4(in dstVector, pDstCurrent, pIdxCurrent);
 
                     pIdxCurrent += 4;
                     pSrcCurrent += 4;
@@ -898,11 +820,11 @@ public static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
 
                 if (pIdxCurrent + 4 <= pEnd)
                 {
-                    Vector128<float> dstVector = Load4(pDstCurrent, pIdxCurrent);
+                    Vector128<float> dstVector = SseIntrinsics.Load4(pDstCurrent, pIdxCurrent);
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
 
                     dstVector = Sse.Add(dstVector, srcVector);
-                    Store4(in dstVector, pDstCurrent, pIdxCurrent);
+                    SseIntrinsics.Store4(in dstVector, pDstCurrent, pIdxCurrent);
 
                     pIdxCurrent += 4;
                     pSrcCurrent += 4;
@@ -993,7 +915,7 @@ public static unsafe float SumU(Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result128 = VectorSum128(in result128);
+                result128 = SseIntrinsics.VectorSum128(in result128);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
@@ -1035,7 +957,7 @@ public static unsafe float SumSqU(Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result128 = VectorSum128(in result128);
+                result128 = SseIntrinsics.VectorSum128(in result128);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
@@ -1083,7 +1005,7 @@ public static unsafe float SumSqDiffU(float mean, Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result128 = VectorSum128(in result128);
+                result128 = SseIntrinsics.VectorSum128(in result128);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
@@ -1123,17 +1045,17 @@ public static unsafe float SumAbsU(Span<float> src)
                 if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
-                    result128 = Sse.Add(result128, Sse.And(srcVector, _absMask128));
+                    result128 = Sse.Add(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128));
 
                     pSrcCurrent += 4;
                 }
 
-                result128 = VectorSum128(in result128);
+                result128 = SseIntrinsics.VectorSum128(in result128);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
-                    result128 = Sse.AddScalar(result128, Sse.And(srcVector, _absMask128));
+                    result128 = Sse.AddScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128));
 
                     pSrcCurrent++;
                 }
@@ -1171,18 +1093,18 @@ public static unsafe float SumAbsDiffU(float mean, Span<float> src)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector128);
-                    result128 = Sse.Add(result128, Sse.And(srcVector, _absMask128));
+                    result128 = Sse.Add(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128));
 
                     pSrcCurrent += 4;
                 }
 
-                result128 = VectorSum128(in result128);
+                result128 = SseIntrinsics.VectorSum128(in result128);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     srcVector = Sse.SubtractScalar(srcVector, meanVector128);
-                    result128 = Sse.AddScalar(result128, Sse.And(srcVector, _absMask128));
+                    result128 = Sse.AddScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128));
 
                     pSrcCurrent++;
                 }
@@ -1216,17 +1138,17 @@ public static unsafe float MaxAbsU(Span<float> src)
                 if (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
-                    result128 = Sse.Max(result128, Sse.And(srcVector, _absMask128));
+                    result128 = Sse.Max(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128));
 
                     pSrcCurrent += 4;
                 }
 
-                result128 = VectorMax128(in result128);
+                result128 = SseIntrinsics.VectorMax128(in result128);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
-                    result128 = Sse.MaxScalar(result128, Sse.And(srcVector, _absMask128));
+                    result128 = Sse.MaxScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128));
 
                     pSrcCurrent++;
                 }
@@ -1264,18 +1186,18 @@ public static unsafe float MaxAbsDiffU(float mean, Span<float> src)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector128);
-                    result128 = Sse.Max(result128, Sse.And(srcVector, _absMask128));
+                    result128 = Sse.Max(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128));
 
                     pSrcCurrent += 4;
                 }
 
-                result128 = VectorMax128(in result128);
+                result128 = SseIntrinsics.VectorMax128(in result128);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     srcVector = Sse.SubtractScalar(srcVector, meanVector128);
-                    result128 = Sse.MaxScalar(result128, Sse.And(srcVector, _absMask128));
+                    result128 = Sse.MaxScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128));
 
                     pSrcCurrent++;
                 }
@@ -1322,7 +1244,7 @@ public static unsafe float DotU(Span<float> src, Span<float> dst)
                     pDstCurrent += 4;
                 }
 
-                result128 = VectorSum128(in result128);
+                result128 = SseIntrinsics.VectorSum128(in result128);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
@@ -1370,7 +1292,7 @@ public static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx
 
                 if (pIdxCurrent + 4 <= pIdxEnd)
                 {
-                    Vector128<float> srcVector = Load4(pSrcCurrent, pIdxCurrent);
+                    Vector128<float> srcVector = SseIntrinsics.Load4(pSrcCurrent, pIdxCurrent);
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
 
                     result128 = Sse.Add(result128, Sse.Multiply(srcVector, dstVector));
@@ -1379,11 +1301,11 @@ public static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx
                     pDstCurrent += 4;
                 }
 
-                result128 = VectorSum128(in result128);
+                result128 = SseIntrinsics.VectorSum128(in result128);
 
                 while (pIdxCurrent < pIdxEnd)
                 {
-                    Vector128<float> srcVector = Load1(pSrcCurrent, pIdxCurrent);
+                    Vector128<float> srcVector = SseIntrinsics.Load1(pSrcCurrent, pIdxCurrent);
                     Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
 
                     result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, dstVector));
@@ -1434,7 +1356,7 @@ public static unsafe float Dist2(Span<float> src, Span<float> dst)
                     pDstCurrent += 4;
                 }
 
-                sqDistanceVector128 = VectorSum128(in sqDistanceVector128);
+                sqDistanceVector128 = SseIntrinsics.VectorSum128(in sqDistanceVector128);
 
                 float norm = Sse.ConvertToSingle(Sse.AddScalar(sqDistanceVector128, sqDistanceVectorPadded));
                 while (pSrcCurrent < pSrcEnd)
@@ -1489,7 +1411,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, flo
 
                     Vector128<float> xDst1 = Sse.LoadVector128(pDst1Current);
                     xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128));
-                    Vector128<float> xDst2 = GetNewDst128(xDst1, xThreshold128);
+                    Vector128<float> xDst2 = SseIntrinsics.GetNewDst128(xDst1, xThreshold128);
 
                     Sse.Store(pDst1Current, xDst1);
                     Sse.Store(pDst2Current, xDst2);
@@ -1548,12 +1470,12 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Sp
                 {
                     Vector128<float> xSrc = Sse.LoadVector128(pSrcCurrent);
 
-                    Vector128<float> xDst1 = Load4(pdst1, pIdxCurrent);
+                    Vector128<float> xDst1 = SseIntrinsics.Load4(pdst1, pIdxCurrent);
                     xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128));
-                    Vector128<float> xDst2 = GetNewDst128(xDst1, xThreshold128);
+                    Vector128<float> xDst2 = SseIntrinsics.GetNewDst128(xDst1, xThreshold128);
 
-                    Store4(in xDst1, pdst1, pIdxCurrent);
-                    Store4(in xDst2, pdst2, pIdxCurrent);
+                    SseIntrinsics.Store4(in xDst1, pdst1, pIdxCurrent);
+                    SseIntrinsics.Store4(in xDst2, pdst2, pIdxCurrent);
 
                     pIdxCurrent += 4;
                     pSrcCurrent += 4;
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 323300888c..ee326dc5ba 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -22,45 +22,43 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     internal static class SseIntrinsics
     {
+        internal static readonly Vector128<float> AbsMask128 = Sse2.IsSupported ?
+            Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF)) :
+            Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;
 
-        private static bool Compat(AlignedArray a)
+        private static bool HasCompatibleAlignment(AlignedArray alignedArray)
         {
-            Contracts.AssertValue(a);
-            Contracts.Assert(a.Size > 0);
-            return (a.CbAlign % Vector128Alignment) == 0;
+            Contracts.AssertValue(alignedArray);
+            Contracts.Assert(alignedArray.Size > 0);
+            return (alignedArray.CbAlign % Vector128Alignment) == 0;
         }
 
-        private static unsafe float* Ptr(AlignedArray a, float* p)
+        private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase)
         {
-            Contracts.AssertValue(a);
-            float* q = p + a.GetBase((long)p);
-            Contracts.Assert(((long)q & (Vector128Alignment - 1)) == 0);
-            return q;
+            Contracts.AssertValue(alignedArray);
+            float* alignedBase = unalignedBase + alignedArray.GetBase((long)unalignedBase);
+            Contracts.Assert(((long)alignedBase & (Vector128Alignment - 1)) == 0);
+            return alignedBase;
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<float> Load1(float* src, int* idx)
-        {
-            return Sse.SetScalarVector128(src[idx[0]]);
-        }
+        internal static unsafe Vector128<float> Load1(float* src, int* idx)
+             => Sse.SetScalarVector128(src[idx[0]]);
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<float> Load4(float* src, int* idx)
-        {
-            return Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
-        }
+        internal static unsafe Vector128<float> Load4(float* src, int* idx)
+            => Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
 
+        // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA.
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> Rotate(in Vector128<float> x)
-        {
-            // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA.
-            return Sse.Shuffle(x, x, 0x39);
-        }
+        internal static Vector128<float> Rotate(in Vector128<float> x)
+            => Sse.Shuffle(x, x, 0x39);
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
+        internal static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
         {
             Sse.StoreScalar(dst + idx[0], x);
             Vector128<float> rotated = Rotate(in x);
@@ -72,7 +70,7 @@ private static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> VectorSum(in Vector128<float> vector)
+       internal static Vector128<float> VectorSum128(in Vector128<float> vector)
         {
             if (Sse3.IsSupported)
             {
@@ -88,25 +86,27 @@ private static Vector128<float> VectorSum(in Vector128<float> vector)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> VectorMax(in Vector128<float> vector)
+        internal static Vector128<float> VectorMax128(in Vector128<float> vector)
         {
+            // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> BADC.
             Vector128<float> x1 = Sse.Shuffle(vector, vector, 0xB1);
+
+            // Performs element-wise maximum operation: The 1st and 3rd 32-bit slots become
+            // max(A, B) and max(C, D).
             Vector128<float> partialMax = Sse.Max(vector, x1);
+
+            // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> CAAA.
             x1 = Sse.Shuffle(partialMax, partialMax, 0x02);
-            return Sse.MaxScalar(partialMax, x1);
-        }
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> GetAbsMask()
-        {
-            return Sse2.IsSupported ?
-                Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF)) :
-                Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+            // Performs element-wise maximum operation: The 1st 32-bit slot becomes
+            // max(A, B, C, D).
+            return Sse.MaxScalar(partialMax, x1);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> GetNewDst(in Vector128<float> xDst1, in Vector128<float> signMask, in Vector128<float> xThreshold)
+        internal static Vector128<float> GetNewDst128(in Vector128<float> xDst1, in Vector128<float> xThreshold)
         {
+            Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000
             Vector128<float> xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise
             Vector128<float> xDst1Abs = Sse.Xor(xDst1, xSign);
             Vector128<float> xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true
@@ -117,17 +117,17 @@ private static Vector128<float> GetNewDst(in Vector128<float> xDst1, in Vector12
         // Multiply matrix times vector into vector.
         public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
+            Contracts.Assert(HasCompatibleAlignment(mat));
+            Contracts.Assert(HasCompatibleAlignment(src));
+            Contracts.Assert(HasCompatibleAlignment(dst));
 
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             fixed (float* pMatStart = &mat.Items[0])
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-                float* pmat = Ptr(mat, pMatStart);
+                float* psrc = GetAlignedBase(src, pSrcStart);
+                float* pdst = GetAlignedBase(dst, pDstStart);
+                float* pmat = GetAlignedBase(mat, pMatStart);
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
@@ -183,9 +183,9 @@ public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src,
         public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
                                         int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
+            Contracts.Assert(HasCompatibleAlignment(mat));
+            Contracts.Assert(HasCompatibleAlignment(src));
+            Contracts.Assert(HasCompatibleAlignment(dst));
 
             // REVIEW: For extremely sparse inputs, interchanging the loops would
             // likely be more efficient.
@@ -194,9 +194,9 @@ public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, A
             fixed (float* pMatStart = &mat.Items[0])
             fixed (int* pposSrc = &rgposSrc[0])
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-                float* pmat = Ptr(mat, pMatStart);
+                float* psrc = GetAlignedBase(src, pSrcStart);
+                float* pdst = GetAlignedBase(dst, pDstStart);
+                float* pmat = GetAlignedBase(mat, pMatStart);
 
                 int* pposMin = pposSrc + iposMin;
                 int* pposEnd = pposSrc + iposEnd;
@@ -239,17 +239,17 @@ public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, A
 
         public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
+            Contracts.Assert(HasCompatibleAlignment(mat));
+            Contracts.Assert(HasCompatibleAlignment(src));
+            Contracts.Assert(HasCompatibleAlignment(dst));
 
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             fixed (float* pMatStart = &mat.Items[0])
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-                float* pmat = Ptr(mat, pMatStart);
+                float* psrc = GetAlignedBase(src, pSrcStart);
+                float* pdst = GetAlignedBase(dst, pDstStart);
+                float* pmat = GetAlignedBase(mat, pMatStart);
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
@@ -342,18 +342,18 @@ public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray s
         public static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
                                         int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
+            Contracts.Assert(HasCompatibleAlignment(mat));
+            Contracts.Assert(HasCompatibleAlignment(src));
+            Contracts.Assert(HasCompatibleAlignment(dst));
 
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             fixed (float* pMatStart = &mat.Items[0])
             fixed (int* pposSrc = &rgposSrc[0])
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-                float* pmat = Ptr(mat, pMatStart);
+                float* psrc = GetAlignedBase(src, pSrcStart);
+                float* pdst = GetAlignedBase(dst, pDstStart);
+                float* pmat = GetAlignedBase(mat, pMatStart);
 
                 int* ppos = pposSrc + iposMin;
                 int* pposEnd = pposSrc + iposEnd;
@@ -768,7 +768,7 @@ public static unsafe float SumU(Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum128(in result);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
@@ -797,7 +797,7 @@ public static unsafe float SumSqU(Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum128(in result);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
@@ -830,7 +830,7 @@ public static unsafe float SumSqDiffU(float mean, Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum128(in result);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
@@ -853,22 +853,21 @@ public static unsafe float SumAbsU(Span<float> src)
                 float* pSrcCurrent = psrc;
 
                 Vector128<float> result = Sse.SetZeroVector128();
-                Vector128<float> mask = GetAbsMask();
 
                 while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
-                    result = Sse.Add(result, Sse.And(srcVector, mask));
+                    result = Sse.Add(result, Sse.And(srcVector, AbsMask128));
 
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum128(in result);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
-                    result = Sse.AddScalar(result, Sse.And(srcVector, mask));
+                    result = Sse.AddScalar(result, Sse.And(srcVector, AbsMask128));
 
                     pSrcCurrent++;
                 }
@@ -886,24 +885,23 @@ public static unsafe float SumAbsDiffU(float mean, Span<float> src)
 
                 Vector128<float> result = Sse.SetZeroVector128();
                 Vector128<float> meanVector = Sse.SetAllVector128(mean);
-                Vector128<float> mask = GetAbsMask();
 
                 while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector);
-                    result = Sse.Add(result, Sse.And(srcVector, mask));
+                    result = Sse.Add(result, Sse.And(srcVector, AbsMask128));
 
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum128(in result);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     srcVector = Sse.SubtractScalar(srcVector, meanVector);
-                    result = Sse.AddScalar(result, Sse.And(srcVector, mask));
+                    result = Sse.AddScalar(result, Sse.And(srcVector, AbsMask128));
 
                     pSrcCurrent++;
                 }
@@ -920,22 +918,21 @@ public static unsafe float MaxAbsU(Span<float> src)
                 float* pSrcCurrent = psrc;
 
                 Vector128<float> result = Sse.SetZeroVector128();
-                Vector128<float> mask = GetAbsMask();
 
                 while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
-                    result = Sse.Max(result, Sse.And(srcVector, mask));
+                    result = Sse.Max(result, Sse.And(srcVector, AbsMask128));
 
                     pSrcCurrent += 4;
                 }
 
-                result = VectorMax(in result);
+                result = VectorMax128(in result);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
-                    result = Sse.MaxScalar(result, Sse.And(srcVector, mask));
+                    result = Sse.MaxScalar(result, Sse.And(srcVector, AbsMask128));
 
                     pSrcCurrent++;
                 }
@@ -953,24 +950,23 @@ public static unsafe float MaxAbsDiffU(float mean, Span<float> src)
 
                 Vector128<float> result = Sse.SetZeroVector128();
                 Vector128<float> meanVector = Sse.SetAllVector128(mean);
-                Vector128<float> mask = GetAbsMask();
 
                 while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Subtract(srcVector, meanVector);
-                    result = Sse.Max(result, Sse.And(srcVector, mask));
+                    result = Sse.Max(result, Sse.And(srcVector, AbsMask128));
 
                     pSrcCurrent += 4;
                 }
 
-                result = VectorMax(in result);
+                result = VectorMax128(in result);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
                     srcVector = Sse.SubtractScalar(srcVector, meanVector);
-                    result = Sse.MaxScalar(result, Sse.And(srcVector, mask));
+                    result = Sse.MaxScalar(result, Sse.And(srcVector, AbsMask128));
 
                     pSrcCurrent++;
                 }
@@ -1001,7 +997,7 @@ public static unsafe float DotU(Span<float> src, Span<float> dst)
                     pDstCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum128(in result);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
@@ -1042,7 +1038,7 @@ public static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx
                     pDstCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum128(in result);
 
                 while (pIdxCurrent < pIdxEnd)
                 {
@@ -1081,7 +1077,7 @@ public static unsafe float Dist2(Span<float> src, Span<float> dst)
                     pDstCurrent += 4;
                 }
 
-                sqDistanceVector = VectorSum(in sqDistanceVector);
+                sqDistanceVector = VectorSum128(in sqDistanceVector);
 
                 float norm = Sse.ConvertToSingle(sqDistanceVector);
                 while (pSrcCurrent < pSrcEnd)
@@ -1119,7 +1115,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, flo
 
                     Vector128<float> xDst1 = Sse.LoadVector128(pDst1Current);
                     xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal));
-                    Vector128<float> xDst2 = GetNewDst(xDst1, signMask, xThreshold);
+                    Vector128<float> xDst2 = GetNewDst128(xDst1, xThreshold);
 
                     Sse.Store(pDst1Current, xDst1);
                     Sse.Store(pDst2Current, xDst2);
@@ -1164,7 +1160,7 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Sp
 
                     Vector128<float> xDst1 = Load4(pdst1, pIdxCurrent);
                     xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal));
-                    Vector128<float> xDst2 = GetNewDst(xDst1, signMask, xThreshold);
+                    Vector128<float> xDst2 = GetNewDst128(xDst1, xThreshold);
 
                     Store4(in xDst1, pdst1, pIdxCurrent);
                     Store4(in xDst2, pdst2, pIdxCurrent);

From b8d63cccbdca22dd1b24b5b524b4d25e6e65eb67 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Thu, 23 Aug 2018 12:44:53 -0700
Subject: [PATCH 28/29] Made two Sse/AvxIntrinsics helper functions about
 AlignedArray inline in hopes of improving perf

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 2 ++
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index eb4ac3b817..d5e2fc0e38 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -22,6 +22,7 @@ internal static class AvxIntrinsics
 
         private const int Vector256Alignment = 32;
 
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static bool HasCompatibleAlignment(AlignedArray alignedArray)
         {
             Contracts.AssertValue(alignedArray);
@@ -29,6 +30,7 @@ private static bool HasCompatibleAlignment(AlignedArray alignedArray)
             return (alignedArray.CbAlign % Vector256Alignment) == 0;
         }
 
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase)
         {
             Contracts.AssertValue(alignedArray);
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index ee326dc5ba..0f4fb54d18 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -29,6 +29,7 @@ internal static class SseIntrinsics
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;
 
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static bool HasCompatibleAlignment(AlignedArray alignedArray)
         {
             Contracts.AssertValue(alignedArray);
@@ -36,6 +37,7 @@ private static bool HasCompatibleAlignment(AlignedArray alignedArray)
             return (alignedArray.CbAlign % Vector128Alignment) == 0;
         }
 
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase)
         {
             Contracts.AssertValue(alignedArray);

From 32a3704748ad7bcb654c967cfcccf25990ac0207 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Wed, 29 Aug 2018 14:41:37 -0700
Subject: [PATCH 29/29] Respond to PR feedback: styles for Vector256Alignment
 and Avx.GetLowerHalf

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 30 ++++++++++-------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index d5e2fc0e38..06f89d097e 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -35,14 +35,10 @@ private static bool HasCompatibleAlignment(AlignedArray alignedArray)
         {
             Contracts.AssertValue(alignedArray);
             float* alignedBase = unalignedBase + alignedArray.GetBase((long)unalignedBase);
-            Contracts.Assert(((long)alignedBase & (Vector256Alignment - 1)) == 0);
+            Contracts.Assert(((long)alignedBase % Vector256Alignment) == 0);
             return alignedBase;
         }
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> GetLow(in Vector256<float> x)
-            => Avx.GetLowerHalf(x);
-
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> GetHigh(in Vector256<float> x)
             => Avx.ExtractVector128(x, 1);
@@ -54,7 +50,7 @@ private static unsafe Vector256<float> Load8(float* src, int* idx)
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe void Store8(in Vector256<float> x, float* dst, int* idx)
         {
-            Vector128<float> tmp = GetLow(in x);
+            Vector128<float> tmp = Avx.GetLowerHalf(in x);
             Sse.StoreScalar(dst + idx[0], tmp);
             tmp = SseIntrinsics.Rotate(in tmp);
             Sse.StoreScalar(dst + idx[1], tmp);
@@ -162,7 +158,7 @@ public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src,
                     res2 = Avx.HorizontalAdd(res2, res3);
                     res0 = Avx.HorizontalAdd(res0, res2);
 
-                    Vector128<float> sum = Sse.Add(GetLow(in res0), GetHigh(in res0));
+                    Vector128<float> sum = Sse.Add(Avx.GetLowerHalf(in res0), GetHigh(in res0));
                     if (add)
                     {
                         sum = Sse.Add(sum, Sse.LoadAlignedVector128(pDstCurrent));
@@ -907,7 +903,7 @@ public static unsafe float SumU(Span<float> src)
                 }
 
                 result256 = VectorSum256(in result256);
-                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+                Vector128<float> resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
 
@@ -947,7 +943,7 @@ public static unsafe float SumSqU(Span<float> src)
                 }
 
                 result256 = VectorSum256(in result256);
-                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+                Vector128<float> resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
 
@@ -993,7 +989,7 @@ public static unsafe float SumSqDiffU(float mean, Span<float> src)
                 }
 
                 result256 = VectorSum256(in result256);
-                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+                Vector128<float> resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
                 Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
@@ -1040,7 +1036,7 @@ public static unsafe float SumAbsU(Span<float> src)
                 }
 
                 result256 = VectorSum256(in result256);
-                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+                Vector128<float> resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
 
@@ -1086,7 +1082,7 @@ public static unsafe float SumAbsDiffU(float mean, Span<float> src)
                 }
 
                 result256 = VectorSum256(in result256);
-                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+                Vector128<float> resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
                 Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
@@ -1133,7 +1129,7 @@ public static unsafe float MaxAbsU(Span<float> src)
                 }
 
                 result256 = VectorMax256(in result256);
-                Vector128<float> resultPadded = Sse.MaxScalar(GetLow(result256), GetHigh(result256));
+                Vector128<float> resultPadded = Sse.MaxScalar(Avx.GetLowerHalf(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
 
@@ -1179,7 +1175,7 @@ public static unsafe float MaxAbsDiffU(float mean, Span<float> src)
                 }
 
                 result256 = VectorMax256(in result256);
-                Vector128<float> resultPadded = Sse.MaxScalar(GetLow(result256), GetHigh(result256));
+                Vector128<float> resultPadded = Sse.MaxScalar(Avx.GetLowerHalf(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
                 Vector128<float> meanVector128 = Sse.SetAllVector128(mean);
@@ -1231,7 +1227,7 @@ public static unsafe float DotU(Span<float> src, Span<float> dst)
                 }
 
                 result256 = VectorSum256(in result256);
-                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+                Vector128<float> resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
 
@@ -1288,7 +1284,7 @@ public static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx
                 }
 
                 result256 = VectorSum256(in result256);
-                Vector128<float> resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256));
+                Vector128<float> resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256));
 
                 Vector128<float> result128 = Sse.SetZeroVector128();
 
@@ -1343,7 +1339,7 @@ public static unsafe float Dist2(Span<float> src, Span<float> dst)
                 }
 
                 sqDistanceVector256 = VectorSum256(in sqDistanceVector256);
-                Vector128<float> sqDistanceVectorPadded = Sse.AddScalar(GetLow(sqDistanceVector256), GetHigh(sqDistanceVector256));
+                Vector128<float> sqDistanceVectorPadded = Sse.AddScalar(Avx.GetLowerHalf(sqDistanceVector256), GetHigh(sqDistanceVector256));
 
                 Vector128<float> sqDistanceVector128 = Sse.SetZeroVector128();