diff --git a/Directory.Build.props b/Directory.Build.props
index 73144201c7..bdca231554 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -114,4 +114,11 @@
     <PublicSign Condition="'$(OS)' != 'Windows_NT'">true</PublicSign>
   </PropertyGroup>
 
+  <PropertyGroup>
+    <UseIntrinsics Condition="'$(UseIntrinsics)' == ''">$(Configuration.EndsWith('-Intrinsics'))</UseIntrinsics>
+  </PropertyGroup>
+
+  <PropertyGroup>
+    <CustomAfterMicrosoftCommonTargets>$(RepoRoot)build\AfterCommonTargets.targets</CustomAfterMicrosoftCommonTargets>
+  </PropertyGroup>
 </Project>
diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln
index 140c93753c..18d9d3867e 100644
--- a/Microsoft.ML.sln
+++ b/Microsoft.ML.sln
@@ -97,6 +97,13 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CodeAnalyzer",
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CodeAnalyzer.Tests", "test\Microsoft.ML.CodeAnalyzer.Tests\Microsoft.ML.CodeAnalyzer.Tests.csproj", "{3E4ABF07-7970-4BE6-B45B-A13D3C397545}"
 EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CpuMath.PerformanceTests", "test\Microsoft.ML.CpuMath.PerformanceTests\Microsoft.ML.CpuMath.PerformanceTests.csproj", "{7333EDEF-4144-405C-A5EC-6F42201857D8}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CpuMath.UnitTests.netstandard", "test\Microsoft.ML.CpuMath.UnitTests.netstandard\Microsoft.ML.CpuMath.UnitTests.netstandard.csproj", "{A0E562A9-0E6D-470D-B180-6EB44BA84D60}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CpuMath.UnitTests.netcoreapp", "test\Microsoft.ML.CpuMath.UnitTests.netcoreapp\Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj", "{5F81A2A4-73AD-494C-B387-07D605EC8826}"
+EndProject
+
 Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "Microsoft.ML.FSharp.Tests", "test\Microsoft.ML.FSharp.Tests\Microsoft.ML.FSharp.Tests.fsproj", "{802233D6-8CC0-46AD-9F23-FEE1E9AED9B3}"
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.ImageAnalytics", "src\Microsoft.ML.ImageAnalytics\Microsoft.ML.ImageAnalytics.csproj", "{00E38F77-1E61-4CDF-8F97-1417D4E85053}"
@@ -335,6 +342,30 @@ Global
 		{3E4ABF07-7970-4BE6-B45B-A13D3C397545}.Release|Any CPU.Build.0 = Release|Any CPU
 		{3E4ABF07-7970-4BE6-B45B-A13D3C397545}.Release-Intrinsics|Any CPU.ActiveCfg = Release|Any CPU
 		{3E4ABF07-7970-4BE6-B45B-A13D3C397545}.Release-Intrinsics|Any CPU.Build.0 = Release|Any CPU
+		{7333EDEF-4144-405C-A5EC-6F42201857D8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{7333EDEF-4144-405C-A5EC-6F42201857D8}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{7333EDEF-4144-405C-A5EC-6F42201857D8}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug|Any CPU
+		{7333EDEF-4144-405C-A5EC-6F42201857D8}.Debug-Intrinsics|Any CPU.Build.0 = Debug|Any CPU
+		{7333EDEF-4144-405C-A5EC-6F42201857D8}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{7333EDEF-4144-405C-A5EC-6F42201857D8}.Release|Any CPU.Build.0 = Release|Any CPU
+		{7333EDEF-4144-405C-A5EC-6F42201857D8}.Release-Intrinsics|Any CPU.ActiveCfg = Release|Any CPU
+		{7333EDEF-4144-405C-A5EC-6F42201857D8}.Release-Intrinsics|Any CPU.Build.0 = Release|Any CPU
+		{A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug|Any CPU
+		{A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Debug-Intrinsics|Any CPU.Build.0 = Debug|Any CPU
+		{A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Release|Any CPU.Build.0 = Release|Any CPU
+		{A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Release-Intrinsics|Any CPU.ActiveCfg = Release|Any CPU
+		{A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Release-Intrinsics|Any CPU.Build.0 = Release|Any CPU
+		{5F81A2A4-73AD-494C-B387-07D605EC8826}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{5F81A2A4-73AD-494C-B387-07D605EC8826}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{5F81A2A4-73AD-494C-B387-07D605EC8826}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug|Any CPU
+		{5F81A2A4-73AD-494C-B387-07D605EC8826}.Debug-Intrinsics|Any CPU.Build.0 = Debug|Any CPU
+		{5F81A2A4-73AD-494C-B387-07D605EC8826}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{5F81A2A4-73AD-494C-B387-07D605EC8826}.Release|Any CPU.Build.0 = Release|Any CPU
+		{5F81A2A4-73AD-494C-B387-07D605EC8826}.Release-Intrinsics|Any CPU.ActiveCfg = Release|Any CPU
+		{5F81A2A4-73AD-494C-B387-07D605EC8826}.Release-Intrinsics|Any CPU.Build.0 = Release|Any CPU
 		{802233D6-8CC0-46AD-9F23-FEE1E9AED9B3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{802233D6-8CC0-46AD-9F23-FEE1E9AED9B3}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{802233D6-8CC0-46AD-9F23-FEE1E9AED9B3}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug|Any CPU
@@ -395,6 +426,9 @@ Global
 		{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{BF66A305-DF10-47E4-8D81-42049B149D2B} = {D3D38B03-B557-484D-8348-8BADEE4DF592}
+		{7333EDEF-4144-405C-A5EC-6F42201857D8} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
+		{A0E562A9-0E6D-470D-B180-6EB44BA84D60} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
+		{5F81A2A4-73AD-494C-B387-07D605EC8826} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{B4E55B2D-2A92-46E7-B72F-E76D6FD83440} = {7F13E156-3EBA-4021-84A5-CD56BA72F99E}
 		{3E4ABF07-7970-4BE6-B45B-A13D3C397545} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{802233D6-8CC0-46AD-9F23-FEE1E9AED9B3} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
diff --git a/build.proj b/build.proj
index f8e4adaf5f..cb5c557cbe 100644
--- a/build.proj
+++ b/build.proj
@@ -41,7 +41,8 @@
   <Target Name="RestoreProjects" Condition="'$(RestoreDuringBuild)'=='true'">
     <Message Importance="High" Text="Restoring all projects..." />
     <MSBuild Projects="@(Project)"
-             Targets="Restore" />
+             Targets="Restore" 
+             Properties="MSBuildWarningsAsMessages=NU1503" />
   </Target>
 
   <Target Name="BuildNative"
diff --git a/build/AfterCommonTargets.targets b/build/AfterCommonTargets.targets
new file mode 100644
index 0000000000..cba4c80b5c
--- /dev/null
+++ b/build/AfterCommonTargets.targets
@@ -0,0 +1,13 @@
+<Project>
+  <PropertyGroup> 
+    <MSBuildAllProjects>$(MSBuildAllProjects);$(MSBuildThisFileFullPath)</MSBuildAllProjects>
+  </PropertyGroup> 
+
+  <!-- 
+  We use netcoreapp3.0 for C# intrinsics, but 3.0 isn't supported in CI or in normal development 
+  environments yet. So when we are targeting netcoreapp3.0, but aren't building for intrinsics, 
+  we need to skip the project. 
+  --> 
+  <Import Condition="'$(UseIntrinsics)' != 'true' and '$(TargetFramework)' == 'netcoreapp3.0'" 
+          Project="$(RepoRoot)build\Empty.targets" />
+</Project>
\ No newline at end of file
diff --git a/build/Empty.targets b/build/Empty.targets
new file mode 100644
index 0000000000..72abf9cd60
--- /dev/null
+++ b/build/Empty.targets
@@ -0,0 +1,29 @@
+<Project> 
+  <PropertyGroup> 
+    <MSBuildAllProjects>$(MSBuildAllProjects);$(MSBuildThisFileFullPath)</MSBuildAllProjects>
+    <!--
+      In the SDK ImportAfter folder, this property is declared to point to Microsoft.TestPlatform.targets, which is the file containing the original VSTest target.
+      Since the Microsoft.TestPlatform.targets are in the ImportAfter folder, they would be imported after this file hence our empty VSTest target would be overriden,
+      in order to be able to override this target, we set the VSTestTargets property to an inexistent file path, so nothing will be imported and that way we successfully
+      overrode the VSTest target.
+    -->
+    <VSTestTargets>ignore.targets</VSTestTargets>
+  </PropertyGroup>
+
+  <!-- 
+    Copied from https://github.com/dotnet/arcade/blob/master/src/Microsoft.DotNet.Arcade.Sdk/tools/Empty.targets 
+
+    Import this file to suppress all targets while allowing the project to participate in the build. 
+    Workaround for https://github.com/dotnet/sdk/issues/2071. 
+
+    The targets defined here are not sufficient for the project to be open in Visual Studio without issues though.     
+  --> 
+
+  <Target Name="_IsProjectRestoreSupported"/>
+  <Target Name="_CheckForInvalidConfigurationAndPlatform"/>
+  <Target Name="Restore"/>
+  <Target Name="Build"/>
+  <Target Name="Test"/>
+  <Target Name="VSTest"/>
+  <Target Name="Pack"/>
+</Project>
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
new file mode 100644
index 0000000000..6c6c1fe6ad
--- /dev/null
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -0,0 +1,396 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.Intrinsics.X86;
+using System;
+
+namespace Microsoft.ML.Runtime.Internal.CpuMath
+{
+    public static partial class CpuMathUtils
+    {
+        public static void Scale(float a, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(0 < count && count <= dst.Length);
+
+            Scale(a, new Span<float>(dst, 0, count));
+        }
+
+        public static void Scale(float a, float[] dst, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 <= offset && offset < dst.Length - count);
+
+            Scale(a, new Span<float>(dst, offset, count));
+        }
+
+        private static void Scale(float a, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.ScaleU(a, dst);
+            }
+            else
+            {
+                for (int i = 0; i < dst.Length; i++)
+                {
+                    dst[i] *= a;
+                }
+            }
+        }
+
+        public static void AddScale(float a, float[] src, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count <= dst.Length);
+
+            AddScale(a, new Span<float>(src, 0, count), new Span<float>(dst, 0, count));
+        }
+
+        public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(count <= src.Length);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
+            Contracts.Assert(0 < count && count <= dst.Length - dstOffset);
+
+            AddScale(a, new Span<float>(src, 0, count), new Span<float>(dst, dstOffset, count));
+        }
+
+        private static void AddScale(float a, Span<float> src, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.AddScaleU(a, src, dst);
+            }
+            else
+            {
+                for (int i = 0; i < dst.Length; i++)
+                {
+                    dst[i] += a * src[i];
+                }
+            }
+        }
+
+        public static void AddScale(float a, float[] src, int[] indices, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.Assert(count <= indices.Length);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count < dst.Length);
+
+            AddScale(a, new Span<float>(src), new Span<int>(indices, 0, count), new Span<float>(dst));
+        }
+
+        public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.Assert(count <= indices.Length);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
+            Contracts.Assert(count < dst.Length - dstOffset);
+
+            AddScale(a, new Span<float>(src), new Span<int>(indices, 0, count),
+                    new Span<float>(dst, dstOffset, dst.Length - dstOffset));
+        }
+
+        private static void AddScale(float a, Span<float> src, Span<int> indices, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.AddScaleSU(a, src, indices, dst);
+            }
+            else
+            {
+                for (int i = 0; i < indices.Length; i++)
+                {
+                    int index = indices[i];
+                    dst[index] += a * src[i];
+                }
+            }
+        }
+
+        public static void Add(float[] src, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count <= dst.Length);
+
+            Add(new Span<float>(src, 0, count), new Span<float>(dst, 0, count));
+        }
+
+        private static void Add(Span<float> src, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.AddU(src, dst);
+            }
+            else
+            {
+                for (int i = 0; i < dst.Length; i++)
+                {
+                    dst[i] += src[i];
+                }
+            }
+        }
+
+        public static void Add(float[] src, int[] indices, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.Assert(count <= indices.Length);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count < dst.Length);
+
+            Add(new Span<float>(src), new Span<int>(indices, 0, count), new Span<float>(dst));
+        }
+
+        public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.Assert(count <= indices.Length);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
+            Contracts.Assert(count <= dst.Length - dstOffset);
+
+            Add(new Span<float>(src), new Span<int>(indices, 0, count),
+                new Span<float>(dst, dstOffset, dst.Length - dstOffset));
+        }
+
+        private static void Add(Span<float> src, Span<int> indices, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.AddSU(src, indices, dst);
+            }
+            else
+            {
+                for (int i = 0; i < indices.Length; i++)
+                {
+                    int index = indices[i];
+                    dst[index] += src[i];
+                }
+            }
+        }
+
+        public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(src1);
+            Contracts.Assert(0 < count && count <= src1.Length);
+            Contracts.AssertNonEmpty(src2);
+            Contracts.Assert(0 < count && count <= src2.Length);
+            Contracts.AssertNonEmpty(dst);
+
+            MulElementWise(new Span<float>(src1, 0, count), new Span<float>(src2, 0, count),
+                            new Span<float>(dst, 0, count));
+        }
+
+        private static void MulElementWise(Span<float> src1, Span<float> src2, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.MulElementWiseU(src1, src2, dst);
+            }
+            else
+            {
+                for (int i = 0; i < dst.Length; i++)
+                {
+                    dst[i] = src1[i] * src2[i];
+                }
+            }
+        }
+
+        public static float SumSq(float[] src, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+
+            return SumSq(new Span<float>(src, 0, count));
+        }
+
+        public static float SumSq(float[] src, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+
+            return SumSq(new Span<float>(src, offset, count));
+        }
+
+        private static float SumSq(Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.SumSqU(src);
+            }
+            else
+            {
+                float result = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    result += src[i] * src[i];
+                }
+                return result;
+            }
+        }
+
+        public static float SumAbs(float[] src, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+
+            return SumAbs(new Span<float>(src, 0, count));
+        }
+
+        public static float SumAbs(float[] src, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+
+            return SumAbs(new Span<float>(src, offset, count));
+        }
+
+        private static float SumAbs(Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.SumAbsU(src);
+            }
+            else
+            {
+                float sum = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    sum += Math.Abs(src[i]);
+                }
+                return sum;
+            }
+        }
+
+        public static float DotProductDense(float[] a, float[] b, int count)
+        {
+            Contracts.AssertNonEmpty(a);
+            Contracts.AssertNonEmpty(b);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(a.Length >= count);
+            Contracts.Assert(b.Length >= count);
+
+            return DotProductDense(new Span<float>(a, 0, count), new Span<float>(b, 0, count));
+        }
+
+        public static float DotProductDense(float[] a, int offset, float[] b, int count)
+        {
+            Contracts.AssertNonEmpty(a);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 <= offset && offset <= a.Length - count);
+            Contracts.AssertNonEmpty(b);
+            Contracts.Assert(b.Length >= count);
+
+            return DotProductDense(new Span<float>(a, offset, count), new Span<float>(b, 0, count));
+        }
+
+        private static float DotProductDense(Span<float> a, Span<float> b)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.DotU(a, b);
+            }
+            else
+            {
+                float result = 0;
+                for (int i = 0; i < b.Length; i++)
+                {
+                    result += a[i] * b[i];
+                }
+                return result;
+            }
+        }
+
+        public static float DotProductSparse(float[] a, float[] b, int[] indices, int count)
+        {
+            Contracts.AssertNonEmpty(a);
+            Contracts.AssertNonEmpty(b);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(count < a.Length);
+            Contracts.Assert(count <= b.Length);
+            Contracts.Assert(count <= indices.Length);
+
+            return DotProductSparse(new Span<float>(a), new Span<float>(b),
+                                    new Span<int>(indices, 0, count));
+        }
+
+        public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count)
+        {
+            Contracts.AssertNonEmpty(a);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 <= offset && offset < a.Length);
+            Contracts.Assert(a.Length - offset > count);
+            Contracts.AssertNonEmpty(b);
+            Contracts.Assert(count <= b.Length);
+            Contracts.Assert(count <= indices.Length);
+
+            return DotProductSparse(new Span<float>(a, offset, a.Length - offset),
+                                    new Span<float>(b), new Span<int>(indices, 0, count));
+        }
+
+        private static float DotProductSparse(Span<float> a, Span<float> b, Span<int> indices)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.DotSU(a, b, indices);
+            }
+            else
+            {
+                float result = 0;
+                for (int i = 0; i < indices.Length; i++)
+                {
+                    int index = indices[i];
+                    result += a[index] * b[i];
+                }
+                return result;
+            }
+        }
+
+        public static float L2DistSquared(float[] a, float[] b, int count)
+        {
+            Contracts.AssertNonEmpty(a);
+            Contracts.AssertNonEmpty(b);
+            Contracts.Assert(0 < count && count <= a.Length);
+            Contracts.Assert(count <= b.Length);
+
+            return L2DistSquared(new Span<float>(a, 0, count), new Span<float>(b, 0, count));
+        }
+
+        private static float L2DistSquared(Span<float> a, Span<float> b)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.Dist2(a, b);
+            }
+            else
+            {
+                float norm = 0;
+                for (int i = 0; i < b.Length; i++)
+                {
+                    float distance = a[i] - b[i];
+                    norm += distance * distance;
+                }
+                return norm;
+            }
+        }
+    }
+}
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
new file mode 100644
index 0000000000..501fc9082e
--- /dev/null
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -0,0 +1,47 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Runtime.Internal.CpuMath
+{
+    public static partial class CpuMathUtils
+    {
+        public static void Scale(float a, float[] dst, int count) => SseUtils.Scale(a, dst, count);
+
+        public static void Scale(float a, float[] dst, int offset, int count) => SseUtils.Scale(a, dst, offset, count);
+
+        public static void AddScale(float a, float[] src, float[] dst, int count) => SseUtils.AddScale(a, src, dst, count);
+
+        public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, dst, dstOffset, count);
+
+        public static void AddScale(float a, float[] src, int[] indices, float[] dst, int count) => SseUtils.AddScale(a, src, indices, dst, count);
+
+        public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, indices, dst, dstOffset, count);
+
+        public static void Add(float[] src, float[] dst, int count) => SseUtils.Add(src, dst, count);
+
+        public static void Add(float[] src, int[] indices, float[] dst, int count) => SseUtils.Add(src, indices, dst, count);
+
+        public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, int count) => SseUtils.Add(src, indices, dst, dstOffset, count);
+
+        public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count) => SseUtils.MulElementWise(src1, src2, dst, count);
+
+        public static float SumSq(float[] src, int count) => SseUtils.SumSq(src, count);
+
+        public static float SumSq(float[] src, int offset, int count) => SseUtils.SumSq(src, offset, count);
+
+        public static float SumAbs(float[] src, int count) => SseUtils.SumAbs(src, count);
+
+        public static float SumAbs(float[] src, int offset, int count) => SseUtils.SumAbs(src, offset, count);
+
+        public static float DotProductDense(float[] a, float[] b, int count) => SseUtils.DotProductDense(a, b, count);
+
+        public static float DotProductDense(float[] a, int offset, float[] b, int count) => SseUtils.DotProductDense(a, offset, b, count);
+
+        public static float DotProductSparse(float[] a, float[] b, int[] indices, int count) => SseUtils.DotProductSparse(a, b, indices, count);
+
+        public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count) => SseUtils.DotProductSparse(a, offset, b, indices, count);
+
+        public static float L2DistSquared(float[] a, float[] b, int count) => SseUtils.L2DistSquared(a, b, count);
+    }
+}
diff --git a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
index bde7ae89f5..b6c95b93f4 100644
--- a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
+++ b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
@@ -2,21 +2,29 @@
 
   <PropertyGroup>
     <Configurations>Debug;Release;Debug-Intrinsics;Release-Intrinsics</Configurations>
-    <UseIntrinsics Condition="'$(UseIntrinsics)' == ''">$(Configuration.EndsWith('-Intrinsics'))</UseIntrinsics>
-    
     <TargetFramework Condition="'$(UseIntrinsics)' != 'true'">netstandard2.0</TargetFramework>
     <TargetFrameworks Condition="'$(UseIntrinsics)' == 'true'">netstandard2.0;netcoreapp3.0</TargetFrameworks>
     <IncludeInPackage>Microsoft.ML.CpuMath</IncludeInPackage>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <DefineConstants>$(DefineConstants);CORECLR;PRIVATE_CONTRACTS</DefineConstants>
+    <LangVersion>7.3</LangVersion>
   </PropertyGroup>
 
   <ItemGroup>
     <Compile Include="..\Microsoft.ML.Core\Utilities\Contracts.cs" />
+
+    <!-- Workaround https://github.com/dotnet/project-system/issues/935 -->
+    <None Include="**/*.cs" />
   </ItemGroup>
 
   <ItemGroup Condition="'$(TargetFramework)' == 'netcoreapp3.0'">
+    <Compile Remove="CpuMathUtils.netstandard.cs" />
     <!-- This is only needed until https://github.com/dotnet/corefx/issues/31064 is addressed. -->
     <PackageReference Include="System.Runtime.Intrinsics.Experimental" Version="4.6.0-preview1-26708-04" />
   </ItemGroup>
-</Project>
+
+  <ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
+    <Compile Remove="CpuMathUtils.netcoreapp.cs" />
+    <Compile Remove="SseIntrinsics.cs" />
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/Sse.cs b/src/Microsoft.ML.CpuMath/Sse.cs
index 68e6ee906b..13de22dd5b 100644
--- a/src/Microsoft.ML.CpuMath/Sse.cs
+++ b/src/Microsoft.ML.CpuMath/Sse.cs
@@ -2,8 +2,6 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using System;
-
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     /// <summary>
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
new file mode 100644
index 0000000000..d11676f283
--- /dev/null
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -0,0 +1,476 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+// The exported function names need to be unique (can't be disambiguated based on signature), hence
+// we introduce suffix letters to indicate the general patterns used.
+// * U suffix means unaligned and unpadded.
+// * S suffix means sparse (unaligned) vector.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Microsoft.ML.Runtime.Internal.CpuMath
+{
+    internal static class SseIntrinsics
+    {
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe Vector128<float> Load1(float* src, int* idx)
+        {
+            return Sse.SetScalarVector128(src[idx[0]]);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe Vector128<float> Load4(float* src, int* idx)
+        {
+            return Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> Rotate(Vector128<float> x)
+        {
+            // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA.
+            return Sse.Shuffle(x, x, 0x39);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> RotateReverse(Vector128<float> x)
+        {
+            // The control byte shuffles the four 32-bit floats of x: ABCD -> DABC.
+            return Sse.Shuffle(x, x, 0x93);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void Store4(Vector128<float> x, float* dst, int* idx)
+        {
+            Sse.StoreScalar(dst + idx[0], x);
+            x = Rotate(x);
+            Sse.StoreScalar(dst + idx[1], x);
+            x = Rotate(x);
+            Sse.StoreScalar(dst + idx[2], x);
+            x = Rotate(x);
+            Sse.StoreScalar(dst + idx[3], x);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> VectorSum(in Vector128<float> vector)
+        {
+            if (Sse3.IsSupported)
+            {
+                Vector128<float> tmp = Sse3.HorizontalAdd(vector, vector);
+                return Sse3.HorizontalAdd(tmp, tmp);
+            }
+            else
+            {
+                // SSE3 is not supported.
+                Vector128<float> tmp = Sse.Add(vector, Sse.MoveHighToLow(vector, vector));
+                // The control byte shuffles the four 32-bit floats of tmp: ABCD -> BADC.
+                return Sse.Add(tmp, Sse.Shuffle(tmp, tmp, 0xb1));
+            }
+        }
+
+        internal static unsafe void ScaleU(float scale, Span<float> dst)
+        {
+            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+
+            fixed (float* pdst = dst)
+            {
+                float* pDstCurrent = pdst;
+                float* pEnd = pdst + dst.Length;
+
+                while (pDstCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+
+                    dstVector = Sse.Multiply(scaleVector, dstVector);
+                    Sse.Store(pDstCurrent, dstVector);
+
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+                    dstVector = Sse.MultiplyScalar(scaleVector, dstVector);
+                    Sse.StoreScalar(pDstCurrent, dstVector);
+
+                    pDstCurrent++;
+                }
+            }
+        }
+
+        internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float> dst)
+        {
+            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pEnd = pdst + dst.Length;
+
+                while (pDstCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+
+                    srcVector = Sse.Multiply(srcVector, scaleVector);
+                    dstVector = Sse.Add(dstVector, srcVector);
+                    Sse.Store(pDstCurrent, dstVector);
+
+                    pDstCurrent += 4;
+                    pSrcCurrent += 4;
+                }
+
+                while (pDstCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+                    srcVector = Sse.MultiplyScalar(srcVector, scaleVector);
+                    dstVector = Sse.AddScalar(dstVector, srcVector);
+                    Sse.StoreScalar(pDstCurrent, dstVector);
+
+                    pDstCurrent++;
+                    pSrcCurrent++;
+                }
+            }
+        }
+
+        internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx, Span<float> dst)
+        {
+            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+
+            fixed (float* psrc = src)
+            fixed (int* pidx = idx)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                int* pIdxCurrent = pidx;
+                float* pDstCurrent = pdst;
+                int* pEnd = pidx + idx.Length;
+
+                while (pIdxCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Load4(pDstCurrent, pIdxCurrent);
+
+                    srcVector = Sse.Multiply(srcVector, scaleVector);
+                    dstVector = Sse.Add(dstVector, srcVector);
+                    Store4(dstVector, pDstCurrent, pIdxCurrent);
+
+                    pIdxCurrent += 4;
+                    pSrcCurrent += 4;
+                }
+
+                while (pIdxCurrent < pEnd)
+                {
+                    pDstCurrent[*pIdxCurrent] += scale * (*pSrcCurrent);
+
+                    pIdxCurrent++;
+                    pSrcCurrent++;
+                }
+            }
+        }
+
+        internal static unsafe void AddU(Span<float> src, Span<float> dst)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+
+                    Vector128<float> result = Sse.Add(srcVector, dstVector);
+                    Sse.Store(pDstCurrent, result);
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+                    Vector128<float> result = Sse.AddScalar(srcVector, dstVector);
+                    Sse.StoreScalar(pDstCurrent, result);
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                }
+            }
+        }
+
+        internal static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> dst)
+        {
+            fixed (float* psrc = src)
+            fixed (int* pidx = idx)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                int* pIdxCurrent = pidx;
+                float* pDstCurrent = pdst;
+                int* pEnd = pidx + idx.Length;
+
+                while (pIdxCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Load4(pDstCurrent, pIdxCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pSrcCurrent);
+
+                    srcVector = Sse.Add(srcVector, dstVector);
+                    Store4(srcVector, pDstCurrent, pIdxCurrent);
+
+                    pIdxCurrent += 4;
+                    pSrcCurrent += 4;
+                }
+
+                while (pIdxCurrent < pEnd)
+                {
+                    pDstCurrent[*pIdxCurrent] += *pSrcCurrent;
+
+                    pIdxCurrent++;
+                    pSrcCurrent++;
+                }
+            }
+        }
+
+        internal static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2, Span<float> dst)
+        {
+            fixed (float* psrc1 = &src1[0])
+            fixed (float* psrc2 = &src2[0])
+            fixed (float* pdst = dst)
+            {
+                float* pSrc1Current = psrc1;
+                float* pSrc2Current = psrc2;
+                float* pDstCurrent = pdst;
+                float* pEnd = pdst + dst.Length;
+
+                while (pDstCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> src1Vector = Sse.LoadVector128(pSrc1Current);
+                    Vector128<float> src2Vector = Sse.LoadVector128(pSrc2Current);
+                    src2Vector = Sse.Multiply(src1Vector, src2Vector);
+                    Sse.Store(pDstCurrent, src2Vector);
+
+                    pSrc1Current += 4;
+                    pSrc2Current += 4;
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pEnd)
+                {
+                    Vector128<float> src1Vector = Sse.LoadScalarVector128(pSrc1Current);
+                    Vector128<float> src2Vector = Sse.LoadScalarVector128(pSrc2Current);
+                    src2Vector = Sse.MultiplyScalar(src1Vector, src2Vector);
+                    Sse.StoreScalar(pDstCurrent, src2Vector);
+
+                    pSrc1Current++;
+                    pSrc2Current++;
+                    pDstCurrent++;
+                }
+            }
+        }
+
+        internal static unsafe float SumSqU(Span<float> src)
+        {
+            Vector128<float> result = Sse.SetZeroVector128();
+
+            fixed (float* psrc = src)
+            {
+                float* pSrcCurrent = psrc;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    result = Sse.Add(result, Sse.Multiply(srcVector, srcVector));
+
+                    pSrcCurrent += 4;
+                }
+
+                result = VectorSum(in result);
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, srcVector));
+
+                    pSrcCurrent++;
+                }
+            }
+
+            return Sse.ConvertToSingle(result);
+        }
+
+        internal static unsafe float SumAbsU(Span<float> src)
+        {
+            Vector128<float> result = Sse.SetZeroVector128();
+            Vector128<float> mask;
+
+            if (Sse2.IsSupported)
+            {
+                mask = Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF));
+            }
+            else
+            {
+                mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+            }
+
+            fixed (float* psrc = src)
+            {
+                float* pSrcCurrent = psrc;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    result = Sse.Add(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent += 4;
+                }
+
+                result = VectorSum(in result);
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    result = Sse.Add(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent++;
+                }
+            }
+
+            return Sse.ConvertToSingle(result);
+        }
+
+        internal static unsafe float DotU(Span<float> src, Span<float> dst)
+        {
+            Vector128<float> result = Sse.SetZeroVector128();
+
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+
+                    result = Sse.Add(result, Sse.Multiply(srcVector, dstVector));
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                result = VectorSum(in result);
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+                    result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, dstVector));
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                }
+            }
+
+            return Sse.ConvertToSingle(result);
+        }
+
+        internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> idx)
+        {
+            Vector128<float> result = Sse.SetZeroVector128();
+
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (int* pidx = idx)
+            {
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                int* pIdxCurrent = pidx;
+                int* pEnd = pidx + idx.Length;
+
+                while (pIdxCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Load4(pSrcCurrent, pIdxCurrent);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+
+                    result = Sse.Add(result, Sse.Multiply(srcVector, dstVector));
+
+                    pIdxCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                result = VectorSum(in result);
+
+                while (pIdxCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Load1(pSrcCurrent, pIdxCurrent);
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+                    result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, dstVector));
+
+                    pIdxCurrent++;
+                    pDstCurrent++;
+                }
+            }
+
+            return Sse.ConvertToSingle(result);
+        }
+
+        internal static unsafe float Dist2(Span<float> src, Span<float> dst)
+        {
+            Vector128<float> sqDistanceVector = Sse.SetZeroVector128();
+
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent),
+                                                                    Sse.LoadVector128(pDstCurrent));
+                    sqDistanceVector = Sse.Add(sqDistanceVector,
+                                                Sse.Multiply(distanceVector, distanceVector));
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                sqDistanceVector = VectorSum(in sqDistanceVector);
+
+                float norm = Sse.ConvertToSingle(sqDistanceVector);
+                while (pSrcCurrent < pEnd)
+                {
+                    float distance = (*pSrcCurrent) - (*pDstCurrent);
+                    norm += distance * distance;
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                }
+
+                return norm;
+            }
+        }
+
+    }
+}
diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs
index d7082c8313..1053f75b75 100644
--- a/src/Microsoft.ML.CpuMath/Thunk.cs
+++ b/src/Microsoft.ML.CpuMath/Thunk.cs
@@ -2,7 +2,6 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using System;
 using System.Runtime.InteropServices;
 using System.Runtime.CompilerServices;
 using System.Security;
diff --git a/src/Native/build.cmd b/src/Native/build.cmd
index e2bbc3a4dc..3f533dc353 100644
--- a/src/Native/build.cmd
+++ b/src/Native/build.cmd
@@ -17,7 +17,9 @@ set CMAKE_BUILD_TYPE=Debug
 :Arg_Loop
 if [%1] == [] goto :ToolsVersion
 if /i [%1] == [Release]     ( set CMAKE_BUILD_TYPE=Release&&shift&goto Arg_Loop)
+if /i [%1] == [Release-Intrinsics]     ( set CMAKE_BUILD_TYPE=Release-Intrinsics&&shift&goto Arg_Loop)
 if /i [%1] == [Debug]       ( set CMAKE_BUILD_TYPE=Debug&&shift&goto Arg_Loop)
+if /i [%1] == [Debug-Intrinsics]       ( set CMAKE_BUILD_TYPE=Debug-Intrinsics&&shift&goto Arg_Loop)
 
 if /i [%1] == [x86]         ( set __BuildArch=x86&&set __VCBuildArch=x86&&shift&goto Arg_Loop)
 if /i [%1] == [x64]         ( set __BuildArch=x64&&set __VCBuildArch=x86_amd64&&shift&goto Arg_Loop)
@@ -86,6 +88,10 @@ if %__IntermediatesDir% == "" (
 set "__CMakeBinDir=%__CMakeBinDir:\=/%"
 set "__IntermediatesDir=%__IntermediatesDir:\=/%"
 
+:: Strip the "-Intrinsics" suffix from the build type
+if [%CMAKE_BUILD_TYPE:~-11%] == [-Intrinsics] (
+	set CMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE:~0,-11%
+)
 
 :: Check that the intermediate directory exists so we can place our cmake build tree there
 if not exist "%__IntermediatesDir%" md "%__IntermediatesDir%"
diff --git a/test/Directory.Build.props b/test/Directory.Build.props
index ee5d507566..2e16be2f2b 100644
--- a/test/Directory.Build.props
+++ b/test/Directory.Build.props
@@ -20,7 +20,7 @@
     <AssemblyOriginatorKeyFile>$(ToolsDir)Test.snk</AssemblyOriginatorKeyFile>
   </PropertyGroup>
   
-      <ItemGroup>
+  <ItemGroup>
     <PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.5.0" />
     <PackageReference Include="xunit" Version="2.3.1" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.3.1" />
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
new file mode 100644
index 0000000000..90f362de3e
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
@@ -0,0 +1,45 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.InteropServices;
+using System.Security;
+
+namespace Microsoft.ML.CpuMath.PerformanceTests
+{
+    internal static class CpuMathNativeUtils
+    {
+        [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "DotSU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "SumSqU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumSqU(/*const*/ float* ps, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "AddU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddU(/*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "AddSU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddSU(/*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "AddScaleU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddScaleU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "AddScaleSU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddScaleSU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "SumAbsU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumAbsU(/*const*/ float* ps, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c);
+    }
+}
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/Microsoft.ML.CpuMath.PerformanceTests.csproj b/test/Microsoft.ML.CpuMath.PerformanceTests/Microsoft.ML.CpuMath.PerformanceTests.csproj
new file mode 100644
index 0000000000..61d22bbfbb
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/Microsoft.ML.CpuMath.PerformanceTests.csproj
@@ -0,0 +1,27 @@
+﻿<Project Sdk="Microsoft.NET.Sdk" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <LangVersion>7.2</LangVersion>
+    <StartupObject>Microsoft.ML.CpuMath.PerformanceTests.Program</StartupObject>
+    <TargetFramework>netcoreapp3.0</TargetFramework>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  
+  <ItemGroup>
+    <Compile Remove="BenchmarkDotNet.Artifacts\**" />
+    <EmbeddedResource Remove="BenchmarkDotNet.Artifacts\**" />
+    <None Remove="BenchmarkDotNet.Artifacts\**" />
+  </ItemGroup>
+  
+  <ItemGroup>
+    <PackageReference Include="BenchmarkDotNet" Version="0.10.14" />
+  </ItemGroup>
+  
+  <ItemGroup>
+    <ProjectReference Include="..\..\src\Microsoft.ML.CpuMath\Microsoft.ML.CpuMath.csproj" />
+  </ItemGroup>
+  
+  <ItemGroup>
+    <NativeAssemblyReference Include="CpuMathNative" />
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/Program.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/Program.cs
new file mode 100644
index 0000000000..cd731e8cf3
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/Program.cs
@@ -0,0 +1,29 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Jobs;
+using BenchmarkDotNet.Running;
+using BenchmarkDotNet.Toolchains.InProcess;
+
+namespace Microsoft.ML.CpuMath.PerformanceTests
+{
+    class Program
+    {
+        public static void Main(string[] args)
+        {
+            BenchmarkSwitcher
+                .FromAssembly(typeof(Program).Assembly)
+                .Run(null, CreateClrVsCoreConfig());
+        }
+
+        private static IConfig CreateClrVsCoreConfig()
+        {
+            var config = DefaultConfig.Instance.With(
+                Job.ShortRun.
+                With(InProcessToolchain.Instance));
+            return config;
+        }
+    }
+}
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
new file mode 100644
index 0000000000..92752a0018
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -0,0 +1,238 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Running;
+using Microsoft.ML.Runtime.Internal.CpuMath;
+
+namespace Microsoft.ML.CpuMath.PerformanceTests
+{
+    public class SsePerformanceTests
+    {
+        private const int EXP_MAX = 127;
+        private const int EXP_MIN = 0;
+
+        private const int IDXLEN = 1000003;
+        private const int LEN = 1000003;
+        private const int EXP_RANGE = EXP_MAX / 2;
+        private const int DEFAULT_SEED = 253421;
+        private const float DEFAULT_SCALE = 1.11f;
+
+        private float[] src, dst, original, src1, src2;
+        private int[] idx;
+        private int seed = DEFAULT_SEED;
+
+        private static float NextFloat(Random rand, int expRange)
+        {
+            double mantissa = (rand.NextDouble() * 2.0) - 1.0;
+            double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1));
+            return (float)(mantissa * exponent);
+        }
+
+        private static int GetSeed()
+        {
+            int seed = DEFAULT_SEED;
+
+            if (Environment.GetEnvironmentVariable("CPUMATH_SEED") != null)
+            {
+                string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED");
+
+                if (!int.TryParse(CPUMATH_SEED, out seed))
+                {
+                    if(string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase))
+                    {
+                        seed = new Random().Next();
+                    }
+                    else
+                    {
+                        seed = DEFAULT_SEED;
+                    }
+                }
+            }
+
+            Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results");
+
+            return seed;
+        }
+
+        [GlobalSetup]
+        public void Setup()
+        {
+            src = new float[LEN];
+            dst = new float[LEN];
+            src1 = new float[LEN];
+            src2 = new float[LEN];
+            original = new float[LEN];
+            idx = new int[IDXLEN];
+
+            seed = GetSeed();
+            Random rand = new Random(seed);
+
+            for (int i = 0; i < LEN; i++)
+            {
+                src[i] = NextFloat(rand, EXP_RANGE);
+                dst[i] = NextFloat(rand, EXP_RANGE);
+                original[i] = dst[i];
+                src1[i] = NextFloat(rand, EXP_RANGE);
+                src2[i] = NextFloat(rand, EXP_RANGE);
+            }
+
+            for (int i = 0; i < IDXLEN; i++)
+            {
+                idx[i] = rand.Next(0, LEN);
+            }
+        }
+
+        [GlobalCleanup]
+        public void GlobalCleanup()
+        {
+            original.CopyTo(dst, 0);
+        }
+
+        [Benchmark]
+        public unsafe float NativeDotUPerf()
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                return CpuMathNativeUtils.DotU(psrc, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN);
+
+        [Benchmark]
+        public unsafe float NativeDotSUPerf()
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (int* pidx = idx)
+            {
+                return CpuMathNativeUtils.DotSU(psrc, pdst, pidx, IDXLEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN);
+
+        [Benchmark]
+        public unsafe float NativeSumSqUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumSqU(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN);
+
+        [Benchmark]
+        public unsafe void NativeAddUPerf()
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.AddU(psrc, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeAddSUPerf()
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (int* pidx = idx)
+            {
+                CpuMathNativeUtils.AddSU(psrc, pidx, pdst, IDXLEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN);
+
+        [Benchmark]
+        public unsafe void NativeAddScaleUPerf()
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.AddScaleU(DEFAULT_SCALE, psrc, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeAddScaleSUPerf()
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (int* pidx = idx)
+            {
+                CpuMathNativeUtils.AddScaleSU(DEFAULT_SCALE, psrc, pidx, pdst, IDXLEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN);
+
+        [Benchmark]
+        public unsafe void NativeScaleUPerf()
+        {
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.ScaleU(DEFAULT_SCALE, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
+
+        [Benchmark]
+        public unsafe float NativeDist2Perf()
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                return CpuMathNativeUtils.Dist2(psrc, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN);
+
+        [Benchmark]
+        public unsafe float NativeSumAbsUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumAbsU(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumAbsqUPerf() => CpuMathUtils.SumAbs(src, LEN);
+
+        [Benchmark]
+        public unsafe void NativeMulElementWiseUPerf()
+        {
+            fixed (float* psrc1 = src1)
+            fixed (float* psrc2 = src2)
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.MulElementWiseU(psrc1, psrc2, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN);
+    }
+}
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj
new file mode 100644
index 0000000000..e611b15032
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj
@@ -0,0 +1,16 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netcoreapp3.0</TargetFramework>
+    <IsPackable>false</IsPackable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\src\Microsoft.ML.CpuMath\Microsoft.ML.CpuMath.csproj " />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Compile Include="..\Microsoft.ML.CpuMath.UnitTests.netstandard\UnitTests.cs" />
+  </ItemGroup>
+  
+</Project>
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj
new file mode 100644
index 0000000000..9552f688a8
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj
@@ -0,0 +1,16 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netcoreapp2.0</TargetFramework>
+    <IsPackable>false</IsPackable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\src\Microsoft.ML.CpuMath\Microsoft.ML.CpuMath.csproj " />
+  </ItemGroup>
+
+  <ItemGroup>
+    <NativeAssemblyReference Include="CpuMathNative" />
+  </ItemGroup>
+ 
+</Project>
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
new file mode 100644
index 0000000000..6fc2596ef7
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
@@ -0,0 +1,246 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Xunit;
+using Microsoft.ML.Runtime.Internal.CpuMath;
+
+namespace Microsoft.ML.CpuMath.UnitTests
+{
+    public class CpuMathUtilsUnitTests
+    {
+        private readonly float[][] testArrays;
+        private readonly int[] testIndexArray;
+        private const float DEFAULT_SCALE = 1.7f;
+        private FloatEqualityComparer comparer;
+
+        public CpuMathUtilsUnitTests()
+        {
+            // Padded array whose length is a multiple of 4
+            float[] testArray1 = new float[8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
+            // Unpadded array whose length is not a multiple of 4.
+            float[] testArray2 = new float[7] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f };
+            testArrays = new float[][] { testArray1, testArray2 };
+            testIndexArray = new int[4] { 0, 2, 5, 6 };
+            comparer = new FloatEqualityComparer();
+        }
+
+        [Theory]
+        [InlineData(0, 13306.0376f)]
+        [InlineData(1, 13291.9235f)]
+        public void DotUTest(int test, float expected)
+        {
+            float[] src = (float[]) testArrays[test].Clone();
+            float[] dst = (float[]) src.Clone();
+            
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 736.7352f)]
+        [InlineData(1, 736.7352f)]
+        public void DotSUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            int[] idx = testIndexArray;
+
+            // Ensures src and dst are different arrays
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length);
+            Assert.Equal(expected, actual, 4);
+        }
+
+        [Theory]
+        [InlineData(0, 13399.9376f)]
+        [InlineData(1, 13389.1135f)]
+        public void SumSqUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.SumSq(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddUTest(int test)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            float[] expected = (float[])src.Clone();
+
+            // Ensures src and dst are different arrays
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] = 2 * expected[i] + 1;
+            }
+
+            CpuMathUtils.Add(src, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddSUTest(int test)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            int[] idx = testIndexArray;
+            float[] expected = (float[])dst.Clone();
+
+            expected[0] = 3.92f;
+            expected[2] = -12.14f;
+            expected[5] = -36.69f;
+            expected[6] = 46.29f;
+
+            CpuMathUtils.Add(src, idx, dst, idx.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddScaleUTest(int test)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            float[] expected = (float[])dst.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] *= (1 + DEFAULT_SCALE);
+            }
+
+            CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddScaleSUTest(int test)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            int[] idx = testIndexArray;
+            float[] expected = (float[])dst.Clone();
+
+            expected[0] = 5.292f;
+            expected[2] = -13.806f;
+            expected[5] = -43.522f;
+            expected[6] = 55.978f;
+
+            CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void ScaleUTest(int test)
+        {
+            float[] dst = (float[])testArrays[test].Clone();
+            float[] expectedOutput = (float[])dst.Clone();
+
+            for (int i = 0; i < expectedOutput.Length; i++)
+            {
+                expectedOutput[i] *= DEFAULT_SCALE;
+            }
+
+            CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length);
+            var managedOutput = dst;
+            Assert.Equal(expectedOutput, managedOutput, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 8.0f)]
+        [InlineData(1, 7.0f)]
+        public void Dist2Test(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+
+            // Ensures src and dst are different arrays
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length);
+            Assert.Equal(expected, actual, 0);
+        }
+
+        [Theory]
+        [InlineData(0, 196.98f)]
+        [InlineData(1, 193.69f)]
+        public void SumAbsUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.SumAbs(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void MulElementWiseUTest(int test)
+        {
+            float[] src1 = (float[])testArrays[test].Clone();
+            float[] src2 = (float[])src1.Clone();
+            float[] dst = (float[])src1.Clone();
+
+            // Ensures src1 and src2 are different arrays
+            for (int i = 0; i < src2.Length; i++)
+            {
+                src2[i] += 1;
+            }
+
+            float[] expected = (float[])src1.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] *= (1 + expected[i]);
+            }
+
+            CpuMathUtils.MulElementWise(src1, src2, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
+        }
+    }
+
+    internal class FloatEqualityComparer : IEqualityComparer<float>
+    {
+        public bool Equals(float a, float b)
+        {
+            return Math.Abs(a - b) < 1e-5f;
+        }
+
+        public int GetHashCode(float a)
+        {
+            throw new NotImplementedException();
+        }
+    }
+}