diff --git a/Directory.Build.props b/Directory.Build.props
index 73144201c7..bdca231554 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -114,4 +114,11 @@
true
+
+ $(Configuration.EndsWith('-Intrinsics'))
+
+
+
+ $(RepoRoot)build\AfterCommonTargets.targets
+
diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln
index 140c93753c..18d9d3867e 100644
--- a/Microsoft.ML.sln
+++ b/Microsoft.ML.sln
@@ -97,6 +97,13 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CodeAnalyzer",
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CodeAnalyzer.Tests", "test\Microsoft.ML.CodeAnalyzer.Tests\Microsoft.ML.CodeAnalyzer.Tests.csproj", "{3E4ABF07-7970-4BE6-B45B-A13D3C397545}"
EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CpuMath.PerformanceTests", "test\Microsoft.ML.CpuMath.PerformanceTests\Microsoft.ML.CpuMath.PerformanceTests.csproj", "{7333EDEF-4144-405C-A5EC-6F42201857D8}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CpuMath.UnitTests.netstandard", "test\Microsoft.ML.CpuMath.UnitTests.netstandard\Microsoft.ML.CpuMath.UnitTests.netstandard.csproj", "{A0E562A9-0E6D-470D-B180-6EB44BA84D60}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CpuMath.UnitTests.netcoreapp", "test\Microsoft.ML.CpuMath.UnitTests.netcoreapp\Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj", "{5F81A2A4-73AD-494C-B387-07D605EC8826}"
+EndProject
+
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "Microsoft.ML.FSharp.Tests", "test\Microsoft.ML.FSharp.Tests\Microsoft.ML.FSharp.Tests.fsproj", "{802233D6-8CC0-46AD-9F23-FEE1E9AED9B3}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.ImageAnalytics", "src\Microsoft.ML.ImageAnalytics\Microsoft.ML.ImageAnalytics.csproj", "{00E38F77-1E61-4CDF-8F97-1417D4E85053}"
@@ -335,6 +342,30 @@ Global
{3E4ABF07-7970-4BE6-B45B-A13D3C397545}.Release|Any CPU.Build.0 = Release|Any CPU
{3E4ABF07-7970-4BE6-B45B-A13D3C397545}.Release-Intrinsics|Any CPU.ActiveCfg = Release|Any CPU
{3E4ABF07-7970-4BE6-B45B-A13D3C397545}.Release-Intrinsics|Any CPU.Build.0 = Release|Any CPU
+ {7333EDEF-4144-405C-A5EC-6F42201857D8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {7333EDEF-4144-405C-A5EC-6F42201857D8}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {7333EDEF-4144-405C-A5EC-6F42201857D8}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug|Any CPU
+ {7333EDEF-4144-405C-A5EC-6F42201857D8}.Debug-Intrinsics|Any CPU.Build.0 = Debug|Any CPU
+ {7333EDEF-4144-405C-A5EC-6F42201857D8}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {7333EDEF-4144-405C-A5EC-6F42201857D8}.Release|Any CPU.Build.0 = Release|Any CPU
+ {7333EDEF-4144-405C-A5EC-6F42201857D8}.Release-Intrinsics|Any CPU.ActiveCfg = Release|Any CPU
+ {7333EDEF-4144-405C-A5EC-6F42201857D8}.Release-Intrinsics|Any CPU.Build.0 = Release|Any CPU
+ {A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug|Any CPU
+ {A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Debug-Intrinsics|Any CPU.Build.0 = Debug|Any CPU
+ {A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Release|Any CPU.Build.0 = Release|Any CPU
+ {A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Release-Intrinsics|Any CPU.ActiveCfg = Release|Any CPU
+ {A0E562A9-0E6D-470D-B180-6EB44BA84D60}.Release-Intrinsics|Any CPU.Build.0 = Release|Any CPU
+ {5F81A2A4-73AD-494C-B387-07D605EC8826}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {5F81A2A4-73AD-494C-B387-07D605EC8826}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {5F81A2A4-73AD-494C-B387-07D605EC8826}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug|Any CPU
+ {5F81A2A4-73AD-494C-B387-07D605EC8826}.Debug-Intrinsics|Any CPU.Build.0 = Debug|Any CPU
+ {5F81A2A4-73AD-494C-B387-07D605EC8826}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {5F81A2A4-73AD-494C-B387-07D605EC8826}.Release|Any CPU.Build.0 = Release|Any CPU
+ {5F81A2A4-73AD-494C-B387-07D605EC8826}.Release-Intrinsics|Any CPU.ActiveCfg = Release|Any CPU
+ {5F81A2A4-73AD-494C-B387-07D605EC8826}.Release-Intrinsics|Any CPU.Build.0 = Release|Any CPU
{802233D6-8CC0-46AD-9F23-FEE1E9AED9B3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{802233D6-8CC0-46AD-9F23-FEE1E9AED9B3}.Debug|Any CPU.Build.0 = Debug|Any CPU
{802233D6-8CC0-46AD-9F23-FEE1E9AED9B3}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug|Any CPU
@@ -395,6 +426,9 @@ Global
{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{BF66A305-DF10-47E4-8D81-42049B149D2B} = {D3D38B03-B557-484D-8348-8BADEE4DF592}
+ {7333EDEF-4144-405C-A5EC-6F42201857D8} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
+ {A0E562A9-0E6D-470D-B180-6EB44BA84D60} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
+ {5F81A2A4-73AD-494C-B387-07D605EC8826} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{B4E55B2D-2A92-46E7-B72F-E76D6FD83440} = {7F13E156-3EBA-4021-84A5-CD56BA72F99E}
{3E4ABF07-7970-4BE6-B45B-A13D3C397545} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{802233D6-8CC0-46AD-9F23-FEE1E9AED9B3} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
diff --git a/build.proj b/build.proj
index f8e4adaf5f..cb5c557cbe 100644
--- a/build.proj
+++ b/build.proj
@@ -41,7 +41,8 @@
+ Targets="Restore"
+ Properties="MSBuildWarningsAsMessages=NU1503" />
+
+ $(MSBuildAllProjects);$(MSBuildThisFileFullPath)
+
+
+
+
+
\ No newline at end of file
diff --git a/build/Empty.targets b/build/Empty.targets
new file mode 100644
index 0000000000..72abf9cd60
--- /dev/null
+++ b/build/Empty.targets
@@ -0,0 +1,29 @@
+
+
+ $(MSBuildAllProjects);$(MSBuildThisFileFullPath)
+
+ ignore.targets
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
new file mode 100644
index 0000000000..6c6c1fe6ad
--- /dev/null
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -0,0 +1,396 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.Intrinsics.X86;
+using System;
+
+namespace Microsoft.ML.Runtime.Internal.CpuMath
+{
+ public static partial class CpuMathUtils
+ {
+ public static void Scale(float a, float[] dst, int count)
+ {
+ Contracts.AssertNonEmpty(dst);
+ Contracts.Assert(0 < count && count <= dst.Length);
+
+ Scale(a, new Span(dst, 0, count));
+ }
+
+ public static void Scale(float a, float[] dst, int offset, int count)
+ {
+ Contracts.AssertNonEmpty(dst);
+ Contracts.Assert(0 < count);
+ Contracts.Assert(0 <= offset && offset < dst.Length - count);
+
+ Scale(a, new Span(dst, offset, count));
+ }
+
+ private static void Scale(float a, Span dst)
+ {
+ if (Sse.IsSupported)
+ {
+ SseIntrinsics.ScaleU(a, dst);
+ }
+ else
+ {
+ for (int i = 0; i < dst.Length; i++)
+ {
+ dst[i] *= a;
+ }
+ }
+ }
+
+ public static void AddScale(float a, float[] src, float[] dst, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(0 < count && count <= src.Length);
+ Contracts.AssertNonEmpty(dst);
+ Contracts.Assert(count <= dst.Length);
+
+ AddScale(a, new Span(src, 0, count), new Span(dst, 0, count));
+ }
+
+ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(count <= src.Length);
+ Contracts.AssertNonEmpty(dst);
+ Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
+ Contracts.Assert(0 < count && count <= dst.Length - dstOffset);
+
+ AddScale(a, new Span(src, 0, count), new Span(dst, dstOffset, count));
+ }
+
+ private static void AddScale(float a, Span src, Span dst)
+ {
+ if (Sse.IsSupported)
+ {
+ SseIntrinsics.AddScaleU(a, src, dst);
+ }
+ else
+ {
+ for (int i = 0; i < dst.Length; i++)
+ {
+ dst[i] += a * src[i];
+ }
+ }
+ }
+
+ public static void AddScale(float a, float[] src, int[] indices, float[] dst, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(0 < count && count <= src.Length);
+ Contracts.AssertNonEmpty(indices);
+ Contracts.Assert(count <= indices.Length);
+ Contracts.AssertNonEmpty(dst);
+ Contracts.Assert(count < dst.Length);
+
+ AddScale(a, new Span(src), new Span(indices, 0, count), new Span(dst));
+ }
+
+ public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(0 < count && count <= src.Length);
+ Contracts.AssertNonEmpty(indices);
+ Contracts.Assert(count <= indices.Length);
+ Contracts.AssertNonEmpty(dst);
+ Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
+ Contracts.Assert(count < dst.Length - dstOffset);
+
+ AddScale(a, new Span(src), new Span(indices, 0, count),
+ new Span(dst, dstOffset, dst.Length - dstOffset));
+ }
+
+ private static void AddScale(float a, Span src, Span indices, Span dst)
+ {
+ if (Sse.IsSupported)
+ {
+ SseIntrinsics.AddScaleSU(a, src, indices, dst);
+ }
+ else
+ {
+ for (int i = 0; i < indices.Length; i++)
+ {
+ int index = indices[i];
+ dst[index] += a * src[i];
+ }
+ }
+ }
+
+ public static void Add(float[] src, float[] dst, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(0 < count && count <= src.Length);
+ Contracts.AssertNonEmpty(dst);
+ Contracts.Assert(count <= dst.Length);
+
+ Add(new Span(src, 0, count), new Span(dst, 0, count));
+ }
+
+ private static void Add(Span src, Span dst)
+ {
+ if (Sse.IsSupported)
+ {
+ SseIntrinsics.AddU(src, dst);
+ }
+ else
+ {
+ for (int i = 0; i < dst.Length; i++)
+ {
+ dst[i] += src[i];
+ }
+ }
+ }
+
+ public static void Add(float[] src, int[] indices, float[] dst, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(0 < count && count <= src.Length);
+ Contracts.AssertNonEmpty(indices);
+ Contracts.Assert(count <= indices.Length);
+ Contracts.AssertNonEmpty(dst);
+ Contracts.Assert(count < dst.Length);
+
+ Add(new Span(src), new Span(indices, 0, count), new Span(dst));
+ }
+
+ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(0 < count && count <= src.Length);
+ Contracts.AssertNonEmpty(indices);
+ Contracts.Assert(count <= indices.Length);
+ Contracts.AssertNonEmpty(dst);
+ Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
+ Contracts.Assert(count <= dst.Length - dstOffset);
+
+ Add(new Span(src), new Span(indices, 0, count),
+ new Span(dst, dstOffset, dst.Length - dstOffset));
+ }
+
+ private static void Add(Span src, Span indices, Span dst)
+ {
+ if (Sse.IsSupported)
+ {
+ SseIntrinsics.AddSU(src, indices, dst);
+ }
+ else
+ {
+ for (int i = 0; i < indices.Length; i++)
+ {
+ int index = indices[i];
+ dst[index] += src[i];
+ }
+ }
+ }
+
+ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count)
+ {
+ Contracts.AssertNonEmpty(src1);
+ Contracts.Assert(0 < count && count <= src1.Length);
+ Contracts.AssertNonEmpty(src2);
+ Contracts.Assert(0 < count && count <= src2.Length);
+ Contracts.AssertNonEmpty(dst);
+
+ MulElementWise(new Span(src1, 0, count), new Span(src2, 0, count),
+ new Span(dst, 0, count));
+ }
+
+ private static void MulElementWise(Span src1, Span src2, Span dst)
+ {
+ if (Sse.IsSupported)
+ {
+ SseIntrinsics.MulElementWiseU(src1, src2, dst);
+ }
+ else
+ {
+ for (int i = 0; i < dst.Length; i++)
+ {
+ dst[i] = src1[i] * src2[i];
+ }
+ }
+ }
+
+ public static float SumSq(float[] src, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(0 < count && count <= src.Length);
+
+ return SumSq(new Span(src, 0, count));
+ }
+
+ public static float SumSq(float[] src, int offset, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(0 < count);
+ Contracts.Assert(0 <= offset && offset <= src.Length - count);
+
+ return SumSq(new Span(src, offset, count));
+ }
+
+ private static float SumSq(Span src)
+ {
+ if (Sse.IsSupported)
+ {
+ return SseIntrinsics.SumSqU(src);
+ }
+ else
+ {
+ float result = 0;
+ for (int i = 0; i < src.Length; i++)
+ {
+ result += src[i] * src[i];
+ }
+ return result;
+ }
+ }
+
+ public static float SumAbs(float[] src, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(0 < count && count <= src.Length);
+
+ return SumAbs(new Span(src, 0, count));
+ }
+
+ public static float SumAbs(float[] src, int offset, int count)
+ {
+ Contracts.AssertNonEmpty(src);
+ Contracts.Assert(0 < count);
+ Contracts.Assert(0 <= offset && offset <= src.Length - count);
+
+ return SumAbs(new Span(src, offset, count));
+ }
+
+ private static float SumAbs(Span src)
+ {
+ if (Sse.IsSupported)
+ {
+ return SseIntrinsics.SumAbsU(src);
+ }
+ else
+ {
+ float sum = 0;
+ for (int i = 0; i < src.Length; i++)
+ {
+ sum += Math.Abs(src[i]);
+ }
+ return sum;
+ }
+ }
+
+ public static float DotProductDense(float[] a, float[] b, int count)
+ {
+ Contracts.AssertNonEmpty(a);
+ Contracts.AssertNonEmpty(b);
+ Contracts.Assert(0 < count);
+ Contracts.Assert(a.Length >= count);
+ Contracts.Assert(b.Length >= count);
+
+ return DotProductDense(new Span(a, 0, count), new Span(b, 0, count));
+ }
+
+ public static float DotProductDense(float[] a, int offset, float[] b, int count)
+ {
+ Contracts.AssertNonEmpty(a);
+ Contracts.Assert(0 < count);
+ Contracts.Assert(0 <= offset && offset <= a.Length - count);
+ Contracts.AssertNonEmpty(b);
+ Contracts.Assert(b.Length >= count);
+
+ return DotProductDense(new Span(a, offset, count), new Span(b, 0, count));
+ }
+
+ private static float DotProductDense(Span a, Span b)
+ {
+ if (Sse.IsSupported)
+ {
+ return SseIntrinsics.DotU(a, b);
+ }
+ else
+ {
+ float result = 0;
+ for (int i = 0; i < b.Length; i++)
+ {
+ result += a[i] * b[i];
+ }
+ return result;
+ }
+ }
+
+ public static float DotProductSparse(float[] a, float[] b, int[] indices, int count)
+ {
+ Contracts.AssertNonEmpty(a);
+ Contracts.AssertNonEmpty(b);
+ Contracts.Assert(0 < count);
+ Contracts.Assert(count < a.Length);
+ Contracts.Assert(count <= b.Length);
+ Contracts.Assert(count <= indices.Length);
+
+ return DotProductSparse(new Span(a), new Span(b),
+ new Span(indices, 0, count));
+ }
+
+ public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count)
+ {
+ Contracts.AssertNonEmpty(a);
+ Contracts.Assert(0 < count);
+ Contracts.Assert(0 <= offset && offset < a.Length);
+ Contracts.Assert(a.Length - offset > count);
+ Contracts.AssertNonEmpty(b);
+ Contracts.Assert(count <= b.Length);
+ Contracts.Assert(count <= indices.Length);
+
+ return DotProductSparse(new Span(a, offset, a.Length - offset),
+ new Span(b), new Span(indices, 0, count));
+ }
+
+ private static float DotProductSparse(Span a, Span b, Span indices)
+ {
+ if (Sse.IsSupported)
+ {
+ return SseIntrinsics.DotSU(a, b, indices);
+ }
+ else
+ {
+ float result = 0;
+ for (int i = 0; i < indices.Length; i++)
+ {
+ int index = indices[i];
+ result += a[index] * b[i];
+ }
+ return result;
+ }
+ }
+
+ public static float L2DistSquared(float[] a, float[] b, int count)
+ {
+ Contracts.AssertNonEmpty(a);
+ Contracts.AssertNonEmpty(b);
+ Contracts.Assert(0 < count && count <= a.Length);
+ Contracts.Assert(count <= b.Length);
+
+ return L2DistSquared(new Span(a, 0, count), new Span(b, 0, count));
+ }
+
+ private static float L2DistSquared(Span a, Span b)
+ {
+ if (Sse.IsSupported)
+ {
+ return SseIntrinsics.Dist2(a, b);
+ }
+ else
+ {
+ float norm = 0;
+ for (int i = 0; i < b.Length; i++)
+ {
+ float distance = a[i] - b[i];
+ norm += distance * distance;
+ }
+ return norm;
+ }
+ }
+ }
+}
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
new file mode 100644
index 0000000000..501fc9082e
--- /dev/null
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -0,0 +1,47 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Runtime.Internal.CpuMath
+{
+ public static partial class CpuMathUtils
+ {
+ public static void Scale(float a, float[] dst, int count) => SseUtils.Scale(a, dst, count);
+
+ public static void Scale(float a, float[] dst, int offset, int count) => SseUtils.Scale(a, dst, offset, count);
+
+ public static void AddScale(float a, float[] src, float[] dst, int count) => SseUtils.AddScale(a, src, dst, count);
+
+ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, dst, dstOffset, count);
+
+ public static void AddScale(float a, float[] src, int[] indices, float[] dst, int count) => SseUtils.AddScale(a, src, indices, dst, count);
+
+ public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, indices, dst, dstOffset, count);
+
+ public static void Add(float[] src, float[] dst, int count) => SseUtils.Add(src, dst, count);
+
+ public static void Add(float[] src, int[] indices, float[] dst, int count) => SseUtils.Add(src, indices, dst, count);
+
+ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, int count) => SseUtils.Add(src, indices, dst, dstOffset, count);
+
+ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count) => SseUtils.MulElementWise(src1, src2, dst, count);
+
+ public static float SumSq(float[] src, int count) => SseUtils.SumSq(src, count);
+
+ public static float SumSq(float[] src, int offset, int count) => SseUtils.SumSq(src, offset, count);
+
+ public static float SumAbs(float[] src, int count) => SseUtils.SumAbs(src, count);
+
+ public static float SumAbs(float[] src, int offset, int count) => SseUtils.SumAbs(src, offset, count);
+
+ public static float DotProductDense(float[] a, float[] b, int count) => SseUtils.DotProductDense(a, b, count);
+
+ public static float DotProductDense(float[] a, int offset, float[] b, int count) => SseUtils.DotProductDense(a, offset, b, count);
+
+ public static float DotProductSparse(float[] a, float[] b, int[] indices, int count) => SseUtils.DotProductSparse(a, b, indices, count);
+
+ public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count) => SseUtils.DotProductSparse(a, offset, b, indices, count);
+
+ public static float L2DistSquared(float[] a, float[] b, int count) => SseUtils.L2DistSquared(a, b, count);
+ }
+}
diff --git a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
index bde7ae89f5..b6c95b93f4 100644
--- a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
+++ b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj
@@ -2,21 +2,29 @@
Debug;Release;Debug-Intrinsics;Release-Intrinsics
- $(Configuration.EndsWith('-Intrinsics'))
-
netstandard2.0
netstandard2.0;netcoreapp3.0
Microsoft.ML.CpuMath
true
$(DefineConstants);CORECLR;PRIVATE_CONTRACTS
+ 7.3
+
+
+
+
-
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/Sse.cs b/src/Microsoft.ML.CpuMath/Sse.cs
index 68e6ee906b..13de22dd5b 100644
--- a/src/Microsoft.ML.CpuMath/Sse.cs
+++ b/src/Microsoft.ML.CpuMath/Sse.cs
@@ -2,8 +2,6 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
-using System;
-
namespace Microsoft.ML.Runtime.Internal.CpuMath
{
///
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
new file mode 100644
index 0000000000..d11676f283
--- /dev/null
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -0,0 +1,476 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+// The exported function names need to be unique (can't be disambiguated based on signature), hence
+// we introduce suffix letters to indicate the general patterns used.
+// * U suffix means unaligned and unpadded.
+// * S suffix means sparse (unaligned) vector.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Microsoft.ML.Runtime.Internal.CpuMath
+{
+ internal static class SseIntrinsics
+ {
+ [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+ private static unsafe Vector128 Load1(float* src, int* idx)
+ {
+ return Sse.SetScalarVector128(src[idx[0]]);
+ }
+
+ [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+ private static unsafe Vector128 Load4(float* src, int* idx)
+ {
+ return Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]);
+ }
+
+ [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+ private static Vector128 Rotate(Vector128 x)
+ {
+ // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA.
+ return Sse.Shuffle(x, x, 0x39);
+ }
+
+ [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+ private static Vector128 RotateReverse(Vector128 x)
+ {
+ // The control byte shuffles the four 32-bit floats of x: ABCD -> DABC.
+ return Sse.Shuffle(x, x, 0x93);
+ }
+
+ [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void Store4(Vector128 x, float* dst, int* idx)
+ {
+ Sse.StoreScalar(dst + idx[0], x);
+ x = Rotate(x);
+ Sse.StoreScalar(dst + idx[1], x);
+ x = Rotate(x);
+ Sse.StoreScalar(dst + idx[2], x);
+ x = Rotate(x);
+ Sse.StoreScalar(dst + idx[3], x);
+ }
+
+ [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+ private static Vector128 VectorSum(in Vector128 vector)
+ {
+ if (Sse3.IsSupported)
+ {
+ Vector128 tmp = Sse3.HorizontalAdd(vector, vector);
+ return Sse3.HorizontalAdd(tmp, tmp);
+ }
+ else
+ {
+ // SSE3 is not supported.
+ Vector128 tmp = Sse.Add(vector, Sse.MoveHighToLow(vector, vector));
+ // The control byte shuffles the four 32-bit floats of tmp: ABCD -> BADC.
+ return Sse.Add(tmp, Sse.Shuffle(tmp, tmp, 0xb1));
+ }
+ }
+
+ internal static unsafe void ScaleU(float scale, Span dst)
+ {
+ Vector128 scaleVector = Sse.SetAllVector128(scale);
+
+ fixed (float* pdst = dst)
+ {
+ float* pDstCurrent = pdst;
+ float* pEnd = pdst + dst.Length;
+
+ while (pDstCurrent + 4 <= pEnd)
+ {
+ Vector128 dstVector = Sse.LoadVector128(pDstCurrent);
+
+ dstVector = Sse.Multiply(scaleVector, dstVector);
+ Sse.Store(pDstCurrent, dstVector);
+
+ pDstCurrent += 4;
+ }
+
+ while (pDstCurrent < pEnd)
+ {
+ Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+ dstVector = Sse.MultiplyScalar(scaleVector, dstVector);
+ Sse.StoreScalar(pDstCurrent, dstVector);
+
+ pDstCurrent++;
+ }
+ }
+ }
+
+ internal static unsafe void AddScaleU(float scale, Span src, Span dst)
+ {
+ Vector128 scaleVector = Sse.SetAllVector128(scale);
+
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ {
+ float* pSrcCurrent = psrc;
+ float* pDstCurrent = pdst;
+ float* pEnd = pdst + dst.Length;
+
+ while (pDstCurrent + 4 <= pEnd)
+ {
+ Vector128 srcVector = Sse.LoadVector128(pSrcCurrent);
+ Vector128 dstVector = Sse.LoadVector128(pDstCurrent);
+
+ srcVector = Sse.Multiply(srcVector, scaleVector);
+ dstVector = Sse.Add(dstVector, srcVector);
+ Sse.Store(pDstCurrent, dstVector);
+
+ pDstCurrent += 4;
+ pSrcCurrent += 4;
+ }
+
+ while (pDstCurrent < pEnd)
+ {
+ Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+ Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+ srcVector = Sse.MultiplyScalar(srcVector, scaleVector);
+ dstVector = Sse.AddScalar(dstVector, srcVector);
+ Sse.StoreScalar(pDstCurrent, dstVector);
+
+ pDstCurrent++;
+ pSrcCurrent++;
+ }
+ }
+ }
+
+ internal static unsafe void AddScaleSU(float scale, Span src, Span idx, Span dst)
+ {
+ Vector128 scaleVector = Sse.SetAllVector128(scale);
+
+ fixed (float* psrc = src)
+ fixed (int* pidx = idx)
+ fixed (float* pdst = dst)
+ {
+ float* pSrcCurrent = psrc;
+ int* pIdxCurrent = pidx;
+ float* pDstCurrent = pdst;
+ int* pEnd = pidx + idx.Length;
+
+ while (pIdxCurrent + 4 <= pEnd)
+ {
+ Vector128 srcVector = Sse.LoadVector128(pSrcCurrent);
+ Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent);
+
+ srcVector = Sse.Multiply(srcVector, scaleVector);
+ dstVector = Sse.Add(dstVector, srcVector);
+ Store4(dstVector, pDstCurrent, pIdxCurrent);
+
+ pIdxCurrent += 4;
+ pSrcCurrent += 4;
+ }
+
+ while (pIdxCurrent < pEnd)
+ {
+ pDstCurrent[*pIdxCurrent] += scale * (*pSrcCurrent);
+
+ pIdxCurrent++;
+ pSrcCurrent++;
+ }
+ }
+ }
+
+ internal static unsafe void AddU(Span src, Span dst)
+ {
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ {
+ float* pSrcCurrent = psrc;
+ float* pDstCurrent = pdst;
+ float* pEnd = psrc + src.Length;
+
+ while (pSrcCurrent + 4 <= pEnd)
+ {
+ Vector128 srcVector = Sse.LoadVector128(pSrcCurrent);
+ Vector128 dstVector = Sse.LoadVector128(pDstCurrent);
+
+ Vector128 result = Sse.Add(srcVector, dstVector);
+ Sse.Store(pDstCurrent, result);
+
+ pSrcCurrent += 4;
+ pDstCurrent += 4;
+ }
+
+ while (pSrcCurrent < pEnd)
+ {
+ Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+ Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+ Vector128 result = Sse.AddScalar(srcVector, dstVector);
+ Sse.StoreScalar(pDstCurrent, result);
+
+ pSrcCurrent++;
+ pDstCurrent++;
+ }
+ }
+ }
+
+ internal static unsafe void AddSU(Span src, Span idx, Span dst)
+ {
+ fixed (float* psrc = src)
+ fixed (int* pidx = idx)
+ fixed (float* pdst = dst)
+ {
+ float* pSrcCurrent = psrc;
+ int* pIdxCurrent = pidx;
+ float* pDstCurrent = pdst;
+ int* pEnd = pidx + idx.Length;
+
+ while (pIdxCurrent + 4 <= pEnd)
+ {
+ Vector128 srcVector = Load4(pDstCurrent, pIdxCurrent);
+ Vector128 dstVector = Sse.LoadVector128(pSrcCurrent);
+
+ srcVector = Sse.Add(srcVector, dstVector);
+ Store4(srcVector, pDstCurrent, pIdxCurrent);
+
+ pIdxCurrent += 4;
+ pSrcCurrent += 4;
+ }
+
+ while (pIdxCurrent < pEnd)
+ {
+ pDstCurrent[*pIdxCurrent] += *pSrcCurrent;
+
+ pIdxCurrent++;
+ pSrcCurrent++;
+ }
+ }
+ }
+
+ internal static unsafe void MulElementWiseU(Span src1, Span src2, Span dst)
+ {
+ fixed (float* psrc1 = &src1[0])
+ fixed (float* psrc2 = &src2[0])
+ fixed (float* pdst = dst)
+ {
+ float* pSrc1Current = psrc1;
+ float* pSrc2Current = psrc2;
+ float* pDstCurrent = pdst;
+ float* pEnd = pdst + dst.Length;
+
+ while (pDstCurrent + 4 <= pEnd)
+ {
+ Vector128 src1Vector = Sse.LoadVector128(pSrc1Current);
+ Vector128 src2Vector = Sse.LoadVector128(pSrc2Current);
+ src2Vector = Sse.Multiply(src1Vector, src2Vector);
+ Sse.Store(pDstCurrent, src2Vector);
+
+ pSrc1Current += 4;
+ pSrc2Current += 4;
+ pDstCurrent += 4;
+ }
+
+ while (pDstCurrent < pEnd)
+ {
+ Vector128 src1Vector = Sse.LoadScalarVector128(pSrc1Current);
+ Vector128 src2Vector = Sse.LoadScalarVector128(pSrc2Current);
+ src2Vector = Sse.MultiplyScalar(src1Vector, src2Vector);
+ Sse.StoreScalar(pDstCurrent, src2Vector);
+
+ pSrc1Current++;
+ pSrc2Current++;
+ pDstCurrent++;
+ }
+ }
+ }
+
+ internal static unsafe float SumSqU(Span src)
+ {
+ Vector128 result = Sse.SetZeroVector128();
+
+ fixed (float* psrc = src)
+ {
+ float* pSrcCurrent = psrc;
+ float* pEnd = psrc + src.Length;
+
+ while (pSrcCurrent + 4 <= pEnd)
+ {
+ Vector128 srcVector = Sse.LoadVector128(pSrcCurrent);
+ result = Sse.Add(result, Sse.Multiply(srcVector, srcVector));
+
+ pSrcCurrent += 4;
+ }
+
+ result = VectorSum(in result);
+
+ while (pSrcCurrent < pEnd)
+ {
+ Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+ result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, srcVector));
+
+ pSrcCurrent++;
+ }
+ }
+
+ return Sse.ConvertToSingle(result);
+ }
+
+ internal static unsafe float SumAbsU(Span src)
+ {
+ Vector128 result = Sse.SetZeroVector128();
+ Vector128 mask;
+
+ if (Sse2.IsSupported)
+ {
+ mask = Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF));
+ }
+ else
+ {
+ mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+ }
+
+ fixed (float* psrc = src)
+ {
+ float* pSrcCurrent = psrc;
+ float* pEnd = psrc + src.Length;
+
+ while (pSrcCurrent + 4 <= pEnd)
+ {
+ Vector128 srcVector = Sse.LoadVector128(pSrcCurrent);
+ result = Sse.Add(result, Sse.And(srcVector, mask));
+
+ pSrcCurrent += 4;
+ }
+
+ result = VectorSum(in result);
+
+ while (pSrcCurrent < pEnd)
+ {
+ Vector128 srcVector = Sse.LoadVector128(pSrcCurrent);
+ result = Sse.Add(result, Sse.And(srcVector, mask));
+
+ pSrcCurrent++;
+ }
+ }
+
+ return Sse.ConvertToSingle(result);
+ }
+
+ internal static unsafe float DotU(Span src, Span dst)
+ {
+ Vector128 result = Sse.SetZeroVector128();
+
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ {
+ float* pSrcCurrent = psrc;
+ float* pDstCurrent = pdst;
+ float* pEnd = psrc + src.Length;
+
+ while (pSrcCurrent + 4 <= pEnd)
+ {
+ Vector128 srcVector = Sse.LoadVector128(pSrcCurrent);
+ Vector128 dstVector = Sse.LoadVector128(pDstCurrent);
+
+ result = Sse.Add(result, Sse.Multiply(srcVector, dstVector));
+
+ pSrcCurrent += 4;
+ pDstCurrent += 4;
+ }
+
+ result = VectorSum(in result);
+
+ while (pSrcCurrent < pEnd)
+ {
+ Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+ Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+ result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, dstVector));
+
+ pSrcCurrent++;
+ pDstCurrent++;
+ }
+ }
+
+ return Sse.ConvertToSingle(result);
+ }
+
+ internal static unsafe float DotSU(Span src, Span dst, Span idx)
+ {
+ Vector128 result = Sse.SetZeroVector128();
+
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ fixed (int* pidx = idx)
+ {
+ float* pSrcCurrent = psrc;
+ float* pDstCurrent = pdst;
+ int* pIdxCurrent = pidx;
+ int* pEnd = pidx + idx.Length;
+
+ while (pIdxCurrent + 4 <= pEnd)
+ {
+ Vector128 srcVector = Load4(pSrcCurrent, pIdxCurrent);
+ Vector128 dstVector = Sse.LoadVector128(pDstCurrent);
+
+ result = Sse.Add(result, Sse.Multiply(srcVector, dstVector));
+
+ pIdxCurrent += 4;
+ pDstCurrent += 4;
+ }
+
+ result = VectorSum(in result);
+
+ while (pIdxCurrent < pEnd)
+ {
+ Vector128 srcVector = Load1(pSrcCurrent, pIdxCurrent);
+ Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent);
+
+ result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, dstVector));
+
+ pIdxCurrent++;
+ pDstCurrent++;
+ }
+ }
+
+ return Sse.ConvertToSingle(result);
+ }
+
+ internal static unsafe float Dist2(Span src, Span dst)
+ {
+ Vector128 sqDistanceVector = Sse.SetZeroVector128();
+
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ {
+ float* pSrcCurrent = psrc;
+ float* pDstCurrent = pdst;
+ float* pEnd = psrc + src.Length;
+
+ while (pSrcCurrent + 4 <= pEnd)
+ {
+ Vector128 distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent),
+ Sse.LoadVector128(pDstCurrent));
+ sqDistanceVector = Sse.Add(sqDistanceVector,
+ Sse.Multiply(distanceVector, distanceVector));
+
+ pSrcCurrent += 4;
+ pDstCurrent += 4;
+ }
+
+ sqDistanceVector = VectorSum(in sqDistanceVector);
+
+ float norm = Sse.ConvertToSingle(sqDistanceVector);
+ while (pSrcCurrent < pEnd)
+ {
+ float distance = (*pSrcCurrent) - (*pDstCurrent);
+ norm += distance * distance;
+
+ pSrcCurrent++;
+ pDstCurrent++;
+ }
+
+ return norm;
+ }
+ }
+
+ }
+}
diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs
index d7082c8313..1053f75b75 100644
--- a/src/Microsoft.ML.CpuMath/Thunk.cs
+++ b/src/Microsoft.ML.CpuMath/Thunk.cs
@@ -2,7 +2,6 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
-using System;
using System.Runtime.InteropServices;
using System.Runtime.CompilerServices;
using System.Security;
diff --git a/src/Native/build.cmd b/src/Native/build.cmd
index e2bbc3a4dc..3f533dc353 100644
--- a/src/Native/build.cmd
+++ b/src/Native/build.cmd
@@ -17,7 +17,9 @@ set CMAKE_BUILD_TYPE=Debug
:Arg_Loop
if [%1] == [] goto :ToolsVersion
if /i [%1] == [Release] ( set CMAKE_BUILD_TYPE=Release&&shift&goto Arg_Loop)
+if /i [%1] == [Release-Intrinsics] ( set CMAKE_BUILD_TYPE=Release-Intrinsics&&shift&goto Arg_Loop)
if /i [%1] == [Debug] ( set CMAKE_BUILD_TYPE=Debug&&shift&goto Arg_Loop)
+if /i [%1] == [Debug-Intrinsics] ( set CMAKE_BUILD_TYPE=Debug-Intrinsics&&shift&goto Arg_Loop)
if /i [%1] == [x86] ( set __BuildArch=x86&&set __VCBuildArch=x86&&shift&goto Arg_Loop)
if /i [%1] == [x64] ( set __BuildArch=x64&&set __VCBuildArch=x86_amd64&&shift&goto Arg_Loop)
@@ -86,6 +88,10 @@ if %__IntermediatesDir% == "" (
set "__CMakeBinDir=%__CMakeBinDir:\=/%"
set "__IntermediatesDir=%__IntermediatesDir:\=/%"
+:: Strip the "-Intrinsics" suffix from the build type
+if [%CMAKE_BUILD_TYPE:~-11%] == [-Intrinsics] (
+ set CMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE:~0,-11%
+)
:: Check that the intermediate directory exists so we can place our cmake build tree there
if not exist "%__IntermediatesDir%" md "%__IntermediatesDir%"
diff --git a/test/Directory.Build.props b/test/Directory.Build.props
index ee5d507566..2e16be2f2b 100644
--- a/test/Directory.Build.props
+++ b/test/Directory.Build.props
@@ -20,7 +20,7 @@
$(ToolsDir)Test.snk
-
+
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
new file mode 100644
index 0000000000..90f362de3e
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
@@ -0,0 +1,45 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.InteropServices;
+using System.Security;
+
+namespace Microsoft.ML.CpuMath.PerformanceTests
+{
+ internal static class CpuMathNativeUtils
+ {
+ [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c);
+
+ [DllImport("CpuMathNative", EntryPoint = "DotSU"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c);
+
+ [DllImport("CpuMathNative", EntryPoint = "SumSqU"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe float SumSqU(/*const*/ float* ps, int c);
+
+ [DllImport("CpuMathNative", EntryPoint = "AddU"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe void AddU(/*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
+
+ [DllImport("CpuMathNative", EntryPoint = "AddSU"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe void AddSU(/*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c);
+
+ [DllImport("CpuMathNative", EntryPoint = "AddScaleU"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe void AddScaleU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
+
+ [DllImport("CpuMathNative", EntryPoint = "AddScaleSU"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe void AddScaleSU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c);
+
+ [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c);
+
+ [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c);
+
+ [DllImport("CpuMathNative", EntryPoint = "SumAbsU"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe float SumAbsU(/*const*/ float* ps, int c);
+
+ [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity]
+ internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c);
+ }
+}
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/Microsoft.ML.CpuMath.PerformanceTests.csproj b/test/Microsoft.ML.CpuMath.PerformanceTests/Microsoft.ML.CpuMath.PerformanceTests.csproj
new file mode 100644
index 0000000000..61d22bbfbb
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/Microsoft.ML.CpuMath.PerformanceTests.csproj
@@ -0,0 +1,27 @@
+
+
+ Exe
+ 7.2
+ Microsoft.ML.CpuMath.PerformanceTests.Program
+ netcoreapp3.0
+ true
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/Program.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/Program.cs
new file mode 100644
index 0000000000..cd731e8cf3
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/Program.cs
@@ -0,0 +1,29 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Jobs;
+using BenchmarkDotNet.Running;
+using BenchmarkDotNet.Toolchains.InProcess;
+
+namespace Microsoft.ML.CpuMath.PerformanceTests
+{
+ class Program
+ {
+ public static void Main(string[] args)
+ {
+ BenchmarkSwitcher
+ .FromAssembly(typeof(Program).Assembly)
+ .Run(null, CreateClrVsCoreConfig());
+ }
+
+ private static IConfig CreateClrVsCoreConfig()
+ {
+ var config = DefaultConfig.Instance.With(
+ Job.ShortRun.
+ With(InProcessToolchain.Instance));
+ return config;
+ }
+ }
+}
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
new file mode 100644
index 0000000000..92752a0018
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -0,0 +1,238 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Running;
+using Microsoft.ML.Runtime.Internal.CpuMath;
+
+namespace Microsoft.ML.CpuMath.PerformanceTests
+{
+ public class SsePerformanceTests
+ {
+ private const int EXP_MAX = 127;
+ private const int EXP_MIN = 0;
+
+ private const int IDXLEN = 1000003;
+ private const int LEN = 1000003;
+ private const int EXP_RANGE = EXP_MAX / 2;
+ private const int DEFAULT_SEED = 253421;
+ private const float DEFAULT_SCALE = 1.11f;
+
+ private float[] src, dst, original, src1, src2;
+ private int[] idx;
+ private int seed = DEFAULT_SEED;
+
+ private static float NextFloat(Random rand, int expRange)
+ {
+ double mantissa = (rand.NextDouble() * 2.0) - 1.0;
+ double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1));
+ return (float)(mantissa * exponent);
+ }
+
+ private static int GetSeed()
+ {
+ int seed = DEFAULT_SEED;
+
+ if (Environment.GetEnvironmentVariable("CPUMATH_SEED") != null)
+ {
+ string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED");
+
+ if (!int.TryParse(CPUMATH_SEED, out seed))
+ {
+ if(string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase))
+ {
+ seed = new Random().Next();
+ }
+ else
+ {
+ seed = DEFAULT_SEED;
+ }
+ }
+ }
+
+ Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results");
+
+ return seed;
+ }
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ src = new float[LEN];
+ dst = new float[LEN];
+ src1 = new float[LEN];
+ src2 = new float[LEN];
+ original = new float[LEN];
+ idx = new int[IDXLEN];
+
+ seed = GetSeed();
+ Random rand = new Random(seed);
+
+ for (int i = 0; i < LEN; i++)
+ {
+ src[i] = NextFloat(rand, EXP_RANGE);
+ dst[i] = NextFloat(rand, EXP_RANGE);
+ original[i] = dst[i];
+ src1[i] = NextFloat(rand, EXP_RANGE);
+ src2[i] = NextFloat(rand, EXP_RANGE);
+ }
+
+ for (int i = 0; i < IDXLEN; i++)
+ {
+ idx[i] = rand.Next(0, LEN);
+ }
+ }
+
+ [GlobalCleanup]
+ public void GlobalCleanup()
+ {
+ original.CopyTo(dst, 0);
+ }
+
+ [Benchmark]
+ public unsafe float NativeDotUPerf()
+ {
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ {
+ return CpuMathNativeUtils.DotU(psrc, pdst, LEN);
+ }
+ }
+
+ [Benchmark]
+ public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN);
+
+ [Benchmark]
+ public unsafe float NativeDotSUPerf()
+ {
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ fixed (int* pidx = idx)
+ {
+ return CpuMathNativeUtils.DotSU(psrc, pdst, pidx, IDXLEN);
+ }
+ }
+
+ [Benchmark]
+ public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN);
+
+ [Benchmark]
+ public unsafe float NativeSumSqUPerf()
+ {
+ fixed (float* psrc = src)
+ {
+ return CpuMathNativeUtils.SumSqU(psrc, LEN);
+ }
+ }
+
+ [Benchmark]
+ public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN);
+
+ [Benchmark]
+ public unsafe void NativeAddUPerf()
+ {
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ {
+ CpuMathNativeUtils.AddU(psrc, pdst, LEN);
+ }
+ }
+
+ [Benchmark]
+ public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN);
+
+ [Benchmark]
+ public unsafe void NativeAddSUPerf()
+ {
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ fixed (int* pidx = idx)
+ {
+ CpuMathNativeUtils.AddSU(psrc, pidx, pdst, IDXLEN);
+ }
+ }
+
+ [Benchmark]
+ public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN);
+
+ [Benchmark]
+ public unsafe void NativeAddScaleUPerf()
+ {
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ {
+ CpuMathNativeUtils.AddScaleU(DEFAULT_SCALE, psrc, pdst, LEN);
+ }
+ }
+
+ [Benchmark]
+ public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
+
+ [Benchmark]
+ public unsafe void NativeAddScaleSUPerf()
+ {
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ fixed (int* pidx = idx)
+ {
+ CpuMathNativeUtils.AddScaleSU(DEFAULT_SCALE, psrc, pidx, pdst, IDXLEN);
+ }
+ }
+
+ [Benchmark]
+ public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN);
+
+ [Benchmark]
+ public unsafe void NativeScaleUPerf()
+ {
+ fixed (float* pdst = dst)
+ {
+ CpuMathNativeUtils.ScaleU(DEFAULT_SCALE, pdst, LEN);
+ }
+ }
+
+ [Benchmark]
+ public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
+
+ [Benchmark]
+ public unsafe float NativeDist2Perf()
+ {
+ fixed (float* psrc = src)
+ fixed (float* pdst = dst)
+ {
+ return CpuMathNativeUtils.Dist2(psrc, pdst, LEN);
+ }
+ }
+
+ [Benchmark]
+ public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN);
+
+ [Benchmark]
+ public unsafe float NativeSumAbsUPerf()
+ {
+ fixed (float* psrc = src)
+ {
+ return CpuMathNativeUtils.SumAbsU(psrc, LEN);
+ }
+ }
+
+ [Benchmark]
+ public float ManagedSumAbsqUPerf() => CpuMathUtils.SumAbs(src, LEN);
+
+ [Benchmark]
+ public unsafe void NativeMulElementWiseUPerf()
+ {
+ fixed (float* psrc1 = src1)
+ fixed (float* psrc2 = src2)
+ fixed (float* pdst = dst)
+ {
+ CpuMathNativeUtils.MulElementWiseU(psrc1, psrc2, pdst, LEN);
+ }
+ }
+
+ [Benchmark]
+ public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN);
+ }
+}
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj
new file mode 100644
index 0000000000..e611b15032
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj
@@ -0,0 +1,16 @@
+
+
+
+ netcoreapp3.0
+ false
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj
new file mode 100644
index 0000000000..9552f688a8
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj
@@ -0,0 +1,16 @@
+
+
+
+ netcoreapp2.0
+ false
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
new file mode 100644
index 0000000000..6fc2596ef7
--- /dev/null
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
@@ -0,0 +1,246 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Xunit;
+using Microsoft.ML.Runtime.Internal.CpuMath;
+
+namespace Microsoft.ML.CpuMath.UnitTests
+{
+ public class CpuMathUtilsUnitTests
+ {
+ private readonly float[][] testArrays;
+ private readonly int[] testIndexArray;
+ private const float DEFAULT_SCALE = 1.7f;
+ private FloatEqualityComparer comparer;
+
+ public CpuMathUtilsUnitTests()
+ {
+ // Padded array whose length is a multiple of 4
+ float[] testArray1 = new float[8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
+ // Unpadded array whose length is not a multiple of 4.
+ float[] testArray2 = new float[7] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f };
+ testArrays = new float[][] { testArray1, testArray2 };
+ testIndexArray = new int[4] { 0, 2, 5, 6 };
+ comparer = new FloatEqualityComparer();
+ }
+
+ [Theory]
+ [InlineData(0, 13306.0376f)]
+ [InlineData(1, 13291.9235f)]
+ public void DotUTest(int test, float expected)
+ {
+ float[] src = (float[]) testArrays[test].Clone();
+ float[] dst = (float[]) src.Clone();
+
+ for (int i = 0; i < dst.Length; i++)
+ {
+ dst[i] += 1;
+ }
+
+ var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length);
+ Assert.Equal(expected, actual, 2);
+ }
+
+ [Theory]
+ [InlineData(0, 736.7352f)]
+ [InlineData(1, 736.7352f)]
+ public void DotSUTest(int test, float expected)
+ {
+ float[] src = (float[])testArrays[test].Clone();
+ float[] dst = (float[])src.Clone();
+ int[] idx = testIndexArray;
+
+ // Ensures src and dst are different arrays
+ for (int i = 0; i < dst.Length; i++)
+ {
+ dst[i] += 1;
+ }
+
+ var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length);
+ Assert.Equal(expected, actual, 4);
+ }
+
+ [Theory]
+ [InlineData(0, 13399.9376f)]
+ [InlineData(1, 13389.1135f)]
+ public void SumSqUTest(int test, float expected)
+ {
+ float[] src = (float[])testArrays[test].Clone();
+ var actual = CpuMathUtils.SumSq(src, src.Length);
+ Assert.Equal(expected, actual, 2);
+ }
+
+ [Theory]
+ [InlineData(0)]
+ [InlineData(1)]
+ public void AddUTest(int test)
+ {
+ float[] src = (float[])testArrays[test].Clone();
+ float[] dst = (float[])src.Clone();
+ float[] expected = (float[])src.Clone();
+
+ // Ensures src and dst are different arrays
+ for (int i = 0; i < dst.Length; i++)
+ {
+ dst[i] += 1;
+ }
+
+ for (int i = 0; i < expected.Length; i++)
+ {
+ expected[i] = 2 * expected[i] + 1;
+ }
+
+ CpuMathUtils.Add(src, dst, dst.Length);
+ var actual = dst;
+ Assert.Equal(expected, actual, comparer);
+ }
+
+ [Theory]
+ [InlineData(0)]
+ [InlineData(1)]
+ public void AddSUTest(int test)
+ {
+ float[] src = (float[])testArrays[test].Clone();
+ float[] dst = (float[])src.Clone();
+ int[] idx = testIndexArray;
+ float[] expected = (float[])dst.Clone();
+
+ expected[0] = 3.92f;
+ expected[2] = -12.14f;
+ expected[5] = -36.69f;
+ expected[6] = 46.29f;
+
+ CpuMathUtils.Add(src, idx, dst, idx.Length);
+ var actual = dst;
+ Assert.Equal(expected, actual, comparer);
+ }
+
+ [Theory]
+ [InlineData(0)]
+ [InlineData(1)]
+ public void AddScaleUTest(int test)
+ {
+ float[] src = (float[])testArrays[test].Clone();
+ float[] dst = (float[])src.Clone();
+ float[] expected = (float[])dst.Clone();
+
+ for (int i = 0; i < expected.Length; i++)
+ {
+ expected[i] *= (1 + DEFAULT_SCALE);
+ }
+
+ CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, dst.Length);
+ var actual = dst;
+ Assert.Equal(expected, actual, comparer);
+ }
+
+ [Theory]
+ [InlineData(0)]
+ [InlineData(1)]
+ public void AddScaleSUTest(int test)
+ {
+ float[] src = (float[])testArrays[test].Clone();
+ float[] dst = (float[])src.Clone();
+ int[] idx = testIndexArray;
+ float[] expected = (float[])dst.Clone();
+
+ expected[0] = 5.292f;
+ expected[2] = -13.806f;
+ expected[5] = -43.522f;
+ expected[6] = 55.978f;
+
+ CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length);
+ var actual = dst;
+ Assert.Equal(expected, actual, comparer);
+ }
+
+ [Theory]
+ [InlineData(0)]
+ [InlineData(1)]
+ public void ScaleUTest(int test)
+ {
+ float[] dst = (float[])testArrays[test].Clone();
+ float[] expectedOutput = (float[])dst.Clone();
+
+ for (int i = 0; i < expectedOutput.Length; i++)
+ {
+ expectedOutput[i] *= DEFAULT_SCALE;
+ }
+
+ CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length);
+ var managedOutput = dst;
+ Assert.Equal(expectedOutput, managedOutput, comparer);
+ }
+
+ [Theory]
+ [InlineData(0, 8.0f)]
+ [InlineData(1, 7.0f)]
+ public void Dist2Test(int test, float expected)
+ {
+ float[] src = (float[])testArrays[test].Clone();
+ float[] dst = (float[])src.Clone();
+
+ // Ensures src and dst are different arrays
+ for (int i = 0; i < dst.Length; i++)
+ {
+ dst[i] += 1;
+ }
+
+ var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length);
+ Assert.Equal(expected, actual, 0);
+ }
+
+ [Theory]
+ [InlineData(0, 196.98f)]
+ [InlineData(1, 193.69f)]
+ public void SumAbsUTest(int test, float expected)
+ {
+ float[] src = (float[])testArrays[test].Clone();
+ var actual = CpuMathUtils.SumAbs(src, src.Length);
+ Assert.Equal(expected, actual, 2);
+ }
+
+ [Theory]
+ [InlineData(0)]
+ [InlineData(1)]
+ public void MulElementWiseUTest(int test)
+ {
+ float[] src1 = (float[])testArrays[test].Clone();
+ float[] src2 = (float[])src1.Clone();
+ float[] dst = (float[])src1.Clone();
+
+ // Ensures src1 and src2 are different arrays
+ for (int i = 0; i < src2.Length; i++)
+ {
+ src2[i] += 1;
+ }
+
+ float[] expected = (float[])src1.Clone();
+
+ for (int i = 0; i < expected.Length; i++)
+ {
+ expected[i] *= (1 + expected[i]);
+ }
+
+ CpuMathUtils.MulElementWise(src1, src2, dst, dst.Length);
+ var actual = dst;
+ Assert.Equal(expected, actual, comparer);
+ }
+ }
+
+ internal class FloatEqualityComparer : IEqualityComparer
+ {
+ public bool Equals(float a, float b)
+ {
+ return Math.Abs(a - b) < 1e-5f;
+ }
+
+ public int GetHashCode(float a)
+ {
+ throw new NotImplementedException();
+ }
+ }
+}