From 0dab6d1593e07a080b75d2091bc5f5a2ba90c53e Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Fri, 10 Aug 2018 14:27:04 -0700 Subject: [PATCH 01/29] Implemented AVX intrinsics --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 506 ++++++++++++++++++ .../Microsoft.ML.CpuMath.csproj | 4 + 2 files changed, 510 insertions(+) create mode 100644 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs new file mode 100644 index 0000000000..00453e75f4 --- /dev/null +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -0,0 +1,506 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +// The exported function names need to be unique (can't be disambiguated based on signature), hence +// we introduce suffix letters to indicate the general patterns used. +// * A suffix means aligned and padded for SSE operations. +// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector. +// * Tran means the matrix is transposed. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Microsoft.ML.Runtime.Internal.CpuMath +{ + internal static class AvxIntrinsics + { + private const int CbAlign = 32; + + private static bool Compat(AlignedArray a) + { + Contracts.AssertValue(a); + Contracts.Assert(a.Size > 0); + return a.CbAlign == CbAlign; + } + + private static unsafe float* Ptr(AlignedArray a, float* p) + { + Contracts.AssertValue(a); + float* q = p + a.GetBase((long)p); + Contracts.Assert(((long)q & (CbAlign - 1)) == 0); + return q; + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector256 ToVector256(in Vector128 a, in Vector128 b) + { + // REVIEW NEEDED: Is it the correct port of the following code? + // #ifndef _WIN32 + // #define _mm256_set_m128(va, vb) _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1) + // #endif + return Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe void ZeroUpper() + { + // Currently no-op since _mm256_zeroupper is not supported (ref: https://github.com/dotnet/coreclr/pull/16955) + // This is a placeholder in case the intrinsic is supported later on. + return; + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector128 GetLow(in Vector128 x) + { + return Avx.ExtractVector128(x, 0); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector128 GetHigh(in Vector128 x) + { + return Avx.ExtractVector128(x, 1); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 Rotate(in Vector128 x) + { + // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA. + return Sse.Shuffle(x, x, 0x39); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe void Store4(in Vector128 x, float* dst, int* idx) + { + Sse.StoreScalar(dst + idx[0], x); + Vector128 rotated = Rotate(in x); + Sse.StoreScalar(dst + idx[1], rotated); + rotated = Rotate(in rotated); + Sse.StoreScalar(dst + idx[2], rotated); + rotated = Rotate(in rotated); + Sse.StoreScalar(dst + idx[3], rotated); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector256 VectorSum(in Vector256 vector) + { + Vector256 partialSum = Avx.HorizontalAdd(vector, vector); + return Avx.HorizontalAdd(partialSum, partialSum); + } + + // Multiply matrix times vector into vector. + internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + { + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); + + float* pSrcEnd = psrc + ccol; + float* pDstEnd = pdst + crow; + float* pDstCurrent = pdst; + float* pMatCurrent = pmat; + + while (pDstCurrent < pDstEnd) + { + Vector256 res0 = Avx.SetZeroVector256(); + Vector256 res1 = res0; + Vector256 res2 = res0; + Vector256 res3 = res0; + + float* pSrcCurrent = psrc; + + while (pSrcCurrent < pSrcEnd) + { + float* pMatTemp = pMatCurrent; + + Vector256 x01 = Avx.LoadAlignedVector256(pMatTemp); + Vector256 x11 = Avx.LoadAlignedVector256(pMatTemp += ccol); + Vector256 x21 = Avx.LoadAlignedVector256(pMatTemp += ccol); + Vector256 x31 = Avx.LoadAlignedVector256(pMatTemp += ccol); + Vector256 x02 = Avx.LoadAlignedVector256(pSrcCurrent); + + res0 = Avx.Add(res0, Avx.Multiply(x01, x02)); + res1 = Avx.Add(res1, Avx.Multiply(x11, x02)); + res2 = Avx.Add(res2, Avx.Multiply(x21, x02)); + res3 = Avx.Add(res3, Avx.Multiply(x31, x02)); + + pSrcCurrent += 8; + pMatCurrent += 8; + } + + // Add up the entries of each, with the 4 results in res0 + res0 = Avx.HorizontalAdd(res0, res1); + res2 = Avx.HorizontalAdd(res2, res3); + res0 = Avx.HorizontalAdd(res0, res2); + + Vector128 sum = Sse.Add(GetLow(in res0), GetHigh(in res0)); + if (add) + { + sum = Sse.Add(sum, Sse.LoadAlignedVector128(pDstCurrent)); + } + Sse.StoreAligned(pDstCurrent, sum); + + pDstCurrent += 4; + pMatCurrent += 3 * ccol; + } + + ZeroUpper(); + } + } + + // Partial sparse source vector. + internal static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) + { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + + // REVIEW: For extremely sparse inputs, interchanging the loops would + // likely be more efficient. + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + fixed (int* pposSrc = &rgposSrc[0]) + { + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); + + int* pposMin = pposSrc + iposMin; + int* pposEnd = pposSrc + iposEnd; + float* pDstEnd = pdst + crow; + float* pm0 = pmat - posMin; + float* pSrcCurrent = psrc - posMin; + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pm1 = pm0 + ccol; + float* pm2 = pm1 + ccol; + float* pm3 = pm2 + ccol; + Vector256 result = Avx.SetZeroVector256(); + + int* ppos = pposMin; + + while (ppos < pposEnd) + { + int col1 = *ppos; + int col2 = col1 + 4 * ccol; + Vector256 x1 = Avx.SetVector256(pm3[col2], pm2[col2], pm1[col2], pm0[col2], + pm3[col1], pm2[col1], pm1[col1], pm0[col1]); + Vector256 x2 = Avx.SetAllVector256(pSrcCurrent[col1]); + x2 = Avx.Multiply(x2, x1); + result = Avx.Add(result, x2); + + ppos++; + } + + if (add) + { + result = Avx.Add(result, Avx.LoadAlignedVector256(pDstCurrent)); + } + Avx.StoreAligned(pDstCurrent, result); + + pDstCurrent += 8; + pm0 += 8 * ccol; + } + + ZeroUpper(); + } + } + + internal static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + { + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); + + float* pSrcEnd = psrc + ccol; + float* pDstEnd = pdst + crow; + float* pSrcCurrent = psrc; + float* pMatCurrent = pmat; + + // We do 4-way unrolling + if (!add) + { + Vector128 h01 = Sse.LoadAlignedVector128(pSrcCurrent); + // Replicate each slot of h01 (ABCD) into its own register. + Vector128 h11 = Sse.Shuffle(h01, h01, 0x55); // B + Vector128 h21 = Sse.Shuffle(h01, h01, 0xAA); // C + Vector128 h31 = Sse.Shuffle(h01, h01, 0xFF); // D + h01 = Sse.Shuffle(h01, h01, 0x00); // A + + Vector256 x01 = ToVector256(h01, h01); + Vector256 x11 = ToVector256(h11, h11); + Vector256 x21 = ToVector256(h21, h21); + Vector256 x31 = ToVector256(h31, h31); + + pSrcCurrent += 4; + + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pMatTemp = pMatCurrent; + Vector256 x02 = Avx.LoadAlignedVector256(pMatTemp); + Vector256 x12 = Avx.LoadAlignedVector256(pMatTemp += crow); + Vector256 x22 = Avx.LoadAlignedVector256(pMatTemp += crow); + Vector256 x32 = Avx.LoadAlignedVector256(pMatTemp += crow); + + x02 = Avx.Multiply(x01, x02); + x12 = Avx.Multiply(x11, x12); + x22 = Avx.Multiply(x21, x22); + x32 = Avx.Multiply(x31, x32); + + x02 = Avx.Add(x02, x12); + x22 = Avx.Add(x22, x32); + x02 = Avx.Add(x02, x22); + + Avx.StoreAligned(pDstCurrent, x02); + + pDstCurrent += 8; + pMatCurrent += 8; + } + + pMatCurrent += 3 * crow; + } + + while (pSrcCurrent < pSrcEnd) + { + Vector128 h01 = Sse.LoadAlignedVector128(pSrcCurrent); + // Replicate each slot of h01 (ABCD) into its own register. + Vector128 h11 = Sse.Shuffle(h01, h01, 0x55); // B + Vector128 h21 = Sse.Shuffle(h01, h01, 0xAA); // C + Vector128 h31 = Sse.Shuffle(h01, h01, 0xFF); // D + h01 = Sse.Shuffle(h01, h01, 0x00); // A + + Vector256 x01 = ToVector256(h01, h01); + Vector256 x11 = ToVector256(h11, h11); + Vector256 x21 = ToVector256(h21, h21); + Vector256 x31 = ToVector256(h31, h31); + + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pMatTemp = pMatCurrent; + + Vector256 x02 = Avx.LoadAlignedVector256(pMatTemp); + Vector256 x12 = Avx.LoadAlignedVector256(pMatTemp += crow); + Vector256 x22 = Avx.LoadAlignedVector256(pMatTemp += crow); + Vector256 x32 = Avx.LoadAlignedVector256(pMatTemp += crow); + Vector256 x3 = Avx.LoadAlignedVector256(pDstCurrent); + + x02 = Avx.Multiply(x01, x02); + x12 = Avx.Multiply(x11, x12); + x22 = Avx.Multiply(x21, x22); + x32 = Avx.Multiply(x31, x32); + + x02 = Avx.Add(x02, x12); + x22 = Avx.Add(x22, x32); + x02 = Avx.Add(x02, x22); + x3 = Avx.Add(x02, x3); + + Avx.StoreAligned(pDstCurrent, x3); + + pDstCurrent += 8; + pMatCurrent += 8; + } + + pMatCurrent += 3 * crow; + pSrcCurrent += 4; + } + + ZeroUpper(); + } + } + + // Partial sparse source vector. + internal static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow) + { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + fixed (int* pposSrc = &rgposSrc[0]) + { + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); + + int* ppos = pposSrc + iposMin; + int* pposEnd = pposSrc + iposEnd; + float* pDstEnd = pdst + crow; + + if (!add) + { + int col = *ppos - posMin; + ppos++; + + Vector256 x0 = Avx.SetAllVector256(psrc[col]); + float* pDstCurrent = pdst; + float* pMatCurrent = pmat + col * crow; + + while (pDstCurrent < pDstEnd) + { + Vector256 x1 = Avx.LoadAlignedVector256(pMatCurrent); + x1 = Avx.Multiply(x1, x0); + Avx.StoreAligned(pDstCurrent, x1); + + pDstCurrent += 8; + pMatCurrent += 8; + } + } + + // REVIEW: Should we explore unrolling the outer loop? + while (ppos < pposEnd) + { + int col = *ppos - posMin; + + Vector256 x0 = Avx.SetAllVector256(psrc[col]); + float* pDstCurrent = pdst; + float* pMatCurrent = pmat + col * crow; + + while (pDstCurrent < pDstEnd) + { + Vector256 x1 = Avx.LoadAlignedVector256(pMatCurrent); + Vector256 x2 = Avx.LoadAlignedVector256(pDstCurrent); + x1 = Avx.Multiply(x1, x0); + x2 = Avx.Add(x2, x1); + Avx.StoreAligned(pDstCurrent, x2); + + pDstCurrent += 8; + pMatCurrent += 8; + } + + ppos++; + } + + ZeroUpper(); + } + } + + internal static unsafe void ScaleX(float scale, Span dst) + { + Vector256 scaleVector = Avx.SetAllVector256(scale); + + fixed (float* pdst = dst) + { + float* pDstCurrent = pdst; + float* pEnd = pdst + dst.Length; + + while (pDstCurrent < pEnd) + { + Vector256 dstVector = Avx.LoadAlignedVector256(pDstCurrent); + + dstVector = Avx.Multiply(scaleVector, dstVector); + Avx.StoreAligned(pDstCurrent, dstVector); + + pDstCurrent += 8; + } + } + + ZeroUpper(); + } + + internal static unsafe void AddScaleX(float scale, Span src, Span dst) + { + Vector256 scaleVector = Avx.SetAllVector256(scale); + + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pEnd = pdst + dst.Length; + + while (pDstCurrent < pEnd) + { + Vector256 srcVector = Avx.LoadAlignedVector256(pSrcCurrent); + Vector256 dstVector = Avx.LoadAlignedVector256(pDstCurrent); + + srcVector = Avx.Multiply(srcVector, scaleVector); + dstVector = Avx.Add(dstVector, srcVector); + Avx.StoreAligned(pDstCurrent, dstVector); + + pDstCurrent += 8; + pSrcCurrent += 8; + } + } + + ZeroUpper(); + } + + internal static unsafe void AddX(Span src, Span dst) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pEnd = psrc + src.Length; + + while (pSrcCurrent < pEnd) + { + Vector256 srcVector = Avx.LoadAlignedVector256(pSrcCurrent); + Vector256 dstVector = Avx.LoadAlignedVector256(pDstCurrent); + + Vector256 result = Avx.Add(srcVector, dstVector); + Avx.StoreAligned(pDstCurrent, result); + + pSrcCurrent += 8; + pDstCurrent += 8; + } + + ZeroUpper(); + } + } + + internal static unsafe float SumX(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result = Avx.SetZeroVector256(); + + while (pSrcCurrent < pSrcEnd) + { + result = Avx.Add(result, Avx.LoadAlignedVector256(pSrcCurrent)); + pSrcCurrent += 8; + } + + result = VectorSum(in result); + Vector128 result128 = Sse.AddScalar(GetLow(result), GetHigh(result)); + + float sum = Sse.ConvertToSingle(result128); + ZeroUpper(); + return sum; + } + } + } +} diff --git a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj index b6c95b93f4..4c46db9c3c 100644 --- a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj +++ b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj @@ -27,4 +27,8 @@ + + + + \ No newline at end of file From 3d76fb19c738a637272025220b0d5dd11f9e8eaf Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Wed, 15 Aug 2018 16:24:50 -0700 Subject: [PATCH 02/29] Implemented performance tests for AVX intrinsics, with some fixes to the intrinsics Note: Building perf tests succeed, but running perf tests for AVX intrinsics ends without results. --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 12 +- .../CpuMathUtils.netcoreapp.cs | 30 +++- .../Microsoft.ML.CpuMath.csproj | 9 +- .../AvxPerformanceTests.cs | 150 ++++++++++++++++++ .../CpuMathNativeUtils.cs | 12 ++ .../SsePerformanceTests.cs | 3 +- 6 files changed, 203 insertions(+), 13 deletions(-) create mode 100644 test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 00453e75f4..073a8b561a 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -53,13 +53,13 @@ private static unsafe void ZeroUpper() } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 GetLow(in Vector128 x) + private static unsafe Vector128 GetLow(in Vector256 x) { return Avx.ExtractVector128(x, 0); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 GetHigh(in Vector128 x) + private static unsafe Vector128 GetHigh(in Vector256 x) { return Avx.ExtractVector128(x, 1); } @@ -112,7 +112,7 @@ internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src while (pDstCurrent < pDstEnd) { - Vector256 res0 = Avx.SetZeroVector256(); + Vector256 res0 = Avx.SetZeroVector256(); Vector256 res1 = res0; Vector256 res2 = res0; Vector256 res3 = res0; @@ -165,7 +165,7 @@ internal static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, Contracts.Assert(Compat(mat)); Contracts.Assert(Compat(src)); Contracts.Assert(Compat(dst)); - + // REVIEW: For extremely sparse inputs, interchanging the loops would // likely be more efficient. fixed (float* pSrcStart = &src.Items[0]) @@ -189,7 +189,7 @@ internal static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, float* pm1 = pm0 + ccol; float* pm2 = pm1 + ccol; float* pm3 = pm2 + ccol; - Vector256 result = Avx.SetZeroVector256(); + Vector256 result = Avx.SetZeroVector256(); int* ppos = pposMin; @@ -486,7 +486,7 @@ internal static unsafe float SumX(Span src) float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - Vector256 result = Avx.SetZeroVector256(); + Vector256 result = Avx.SetZeroVector256(); while (pSrcCurrent < pSrcEnd) { diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 81d7acf25a..4991c1fd86 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -17,7 +17,20 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr Contracts.Assert(mat.Size == dst.Size * src.Size); Contracts.Assert(crun >= 0); - if (Sse.IsSupported) + if (Avx.IsSupported) + { + if (!tran) + { + Contracts.Assert(crun <= dst.Size); + AvxIntrinsics.MatMulX(add, mat, src, dst, crun, src.Size); + } + else + { + Contracts.Assert(crun <= src.Size); + AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Size, crun); + } + } + else if (Sse.IsSupported) { if (!tran) { @@ -96,7 +109,20 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo Contracts.AssertNonEmpty(rgposSrc); Contracts.Assert(crun >= 0); - if (Sse.IsSupported) + if (Avx.IsSupported) + { + if (!tran) + { + Contracts.Assert(crun <= dst.Size); + AvxIntrinsics.MatMulPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size); + } + else + { + Contracts.Assert(crun <= srcValues.Size); + AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size); + } + } + else if (Sse.IsSupported) { if (!tran) { diff --git a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj index 4c46db9c3c..ef24bf2762 100644 --- a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj +++ b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj @@ -10,6 +10,11 @@ 7.3 + + Auto + true + + @@ -26,9 +31,7 @@ - - - + \ No newline at end of file diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs new file mode 100644 index 0000000000..004333cd68 --- /dev/null +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs @@ -0,0 +1,150 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; +using Microsoft.ML.Runtime.Internal.CpuMath; + +namespace Microsoft.ML.CpuMath.PerformanceTests +{ + public class AvxPerformanceTests + { + private const int EXP_MAX = 127; + private const int EXP_MIN = 0; + + private const int IDXLEN = 1000003; + private const int LEN = 1000003; + private const int EXP_RANGE = EXP_MAX / 2; + private const int DEFAULT_SEED = 253421; + private const float DEFAULT_SCALE = 1.11f; + private const int DEFAULT_CROW = 500; + private const int DEFAULT_CCOL = 2000; + private const bool ADD = true; + + private float[] src, dst, original, src1, src2, result; + private int[] idx; + private int seed = DEFAULT_SEED; + + private static float NextFloat(Random rand, int expRange) + { + double mantissa = (rand.NextDouble() * 2.0) - 1.0; + double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1)); + return (float)(mantissa * exponent); + } + + private static int GetSeed() + { + int seed = DEFAULT_SEED; + + if (Environment.GetEnvironmentVariable("CPUMATH_SEED") != null) + { + string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED"); + + if (!int.TryParse(CPUMATH_SEED, out seed)) + { + if (string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase)) + { + seed = new Random().Next(); + } + else + { + seed = DEFAULT_SEED; + } + } + } + + Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results"); + + return seed; + } + + [GlobalSetup] + public void Setup() + { + src = new float[LEN]; + dst = new float[LEN]; + src1 = new float[LEN]; + src2 = new float[LEN]; + original = new float[LEN]; + result = new float[LEN]; + idx = new int[IDXLEN]; + + seed = GetSeed(); + Random rand = new Random(seed); + + for (int i = 0; i < LEN; i++) + { + src[i] = NextFloat(rand, EXP_RANGE); + dst[i] = NextFloat(rand, EXP_RANGE); + original[i] = dst[i]; + result[i] = dst[i]; + src1[i] = NextFloat(rand, EXP_RANGE); + src2[i] = NextFloat(rand, EXP_RANGE); + } + + for (int i = 0; i < IDXLEN; i++) + { + idx[i] = rand.Next(0, LEN); + } + } + + [GlobalCleanup] + public void GlobalCleanup() + { + original.CopyTo(dst, 0); + original.CopyTo(result, 0); + } + + [Benchmark] + public unsafe void NativeScaleXPerf() + { + fixed (float* pdst = dst) + { + CpuMathNativeUtils.ScaleX(DEFAULT_SCALE, pdst, LEN); + } + } + + [Benchmark] + public void ManagedScaleXPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); + + [Benchmark] + public unsafe void NativeAddScaleXPerf() + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + CpuMathNativeUtils.AddScaleX(DEFAULT_SCALE, psrc, pdst, LEN); + } + } + + [Benchmark] + public void ManagedAddScaleXPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); + + [Benchmark] + public unsafe void NativeAddXPerf() + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + CpuMathNativeUtils.AddX(psrc, pdst, LEN); + } + } + + [Benchmark] + public void ManagedAddXPerf() => CpuMathUtils.Add(src, dst, LEN); + + [Benchmark] + public unsafe float NativeSumXPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumX(psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumXPerf() => CpuMathUtils.Sum(src, LEN); + } +} diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs index 8df3352556..27f46022eb 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs @@ -85,5 +85,17 @@ internal static class CpuMathNativeUtils [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateSU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void SdcaL1UpdateSU(float primalUpdate, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c); + + [DllImport("CpuMathNative", EntryPoint = "ScaleX"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void ScaleX(float a, /*_Inout_*/ float* pd, int c); + + [DllImport("CpuMathNative", EntryPoint = "AddScaleX"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void AddScaleX(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); + + [DllImport("CpuMathNative", EntryPoint = "AddX"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void AddX(/*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); + + [DllImport("CpuMathNative", EntryPoint = "SumX"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float SumX(/*const*/ float* ps, int c); } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index ade2ea6a0e..a6ddf56a36 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -16,7 +16,7 @@ public class SsePerformanceTests private const int IDXLEN = 1000003; private const int LEN = 1000003; - private const int EXP_RANGE = EXP_MAX / 2; + private const int EXP_RANGE = EXP_MAX / 8; private const int DEFAULT_SEED = 253421; private const float DEFAULT_SCALE = 1.11f; private const int DEFAULT_CROW = 500; @@ -312,7 +312,6 @@ public unsafe float NativeMaxAbsDiffUPerf() [Benchmark] public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN); - // TODO: MaxAbsU!!! [Benchmark] public unsafe float NativeDotUPerf() From 6a51bd865a3dbd1e15c15987201496e4882a7b64 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Thu, 16 Aug 2018 11:15:56 -0700 Subject: [PATCH 03/29] Changes to perf tests in response to feedback --- .../AvxPerformanceTests.cs | 8 +++----- .../SsePerformanceTests.cs | 6 ++---- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs index 004333cd68..b178ca684d 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs @@ -16,7 +16,7 @@ public class AvxPerformanceTests private const int IDXLEN = 1000003; private const int LEN = 1000003; - private const int EXP_RANGE = EXP_MAX / 2; + private const int EXP_RANGE = EXP_MAX / 8; private const int DEFAULT_SEED = 253421; private const float DEFAULT_SCALE = 1.11f; private const int DEFAULT_CROW = 500; @@ -37,11 +37,10 @@ private static float NextFloat(Random rand, int expRange) private static int GetSeed() { int seed = DEFAULT_SEED; + string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED"); - if (Environment.GetEnvironmentVariable("CPUMATH_SEED") != null) + if (CPUMATH_SEED != null) { - string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED"); - if (!int.TryParse(CPUMATH_SEED, out seed)) { if (string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase)) @@ -56,7 +55,6 @@ private static int GetSeed() } Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results"); - return seed; } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index a6ddf56a36..02f844f033 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -37,11 +37,10 @@ private static float NextFloat(Random rand, int expRange) private static int GetSeed() { int seed = DEFAULT_SEED; + string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED"); - if (Environment.GetEnvironmentVariable("CPUMATH_SEED") != null) + if (CPUMATH_SEED != null) { - string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED"); - if (!int.TryParse(CPUMATH_SEED, out seed)) { if(string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase)) @@ -56,7 +55,6 @@ private static int GetSeed() } Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results"); - return seed; } From 1b2cea94b5f9ba309aa3cc6d63c0812e21f1cc8a Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Thu, 16 Aug 2018 20:44:03 -0700 Subject: [PATCH 04/29] Fixes across multiple files to make unit tests and perf tests work for all used AVX intrinsics Note: Except matrix operations --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 178 +++++++++--- .../CpuMathUtils.netcoreapp.cs | 24 +- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 8 +- .../AvxPerformanceTests.cs | 46 +-- .../SsePerformanceTests.cs | 4 +- .../UnitTests.cs | 262 +++++++++--------- 6 files changed, 303 insertions(+), 219 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 073a8b561a..94434b7ee5 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -35,17 +35,17 @@ private static bool Compat(AlignedArray a) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector256 ToVector256(in Vector128 a, in Vector128 b) + private static Vector256 ToVector256(in Vector128 a, in Vector128 b) { // REVIEW NEEDED: Is it the correct port of the following code? // #ifndef _WIN32 // #define _mm256_set_m128(va, vb) _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1) // #endif - return Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1); + return Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe void ZeroUpper() + private static void ZeroUpper() { // Currently no-op since _mm256_zeroupper is not supported (ref: https://github.com/dotnet/coreclr/pull/16955) // This is a placeholder in case the intrinsic is supported later on. @@ -53,15 +53,15 @@ private static unsafe void ZeroUpper() } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 GetLow(in Vector256 x) + private static Vector128 GetLow(in Vector256 x) { - return Avx.ExtractVector128(x, 0); + return Avx.ExtractVector128(x, 0); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 GetHigh(in Vector256 x) + private static Vector128 GetHigh(in Vector256 x) { - return Avx.ExtractVector128(x, 1); + return Avx.ExtractVector128(x, 1); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] @@ -84,12 +84,28 @@ private static unsafe void Store4(in Vector128 x, float* dst, int* idx) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector256 VectorSum(in Vector256 vector) + private static Vector256 VectorSum256(in Vector256 vector) { Vector256 partialSum = Avx.HorizontalAdd(vector, vector); return Avx.HorizontalAdd(partialSum, partialSum); } + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 VectorSum128(in Vector128 vector) + { + if (Sse3.IsSupported) + { + Vector128 partialSum = Sse3.HorizontalAdd(vector, vector); + return Sse3.HorizontalAdd(partialSum, partialSum); + } + else + { + Vector128 partialSum = Sse.Add(vector, Sse.MoveHighToLow(vector, vector)); + // The control byte shuffles the four 32-bit floats of partialSum: ABCD -> BADC. + return Sse.Add(partialSum, Sse.Shuffle(partialSum, partialSum, 0xB1)); + } + } + // Multiply matrix times vector into vector. internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { @@ -403,33 +419,53 @@ internal static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgpos } } - internal static unsafe void ScaleX(float scale, Span dst) + internal static unsafe void ScaleU(float scale, Span dst) { - Vector256 scaleVector = Avx.SetAllVector256(scale); - fixed (float* pdst = dst) { float* pDstCurrent = pdst; float* pEnd = pdst + dst.Length; - while (pDstCurrent < pEnd) + Vector256 scaleVector256 = Avx.SetAllVector256(scale); + + while (pDstCurrent + 8 <= pEnd) { - Vector256 dstVector = Avx.LoadAlignedVector256(pDstCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); - dstVector = Avx.Multiply(scaleVector, dstVector); - Avx.StoreAligned(pDstCurrent, dstVector); + dstVector = Avx.Multiply(scaleVector256, dstVector); + Avx.Store(pDstCurrent, dstVector); pDstCurrent += 8; } + + Vector128 scaleVector128 = Sse.SetAllVector128(scale); + + while (pDstCurrent + 4 <= pEnd) + { + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + + dstVector = Sse.Multiply(scaleVector128, dstVector); + Sse.Store(pDstCurrent, dstVector); + + pDstCurrent += 4; + } + + while (pDstCurrent < pEnd) + { + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + + dstVector = Sse.MultiplyScalar(scaleVector128, dstVector); + Sse.StoreScalar(pDstCurrent, dstVector); + + pDstCurrent++; + } } ZeroUpper(); } - internal static unsafe void AddScaleX(float scale, Span src, Span dst) + internal static unsafe void AddScaleU(float scale, Span src, Span dst) { - Vector256 scaleVector = Avx.SetAllVector256(scale); - fixed (float* psrc = src) fixed (float* pdst = dst) { @@ -437,24 +473,54 @@ internal static unsafe void AddScaleX(float scale, Span src, Span float* pDstCurrent = pdst; float* pEnd = pdst + dst.Length; - while (pDstCurrent < pEnd) + Vector256 scaleVector256 = Avx.SetAllVector256(scale); + + while (pDstCurrent + 8 <= pEnd) { - Vector256 srcVector = Avx.LoadAlignedVector256(pSrcCurrent); - Vector256 dstVector = Avx.LoadAlignedVector256(pDstCurrent); + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); - srcVector = Avx.Multiply(srcVector, scaleVector); + srcVector = Avx.Multiply(srcVector, scaleVector256); dstVector = Avx.Add(dstVector, srcVector); - Avx.StoreAligned(pDstCurrent, dstVector); + Avx.Store(pDstCurrent, dstVector); - pDstCurrent += 8; pSrcCurrent += 8; + pDstCurrent += 8; + } + + Vector128 scaleVector128 = Sse.SetAllVector128(scale); + + while (pDstCurrent + 4 <= pEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + + srcVector = Sse.Multiply(srcVector, scaleVector128); + dstVector = Sse.Add(dstVector, srcVector); + Sse.Store(pDstCurrent, dstVector); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + while (pDstCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + + srcVector = Sse.MultiplyScalar(srcVector, scaleVector128); + dstVector = Sse.AddScalar(dstVector, srcVector); + Sse.StoreScalar(pDstCurrent, dstVector); + + pSrcCurrent++; + pDstCurrent++; } } ZeroUpper(); } - internal static unsafe void AddX(Span src, Span dst) + internal static unsafe void AddU(Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -463,41 +529,81 @@ internal static unsafe void AddX(Span src, Span dst) float* pDstCurrent = pdst; float* pEnd = psrc + src.Length; - while (pSrcCurrent < pEnd) + while (pSrcCurrent + 8 <= pEnd) { - Vector256 srcVector = Avx.LoadAlignedVector256(pSrcCurrent); - Vector256 dstVector = Avx.LoadAlignedVector256(pDstCurrent); + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); Vector256 result = Avx.Add(srcVector, dstVector); - Avx.StoreAligned(pDstCurrent, result); + Avx.Store(pDstCurrent, result); pSrcCurrent += 8; pDstCurrent += 8; } + while (pSrcCurrent + 4 <= pEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + + Vector128 result = Sse.Add(srcVector, dstVector); + Sse.Store(pDstCurrent, result); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + while (pSrcCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + + Vector128 result = Sse.AddScalar(srcVector, dstVector); + Sse.StoreScalar(pDstCurrent, result); + + pSrcCurrent++; + pDstCurrent++; + } + ZeroUpper(); } } - internal static unsafe float SumX(Span src) + internal static unsafe float SumU(Span src) { fixed (float* psrc = src) { float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - Vector256 result = Avx.SetZeroVector256(); + Vector256 result256 = Avx.SetZeroVector256(); - while (pSrcCurrent < pSrcEnd) + while (pSrcCurrent + 8 <= pSrcEnd) { - result = Avx.Add(result, Avx.LoadAlignedVector256(pSrcCurrent)); + result256 = Avx.Add(result256, Avx.LoadVector256(pSrcCurrent)); pSrcCurrent += 8; } - result = VectorSum(in result); - Vector128 result128 = Sse.AddScalar(GetLow(result), GetHigh(result)); + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + result128 = Sse.Add(result128, Sse.LoadVector128(pSrcCurrent)); + pSrcCurrent += 4; + } + + result128 = VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + result128 = Sse.AddScalar(result128, Sse.LoadScalarVector128(pSrcCurrent)); + pSrcCurrent++; + } - float sum = Sse.ConvertToSingle(result128); + float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); ZeroUpper(); return sum; } diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 4991c1fd86..c192052ca6 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -230,7 +230,11 @@ public static void Scale(float a, float[] dst, int offset, int count) private static void Scale(float a, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.ScaleU(a, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.ScaleU(a, dst); } @@ -321,7 +325,11 @@ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, in private static void AddScale(float a, Span src, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddScaleU(a, src, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.AddScaleU(a, src, dst); } @@ -420,7 +428,11 @@ public static void Add(float[] src, float[] dst, int count) private static void Add(Span src, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddU(src, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.AddU(src, dst); } @@ -527,7 +539,11 @@ public static float Sum(float[] src, int offset, int count) private static float Sum(Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.SumU(src); + } + else if (Sse.IsSupported) { return SseIntrinsics.SumU(src); } diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index bf7ad03e34..44157364a8 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -406,19 +406,19 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos } // dst[i] += scale - internal static unsafe void AddScalarU(float scale, Span dst) + internal static unsafe void AddScalarU(float scalar, Span dst) { fixed (float* pdst = dst) { float* pDstEnd = pdst + dst.Length; float* pDstCurrent = pdst; - Vector128 x1 = Sse.SetAllVector128(scale); + Vector128 scalarVector = Sse.SetAllVector128(scalar); while (pDstCurrent + 4 <= pDstEnd) { Vector128 x2 = Sse.LoadVector128(pDstCurrent); - x2 = Sse.Add(x2, x1); + x2 = Sse.Add(x2, scalarVector); Sse.Store(pDstCurrent, x2); pDstCurrent += 4; @@ -427,7 +427,7 @@ internal static unsafe void AddScalarU(float scale, Span dst) while (pDstCurrent < pDstEnd) { Vector128 x2 = Sse.LoadScalarVector128(pDstCurrent); - x2 = Sse.AddScalar(x2, x1); + x2 = Sse.AddScalar(x2, scalarVector); Sse.StoreScalar(pDstCurrent, x2); pDstCurrent++; diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs index b178ca684d..fdb7140738 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs @@ -96,53 +96,15 @@ public void GlobalCleanup() } [Benchmark] - public unsafe void NativeScaleXPerf() - { - fixed (float* pdst = dst) - { - CpuMathNativeUtils.ScaleX(DEFAULT_SCALE, pdst, LEN); - } - } - - [Benchmark] - public void ManagedScaleXPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); - - [Benchmark] - public unsafe void NativeAddScaleXPerf() - { - fixed (float* psrc = src) - fixed (float* pdst = dst) - { - CpuMathNativeUtils.AddScaleX(DEFAULT_SCALE, psrc, pdst, LEN); - } - } + public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); [Benchmark] - public void ManagedAddScaleXPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); + public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); [Benchmark] - public unsafe void NativeAddXPerf() - { - fixed (float* psrc = src) - fixed (float* pdst = dst) - { - CpuMathNativeUtils.AddX(psrc, pdst, LEN); - } - } - - [Benchmark] - public void ManagedAddXPerf() => CpuMathUtils.Add(src, dst, LEN); - - [Benchmark] - public unsafe float NativeSumXPerf() - { - fixed (float* psrc = src) - { - return CpuMathNativeUtils.SumX(psrc, LEN); - } - } + public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN); [Benchmark] - public float ManagedSumXPerf() => CpuMathUtils.Sum(src, LEN); + public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN); } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index 02f844f033..ff1f451550 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -96,11 +96,11 @@ public void GlobalCleanup() } [Benchmark] - public unsafe float NativeAddScalarUPerf() + public unsafe void NativeAddScalarUPerf() { fixed (float* pdst = dst) { - return CpuMathNativeUtils.AddScalarU(DEFAULT_SCALE, pdst, LEN); + CpuMathNativeUtils.AddScalarU(DEFAULT_SCALE, pdst, LEN); } } diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs index d1d5955a8e..b57066be8c 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs @@ -70,137 +70,137 @@ public CpuMathUtilsUnitTests() testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; } - [Theory] - [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })] - [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })] - [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })] - public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - - CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })] - [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })] - [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })] - public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - - CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })] - [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })] - [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })] - public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - - CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })] - [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })] - [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })] - public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - - CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })] - [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })] - [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })] - public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; - - CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })] - [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })] - [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })] - public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; - - CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })] - [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })] - [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })] - public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; - - CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })] - [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })] - [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })] - public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; - - CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); - } + //[Theory] + //[InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })] + //[InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })] + //[InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })] + //public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) + //{ + // AlignedArray mat = testMatrices[matTest]; + // AlignedArray src = testSrcVectors[srcTest]; + // AlignedArray dst = testDstVectors[dstTest]; + + // CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); + // float[] actual = new float[dst.Size]; + // dst.CopyTo(actual, 0, dst.Size); + // Assert.Equal(expected, actual, comparer); + //} + + //[Theory] + //[InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })] + //[InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })] + //[InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })] + //public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + //{ + // AlignedArray mat = testMatrices[matTest]; + // AlignedArray src = testSrcVectors[srcTest]; + // AlignedArray dst = testDstVectors[dstTest]; + + // CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); + // float[] actual = new float[dst.Size]; + // dst.CopyTo(actual, 0, dst.Size); + // Assert.Equal(expected, actual, comparer); + //} + + //[Theory] + //[InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })] + //[InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })] + //[InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })] + //public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) + //{ + // AlignedArray mat = testMatrices[matTest]; + // AlignedArray src = testSrcVectors[srcTest]; + // AlignedArray dst = testDstVectors[dstTest]; + + // CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); + // float[] actual = new float[dst.Size]; + // dst.CopyTo(actual, 0, dst.Size); + // Assert.Equal(expected, actual, comparer); + //} + + //[Theory] + //[InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })] + //[InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })] + //[InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })] + //public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + //{ + // AlignedArray mat = testMatrices[matTest]; + // AlignedArray src = testSrcVectors[srcTest]; + // AlignedArray dst = testDstVectors[dstTest]; + + // CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); + // float[] actual = new float[dst.Size]; + // dst.CopyTo(actual, 0, dst.Size); + // Assert.Equal(expected, actual, comparer); + //} + + //[Theory] + //[InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })] + //[InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })] + //[InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })] + //public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) + //{ + // AlignedArray mat = testMatrices[matTest]; + // AlignedArray src = testSrcVectors[srcTest]; + // AlignedArray dst = testDstVectors[dstTest]; + // int[] idx = testIndexArray; + + // CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); + // float[] actual = new float[dst.Size]; + // dst.CopyTo(actual, 0, dst.Size); + // Assert.Equal(expected, actual, comparer); + //} + + //[Theory] + //[InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })] + //[InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })] + //[InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })] + //public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + //{ + // AlignedArray mat = testMatrices[matTest]; + // AlignedArray src = testSrcVectors[srcTest]; + // AlignedArray dst = testDstVectors[dstTest]; + // int[] idx = testIndexArray; + + // CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); + // float[] actual = new float[dst.Size]; + // dst.CopyTo(actual, 0, dst.Size); + // Assert.Equal(expected, actual, comparer); + //} + + //[Theory] + //[InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })] + //[InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })] + //[InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })] + //public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected) + //{ + // AlignedArray mat = testMatrices[matTest]; + // AlignedArray src = testSrcVectors[srcTest]; + // AlignedArray dst = testDstVectors[dstTest]; + // int[] idx = testIndexArray; + + // CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); + // float[] actual = new float[dst.Size]; + // dst.CopyTo(actual, 0, dst.Size); + // Assert.Equal(expected, actual, comparer); + //} + + //[Theory] + //[InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })] + //[InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })] + //[InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })] + //public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + //{ + // AlignedArray mat = testMatrices[matTest]; + // AlignedArray src = testSrcVectors[srcTest]; + // AlignedArray dst = testDstVectors[dstTest]; + // int[] idx = testIndexArray; + + // CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); + // float[] actual = new float[dst.Size]; + // dst.CopyTo(actual, 0, dst.Size); + // Assert.Equal(expected, actual, comparer); + //} [Theory] [InlineData(0)] From f471726416439540a21d0dcb3e8e0fba538688b7 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Thu, 16 Aug 2018 23:28:33 -0700 Subject: [PATCH 05/29] Implemented new AVX intrinsics that do not involve matrix operations, passing basic unit tests --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 1075 ++++++++++++++++- .../CpuMathUtils.netcoreapp.cs | 108 +- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 196 +-- 3 files changed, 1246 insertions(+), 133 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 94434b7ee5..4c006ea868 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -64,6 +64,24 @@ private static Vector128 GetHigh(in Vector256 x) return Avx.ExtractVector128(x, 1); } + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector128 Load1(float* src, int* idx) + { + return Sse.SetScalarVector128(src[idx[0]]); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector128 Load4(float* src, int* idx) + { + return Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector256 Load8(float* src, int* idx) + { + return Avx.SetVector256(src[idx[7]], src[idx[6]], src[idx[5]], src[idx[4]], src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); + } + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector128 Rotate(in Vector128 x) { @@ -84,10 +102,24 @@ private static unsafe void Store4(in Vector128 x, float* dst, int* idx) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector256 VectorSum256(in Vector256 vector) + private static unsafe void Store8(in Vector256 x, float* dst, int* idx) { - Vector256 partialSum = Avx.HorizontalAdd(vector, vector); - return Avx.HorizontalAdd(partialSum, partialSum); + Vector128 tmp = GetLow(in x); + Sse.StoreScalar(dst + idx[0], tmp); + tmp = Rotate(in tmp); + Sse.StoreScalar(dst + idx[1], tmp); + tmp = Rotate(in tmp); + Sse.StoreScalar(dst + idx[2], tmp); + tmp = Rotate(in tmp); + Sse.StoreScalar(dst + idx[3], tmp); + tmp = GetHigh(in x); + Sse.StoreScalar(dst + idx[4], tmp); + tmp = Rotate(in tmp); + Sse.StoreScalar(dst + idx[5], tmp); + tmp = Rotate(in tmp); + Sse.StoreScalar(dst + idx[6], tmp); + tmp = Rotate(in tmp); + Sse.StoreScalar(dst + idx[7], tmp); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] @@ -106,6 +138,68 @@ private static Vector128 VectorSum128(in Vector128 vector) } } + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector256 VectorSum256(in Vector256 vector) + { + Vector256 partialSum = Avx.HorizontalAdd(vector, vector); + return Avx.HorizontalAdd(partialSum, partialSum); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 VectorMax128(in Vector128 vector) + { + Vector128 x1 = Sse.Shuffle(vector, vector, 0xB1); + Vector128 partialMax = Sse.Max(vector, x1); + x1 = Sse.Shuffle(partialMax, partialMax, 0x02); + return Sse.MaxScalar(partialMax, x1); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector256 VectorMax256(in Vector256 vector) + { + Vector256 x1 = Avx.Shuffle(vector, vector, 0xB1); + Vector256 partialMax = Avx.Max(vector, x1); + x1 = Avx.Shuffle(partialMax, partialMax, 0x02); + return Avx.Max(partialMax, x1); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 GetAbsMask128() + { + return Sse2.IsSupported ? + Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : + Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector256 GetAbsMask256() + { + return Avx.StaticCast(Avx.SetAllVector256(0x7FFFFFFF)); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 GetNewDst128(in Vector128 xDst1, in Vector128 signMask, in Vector128 xThreshold) + { + Vector128 xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise + Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); + Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true + Vector128 x2 = Sse.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise + return Sse.And(Sse.Subtract(xDst1, x2), xCond); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector256 GetNewDst256(in Vector256 xDst1, in Vector256 signMask, in Vector256 xThreshold) + { + Vector256 xSign = Avx.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise + Vector256 xDst1Abs = Avx.Xor(xDst1, xSign); + + // REVIEW NEEDED: Do we want Signaling or NonSignaling? The original functionality is NonSignaling, which does not throw an exception even when there is an NaN. + // Signaling means that if an operand contains an NaN, an exception is raised (ref: https://stackoverflow.com/questions/16988199/how-to-choose-avx-compare-predicate-variants) + Vector256 xCond = Avx.Compare(xDst1Abs, xThreshold, FloatComparisonMode.GreaterThanOrderedSignaling); // result = 0xFFFF FFFF if true + Vector256 x2 = Avx.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise + return Avx.And(Avx.Subtract(xDst1, x2), xCond); + } + // Multiply matrix times vector into vector. internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { @@ -419,6 +513,49 @@ internal static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgpos } } + // dst[i] += scale + internal static unsafe void AddScalarU(float scalar, Span dst) + { + fixed (float* pdst = dst) + { + float* pDstEnd = pdst + dst.Length; + float* pDstCurrent = pdst; + + Vector256 scalarVector256 = Avx.SetAllVector256(scalar); + + while (pDstCurrent + 8 <= pDstEnd) + { + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + dstVector = Avx.Add(dstVector, scalarVector256); + Avx.Store(pDstCurrent, dstVector); + + pDstCurrent += 8; + } + + Vector128 scalarVector128 = Sse.SetAllVector128(scalar); + + while (pDstCurrent + 4 <= pDstEnd) + { + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + dstVector = Sse.Add(dstVector, scalarVector128); + Sse.Store(pDstCurrent, dstVector); + + pDstCurrent += 4; + } + + while (pDstCurrent < pDstEnd) + { + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + dstVector = Sse.AddScalar(dstVector, scalarVector128); + Sse.StoreScalar(pDstCurrent, dstVector); + + pDstCurrent++; + } + } + + ZeroUpper(); + } + internal static unsafe void ScaleU(float scale, Span dst) { fixed (float* pdst = dst) @@ -464,6 +601,101 @@ internal static unsafe void ScaleU(float scale, Span dst) ZeroUpper(); } + internal static unsafe void ScaleSrcU(float scale, Span src, Span dst) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pDstEnd = pdst + dst.Length; + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + + Vector256 scaleVector256 = Avx.SetAllVector256(scale); + + while (pDstCurrent + 8 <= pDstEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + srcVector = Avx.Multiply(srcVector, scaleVector256); + Avx.Store(pDstCurrent, srcVector); + + pSrcCurrent += 8; + pDstCurrent += 8; + } + + Vector128 scaleVector128 = Sse.SetAllVector128(scale); + + while (pDstCurrent + 4 <= pDstEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Multiply(srcVector, scaleVector128); + Sse.Store(pDstCurrent, srcVector); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + while (pDstCurrent < pDstEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.MultiplyScalar(srcVector, scaleVector128); + Sse.StoreScalar(pDstCurrent, srcVector); + + pSrcCurrent++; + pDstCurrent++; + } + } + + ZeroUpper(); + } + + // dst[i] = a * (dst[i] + b) + internal static unsafe void ScaleAddU(float a, float b, Span dst) + { + fixed (float* pdst = dst) + { + float* pDstEnd = pdst + dst.Length; + float* pDstCurrent = pdst; + + Vector256 a256 = Avx.SetAllVector256(a); + Vector256 b256 = Avx.SetAllVector256(b); + + while (pDstCurrent + 8 <= pDstEnd) + { + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + dstVector = Avx.Add(dstVector, b256); + dstVector = Avx.Multiply(dstVector, a256); + Avx.Store(pDstCurrent, dstVector); + + pDstCurrent += 8; + } + + Vector128 a128 = Sse.SetAllVector128(a); + Vector128 b128 = Sse.SetAllVector128(b); + + while (pDstCurrent + 4 <= pDstEnd) + { + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + dstVector = Sse.Add(dstVector, b128); + dstVector = Sse.Multiply(dstVector, a128); + Sse.Store(pDstCurrent, dstVector); + + pDstCurrent += 4; + } + + while (pDstCurrent < pDstEnd) + { + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + dstVector = Sse.AddScalar(dstVector, b128); + dstVector = Sse.MultiplyScalar(dstVector, a128); + Sse.StoreScalar(pDstCurrent, dstVector); + + pDstCurrent++; + } + } + + ZeroUpper(); + } + internal static unsafe void AddScaleU(float scale, Span src, Span dst) { fixed (float* psrc = src) @@ -520,6 +752,117 @@ internal static unsafe void AddScaleU(float scale, Span src, Span ZeroUpper(); } + internal static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + fixed (float* pres = result) + { + float* pResEnd = pres + result.Length; + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pResCurrent = pres; + + Vector256 scaleVector256 = Avx.SetAllVector256(scale); + + while (pResCurrent + 8 <= pResEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + srcVector = Avx.Multiply(srcVector, scaleVector256); + dstVector = Avx.Add(dstVector, srcVector); + Avx.Store(pResCurrent, dstVector); + + pSrcCurrent += 8; + pDstCurrent += 8; + pResCurrent += 8; + } + + Vector128 scaleVector128 = Sse.SetAllVector128(scale); + + while (pResCurrent + 4 <= pResEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + srcVector = Sse.Multiply(srcVector, scaleVector128); + dstVector = Sse.Add(dstVector, srcVector); + Sse.Store(pResCurrent, dstVector); + + pSrcCurrent += 4; + pDstCurrent += 4; + pResCurrent += 4; + } + + while (pResCurrent < pResEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + srcVector = Sse.MultiplyScalar(srcVector, scaleVector128); + dstVector = Sse.AddScalar(dstVector, srcVector); + Sse.StoreScalar(pResCurrent, dstVector); + + pSrcCurrent++; + pDstCurrent++; + pResCurrent++; + } + } + + ZeroUpper(); + } + + internal static unsafe void AddScaleSU(float scale, Span src, Span idx, Span dst) + { + fixed (float* psrc = src) + fixed (int* pidx = idx) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + int* pIdxCurrent = pidx; + float* pDstCurrent = pdst; + int* pEnd = pidx + idx.Length; + + Vector256 scaleVector256 = Avx.SetAllVector256(scale); + + while (pIdxCurrent + 8 <= pEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Load8(pDstCurrent, pIdxCurrent); + + srcVector = Avx.Multiply(srcVector, scaleVector256); + dstVector = Avx.Add(dstVector, srcVector); + Store8(in dstVector, pDstCurrent, pIdxCurrent); + + pIdxCurrent += 8; + pSrcCurrent += 8; + } + + Vector128 scaleVector128 = Sse.SetAllVector128(scale); + + while (pIdxCurrent + 4 <= pEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent); + + srcVector = Sse.Multiply(srcVector, scaleVector128); + dstVector = Sse.Add(dstVector, srcVector); + Store4(in dstVector, pDstCurrent, pIdxCurrent); + + pIdxCurrent += 4; + pSrcCurrent += 4; + } + + while (pIdxCurrent < pEnd) + { + pDstCurrent[*pIdxCurrent] += scale * (*pSrcCurrent); + + pIdxCurrent++; + pSrcCurrent++; + } + } + + ZeroUpper(); + } + internal static unsafe void AddU(Span src, Span dst) { fixed (float* psrc = src) @@ -564,34 +907,132 @@ internal static unsafe void AddU(Span src, Span dst) pSrcCurrent++; pDstCurrent++; } - - ZeroUpper(); } + + ZeroUpper(); } - internal static unsafe float SumU(Span src) + internal static unsafe void AddSU(Span src, Span idx, Span dst) { fixed (float* psrc = src) + fixed (int* pidx = idx) + fixed (float* pdst = dst) { - float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; + int* pIdxCurrent = pidx; + float* pDstCurrent = pdst; + int* pEnd = pidx + idx.Length; - Vector256 result256 = Avx.SetZeroVector256(); - - while (pSrcCurrent + 8 <= pSrcEnd) + while (pIdxCurrent + 8 <= pEnd) { - result256 = Avx.Add(result256, Avx.LoadVector256(pSrcCurrent)); + Vector256 srcVector = Load8(pDstCurrent, pIdxCurrent); + Vector256 dstVector = Avx.LoadVector256(pSrcCurrent); + + srcVector = Avx.Add(srcVector, dstVector); + Store8(in srcVector, pDstCurrent, pIdxCurrent); + + pIdxCurrent += 8; pSrcCurrent += 8; } - result256 = VectorSum256(in result256); - Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + while (pIdxCurrent + 4 <= pEnd) + { + Vector128 srcVector = Load4(pDstCurrent, pIdxCurrent); + Vector128 dstVector = Sse.LoadVector128(pSrcCurrent); - Vector128 result128 = Sse.SetZeroVector128(); + srcVector = Sse.Add(srcVector, dstVector); + Store4(in srcVector, pDstCurrent, pIdxCurrent); - while (pSrcCurrent + 4 <= pSrcEnd) - { - result128 = Sse.Add(result128, Sse.LoadVector128(pSrcCurrent)); + pIdxCurrent += 4; + pSrcCurrent += 4; + } + + while (pIdxCurrent < pEnd) + { + pDstCurrent[*pIdxCurrent] += *pSrcCurrent; + + pIdxCurrent++; + pSrcCurrent++; + } + } + + ZeroUpper(); + } + + internal static unsafe void MulElementWiseU(Span src1, Span src2, Span dst) + { + fixed (float* psrc1 = &src1[0]) + fixed (float* psrc2 = &src2[0]) + fixed (float* pdst = dst) + { + float* pSrc1Current = psrc1; + float* pSrc2Current = psrc2; + float* pDstCurrent = pdst; + float* pEnd = pdst + dst.Length; + + while (pDstCurrent + 8 <= pEnd) + { + Vector256 src1Vector = Avx.LoadVector256(pSrc1Current); + Vector256 src2Vector = Avx.LoadVector256(pSrc2Current); + src2Vector = Avx.Multiply(src1Vector, src2Vector); + Avx.Store(pDstCurrent, src2Vector); + + pSrc1Current += 8; + pSrc2Current += 8; + pDstCurrent += 8; + } + + while (pDstCurrent + 4 <= pEnd) + { + Vector128 src1Vector = Sse.LoadVector128(pSrc1Current); + Vector128 src2Vector = Sse.LoadVector128(pSrc2Current); + src2Vector = Sse.Multiply(src1Vector, src2Vector); + Sse.Store(pDstCurrent, src2Vector); + + pSrc1Current += 4; + pSrc2Current += 4; + pDstCurrent += 4; + } + + while (pDstCurrent < pEnd) + { + Vector128 src1Vector = Sse.LoadScalarVector128(pSrc1Current); + Vector128 src2Vector = Sse.LoadScalarVector128(pSrc2Current); + src2Vector = Sse.MultiplyScalar(src1Vector, src2Vector); + Sse.StoreScalar(pDstCurrent, src2Vector); + + pSrc1Current++; + pSrc2Current++; + pDstCurrent++; + } + } + + ZeroUpper(); + } + + internal static unsafe float SumU(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + result256 = Avx.Add(result256, Avx.LoadVector256(pSrcCurrent)); + pSrcCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + result128 = Sse.Add(result128, Sse.LoadVector128(pSrcCurrent)); pSrcCurrent += 4; } @@ -608,5 +1049,605 @@ internal static unsafe float SumU(Span src) return sum; } } + + internal static unsafe float SumSqU(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + result256 = Avx.Add(result256, Avx.Multiply(srcVector, srcVector)); + + pSrcCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + result128 = Sse.Add(result128, Sse.Multiply(srcVector, srcVector)); + + pSrcCurrent += 4; + } + + result128 = VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, srcVector)); + + pSrcCurrent++; + } + + float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + ZeroUpper(); + return sum; + } + } + + internal static unsafe float SumSqDiffU(float mean, Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + Vector256 meanVector256 = Avx.SetAllVector256(mean); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + srcVector = Avx.Subtract(srcVector, meanVector256); + result256 = Avx.Add(result256, Avx.Multiply(srcVector, srcVector)); + + pSrcCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + Vector128 meanVector128 = Sse.SetAllVector128(mean); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector128); + result128 = Sse.Add(result128, Sse.Multiply(srcVector, srcVector)); + + pSrcCurrent += 4; + } + + result128 = VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector128); + result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, srcVector)); + + pSrcCurrent++; + } + + float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + ZeroUpper(); + return sum; + } + } + + internal static unsafe float SumAbsU(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + Vector256 mask256 = GetAbsMask256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + result256 = Avx.Add(result256, Avx.And(srcVector, mask256)); + + pSrcCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + Vector128 mask128 = GetAbsMask128(); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + result128 = Sse.Add(result128, Sse.And(srcVector, mask128)); + + pSrcCurrent += 4; + } + + result128 = VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + result128 = Sse.AddScalar(result128, Sse.And(srcVector, mask128)); + + pSrcCurrent++; + } + + float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + ZeroUpper(); + return sum; + } + } + + internal static unsafe float SumAbsDiffU(float mean, Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + Vector256 meanVector256 = Avx.SetAllVector256(mean); + Vector256 mask256 = GetAbsMask256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + srcVector = Avx.Subtract(srcVector, meanVector256); + result256 = Avx.Add(result256, Avx.And(srcVector, mask256)); + + pSrcCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + Vector128 meanVector128 = Sse.SetAllVector128(mean); + Vector128 mask128 = GetAbsMask128(); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector128); + result128 = Sse.Add(result128, Sse.And(srcVector, mask128)); + + pSrcCurrent += 4; + } + + result128 = VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector128); + result128 = Sse.AddScalar(result128, Sse.And(srcVector, mask128)); + + pSrcCurrent++; + } + + float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + ZeroUpper(); + return sum; + } + } + + internal static unsafe float MaxAbsU(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + Vector256 mask256 = GetAbsMask256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + result256 = Avx.Max(result256, Avx.And(srcVector, mask256)); + + pSrcCurrent += 8; + } + + result256 = VectorMax256(in result256); + Vector128 resultPadded = Sse.MaxScalar(GetLow(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + Vector128 mask128 = GetAbsMask128(); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + result128 = Sse.Max(result128, Sse.And(srcVector, mask128)); + + pSrcCurrent += 4; + } + + result128 = VectorMax128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + result128 = Sse.MaxScalar(result128, Sse.And(srcVector, mask128)); + + pSrcCurrent++; + } + + float max = Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded)); + ZeroUpper(); + return max; + } + } + + internal static unsafe float MaxAbsDiffU(float mean, Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + Vector256 meanVector256 = Avx.SetAllVector256(mean); + Vector256 mask256 = GetAbsMask256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + srcVector = Avx.Subtract(srcVector, meanVector256); + result256 = Avx.Max(result256, Avx.And(srcVector, mask256)); + + pSrcCurrent += 8; + } + + result256 = VectorMax256(in result256); + Vector128 resultPadded = Sse.MaxScalar(GetLow(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + Vector128 meanVector128 = Sse.SetAllVector128(mean); + Vector128 mask128 = GetAbsMask128(); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector128); + result128 = Sse.Max(result128, Sse.And(srcVector, mask128)); + + pSrcCurrent += 4; + } + + result128 = VectorMax128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector128); + result128 = Sse.MaxScalar(result128, Sse.And(srcVector, mask128)); + + pSrcCurrent++; + } + + float max = Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded)); + ZeroUpper(); + return max; + } + } + + internal static unsafe float DotU(Span src, Span dst) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pSrcEnd = psrc + src.Length; + + Vector256 result256 = Avx.SetZeroVector256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + + result256 = Avx.Add(result256, Avx.Multiply(srcVector, dstVector)); + + pSrcCurrent += 8; + pDstCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + + result128 = Sse.Add(result128, Sse.Multiply(srcVector, dstVector)); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + result128 = VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + + result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, dstVector)); + + pSrcCurrent++; + pDstCurrent++; + } + + float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + ZeroUpper(); + return sum; + } + } + + internal static unsafe float DotSU(Span src, Span dst, Span idx) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + fixed (int* pidx = idx) + { + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + int* pIdxCurrent = pidx; + int* pIdxEnd = pidx + idx.Length; + + Vector256 result256 = Avx.SetZeroVector256(); + + while (pIdxCurrent + 8 <= pIdxEnd) + { + Vector256 srcVector = Load8(pSrcCurrent, pIdxCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + + result256 = Avx.Add(result256, Avx.Multiply(srcVector, dstVector)); + + pIdxCurrent += 8; + pDstCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + while (pIdxCurrent + 4 <= pIdxEnd) + { + Vector128 srcVector = Load4(pSrcCurrent, pIdxCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + + result128 = Sse.Add(result128, Sse.Multiply(srcVector, dstVector)); + + pIdxCurrent += 4; + pDstCurrent += 4; + } + + result128 = VectorSum128(in result128); + + while (pIdxCurrent < pIdxEnd) + { + Vector128 srcVector = Load1(pSrcCurrent, pIdxCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + + result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, dstVector)); + + pIdxCurrent++; + pDstCurrent++; + } + + float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + ZeroUpper(); + return sum; + } + } + + internal static unsafe float Dist2(Span src, Span dst) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pSrcEnd = psrc + src.Length; + + Vector256 sqDistanceVector256 = Avx.SetZeroVector256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 distanceVector = Avx.Subtract(Avx.LoadVector256(pSrcCurrent), + Avx.LoadVector256(pDstCurrent)); + sqDistanceVector256 = Avx.Add(sqDistanceVector256, + Avx.Multiply(distanceVector, distanceVector)); + + pSrcCurrent += 8; + pDstCurrent += 8; + } + + sqDistanceVector256 = VectorSum256(in sqDistanceVector256); + Vector128 sqDistanceVectorPadded = Sse.AddScalar(GetLow(sqDistanceVector256), GetHigh(sqDistanceVector256)); + + Vector128 sqDistanceVector128 = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent), + Sse.LoadVector128(pDstCurrent)); + sqDistanceVector128 = Sse.Add(sqDistanceVector128, + Sse.Multiply(distanceVector, distanceVector)); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + sqDistanceVector128 = VectorSum128(in sqDistanceVector128); + + float norm = Sse.ConvertToSingle(Sse.AddScalar(sqDistanceVector128, sqDistanceVectorPadded)); + while (pSrcCurrent < pSrcEnd) + { + float distance = (*pSrcCurrent) - (*pDstCurrent); + norm += distance * distance; + + pSrcCurrent++; + pDstCurrent++; + } + + ZeroUpper(); + return norm; + } + } + + internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) + { + fixed (float* psrc = src) + fixed (float* pdst1 = v) + fixed (float* pdst2 = w) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + float* pDst1Current = pdst1; + float* pDst2Current = pdst2; + + Vector256 xPrimal256 = Avx.SetAllVector256(primalUpdate); + + Vector256 signMask256 = Avx.SetAllVector256(-0.0f); // 0x8000 0000 + Vector256 xThreshold256 = Avx.SetAllVector256(threshold); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 xSrc = Avx.LoadVector256(pSrcCurrent); + + Vector256 xDst1 = Avx.LoadVector256(pDst1Current); + xDst1 = Avx.Add(xDst1, Avx.Multiply(xSrc, xPrimal256)); + Vector256 xDst2 = GetNewDst256(xDst1, signMask256, xThreshold256); + + Avx.Store(pDst1Current, xDst1); + Avx.Store(pDst2Current, xDst2); + + pSrcCurrent += 8; + pDst1Current += 8; + pDst2Current += 8; + } + + Vector128 xPrimal128 = Sse.SetAllVector128(primalUpdate); + + Vector128 signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000 + Vector128 xThreshold128 = Sse.SetAllVector128(threshold); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); + + Vector128 xDst1 = Sse.LoadVector128(pDst1Current); + xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128)); + Vector128 xDst2 = GetNewDst128(xDst1, signMask128, xThreshold128); + + Sse.Store(pDst1Current, xDst1); + Sse.Store(pDst2Current, xDst2); + + pSrcCurrent += 4; + pDst1Current += 4; + pDst2Current += 4; + } + + while (pSrcCurrent < pSrcEnd) + { + *pDst1Current += (*pSrcCurrent) * primalUpdate; + float dst1 = *pDst1Current; + *pDst2Current = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0; + + pSrcCurrent++; + pDst1Current++; + pDst2Current++; + } + } + } + + internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) + { + fixed (float* psrc = src) + fixed (int* pidx = indices) + fixed (float* pdst1 = v) + fixed (float* pdst2 = w) + { + int* pIdxEnd = pidx + indices.Length; + float* pSrcCurrent = psrc; + int* pIdxCurrent = pidx; + + Vector256 xPrimal256 = Avx.SetAllVector256(primalUpdate); + + Vector256 signMask = Avx.SetAllVector256(-0.0f); // 0x8000 0000 + Vector256 xThreshold = Avx.SetAllVector256(threshold); + + while (pIdxCurrent + 8 <= pIdxEnd) + { + Vector256 xSrc = Avx.LoadVector256(pSrcCurrent); + + Vector256 xDst1 = Load8(pdst1, pIdxCurrent); + xDst1 = Avx.Add(xDst1, Avx.Multiply(xSrc, xPrimal256)); + Vector256 xDst2 = GetNewDst256(xDst1, signMask, xThreshold); + + Store8(in xDst1, pdst1, pIdxCurrent); + Store8(in xDst2, pdst2, pIdxCurrent); + + pIdxCurrent += 8; + pSrcCurrent += 8; + } + + Vector128 xPrimal128 = Sse.SetAllVector128(primalUpdate); + + Vector128 signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000 + Vector128 xThreshold128 = Sse.SetAllVector128(threshold); + + while (pIdxCurrent + 4 <= pIdxEnd) + { + Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); + + Vector128 xDst1 = Load4(pdst1, pIdxCurrent); + xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128)); + Vector128 xDst2 = GetNewDst128(xDst1, signMask128, xThreshold128); + + Store4(in xDst1, pdst1, pIdxCurrent); + Store4(in xDst2, pdst2, pIdxCurrent); + + pIdxCurrent += 4; + pSrcCurrent += 4; + } + + while (pIdxCurrent < pIdxEnd) + { + int index = *pIdxCurrent; + pdst1[index] += (*pSrcCurrent) * primalUpdate; + float dst1 = pdst1[index]; + pdst2[index] = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0; + + pIdxCurrent++; + pSrcCurrent++; + } + } + } } } diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index c192052ca6..1e944aea55 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -196,7 +196,11 @@ public static void Add(float a, float[] dst, int count) private static void Add(float a, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddScalarU(a, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.AddScalarU(a, dst); } @@ -261,7 +265,11 @@ public static void Scale(float a, float[] src, float[] dst, int count) private static void Scale(float a, Span src, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.ScaleSrcU(a, src, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.ScaleSrcU(a, src, dst); } @@ -286,7 +294,11 @@ public static void ScaleAdd(float a, float b, float[] dst, int count) private static void ScaleAdd(float a, float b, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.ScaleAddU(a, b, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.ScaleAddU(a, b, dst); } @@ -373,7 +385,11 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in private static void AddScale(float a, Span src, Span indices, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddScaleSU(a, src, indices, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.AddScaleSU(a, src, indices, dst); } @@ -402,7 +418,11 @@ public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, private static void AddScaleCopy(float a, Span src, Span dst, Span res) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddScaleCopyU(a, src, dst, res); + } + else if (Sse.IsSupported) { SseIntrinsics.AddScaleCopyU(a, src, dst, res); } @@ -476,7 +496,11 @@ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, i private static void Add(Span src, Span indices, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddSU(src, indices, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.AddSU(src, indices, dst); } @@ -505,7 +529,11 @@ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int c private static void MulElementWise(Span src1, Span src2, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.MulElementWiseU(src1, src2, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.MulElementWiseU(src1, src2, dst); } @@ -579,7 +607,11 @@ public static float SumSq(float[] src, int offset, int count) private static float SumSq(Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.SumSqU(src); + } + else if (Sse.IsSupported) { return SseIntrinsics.SumSqU(src); } @@ -606,7 +638,11 @@ public static float SumSq(float mean, float[] src, int offset, int count) private static float SumSq(float mean, Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return (mean == 0) ? AvxIntrinsics.SumSqU(src) : AvxIntrinsics.SumSqDiffU(mean, src); + } + else if (Sse.IsSupported) { return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src); } @@ -642,7 +678,11 @@ public static float SumAbs(float[] src, int offset, int count) private static float SumAbs(Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.SumAbsU(src); + } + else if (Sse.IsSupported) { return SseIntrinsics.SumAbsU(src); } @@ -669,7 +709,11 @@ public static float SumAbs(float mean, float[] src, int offset, int count) private static float SumAbs(float mean, Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return (mean == 0) ? AvxIntrinsics.SumAbsU(src) : AvxIntrinsics.SumAbsDiffU(mean, src); + } + else if (Sse.IsSupported) { return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src); } @@ -705,7 +749,11 @@ public static float MaxAbs(float[] src, int offset, int count) private static float MaxAbs(Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.MaxAbsU(src); + } + else if (Sse.IsSupported) { return SseIntrinsics.MaxAbsU(src); } @@ -735,7 +783,11 @@ public static float MaxAbsDiff(float mean, float[] src, int count) private static float MaxAbsDiff(float mean, Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.MaxAbsDiffU(mean, src); + } + else if (Sse.IsSupported) { return SseIntrinsics.MaxAbsDiffU(mean, src); } @@ -779,7 +831,11 @@ public static float DotProductDense(float[] a, int offset, float[] b, int count) private static float DotProductDense(Span a, Span b) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.DotU(a, b); + } + else if (Sse.IsSupported) { return SseIntrinsics.DotU(a, b); } @@ -826,7 +882,11 @@ public static float DotProductSparse(float[] a, int offset, float[] b, int[] ind private static float DotProductSparse(Span a, Span b, Span indices) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.DotSU(a, b, indices); + } + else if (Sse.IsSupported) { return SseIntrinsics.DotSU(a, b, indices); } @@ -855,7 +915,11 @@ public static float L2DistSquared(float[] a, float[] b, int count) private static float L2DistSquared(Span a, Span b) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.Dist2(a, b); + } + else if (Sse.IsSupported) { return SseIntrinsics.Dist2(a, b); } @@ -951,7 +1015,11 @@ public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src private static void SdcaL1UpdateDense(float primalUpdate, Span src, float threshold, Span v, Span w) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w); + } + else if (Sse.IsSupported) { SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w); } @@ -985,7 +1053,11 @@ public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] sr private static void SdcaL1UpdateSparse(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w); + } + else if (Sse.IsSupported) { SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w); } diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 44157364a8..02be0f7aaf 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -417,18 +417,18 @@ internal static unsafe void AddScalarU(float scalar, Span dst) while (pDstCurrent + 4 <= pDstEnd) { - Vector128 x2 = Sse.LoadVector128(pDstCurrent); - x2 = Sse.Add(x2, scalarVector); - Sse.Store(pDstCurrent, x2); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + dstVector = Sse.Add(dstVector, scalarVector); + Sse.Store(pDstCurrent, dstVector); pDstCurrent += 4; } while (pDstCurrent < pDstEnd) { - Vector128 x2 = Sse.LoadScalarVector128(pDstCurrent); - x2 = Sse.AddScalar(x2, scalarVector); - Sse.StoreScalar(pDstCurrent, x2); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + dstVector = Sse.AddScalar(dstVector, scalarVector); + Sse.StoreScalar(pDstCurrent, dstVector); pDstCurrent++; } @@ -437,13 +437,13 @@ internal static unsafe void AddScalarU(float scalar, Span dst) internal static unsafe void ScaleU(float scale, Span dst) { - Vector128 scaleVector = Sse.SetAllVector128(scale); - fixed (float* pdst = dst) { float* pDstCurrent = pdst; float* pEnd = pdst + dst.Length; + Vector128 scaleVector = Sse.SetAllVector128(scale); + while (pDstCurrent + 4 <= pEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -468,8 +468,6 @@ internal static unsafe void ScaleU(float scale, Span dst) internal static unsafe void ScaleSrcU(float scale, Span src, Span dst) { - Vector128 scaleVector = Sse.SetAllVector128(scale); - fixed (float* psrc = src) fixed (float* pdst = dst) { @@ -477,6 +475,8 @@ internal static unsafe void ScaleSrcU(float scale, Span src, Span float* pSrcCurrent = psrc; float* pDstCurrent = pdst; + Vector128 scaleVector = Sse.SetAllVector128(scale); + while (pDstCurrent + 4 <= pDstEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); @@ -502,19 +502,19 @@ internal static unsafe void ScaleSrcU(float scale, Span src, Span // dst[i] = a * (dst[i] + b) internal static unsafe void ScaleAddU(float a, float b, Span dst) { - Vector128 x1 = Sse.SetAllVector128(a); - Vector128 x2 = Sse.SetAllVector128(b); - fixed (float* pdst = dst) { float* pDstEnd = pdst + dst.Length; float* pDstCurrent = pdst; + Vector128 aVector = Sse.SetAllVector128(a); + Vector128 bVector = Sse.SetAllVector128(b); + while (pDstCurrent + 4 <= pDstEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); - dstVector = Sse.Add(dstVector, x2); - dstVector = Sse.Multiply(dstVector, x1); + dstVector = Sse.Add(dstVector, bVector); + dstVector = Sse.Multiply(dstVector, aVector); Sse.Store(pDstCurrent, dstVector); pDstCurrent += 4; @@ -523,8 +523,8 @@ internal static unsafe void ScaleAddU(float a, float b, Span dst) while (pDstCurrent < pDstEnd) { Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); - dstVector = Sse.AddScalar(dstVector, x2); - dstVector = Sse.MultiplyScalar(dstVector, x1); + dstVector = Sse.AddScalar(dstVector, bVector); + dstVector = Sse.MultiplyScalar(dstVector, aVector); Sse.StoreScalar(pDstCurrent, dstVector); pDstCurrent++; @@ -534,8 +534,6 @@ internal static unsafe void ScaleAddU(float a, float b, Span dst) internal static unsafe void AddScaleU(float scale, Span src, Span dst) { - Vector128 scaleVector = Sse.SetAllVector128(scale); - fixed (float* psrc = src) fixed (float* pdst = dst) { @@ -543,6 +541,8 @@ internal static unsafe void AddScaleU(float scale, Span src, Span float* pDstCurrent = pdst; float* pEnd = pdst + dst.Length; + Vector128 scaleVector = Sse.SetAllVector128(scale); + while (pDstCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); @@ -582,15 +582,15 @@ internal static unsafe void AddScaleCopyU(float scale, Span src, Span x1 = Sse.SetAllVector128(scale); + Vector128 scaleVector = Sse.SetAllVector128(scale); while (pResCurrent + 4 <= pResEnd) { - Vector128 x2 = Sse.LoadVector128(pSrcCurrent); - Vector128 x3 = Sse.LoadVector128(pDstCurrent); - x2 = Sse.Multiply(x2, x1); - x3 = Sse.Add(x3, x2); - Sse.Store(pResCurrent, x3); + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + srcVector = Sse.Multiply(srcVector, scaleVector); + dstVector = Sse.Add(dstVector, srcVector); + Sse.Store(pResCurrent, dstVector); pSrcCurrent += 4; pDstCurrent += 4; @@ -599,11 +599,11 @@ internal static unsafe void AddScaleCopyU(float scale, Span src, Span x2 = Sse.LoadScalarVector128(pSrcCurrent); - Vector128 x3 = Sse.LoadScalarVector128(pDstCurrent); - x2 = Sse.MultiplyScalar(x2, x1); - x3 = Sse.AddScalar(x3, x2); - Sse.StoreScalar(pResCurrent, x3); + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + srcVector = Sse.MultiplyScalar(srcVector, scaleVector); + dstVector = Sse.AddScalar(dstVector, srcVector); + Sse.StoreScalar(pResCurrent, dstVector); pSrcCurrent++; pDstCurrent++; @@ -614,8 +614,6 @@ internal static unsafe void AddScaleCopyU(float scale, Span src, Span src, Span idx, Span dst) { - Vector128 scaleVector = Sse.SetAllVector128(scale); - fixed (float* psrc = src) fixed (int* pidx = idx) fixed (float* pdst = dst) @@ -625,6 +623,8 @@ internal static unsafe void AddScaleSU(float scale, Span src, Span i float* pDstCurrent = pdst; int* pEnd = pidx + idx.Length; + Vector128 scaleVector = Sse.SetAllVector128(scale); + while (pIdxCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); @@ -782,14 +782,14 @@ internal static unsafe float SumU(Span src) internal static unsafe float SumSqU(Span src) { - Vector128 result = Sse.SetZeroVector128(); - fixed (float* psrc = src) { + float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); result = Sse.Add(result, Sse.Multiply(srcVector, srcVector)); @@ -799,16 +799,16 @@ internal static unsafe float SumSqU(Span src) result = VectorSum(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, srcVector)); pSrcCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } internal static unsafe float SumSqDiffU(float mean, Span src) @@ -823,9 +823,9 @@ internal static unsafe float SumSqDiffU(float mean, Span src) while (pSrcCurrent + 4 <= pSrcEnd) { - Vector128 x = Sse.LoadVector128(pSrcCurrent); - x = Sse.Subtract(x, meanVector); - result = Sse.Add(result, Sse.Multiply(x, x)); + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector); + result = Sse.Add(result, Sse.Multiply(srcVector, srcVector)); pSrcCurrent += 4; } @@ -834,9 +834,9 @@ internal static unsafe float SumSqDiffU(float mean, Span src) while (pSrcCurrent < pSrcEnd) { - Vector128 x = Sse.LoadScalarVector128(pSrcCurrent); - x = Sse.SubtractScalar(x, meanVector); - result = Sse.AddScalar(result, Sse.MultiplyScalar(x, x)); + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector); + result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, srcVector)); pSrcCurrent++; } @@ -847,15 +847,15 @@ internal static unsafe float SumSqDiffU(float mean, Span src) internal static unsafe float SumAbsU(Span src) { - Vector128 result = Sse.SetZeroVector128(); - Vector128 mask = GetAbsMask(); - fixed (float* psrc = src) { + float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + Vector128 mask = GetAbsMask(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); result = Sse.Add(result, Sse.And(srcVector, mask)); @@ -865,30 +865,30 @@ internal static unsafe float SumAbsU(Span src) result = VectorSum(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); result = Sse.AddScalar(result, Sse.And(srcVector, mask)); pSrcCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } internal static unsafe float SumAbsDiffU(float mean, Span src) { - Vector128 result = Sse.SetZeroVector128(); - Vector128 meanVector = Sse.SetAllVector128(mean); - Vector128 mask = GetAbsMask(); - fixed (float* psrc = src) { + float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + Vector128 meanVector = Sse.SetAllVector128(mean); + Vector128 mask = GetAbsMask(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector); @@ -899,7 +899,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span src) result = VectorSum(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); srcVector = Sse.SubtractScalar(srcVector, meanVector); @@ -907,22 +907,22 @@ internal static unsafe float SumAbsDiffU(float mean, Span src) pSrcCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } internal static unsafe float MaxAbsU(Span src) { - Vector128 result = Sse.SetZeroVector128(); - Vector128 mask = GetAbsMask(); - fixed (float* psrc = src) { + float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + Vector128 mask = GetAbsMask(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); result = Sse.Max(result, Sse.And(srcVector, mask)); @@ -932,30 +932,30 @@ internal static unsafe float MaxAbsU(Span src) result = VectorMax(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); result = Sse.MaxScalar(result, Sse.And(srcVector, mask)); pSrcCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } internal static unsafe float MaxAbsDiffU(float mean, Span src) { - Vector128 result = Sse.SetZeroVector128(); - Vector128 meanVector = Sse.SetAllVector128(mean); - Vector128 mask = GetAbsMask(); - fixed (float* psrc = src) { + float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + Vector128 meanVector = Sse.SetAllVector128(mean); + Vector128 mask = GetAbsMask(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector); @@ -966,7 +966,7 @@ internal static unsafe float MaxAbsDiffU(float mean, Span src) result = VectorMax(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); srcVector = Sse.SubtractScalar(srcVector, meanVector); @@ -974,23 +974,23 @@ internal static unsafe float MaxAbsDiffU(float mean, Span src) pSrcCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } internal static unsafe float DotU(Span src, Span dst) { - Vector128 result = Sse.SetZeroVector128(); - fixed (float* psrc = src) fixed (float* pdst = dst) { float* pSrcCurrent = psrc; float* pDstCurrent = pdst; - float* pEnd = psrc + src.Length; + float* pSrcEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -1003,7 +1003,7 @@ internal static unsafe float DotU(Span src, Span dst) result = VectorSum(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); @@ -1013,15 +1013,13 @@ internal static unsafe float DotU(Span src, Span dst) pSrcCurrent++; pDstCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } internal static unsafe float DotSU(Span src, Span dst, Span idx) { - Vector128 result = Sse.SetZeroVector128(); - fixed (float* psrc = src) fixed (float* pdst = dst) fixed (int* pidx = idx) @@ -1029,9 +1027,11 @@ internal static unsafe float DotSU(Span src, Span dst, Span i float* pSrcCurrent = psrc; float* pDstCurrent = pdst; int* pIdxCurrent = pidx; - int* pEnd = pidx + idx.Length; + int* pIdxEnd = pidx + idx.Length; - while (pIdxCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + + while (pIdxCurrent + 4 <= pIdxEnd) { Vector128 srcVector = Load4(pSrcCurrent, pIdxCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -1044,7 +1044,7 @@ internal static unsafe float DotSU(Span src, Span dst, Span i result = VectorSum(in result); - while (pIdxCurrent < pEnd) + while (pIdxCurrent < pIdxEnd) { Vector128 srcVector = Load1(pSrcCurrent, pIdxCurrent); Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); @@ -1054,23 +1054,23 @@ internal static unsafe float DotSU(Span src, Span dst, Span i pIdxCurrent++; pDstCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } internal static unsafe float Dist2(Span src, Span dst) { - Vector128 sqDistanceVector = Sse.SetZeroVector128(); - fixed (float* psrc = src) fixed (float* pdst = dst) { float* pSrcCurrent = psrc; float* pDstCurrent = pdst; - float* pEnd = psrc + src.Length; + float* pSrcEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 sqDistanceVector = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent), Sse.LoadVector128(pDstCurrent)); @@ -1084,7 +1084,7 @@ internal static unsafe float Dist2(Span src, Span dst) sqDistanceVector = VectorSum(in sqDistanceVector); float norm = Sse.ConvertToSingle(sqDistanceVector); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { float distance = (*pSrcCurrent) - (*pDstCurrent); norm += distance * distance; From 41d65f51ec031f57efc227ceef2519fbf351d01d Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Thu, 16 Aug 2018 23:35:25 -0700 Subject: [PATCH 06/29] Implemented perf tests for AVX via CpuMathUtils class --- .../AvxPerformanceTests.cs | 55 +++++++++++++++++++ .../SsePerformanceTests.cs | 2 +- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs index fdb7140738..7625ce987d 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs @@ -95,16 +95,71 @@ public void GlobalCleanup() original.CopyTo(result, 0); } + [Benchmark] + public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN); + [Benchmark] public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); + [Benchmark] + public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN); + + [Benchmark] + public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN); + [Benchmark] public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); + [Benchmark] + public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN); + + [Benchmark] + public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN); + [Benchmark] public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN); + [Benchmark] + public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN); + + + [Benchmark] + public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN); + [Benchmark] public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN); + + [Benchmark] + public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN); + + [Benchmark] + public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN); + + [Benchmark] + public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN); + + [Benchmark] + public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN); + + [Benchmark] + public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN); + + [Benchmark] + public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN); + + [Benchmark] + public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN); + + [Benchmark] + public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN); + + [Benchmark] + public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN); + + [Benchmark] + public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result); + + [Benchmark] + public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result); } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index ff1f451550..8893a2f877 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -9,7 +9,7 @@ namespace Microsoft.ML.CpuMath.PerformanceTests { - public class SsePerformanceTests + public class AvxVSSseNativePerformanceTests { private const int EXP_MAX = 127; private const int EXP_MIN = 0; From 8c34e8783c03f01cbb0d68906757b390281cb3a2 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Fri, 17 Aug 2018 10:57:37 -0700 Subject: [PATCH 07/29] Implemented switching logic for Vector128/256Alignment between SSE and AVX support --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 60 +++++++++---------- .../CpuAligenedMathUtils.cs | 4 +- .../CpuMathUtils.netcoreapp.cs | 9 +++ .../CpuMathUtils.netstandard.cs | 8 +++ src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 54 ++++++++--------- src/Microsoft.ML.Transforms/RffTransform.cs | 18 +++--- 6 files changed, 85 insertions(+), 68 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 4c006ea868..21ef1a16f7 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -15,22 +15,22 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { - internal static class AvxIntrinsics + public static class AvxIntrinsics { - private const int CbAlign = 32; + private const int Vector256Alignment = 32; private static bool Compat(AlignedArray a) { Contracts.AssertValue(a); Contracts.Assert(a.Size > 0); - return a.CbAlign == CbAlign; + return a.CbAlign == Vector256Alignment; } private static unsafe float* Ptr(AlignedArray a, float* p) { Contracts.AssertValue(a); float* q = p + a.GetBase((long)p); - Contracts.Assert(((long)q & (CbAlign - 1)) == 0); + Contracts.Assert(((long)q & (Vector256Alignment - 1)) == 0); return q; } @@ -201,7 +201,7 @@ private static Vector256 GetNewDst256(in Vector256 xDst1, in Vecto } // Multiply matrix times vector into vector. - internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { Contracts.Assert(Compat(mat)); Contracts.Assert(Compat(src)); @@ -269,7 +269,7 @@ internal static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src } // Partial sparse source vector. - internal static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) { Contracts.Assert(Compat(mat)); @@ -330,7 +330,7 @@ internal static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, } } - internal static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { Contracts.Assert(Compat(mat)); Contracts.Assert(Compat(src)); @@ -445,7 +445,7 @@ internal static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray } // Partial sparse source vector. - internal static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + public static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow) { Contracts.Assert(Compat(mat)); @@ -514,7 +514,7 @@ internal static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgpos } // dst[i] += scale - internal static unsafe void AddScalarU(float scalar, Span dst) + public static unsafe void AddScalarU(float scalar, Span dst) { fixed (float* pdst = dst) { @@ -556,7 +556,7 @@ internal static unsafe void AddScalarU(float scalar, Span dst) ZeroUpper(); } - internal static unsafe void ScaleU(float scale, Span dst) + public static unsafe void ScaleU(float scale, Span dst) { fixed (float* pdst = dst) { @@ -601,7 +601,7 @@ internal static unsafe void ScaleU(float scale, Span dst) ZeroUpper(); } - internal static unsafe void ScaleSrcU(float scale, Span src, Span dst) + public static unsafe void ScaleSrcU(float scale, Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -649,7 +649,7 @@ internal static unsafe void ScaleSrcU(float scale, Span src, Span } // dst[i] = a * (dst[i] + b) - internal static unsafe void ScaleAddU(float a, float b, Span dst) + public static unsafe void ScaleAddU(float a, float b, Span dst) { fixed (float* pdst = dst) { @@ -696,7 +696,7 @@ internal static unsafe void ScaleAddU(float a, float b, Span dst) ZeroUpper(); } - internal static unsafe void AddScaleU(float scale, Span src, Span dst) + public static unsafe void AddScaleU(float scale, Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -752,7 +752,7 @@ internal static unsafe void AddScaleU(float scale, Span src, Span ZeroUpper(); } - internal static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) + public static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -810,7 +810,7 @@ internal static unsafe void AddScaleCopyU(float scale, Span src, Span src, Span idx, Span dst) + public static unsafe void AddScaleSU(float scale, Span src, Span idx, Span dst) { fixed (float* psrc = src) fixed (int* pidx = idx) @@ -863,7 +863,7 @@ internal static unsafe void AddScaleSU(float scale, Span src, Span i ZeroUpper(); } - internal static unsafe void AddU(Span src, Span dst) + public static unsafe void AddU(Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -912,7 +912,7 @@ internal static unsafe void AddU(Span src, Span dst) ZeroUpper(); } - internal static unsafe void AddSU(Span src, Span idx, Span dst) + public static unsafe void AddSU(Span src, Span idx, Span dst) { fixed (float* psrc = src) fixed (int* pidx = idx) @@ -959,7 +959,7 @@ internal static unsafe void AddSU(Span src, Span idx, Span ds ZeroUpper(); } - internal static unsafe void MulElementWiseU(Span src1, Span src2, Span dst) + public static unsafe void MulElementWiseU(Span src1, Span src2, Span dst) { fixed (float* psrc1 = &src1[0]) fixed (float* psrc2 = &src2[0]) @@ -1010,7 +1010,7 @@ internal static unsafe void MulElementWiseU(Span src1, Span src2, ZeroUpper(); } - internal static unsafe float SumU(Span src) + public static unsafe float SumU(Span src) { fixed (float* psrc = src) { @@ -1050,7 +1050,7 @@ internal static unsafe float SumU(Span src) } } - internal static unsafe float SumSqU(Span src) + public static unsafe float SumSqU(Span src) { fixed (float* psrc = src) { @@ -1096,7 +1096,7 @@ internal static unsafe float SumSqU(Span src) } } - internal static unsafe float SumSqDiffU(float mean, Span src) + public static unsafe float SumSqDiffU(float mean, Span src) { fixed (float* psrc = src) { @@ -1147,7 +1147,7 @@ internal static unsafe float SumSqDiffU(float mean, Span src) } } - internal static unsafe float SumAbsU(Span src) + public static unsafe float SumAbsU(Span src) { fixed (float* psrc = src) { @@ -1195,7 +1195,7 @@ internal static unsafe float SumAbsU(Span src) } } - internal static unsafe float SumAbsDiffU(float mean, Span src) + public static unsafe float SumAbsDiffU(float mean, Span src) { fixed (float* psrc = src) { @@ -1248,7 +1248,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span src) } } - internal static unsafe float MaxAbsU(Span src) + public static unsafe float MaxAbsU(Span src) { fixed (float* psrc = src) { @@ -1296,7 +1296,7 @@ internal static unsafe float MaxAbsU(Span src) } } - internal static unsafe float MaxAbsDiffU(float mean, Span src) + public static unsafe float MaxAbsDiffU(float mean, Span src) { fixed (float* psrc = src) { @@ -1349,7 +1349,7 @@ internal static unsafe float MaxAbsDiffU(float mean, Span src) } } - internal static unsafe float DotU(Span src, Span dst) + public static unsafe float DotU(Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -1406,7 +1406,7 @@ internal static unsafe float DotU(Span src, Span dst) } } - internal static unsafe float DotSU(Span src, Span dst, Span idx) + public static unsafe float DotSU(Span src, Span dst, Span idx) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -1465,7 +1465,7 @@ internal static unsafe float DotSU(Span src, Span dst, Span i } } - internal static unsafe float Dist2(Span src, Span dst) + public static unsafe float Dist2(Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -1520,7 +1520,7 @@ internal static unsafe float Dist2(Span src, Span dst) } } - internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) + public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) { fixed (float* psrc = src) fixed (float* pdst1 = v) @@ -1586,7 +1586,7 @@ internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, f } } - internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) + public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) { fixed (float* psrc = src) fixed (int* pidx = indices) diff --git a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs index 9c7fa5ae1f..d217ccf6f9 100644 --- a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs +++ b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs @@ -16,7 +16,7 @@ public static void AssertCompatible(ICpuFullMatrix values) #if DEBUG var mat = values as TMatrix; Contracts.AssertValue(mat); - Contracts.Assert(mat.Items.CbAlign == CpuMathUtils.Vector128Alignment); + Contracts.Assert(mat.Items.CbAlign == CpuMathUtils.GetVectorAlignment()); #endif } @@ -29,7 +29,7 @@ public static void AssertCompatible(ICpuVector values) #if DEBUG CpuAlignedVector vec = values as CpuAlignedVector; Contracts.AssertValue(vec); - Contracts.Assert(vec.Items.CbAlign == CpuMathUtils.Vector128Alignment); + Contracts.Assert(vec.Items.CbAlign == CpuMathUtils.GetVectorAlignment()); #endif } diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 1e944aea55..807f7239d7 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -12,6 +12,15 @@ public static partial class CpuMathUtils // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray public const int Vector128Alignment = 16; + // The count of bytes in Vector256, corresponding to _cbAlign in AlignedArray + public const int Vector256Alignment = 32; + + public static int GetVectorAlignment() + { + // Assumes SSE support on machines that run ML.NET. + return Avx.IsSupported ? Vector256Alignment : Vector128Alignment; + } + public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) { Contracts.Assert(mat.Size == dst.Size * src.Size); diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index 6f480b0f25..db620dbbb6 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -6,9 +6,17 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { + // REVIEW NEEDED: AVX support cannot be checked in .NET Core App 2.0, so we assume Vector128 alignment for SSE. Is it okay? + // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray public const int Vector128Alignment = 16; + public static int GetVectorAlignment() + { + // Assumes SSE support on machines that run ML.NET. + return Vector128Alignment; + } + public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun); public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues, diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 02be0f7aaf..a555b8ba8c 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -20,7 +20,7 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { - internal static class SseIntrinsics + public static class SseIntrinsics { // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; @@ -115,7 +115,7 @@ private static Vector128 GetNewDst(in Vector128 xDst1, in Vector12 } // Multiply matrix times vector into vector. - internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { Contracts.Assert(Compat(mat)); Contracts.Assert(Compat(src)); @@ -180,7 +180,7 @@ internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src } // Partial sparse source vector. - internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) { Contracts.Assert(Compat(mat)); @@ -237,7 +237,7 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, } } - internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { Contracts.Assert(Compat(mat)); Contracts.Assert(Compat(src)); @@ -339,7 +339,7 @@ internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray } // Partial sparse source vector. - internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + public static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow) { Contracts.Assert(Compat(mat)); @@ -406,7 +406,7 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos } // dst[i] += scale - internal static unsafe void AddScalarU(float scalar, Span dst) + public static unsafe void AddScalarU(float scalar, Span dst) { fixed (float* pdst = dst) { @@ -435,7 +435,7 @@ internal static unsafe void AddScalarU(float scalar, Span dst) } } - internal static unsafe void ScaleU(float scale, Span dst) + public static unsafe void ScaleU(float scale, Span dst) { fixed (float* pdst = dst) { @@ -466,7 +466,7 @@ internal static unsafe void ScaleU(float scale, Span dst) } } - internal static unsafe void ScaleSrcU(float scale, Span src, Span dst) + public static unsafe void ScaleSrcU(float scale, Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -500,7 +500,7 @@ internal static unsafe void ScaleSrcU(float scale, Span src, Span } // dst[i] = a * (dst[i] + b) - internal static unsafe void ScaleAddU(float a, float b, Span dst) + public static unsafe void ScaleAddU(float a, float b, Span dst) { fixed (float* pdst = dst) { @@ -532,7 +532,7 @@ internal static unsafe void ScaleAddU(float a, float b, Span dst) } } - internal static unsafe void AddScaleU(float scale, Span src, Span dst) + public static unsafe void AddScaleU(float scale, Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -571,7 +571,7 @@ internal static unsafe void AddScaleU(float scale, Span src, Span } } - internal static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) + public static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -612,7 +612,7 @@ internal static unsafe void AddScaleCopyU(float scale, Span src, Span src, Span idx, Span dst) + public static unsafe void AddScaleSU(float scale, Span src, Span idx, Span dst) { fixed (float* psrc = src) fixed (int* pidx = idx) @@ -648,7 +648,7 @@ internal static unsafe void AddScaleSU(float scale, Span src, Span i } } - internal static unsafe void AddU(Span src, Span dst) + public static unsafe void AddU(Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -683,7 +683,7 @@ internal static unsafe void AddU(Span src, Span dst) } } - internal static unsafe void AddSU(Span src, Span idx, Span dst) + public static unsafe void AddSU(Span src, Span idx, Span dst) { fixed (float* psrc = src) fixed (int* pidx = idx) @@ -716,7 +716,7 @@ internal static unsafe void AddSU(Span src, Span idx, Span ds } } - internal static unsafe void MulElementWiseU(Span src1, Span src2, Span dst) + public static unsafe void MulElementWiseU(Span src1, Span src2, Span dst) { fixed (float* psrc1 = &src1[0]) fixed (float* psrc2 = &src2[0]) @@ -753,7 +753,7 @@ internal static unsafe void MulElementWiseU(Span src1, Span src2, } } - internal static unsafe float SumU(Span src) + public static unsafe float SumU(Span src) { fixed (float* psrc = src) { @@ -780,7 +780,7 @@ internal static unsafe float SumU(Span src) } } - internal static unsafe float SumSqU(Span src) + public static unsafe float SumSqU(Span src) { fixed (float* psrc = src) { @@ -811,7 +811,7 @@ internal static unsafe float SumSqU(Span src) } } - internal static unsafe float SumSqDiffU(float mean, Span src) + public static unsafe float SumSqDiffU(float mean, Span src) { fixed (float* psrc = src) { @@ -845,7 +845,7 @@ internal static unsafe float SumSqDiffU(float mean, Span src) } } - internal static unsafe float SumAbsU(Span src) + public static unsafe float SumAbsU(Span src) { fixed (float* psrc = src) { @@ -877,7 +877,7 @@ internal static unsafe float SumAbsU(Span src) } } - internal static unsafe float SumAbsDiffU(float mean, Span src) + public static unsafe float SumAbsDiffU(float mean, Span src) { fixed (float* psrc = src) { @@ -912,7 +912,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span src) } } - internal static unsafe float MaxAbsU(Span src) + public static unsafe float MaxAbsU(Span src) { fixed (float* psrc = src) { @@ -944,7 +944,7 @@ internal static unsafe float MaxAbsU(Span src) } } - internal static unsafe float MaxAbsDiffU(float mean, Span src) + public static unsafe float MaxAbsDiffU(float mean, Span src) { fixed (float* psrc = src) { @@ -979,7 +979,7 @@ internal static unsafe float MaxAbsDiffU(float mean, Span src) } } - internal static unsafe float DotU(Span src, Span dst) + public static unsafe float DotU(Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -1018,7 +1018,7 @@ internal static unsafe float DotU(Span src, Span dst) } } - internal static unsafe float DotSU(Span src, Span dst, Span idx) + public static unsafe float DotSU(Span src, Span dst, Span idx) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -1059,7 +1059,7 @@ internal static unsafe float DotSU(Span src, Span dst, Span i } } - internal static unsafe float Dist2(Span src, Span dst) + public static unsafe float Dist2(Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -1097,7 +1097,7 @@ internal static unsafe float Dist2(Span src, Span dst) } } - internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) + public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) { fixed (float* psrc = src) fixed (float* pdst1 = v) @@ -1142,7 +1142,7 @@ internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, f } } - internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) + public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) { fixed (float* psrc = src) fixed (int* pidx = indices) diff --git a/src/Microsoft.ML.Transforms/RffTransform.cs b/src/Microsoft.ML.Transforms/RffTransform.cs index 6ad6ceec5f..e1f1d5f39d 100644 --- a/src/Microsoft.ML.Transforms/RffTransform.cs +++ b/src/Microsoft.ML.Transforms/RffTransform.cs @@ -122,8 +122,8 @@ public TransformInfo(IHost host, Column item, Arguments args, int d, Float avgDi int roundedUpD = RoundUp(NewDim, CfltAlign); int roundedUpNumFeatures = RoundUp(SrcDim, CfltAlign); - RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.Vector128Alignment); - RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.Vector128Alignment); + RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment()); + RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment()); InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD); } @@ -158,8 +158,8 @@ public TransformInfo(IHostEnvironment env, ModelLoadContext ctx, int colValueCou // initialize the transform matrix int roundedUpD = RoundUp(NewDim, CfltAlign); int roundedUpNumFeatures = RoundUp(SrcDim, CfltAlign); - RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.Vector128Alignment); - RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.Vector128Alignment); + RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment()); + RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment()); InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD); } @@ -227,7 +227,7 @@ private static VersionInfo GetVersionInfo() private readonly TransformInfo[] _transformInfos; private const string RegistrationName = "Rff"; - private const int CfltAlign = CpuMathUtils.Vector128Alignment / sizeof(float); + private const int CfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float); private static string TestColumnType(ColumnType type) { @@ -496,8 +496,8 @@ private ValueGetter> GetterFromVectorType(IRow input, int iinfo) var getSrc = GetSrcGetter>(input, iinfo); var src = default(VBuffer); - var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, CfltAlign), CpuMathUtils.Vector128Alignment); - var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.Vector128Alignment); + var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, CfltAlign), CpuMathUtils.GetVectorAlignment()); + var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.GetVectorAlignment()); return (ref VBuffer dst) => @@ -512,8 +512,8 @@ private ValueGetter> GetterFromFloatType(IRow input, int iinfo) var getSrc = GetSrcGetter(input, iinfo); var src = default(Float); - var featuresAligned = new AlignedArray(RoundUp(1, CfltAlign), CpuMathUtils.Vector128Alignment); - var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.Vector128Alignment); + var featuresAligned = new AlignedArray(RoundUp(1, CfltAlign), CpuMathUtils.GetVectorAlignment()); + var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.GetVectorAlignment()); var oneDimensionalVector = new VBuffer(1, new Float[] { 0 }); From ddeb6551145e6e5f438ad8b6de47f0591ca296c6 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Fri, 17 Aug 2018 11:37:12 -0700 Subject: [PATCH 08/29] Changed perf tests to reveal SSE and AVX intrinsics perf separately --- .../AvxPerformanceTests.cs | 113 ++++++++++++++---- .../CpuMathNativeUtils.cs | 12 -- .../SsePerformanceTests.cs | 113 ++++++++++++++---- 3 files changed, 180 insertions(+), 58 deletions(-) diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs index 7625ce987d..01058384f8 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs @@ -96,70 +96,137 @@ public void GlobalCleanup() } [Benchmark] - public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN); + public void ManagedAddScalarUPerf() + { + AvxIntrinsics.AddScalarU(DEFAULT_SCALE, new Span(dst, 0, LEN)); + } [Benchmark] - public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); + public void ManagedScaleUPerf() + { + AvxIntrinsics.ScaleU(DEFAULT_SCALE, new Span(dst, 0, LEN)); + } [Benchmark] - public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN); + public void ManagedScaleSrcUPerf() + { + AvxIntrinsics.ScaleSrcU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] - public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN); + public void ManagedScaleAddUPerf() + { + AvxIntrinsics.ScaleAddU(DEFAULT_SCALE, DEFAULT_SCALE, new Span(dst, 0, LEN)); + } [Benchmark] - public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); + public void ManagedAddScaleUPerf() + { + AvxIntrinsics.AddScaleU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] - public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN); + public void ManagedAddScaleSUPerf() + { + AvxIntrinsics.AddScaleSU(DEFAULT_SCALE, new Span(src), new Span(idx, 0, IDXLEN), new Span(dst)); + } [Benchmark] - public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN); + public void ManagedAddScaleCopyUPerf() + { + AvxIntrinsics.AddScaleCopyU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN), new Span(result, 0, LEN)); + } [Benchmark] - public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN); + public void ManagedAddUPerf() + { + AvxIntrinsics.AddU(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] - public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN); + public void ManagedAddSUPerf() + { + AvxIntrinsics.AddSU(new Span(src), new Span(idx, 0, IDXLEN), new Span(dst)); + } [Benchmark] - public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN); + public void ManagedMulElementWiseUPerf() + { + AvxIntrinsics.MulElementWiseU(new Span(src1, 0, LEN), new Span(src2, 0, LEN), + new Span(dst, 0, LEN)); + } [Benchmark] - public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN); + public float ManagedSumUPerf() + { + return AvxIntrinsics.SumU(new Span(src, 0, LEN)); + } [Benchmark] - public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN); + public float ManagedSumSqUPerf() + { + return AvxIntrinsics.SumSqU(new Span(src, 0, LEN)); + } [Benchmark] - public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN); + public float ManagedSumSqDiffUPerf() + { + return AvxIntrinsics.SumSqDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } - [Benchmark] - public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN); + [Benchmark] + public float ManagedSumAbsUPerf() + { + return AvxIntrinsics.SumAbsU(new Span(src, 0, LEN)); + } [Benchmark] - public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN); + public float ManagedSumAbsDiffUPerf() + { + return AvxIntrinsics.SumAbsDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } [Benchmark] - public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN); + public float ManagedMaxAbsUPerf() + { + return AvxIntrinsics.MaxAbsU(new Span(src, 0, LEN)); + } [Benchmark] - public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN); + public float ManagedMaxAbsDiffUPerf() + { + return AvxIntrinsics.MaxAbsDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } [Benchmark] - public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN); + public float ManagedDotUPerf() + { + return AvxIntrinsics.DotU(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] - public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN); + public float ManagedDotSUPerf() + { + return AvxIntrinsics.DotSU(new Span(src), new Span(dst), new Span(idx, 0, IDXLEN)); + } [Benchmark] - public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN); + public float ManagedDist2Perf() + { + return AvxIntrinsics.Dist2(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] - public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result); + public void ManagedSdcaL1UpdateUPerf() + { + AvxIntrinsics.SdcaL1UpdateU(DEFAULT_SCALE, new Span(src, 0, LEN), DEFAULT_SCALE, new Span(dst, 0, LEN), new Span(result, 0, LEN)); + } [Benchmark] - public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result); + public void ManagedSdcaL1UpdateSUPerf() + { + AvxIntrinsics.SdcaL1UpdateSU(DEFAULT_SCALE, new Span(src, 0, IDXLEN), new Span(idx, 0, IDXLEN), DEFAULT_SCALE, new Span(dst), new Span(result)); + } } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs index 27f46022eb..8df3352556 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs @@ -85,17 +85,5 @@ internal static class CpuMathNativeUtils [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateSU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void SdcaL1UpdateSU(float primalUpdate, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c); - - [DllImport("CpuMathNative", EntryPoint = "ScaleX"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void ScaleX(float a, /*_Inout_*/ float* pd, int c); - - [DllImport("CpuMathNative", EntryPoint = "AddScaleX"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void AddScaleX(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); - - [DllImport("CpuMathNative", EntryPoint = "AddX"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void AddX(/*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); - - [DllImport("CpuMathNative", EntryPoint = "SumX"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe float SumX(/*const*/ float* ps, int c); } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index 8893a2f877..c3869c63d9 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -9,7 +9,7 @@ namespace Microsoft.ML.CpuMath.PerformanceTests { - public class AvxVSSseNativePerformanceTests + public class SsePerformanceTests { private const int EXP_MAX = 127; private const int EXP_MIN = 0; @@ -105,7 +105,10 @@ public unsafe void NativeAddScalarUPerf() } [Benchmark] - public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN); + public void ManagedAddScalarUPerf() + { + SseIntrinsics.AddScalarU(DEFAULT_SCALE, new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeScaleUPerf() @@ -117,7 +120,10 @@ public unsafe void NativeScaleUPerf() } [Benchmark] - public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); + public void ManagedScaleUPerf() + { + SseIntrinsics.ScaleU(DEFAULT_SCALE, new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeScaleSrcUPerf() @@ -130,7 +136,10 @@ public unsafe void NativeScaleSrcUPerf() } [Benchmark] - public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN); + public void ManagedScaleSrcUPerf() + { + SseIntrinsics.ScaleSrcU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeScaleAddUPerf() @@ -142,7 +151,10 @@ public unsafe void NativeScaleAddUPerf() } [Benchmark] - public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN); + public void ManagedScaleAddUPerf() + { + SseIntrinsics.ScaleAddU(DEFAULT_SCALE, DEFAULT_SCALE, new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeAddScaleUPerf() @@ -155,7 +167,10 @@ public unsafe void NativeAddScaleUPerf() } [Benchmark] - public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); + public void ManagedAddScaleUPerf() + { + SseIntrinsics.AddScaleU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeAddScaleSUPerf() @@ -169,7 +184,10 @@ public unsafe void NativeAddScaleSUPerf() } [Benchmark] - public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN); + public void ManagedAddScaleSUPerf() + { + SseIntrinsics.AddScaleSU(DEFAULT_SCALE, new Span(src), new Span(idx, 0, IDXLEN), new Span(dst)); + } [Benchmark] public unsafe void NativeAddScaleCopyUPerf() @@ -183,7 +201,10 @@ public unsafe void NativeAddScaleCopyUPerf() } [Benchmark] - public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN); + public void ManagedAddScaleCopyUPerf() + { + SseIntrinsics.AddScaleCopyU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN), new Span(result, 0, LEN)); + } [Benchmark] public unsafe void NativeAddUPerf() @@ -196,7 +217,10 @@ public unsafe void NativeAddUPerf() } [Benchmark] - public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN); + public void ManagedAddUPerf() + { + SseIntrinsics.AddU(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeAddSUPerf() @@ -210,7 +234,10 @@ public unsafe void NativeAddSUPerf() } [Benchmark] - public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN); + public void ManagedAddSUPerf() + { + SseIntrinsics.AddSU(new Span(src), new Span(idx, 0, IDXLEN), new Span(dst)); + } [Benchmark] @@ -225,7 +252,11 @@ public unsafe void NativeMulElementWiseUPerf() } [Benchmark] - public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN); + public void ManagedMulElementWiseUPerf() + { + SseIntrinsics.MulElementWiseU(new Span(src1, 0, LEN), new Span(src2, 0, LEN), + new Span(dst, 0, LEN)); + } [Benchmark] public unsafe float NativeSumUPerf() @@ -237,7 +268,10 @@ public unsafe float NativeSumUPerf() } [Benchmark] - public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN); + public float ManagedSumUPerf() + { + return SseIntrinsics.SumU(new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeSumSqUPerf() @@ -249,7 +283,10 @@ public unsafe float NativeSumSqUPerf() } [Benchmark] - public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN); + public float ManagedSumSqUPerf() + { + return SseIntrinsics.SumSqU(new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeSumSqDiffUPerf() @@ -261,7 +298,10 @@ public unsafe float NativeSumSqDiffUPerf() } [Benchmark] - public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN); + public float ManagedSumSqDiffUPerf() + { + return SseIntrinsics.SumSqDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeSumAbsUPerf() @@ -273,7 +313,10 @@ public unsafe float NativeSumAbsUPerf() } [Benchmark] - public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN); + public float ManagedSumAbsUPerf() + { + return SseIntrinsics.SumAbsU(new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeSumAbsDiffUPerf() @@ -285,7 +328,10 @@ public unsafe float NativeSumAbsDiffUPerf() } [Benchmark] - public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN); + public float ManagedSumAbsDiffUPerf() + { + return SseIntrinsics.SumAbsDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeMaxAbsUPerf() @@ -297,7 +343,10 @@ public unsafe float NativeMaxAbsUPerf() } [Benchmark] - public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN); + public float ManagedMaxAbsUPerf() + { + return SseIntrinsics.MaxAbsU(new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeMaxAbsDiffUPerf() @@ -309,7 +358,10 @@ public unsafe float NativeMaxAbsDiffUPerf() } [Benchmark] - public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN); + public float ManagedMaxAbsDiffUPerf() + { + return SseIntrinsics.MaxAbsDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeDotUPerf() @@ -322,7 +374,10 @@ public unsafe float NativeDotUPerf() } [Benchmark] - public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN); + public float ManagedDotUPerf() + { + return SseIntrinsics.DotU(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] public unsafe float NativeDotSUPerf() @@ -336,7 +391,10 @@ public unsafe float NativeDotSUPerf() } [Benchmark] - public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN); + public float ManagedDotSUPerf() + { + return SseIntrinsics.DotSU(new Span(src), new Span(dst), new Span(idx, 0, IDXLEN)); + } [Benchmark] public unsafe float NativeDist2Perf() @@ -349,7 +407,10 @@ public unsafe float NativeDist2Perf() } [Benchmark] - public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN); + public float ManagedDist2Perf() + { + return SseIntrinsics.Dist2(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeSdcaL1UpdateUPerf() @@ -363,7 +424,10 @@ public unsafe void NativeSdcaL1UpdateUPerf() } [Benchmark] - public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result); + public void ManagedSdcaL1UpdateUPerf() + { + SseIntrinsics.SdcaL1UpdateU(DEFAULT_SCALE, new Span(src, 0, LEN), DEFAULT_SCALE, new Span(dst, 0, LEN), new Span(result, 0, LEN)); + } [Benchmark] public unsafe void NativeSdcaL1UpdateSUPerf() @@ -378,6 +442,9 @@ public unsafe void NativeSdcaL1UpdateSUPerf() } [Benchmark] - public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result); + public void ManagedSdcaL1UpdateSUPerf() + { + SseIntrinsics.SdcaL1UpdateSU(DEFAULT_SCALE, new Span(src, 0, IDXLEN), new Span(idx, 0, IDXLEN), DEFAULT_SCALE, new Span(dst), new Span(result)); + } } } From c776fb098527b631bd1eebca26ec379e31916f75 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Fri, 17 Aug 2018 11:40:30 -0700 Subject: [PATCH 09/29] Fixed access modifiers of private fields --- src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs | 4 ++-- src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 807f7239d7..6843cd4757 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -10,10 +10,10 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath public static partial class CpuMathUtils { // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray - public const int Vector128Alignment = 16; + private const int Vector128Alignment = 16; // The count of bytes in Vector256, corresponding to _cbAlign in AlignedArray - public const int Vector256Alignment = 32; + private const int Vector256Alignment = 32; public static int GetVectorAlignment() { diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index db620dbbb6..bbb7f3bd6a 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -9,7 +9,7 @@ public static partial class CpuMathUtils // REVIEW NEEDED: AVX support cannot be checked in .NET Core App 2.0, so we assume Vector128 alignment for SSE. Is it okay? // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray - public const int Vector128Alignment = 16; + private const int Vector128Alignment = 16; public static int GetVectorAlignment() { From df09fe3d6bd4875d4d4081f5792e942eedee09aa Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Fri, 17 Aug 2018 15:18:47 -0700 Subject: [PATCH 10/29] Implemented all unit tests for AVX intrinsics that do not involve matrix operations with longer input arrays --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 20 +++---- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 8 +-- .../UnitTests.cs | 59 ++++++++++++------- 3 files changed, 51 insertions(+), 36 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 21ef1a16f7..73981981b6 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -925,11 +925,11 @@ public static unsafe void AddSU(Span src, Span idx, Span dst) while (pIdxCurrent + 8 <= pEnd) { - Vector256 srcVector = Load8(pDstCurrent, pIdxCurrent); - Vector256 dstVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Load8(pDstCurrent, pIdxCurrent); + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); - srcVector = Avx.Add(srcVector, dstVector); - Store8(in srcVector, pDstCurrent, pIdxCurrent); + dstVector = Avx.Add(dstVector, srcVector); + Store8(in dstVector, pDstCurrent, pIdxCurrent); pIdxCurrent += 8; pSrcCurrent += 8; @@ -937,11 +937,11 @@ public static unsafe void AddSU(Span src, Span idx, Span dst) while (pIdxCurrent + 4 <= pEnd) { - Vector128 srcVector = Load4(pDstCurrent, pIdxCurrent); - Vector128 dstVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent); + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - srcVector = Sse.Add(srcVector, dstVector); - Store4(in srcVector, pDstCurrent, pIdxCurrent); + dstVector = Sse.Add(dstVector, srcVector); + Store4(in dstVector, pDstCurrent, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; @@ -961,8 +961,8 @@ public static unsafe void AddSU(Span src, Span idx, Span dst) public static unsafe void MulElementWiseU(Span src1, Span src2, Span dst) { - fixed (float* psrc1 = &src1[0]) - fixed (float* psrc2 = &src2[0]) + fixed (float* psrc1 = src1) + fixed (float* psrc2 = src2) fixed (float* pdst = dst) { float* pSrc1Current = psrc1; diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index a555b8ba8c..e6bc9d6dd4 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -696,11 +696,11 @@ public static unsafe void AddSU(Span src, Span idx, Span dst) while (pIdxCurrent + 4 <= pEnd) { - Vector128 srcVector = Load4(pDstCurrent, pIdxCurrent); - Vector128 dstVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent); + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - srcVector = Sse.Add(srcVector, dstVector); - Store4(in srcVector, pDstCurrent, pIdxCurrent); + dstVector = Sse.Add(dstVector, srcVector); + Store4(in dstVector, pDstCurrent, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs index b57066be8c..89424f9177 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using Xunit; +using Xunit.Abstractions; using Microsoft.ML.Runtime.Internal.CpuMath; namespace Microsoft.ML.CpuMath.UnitTests @@ -20,14 +21,18 @@ public class CpuMathUtilsUnitTests private const int SseCbAlign = 16; private FloatEqualityComparer comparer; - public CpuMathUtilsUnitTests() + private readonly ITestOutputHelper output; + + public CpuMathUtilsUnitTests(ITestOutputHelper output) { + this.output = output; + // Padded array whose length is a multiple of 4 - float[] testArray1 = new float[8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; + float[] testArray1 = new float[16] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; // Unpadded array whose length is not a multiple of 4. - float[] testArray2 = new float[7] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f }; + float[] testArray2 = new float[15] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f }; testArrays = new float[][] { testArray1, testArray2 }; - testIndexArray = new int[4] { 0, 2, 5, 6 }; + testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }; comparer = new FloatEqualityComparer(); // Padded matrices whose dimensions are multiples of 4 @@ -308,6 +313,11 @@ public void AddScaleSUTest(int test) expected[2] = -13.806f; expected[5] = -43.522f; expected[6] = 55.978f; + expected[8] = -178.869f; + expected[11] = -31.941f; + expected[12] = -51.205f; + expected[13] = -21.337f; + expected[14] = 35.782f; CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length); var actual = dst; @@ -373,6 +383,11 @@ public void AddSUTest(int test) expected[2] = -12.14f; expected[5] = -36.69f; expected[6] = 46.29f; + expected[8] = -104.41f; + expected[11] = -13.09f; + expected[12] = -73.92f; + expected[13] = -23.64f; + expected[14] = 34.41f; CpuMathUtils.Add(src, idx, dst, idx.Length); var actual = dst; @@ -407,8 +422,8 @@ public void MulElementWiseUTest(int test) } [Theory] - [InlineData(0, -93.9f)] - [InlineData(1, -97.19f)] + [InlineData(0, -187.8f)] + [InlineData(1, -191.09f)] public void SumUTest(int test, float expected) { float[] src = (float[])testArrays[test].Clone(); @@ -417,8 +432,8 @@ public void SumUTest(int test, float expected) } [Theory] - [InlineData(0, 13399.9376f)] - [InlineData(1, 13389.1135f)] + [InlineData(0, 26799.8752f)] + [InlineData(1, 26789.0511f)] public void SumSqUTest(int test, float expected) { float[] src = (float[])testArrays[test].Clone(); @@ -427,8 +442,8 @@ public void SumSqUTest(int test, float expected) } [Theory] - [InlineData(0, 13742.3176f)] - [InlineData(1, 13739.7895f)] + [InlineData(0, 27484.6352f)] + [InlineData(1, 27482.1071f)] public void SumSqDiffUTest(int test, float expected) { float[] src = (float[])testArrays[test].Clone(); @@ -437,8 +452,8 @@ public void SumSqDiffUTest(int test, float expected) } [Theory] - [InlineData(0, 196.98f)] - [InlineData(1, 193.69f)] + [InlineData(0, 393.96f)] + [InlineData(1, 390.67f)] public void SumAbsUTest(int test, float expected) { float[] src = (float[])testArrays[test].Clone(); @@ -447,8 +462,8 @@ public void SumAbsUTest(int test, float expected) } [Theory] - [InlineData(0, 196.98f)] - [InlineData(1, 195.39f)] + [InlineData(0, 393.96f)] + [InlineData(1, 392.37f)] public void SumAbsDiffUTest(int test, float expected) { float[] src = (float[])testArrays[test].Clone(); @@ -477,8 +492,8 @@ public void MaxAbsDiffUTest(int test, float expected) } [Theory] - [InlineData(0, 13306.0376f)] - [InlineData(1, 13291.9235f)] + [InlineData(0, 26612.0752f)] + [InlineData(1, 26597.9611f)] public void DotUTest(int test, float expected) { float[] src = (float[])testArrays[test].Clone(); @@ -490,12 +505,12 @@ public void DotUTest(int test, float expected) } var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length); - Assert.Equal(expected, actual, 2); + Assert.Equal(expected, actual, 1); } [Theory] - [InlineData(0, 736.7352f)] - [InlineData(1, 736.7352f)] + [InlineData(0, -3406.2154f)] + [InlineData(1, -3406.2154f)] public void DotSUTest(int test, float expected) { float[] src = (float[])testArrays[test].Clone(); @@ -509,12 +524,12 @@ public void DotSUTest(int test, float expected) } var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length); - Assert.Equal(expected, actual, 4); + Assert.Equal(expected, actual, 2); } [Theory] - [InlineData(0, 8.0f)] - [InlineData(1, 7.0f)] + [InlineData(0, 16.0f)] + [InlineData(1, 15.0f)] public void Dist2Test(int test, float expected) { float[] src = (float[])testArrays[test].Clone(); From a9c481f46c7693fbe409bed9f4584a36e64634e1 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Fri, 17 Aug 2018 18:02:50 -0700 Subject: [PATCH 11/29] Implemented unit tests for AVX intrinsics --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 1 + .../UnitTests.cs | 314 +++++++++--------- 2 files changed, 162 insertions(+), 153 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 73981981b6..863a40b787 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -5,6 +5,7 @@ // The exported function names need to be unique (can't be disambiguated based on signature), hence // we introduce suffix letters to indicate the general patterns used. // * A suffix means aligned and padded for SSE operations. +// * U suffix means unaligned and unpadded. // * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector. // * Tran means the matrix is transposed. diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs index 89424f9177..a155396448 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs @@ -18,7 +18,8 @@ public class CpuMathUtilsUnitTests private readonly AlignedArray[] testSrcVectors; private readonly AlignedArray[] testDstVectors; private const float DEFAULT_SCALE = 1.7f; - private const int SseCbAlign = 16; + private const int Vector128Assignment = 16; + private const int Vector256Assignment = 32; private FloatEqualityComparer comparer; private readonly ITestOutputHelper output; @@ -35,177 +36,183 @@ public CpuMathUtilsUnitTests(ITestOutputHelper output) testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }; comparer = new FloatEqualityComparer(); - // Padded matrices whose dimensions are multiples of 4 - float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, - 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; - float[] testMatrix2 = new float[4 * 8]; + // Padded matrices whose dimensions are multiples of 8 + float[] testMatrix1 = new float[8 * 8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; + float[] testMatrix2 = new float[8 * 16]; for (int i = 0; i < testMatrix2.Length; i++) { testMatrix2[i] = i + 1; } - AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, SseCbAlign); - AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, SseCbAlign); + AlignedArray testMatrixAligned1 = new AlignedArray(8 * 8, Vector256Assignment); + AlignedArray testMatrixAligned2 = new AlignedArray(8 * 16, Vector256Assignment); testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length); testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length); testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 }; - // Padded source vectors whose dimensions are multiples of 4 - float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f }; - float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f }; + // Padded source vectors whose dimensions are multiples of 8 + float[] testSrcVector1 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f }; + float[] testSrcVector2 = new float[16] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f, 16f }; - AlignedArray testSrcVectorAligned1 = new AlignedArray(4, SseCbAlign); - AlignedArray testSrcVectorAligned2 = new AlignedArray(8, SseCbAlign); + AlignedArray testSrcVectorAligned1 = new AlignedArray(8, Vector256Assignment); + AlignedArray testSrcVectorAligned2 = new AlignedArray(16, Vector256Assignment); testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length); testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length); testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; - // Padded destination vectors whose dimensions are multiples of 4 - float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f }; - float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f }; + // Padded destination vectors whose dimensions are multiples of 8 + float[] testDstVector1 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f }; + float[] testDstVector2 = new float[16] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f }; - AlignedArray testDstVectorAligned1 = new AlignedArray(4, SseCbAlign); - AlignedArray testDstVectorAligned2 = new AlignedArray(8, SseCbAlign); + AlignedArray testDstVectorAligned1 = new AlignedArray(8, Vector256Assignment); + AlignedArray testDstVectorAligned2 = new AlignedArray(16, Vector256Assignment); testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length); testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length); testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; } - //[Theory] - //[InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })] - //[InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })] - //[InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })] - //public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) - //{ - // AlignedArray mat = testMatrices[matTest]; - // AlignedArray src = testSrcVectors[srcTest]; - // AlignedArray dst = testDstVectors[dstTest]; - - // CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); - // float[] actual = new float[dst.Size]; - // dst.CopyTo(actual, 0, dst.Size); - // Assert.Equal(expected, actual, comparer); - //} - - //[Theory] - //[InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })] - //[InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })] - //[InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })] - //public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - //{ - // AlignedArray mat = testMatrices[matTest]; - // AlignedArray src = testSrcVectors[srcTest]; - // AlignedArray dst = testDstVectors[dstTest]; - - // CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); - // float[] actual = new float[dst.Size]; - // dst.CopyTo(actual, 0, dst.Size); - // Assert.Equal(expected, actual, comparer); - //} - - //[Theory] - //[InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })] - //[InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })] - //[InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })] - //public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) - //{ - // AlignedArray mat = testMatrices[matTest]; - // AlignedArray src = testSrcVectors[srcTest]; - // AlignedArray dst = testDstVectors[dstTest]; - - // CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); - // float[] actual = new float[dst.Size]; - // dst.CopyTo(actual, 0, dst.Size); - // Assert.Equal(expected, actual, comparer); - //} - - //[Theory] - //[InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })] - //[InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })] - //[InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })] - //public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - //{ - // AlignedArray mat = testMatrices[matTest]; - // AlignedArray src = testSrcVectors[srcTest]; - // AlignedArray dst = testDstVectors[dstTest]; - - // CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); - // float[] actual = new float[dst.Size]; - // dst.CopyTo(actual, 0, dst.Size); - // Assert.Equal(expected, actual, comparer); - //} - - //[Theory] - //[InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })] - //[InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })] - //[InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })] - //public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) - //{ - // AlignedArray mat = testMatrices[matTest]; - // AlignedArray src = testSrcVectors[srcTest]; - // AlignedArray dst = testDstVectors[dstTest]; - // int[] idx = testIndexArray; - - // CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); - // float[] actual = new float[dst.Size]; - // dst.CopyTo(actual, 0, dst.Size); - // Assert.Equal(expected, actual, comparer); - //} - - //[Theory] - //[InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })] - //[InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })] - //[InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })] - //public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - //{ - // AlignedArray mat = testMatrices[matTest]; - // AlignedArray src = testSrcVectors[srcTest]; - // AlignedArray dst = testDstVectors[dstTest]; - // int[] idx = testIndexArray; - - // CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); - // float[] actual = new float[dst.Size]; - // dst.CopyTo(actual, 0, dst.Size); - // Assert.Equal(expected, actual, comparer); - //} - - //[Theory] - //[InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })] - //[InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })] - //[InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })] - //public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected) - //{ - // AlignedArray mat = testMatrices[matTest]; - // AlignedArray src = testSrcVectors[srcTest]; - // AlignedArray dst = testDstVectors[dstTest]; - // int[] idx = testIndexArray; - - // CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); - // float[] actual = new float[dst.Size]; - // dst.CopyTo(actual, 0, dst.Size); - // Assert.Equal(expected, actual, comparer); - //} - - //[Theory] - //[InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })] - //[InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })] - //[InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })] - //public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - //{ - // AlignedArray mat = testMatrices[matTest]; - // AlignedArray src = testSrcVectors[srcTest]; - // AlignedArray dst = testDstVectors[dstTest]; - // int[] idx = testIndexArray; - - // CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); - // float[] actual = new float[dst.Size]; - // dst.CopyTo(actual, 0, dst.Size); - // Assert.Equal(expected, actual, comparer); - //} + [Theory] + [InlineData(0, 0, 0, new float[] { -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f })] + [InlineData(1, 1, 0, new float[] { 1496f, 3672f, 5848f, 8024f, 10200f, 12376f, 14552f, 16728f })] + [InlineData(1, 0, 1, new float[] { 204f, 492f, 780f, 1068f, 1356f, 1644f, 1932f, 2220f, 2508f, 2796f, 3084f, 3372f, 3660f, 3948f, 4236f, 4524f })] + public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { -416.68f, -415.68f, -414.68f, -413.68f, -412.68f, -411.68f, -410.68f, -409.68f })] + [InlineData(1, 1, 0, new float[] { 1496f, 3673f, 5850f, 8027f, 10204f, 12381f, 14558f, 16735f })] + [InlineData(1, 0, 1, new float[] { 204f, 493f, 782f, 1071f, 1360f, 1649f, 1938f, 2227f, 2516f, 2805f, 3094f, 3383f, 3672f, 3961f, 4250f, 4539f })] + public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 70.56f, -85.68f, -351.36f, 498.24f, -3829.32f, -969.48f, 1168.2f, 118.44f })] + [InlineData(1, 0, 1, new float[] { 2724f, 2760f, 2796f, 2832f, 2868f, 2904f, 2940f, 2976f, 3012f, 3048f, 3084f, 3120f, 3156f, 3192f, 3228f, 3264f })] + [InlineData(1, 1, 0, new float[] { 11016f, 11152f, 11288f, 11424f, 11560f, 11696f, 11832f, 11968f })] + public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 70.56f, -84.68f, -349.36f, 501.24f, -3825.32f, -964.48f, 1174.2f, 125.44f })] + [InlineData(1, 0, 1, new float[] { 2724f, 2761f, 2798f, 2835f, 2872f, 2909f, 2946f, 2983f, 3020f, 3057f, 3094f, 3131f, 3168f, 3205f, 3242f, 3279f })] + [InlineData(1, 1, 0, new float[] { 11016f, 11153f, 11290f, 11427f, 11564f, 11701f, 11838f, 11975f })] + public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f })] + [InlineData(1, 1, 0, new float[] { 910f, 2190f, 3470f, 4750f, 6030f, 7310f, 8590f, 9870f })] + [InlineData(1, 0, 1, new float[] { 95f, 231f, 367f, 503f, 639f, 775f, 911f, 1047f, 1183f, 1319f, 1455f, 1591f, 1727f, 1863f, 1999f, 2135f })] + public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + int[] idx = testIndexArray; + + CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 38.25f, 39.25f, 40.25f, 41.25f, 42.25f, 43.25f, 44.25f, 45.25f })] + [InlineData(1, 1, 0, new float[] { 910f, 2191f, 3472f, 4753f, 6034f, 7315f, 8596f, 9877f })] + [InlineData(1, 0, 1, new float[] { 95f, 232f, 369f, 506f, 643f, 780f, 917f, 1054f, 1191f, 1328f, 1465f, 1602f, 1739f, 1876f, 2013f, 2150f })] + public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + int[] idx = testIndexArray; + + CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 33.32f, -40.46f, -165.92f, 235.28f, -1808.29f, -457.81f, 551.65f, 55.93f })] + [InlineData(1, 0, 1, new float[] { 1265f, 1282f, 1299f, 1316f, 1333f, 1350f, 1367f, 1384f, 1401f, 1418f, 1435f, 1452f, 1469f, 1486f, 1503f, 1520f })] + [InlineData(1, 1, 0, new float[] { 6720f, 6800f, 6880f, 6960f, 7040f, 7120f, 7200f, 7280f })] + public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + int[] idx = testIndexArray; + + CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 33.32f, -39.46f, -163.92f, 238.28f, -1804.29f, -452.81f, 557.65f, 62.93f })] + [InlineData(1, 0, 1, new float[] { 1265f, 1283f, 1301f, 1319f, 1337f, 1355f, 1373f, 1391f, 1409f, 1427f, 1445f, 1463f, 1481f, 1499f, 1517f, 1535f })] + [InlineData(1, 1, 0, new float[] { 6720f, 6801f, 6882f, 6963f, 7044f, 7125f, 7206f, 7287f })] + public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + int[] idx = testIndexArray; + + CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } [Theory] [InlineData(0)] @@ -546,11 +553,11 @@ public void Dist2Test(int test, float expected) } [Theory] - [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })] - [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })] + [InlineData(0, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })] + [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 0f, 10f, 11f, 0f, 0f, 0f, 0f, 16f })] public void ZeroItemsUTest(int test, int[] idx, float[] expected) { - AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign); + AlignedArray src = new AlignedArray(8 + 8 * test, Vector256Assignment); src.CopyFrom(testSrcVectors[test]); CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx); @@ -564,7 +571,8 @@ public void ZeroItemsUTest(int test, int[] idx, float[] expected) [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })] public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected) { - AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign); + // Uses Vector128Assignment since the intrinsic does not use any SSE/AVX algorithm. + AlignedArray src = new AlignedArray(4 + 4 * test, Vector128Assignment); src.CopyFrom(testSrcVectors[test]); CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx); @@ -622,7 +630,7 @@ internal class FloatEqualityComparer : IEqualityComparer { public bool Equals(float a, float b) { - return Math.Abs(a - b) < 1e-5f; + return Math.Abs(a - b) < 1e-3f; } public int GetHashCode(float a) From c692a6fe907213df1d58108c3a01be5565be23f7 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Fri, 17 Aug 2018 18:26:50 -0700 Subject: [PATCH 12/29] Fixed errors on the RffTransform.CfltAlign const-expression requirement --- src/Microsoft.ML.Transforms/RffTransform.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Transforms/RffTransform.cs b/src/Microsoft.ML.Transforms/RffTransform.cs index e1f1d5f39d..c118ce232a 100644 --- a/src/Microsoft.ML.Transforms/RffTransform.cs +++ b/src/Microsoft.ML.Transforms/RffTransform.cs @@ -227,7 +227,10 @@ private static VersionInfo GetVersionInfo() private readonly TransformInfo[] _transformInfos; private const string RegistrationName = "Rff"; - private const int CfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float); + + // REVIEW NEEDED: Used 32 (CpuMathUtils.Vector256Alignment) instead of CpuMathUtils.GetVectorAlignment() + // to silence the error that restricts the expression for CfltAlign to be constant. + private const int CfltAlign = 32 / sizeof(float); private static string TestColumnType(ColumnType type) { From 40528e4dcc30134314219f21a808d9413c5f7c55 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Fri, 17 Aug 2018 18:52:53 -0700 Subject: [PATCH 13/29] Fixed Debug errors by making RffTransform.CfltAlign read-only --- src/Microsoft.ML.Transforms/RffTransform.cs | 22 ++++++++++----------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/Microsoft.ML.Transforms/RffTransform.cs b/src/Microsoft.ML.Transforms/RffTransform.cs index c118ce232a..79e6bdb01c 100644 --- a/src/Microsoft.ML.Transforms/RffTransform.cs +++ b/src/Microsoft.ML.Transforms/RffTransform.cs @@ -120,8 +120,8 @@ public TransformInfo(IHost host, Column item, Arguments args, int d, Float avgDi sub = args.MatrixGenerator; _matrixGenerator = sub.CreateInstance(host, avgDist); - int roundedUpD = RoundUp(NewDim, CfltAlign); - int roundedUpNumFeatures = RoundUp(SrcDim, CfltAlign); + int roundedUpD = RoundUp(NewDim, _cfltAlign); + int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign); RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment()); RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment()); @@ -156,8 +156,8 @@ public TransformInfo(IHostEnvironment env, ModelLoadContext ctx, int colValueCou ctx.LoadModelOrNull(env, out _matrixGenerator, directoryName)); // initialize the transform matrix - int roundedUpD = RoundUp(NewDim, CfltAlign); - int roundedUpNumFeatures = RoundUp(SrcDim, CfltAlign); + int roundedUpD = RoundUp(NewDim, _cfltAlign); + int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign); RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment()); RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment()); InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD); @@ -227,10 +227,7 @@ private static VersionInfo GetVersionInfo() private readonly TransformInfo[] _transformInfos; private const string RegistrationName = "Rff"; - - // REVIEW NEEDED: Used 32 (CpuMathUtils.Vector256Alignment) instead of CpuMathUtils.GetVectorAlignment() - // to silence the error that restricts the expression for CfltAlign to be constant. - private const int CfltAlign = 32 / sizeof(float); + private readonly int _cfltAlign; private static string TestColumnType(ColumnType type) { @@ -254,6 +251,7 @@ public RffTransform(IHostEnvironment env, string source = null) : this(env, new Arguments() { Column = new[] { new Column() { Source = source ?? name, Name = name } }, NewDim = newDim }, input) { + _cfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float); } /// @@ -499,8 +497,8 @@ private ValueGetter> GetterFromVectorType(IRow input, int iinfo) var getSrc = GetSrcGetter>(input, iinfo); var src = default(VBuffer); - var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, CfltAlign), CpuMathUtils.GetVectorAlignment()); - var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.GetVectorAlignment()); + var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, _cfltAlign), CpuMathUtils.GetVectorAlignment()); + var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment()); return (ref VBuffer dst) => @@ -515,8 +513,8 @@ private ValueGetter> GetterFromFloatType(IRow input, int iinfo) var getSrc = GetSrcGetter(input, iinfo); var src = default(Float); - var featuresAligned = new AlignedArray(RoundUp(1, CfltAlign), CpuMathUtils.GetVectorAlignment()); - var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.GetVectorAlignment()); + var featuresAligned = new AlignedArray(RoundUp(1, _cfltAlign), CpuMathUtils.GetVectorAlignment()); + var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment()); var oneDimensionalVector = new VBuffer(1, new Float[] { 0 }); From 4d7d8effa7cc62eb02d0368c4a76b8031fce1abc Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Fri, 17 Aug 2018 19:05:03 -0700 Subject: [PATCH 14/29] Fixed errors by making CfltAlign static (and read-only) --- src/Microsoft.ML.Transforms/RffTransform.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Transforms/RffTransform.cs b/src/Microsoft.ML.Transforms/RffTransform.cs index 79e6bdb01c..675235d6ef 100644 --- a/src/Microsoft.ML.Transforms/RffTransform.cs +++ b/src/Microsoft.ML.Transforms/RffTransform.cs @@ -227,7 +227,7 @@ private static VersionInfo GetVersionInfo() private readonly TransformInfo[] _transformInfos; private const string RegistrationName = "Rff"; - private readonly int _cfltAlign; + private static readonly int _cfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float); private static string TestColumnType(ColumnType type) { @@ -251,7 +251,6 @@ public RffTransform(IHostEnvironment env, string source = null) : this(env, new Arguments() { Column = new[] { new Column() { Source = source ?? name, Name = name } }, NewDim = newDim }, input) { - _cfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float); } /// From 75e4cde97f470eafed114db075096e8fb29d95ee Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Sat, 18 Aug 2018 18:54:29 -0700 Subject: [PATCH 15/29] Developed two unit tests for netcoreapp and netstandard to deal with different alignments separately, with some style changes to readonly variables --- ...oft.ML.CpuMath.UnitTests.netcoreapp.csproj | 4 - .../UnitTests.netcoreapp.cs} | 216 +++--- .../UnitTests.netstandard.cs | 619 ++++++++++++++++++ 3 files changed, 724 insertions(+), 115 deletions(-) rename test/{Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs => Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs} (77%) create mode 100644 test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj index e611b15032..44ad91ed90 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj +++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj @@ -8,9 +8,5 @@ - - - - diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs similarity index 77% rename from test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs rename to test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs index a155396448..2d59a2acf1 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs @@ -1,40 +1,35 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using System; using System.Collections.Generic; using Xunit; -using Xunit.Abstractions; using Microsoft.ML.Runtime.Internal.CpuMath; namespace Microsoft.ML.CpuMath.UnitTests { public class CpuMathUtilsUnitTests { - private readonly float[][] testArrays; - private readonly int[] testIndexArray; - private readonly AlignedArray[] testMatrices; - private readonly AlignedArray[] testSrcVectors; - private readonly AlignedArray[] testDstVectors; - private const float DEFAULT_SCALE = 1.7f; - private const int Vector128Assignment = 16; - private const int Vector256Assignment = 32; - private FloatEqualityComparer comparer; + private readonly float[][] _testArrays; + private readonly int[] _testIndexArray; + private readonly AlignedArray[] _testMatrices; + private readonly AlignedArray[] _testSrcVectors; + private readonly AlignedArray[] _testDstVectors; + private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment(); + private readonly FloatEqualityComparer _comparer; - private readonly ITestOutputHelper output; + private const float DEFAULT_SCALE = 1.7f; - public CpuMathUtilsUnitTests(ITestOutputHelper output) + public CpuMathUtilsUnitTests() { - this.output = output; - // Padded array whose length is a multiple of 4 float[] testArray1 = new float[16] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; // Unpadded array whose length is not a multiple of 4. float[] testArray2 = new float[15] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f }; - testArrays = new float[][] { testArray1, testArray2 }; - testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }; - comparer = new FloatEqualityComparer(); + _testArrays = new float[][] { testArray1, testArray2 }; + _testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }; + _comparer = new FloatEqualityComparer(); // Padded matrices whose dimensions are multiples of 8 float[] testMatrix1 = new float[8 * 8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, @@ -52,34 +47,34 @@ public CpuMathUtilsUnitTests(ITestOutputHelper output) testMatrix2[i] = i + 1; } - AlignedArray testMatrixAligned1 = new AlignedArray(8 * 8, Vector256Assignment); - AlignedArray testMatrixAligned2 = new AlignedArray(8 * 16, Vector256Assignment); + AlignedArray testMatrixAligned1 = new AlignedArray(8 * 8, _vectorAlignment); + AlignedArray testMatrixAligned2 = new AlignedArray(8 * 16, _vectorAlignment); testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length); testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length); - testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 }; + _testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 }; // Padded source vectors whose dimensions are multiples of 8 float[] testSrcVector1 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f }; float[] testSrcVector2 = new float[16] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f, 16f }; - AlignedArray testSrcVectorAligned1 = new AlignedArray(8, Vector256Assignment); - AlignedArray testSrcVectorAligned2 = new AlignedArray(16, Vector256Assignment); + AlignedArray testSrcVectorAligned1 = new AlignedArray(8, _vectorAlignment); + AlignedArray testSrcVectorAligned2 = new AlignedArray(16, _vectorAlignment); testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length); testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length); - testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; + _testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; // Padded destination vectors whose dimensions are multiples of 8 float[] testDstVector1 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f }; float[] testDstVector2 = new float[16] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f }; - AlignedArray testDstVectorAligned1 = new AlignedArray(8, Vector256Assignment); - AlignedArray testDstVectorAligned2 = new AlignedArray(16, Vector256Assignment); + AlignedArray testDstVectorAligned1 = new AlignedArray(8, _vectorAlignment); + AlignedArray testDstVectorAligned2 = new AlignedArray(16, _vectorAlignment); testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length); testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length); - testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; + _testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; } [Theory] @@ -88,14 +83,14 @@ public CpuMathUtilsUnitTests(ITestOutputHelper output) [InlineData(1, 0, 1, new float[] { 204f, 492f, 780f, 1068f, 1356f, 1644f, 1932f, 2220f, 2508f, 2796f, 3084f, 3372f, 3660f, 3948f, 4236f, 4524f })] public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -104,14 +99,14 @@ public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) [InlineData(1, 0, 1, new float[] { 204f, 493f, 782f, 1071f, 1360f, 1649f, 1938f, 2227f, 2516f, 2805f, 3094f, 3383f, 3672f, 3961f, 4250f, 4539f })] public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -120,14 +115,14 @@ public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expect [InlineData(1, 1, 0, new float[] { 11016f, 11152f, 11288f, 11424f, 11560f, 11696f, 11832f, 11968f })] public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -136,14 +131,14 @@ public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expec [InlineData(1, 1, 0, new float[] { 11016f, 11153f, 11290f, 11427f, 11564f, 11701f, 11838f, 11975f })] public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -152,15 +147,15 @@ public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] ex [InlineData(1, 0, 1, new float[] { 95f, 231f, 367f, 503f, 639f, 775f, 911f, 1047f, 1183f, 1319f, 1455f, 1591f, 1727f, 1863f, 1999f, 2135f })] public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -169,15 +164,15 @@ public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected [InlineData(1, 0, 1, new float[] { 95f, 232f, 369f, 506f, 643f, 780f, 917f, 1054f, 1191f, 1328f, 1465f, 1602f, 1739f, 1876f, 2013f, 2150f })] public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -186,15 +181,15 @@ public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expec [InlineData(1, 1, 0, new float[] { 6720f, 6800f, 6880f, 6960f, 7040f, 7120f, 7200f, 7280f })] public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -203,15 +198,15 @@ public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expe [InlineData(1, 1, 0, new float[] { 6720f, 6801f, 6882f, 6963f, 7044f, 7125f, 7206f, 7287f })] public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -219,7 +214,7 @@ public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] e [InlineData(1)] public void AddScalarUTest(int test) { - float[] dst = (float[])testArrays[test].Clone(); + float[] dst = (float[])_testArrays[test].Clone(); float[] expected = (float[])dst.Clone(); for (int i = 0; i < expected.Length; i++) @@ -229,7 +224,7 @@ public void AddScalarUTest(int test) CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -237,7 +232,7 @@ public void AddScalarUTest(int test) [InlineData(1)] public void ScaleUTest(int test) { - float[] dst = (float[])testArrays[test].Clone(); + float[] dst = (float[])_testArrays[test].Clone(); float[] expected = (float[])dst.Clone(); for (int i = 0; i < expected.Length; i++) @@ -247,7 +242,7 @@ public void ScaleUTest(int test) CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -255,7 +250,7 @@ public void ScaleUTest(int test) [InlineData(1)] public void ScaleSrcUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); float[] expected = (float[])dst.Clone(); @@ -266,7 +261,7 @@ public void ScaleSrcUTest(int test) CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -274,7 +269,7 @@ public void ScaleSrcUTest(int test) [InlineData(1)] public void ScaleAddUTest(int test) { - float[] dst = (float[])testArrays[test].Clone(); + float[] dst = (float[])_testArrays[test].Clone(); float[] expected = (float[])dst.Clone(); for (int i = 0; i < expected.Length; i++) @@ -284,7 +279,7 @@ public void ScaleAddUTest(int test) CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -292,7 +287,7 @@ public void ScaleAddUTest(int test) [InlineData(1)] public void AddScaleUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); float[] expected = (float[])dst.Clone(); @@ -303,7 +298,7 @@ public void AddScaleUTest(int test) CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -311,9 +306,9 @@ public void AddScaleUTest(int test) [InlineData(1)] public void AddScaleSUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); - int[] idx = testIndexArray; + int[] idx = _testIndexArray; float[] expected = (float[])dst.Clone(); expected[0] = 5.292f; @@ -328,7 +323,7 @@ public void AddScaleSUTest(int test) CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -336,7 +331,7 @@ public void AddScaleSUTest(int test) [InlineData(1)] public void AddScaleCopyUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); float[] result = (float[])dst.Clone(); float[] expected = (float[])dst.Clone(); @@ -348,7 +343,7 @@ public void AddScaleCopyUTest(int test) CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length); var actual = result; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -356,7 +351,7 @@ public void AddScaleCopyUTest(int test) [InlineData(1)] public void AddUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); float[] expected = (float[])src.Clone(); @@ -373,7 +368,7 @@ public void AddUTest(int test) CpuMathUtils.Add(src, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -381,9 +376,9 @@ public void AddUTest(int test) [InlineData(1)] public void AddSUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); - int[] idx = testIndexArray; + int[] idx = _testIndexArray; float[] expected = (float[])dst.Clone(); expected[0] = 3.92f; @@ -398,7 +393,7 @@ public void AddSUTest(int test) CpuMathUtils.Add(src, idx, dst, idx.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -406,7 +401,7 @@ public void AddSUTest(int test) [InlineData(1)] public void MulElementWiseUTest(int test) { - float[] src1 = (float[])testArrays[test].Clone(); + float[] src1 = (float[])_testArrays[test].Clone(); float[] src2 = (float[])src1.Clone(); float[] dst = (float[])src1.Clone(); @@ -425,7 +420,7 @@ public void MulElementWiseUTest(int test) CpuMathUtils.MulElementWise(src1, src2, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -433,7 +428,7 @@ public void MulElementWiseUTest(int test) [InlineData(1, -191.09f)] public void SumUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.Sum(src, src.Length); Assert.Equal(expected, actual, 2); } @@ -443,7 +438,7 @@ public void SumUTest(int test, float expected) [InlineData(1, 26789.0511f)] public void SumSqUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.SumSq(src, src.Length); Assert.Equal(expected, actual, 2); } @@ -453,7 +448,7 @@ public void SumSqUTest(int test, float expected) [InlineData(1, 27482.1071f)] public void SumSqDiffUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length); Assert.Equal(expected, actual, 2); } @@ -463,7 +458,7 @@ public void SumSqDiffUTest(int test, float expected) [InlineData(1, 390.67f)] public void SumAbsUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.SumAbs(src, src.Length); Assert.Equal(expected, actual, 2); } @@ -473,7 +468,7 @@ public void SumAbsUTest(int test, float expected) [InlineData(1, 392.37f)] public void SumAbsDiffUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length); Assert.Equal(expected, actual, 2); } @@ -483,7 +478,7 @@ public void SumAbsDiffUTest(int test, float expected) [InlineData(1, 106.37f)] public void MaxAbsUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.MaxAbs(src, src.Length); Assert.Equal(expected, actual, 2); } @@ -493,7 +488,7 @@ public void MaxAbsUTest(int test, float expected) [InlineData(1, 108.07f)] public void MaxAbsDiffUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length); Assert.Equal(expected, actual, 2); } @@ -503,7 +498,7 @@ public void MaxAbsDiffUTest(int test, float expected) [InlineData(1, 26597.9611f)] public void DotUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); for (int i = 0; i < dst.Length; i++) @@ -520,9 +515,9 @@ public void DotUTest(int test, float expected) [InlineData(1, -3406.2154f)] public void DotSUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); - int[] idx = testIndexArray; + int[] idx = _testIndexArray; // Ensures src and dst are different arrays for (int i = 0; i < dst.Length; i++) @@ -539,7 +534,7 @@ public void DotSUTest(int test, float expected) [InlineData(1, 15.0f)] public void Dist2Test(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); // Ensures src and dst are different arrays @@ -557,28 +552,27 @@ public void Dist2Test(int test, float expected) [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 0f, 10f, 11f, 0f, 0f, 0f, 0f, 16f })] public void ZeroItemsUTest(int test, int[] idx, float[] expected) { - AlignedArray src = new AlignedArray(8 + 8 * test, Vector256Assignment); - src.CopyFrom(testSrcVectors[test]); + AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment); + src.CopyFrom(_testSrcVectors[test]); CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx); float[] actual = new float[src.Size]; src.CopyTo(actual, 0, src.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] - [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })] - [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })] + [InlineData(0, new int[] { 0, 2, 5 }, new float[] { 0f, 2f, 0f, 4f, 5f, 6f, 0f, 8f })] + [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 9f, 0f, 11f, 12f, 0f, 0f, 0f, 16f })] public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected) { - // Uses Vector128Assignment since the intrinsic does not use any SSE/AVX algorithm. - AlignedArray src = new AlignedArray(4 + 4 * test, Vector128Assignment); - src.CopyFrom(testSrcVectors[test]); + AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment); + src.CopyFrom(_testSrcVectors[test]); CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx); float[] actual = new float[src.Size]; src.CopyTo(actual, 0, src.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -586,7 +580,7 @@ public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected) [InlineData(1)] public void SdcaL1UpdateUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] v = (float[])src.Clone(); float[] w = (float[])src.Clone(); float[] expected = (float[])w.Clone(); @@ -599,7 +593,7 @@ public void SdcaL1UpdateUTest(int test) CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w); var actual = w; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -607,10 +601,10 @@ public void SdcaL1UpdateUTest(int test) [InlineData(1)] public void SdcaL1UpdateSUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] v = (float[])src.Clone(); float[] w = (float[])src.Clone(); - int[] idx = testIndexArray; + int[] idx = _testIndexArray; float[] expected = (float[])w.Clone(); for (int i = 0; i < idx.Length; i++) @@ -622,7 +616,7 @@ public void SdcaL1UpdateSUTest(int test) CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w); var actual = w; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } } @@ -638,4 +632,4 @@ public int GetHashCode(float a) throw new NotImplementedException(); } } -} +} \ No newline at end of file diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs new file mode 100644 index 0000000000..f453c0749d --- /dev/null +++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs @@ -0,0 +1,619 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Xunit; +using Microsoft.ML.Runtime.Internal.CpuMath; + +namespace Microsoft.ML.CpuMath.UnitTests +{ + public class CpuMathUtilsUnitTests + { + private readonly float[][] _testArrays; + private readonly int[] _testIndexArray; + private readonly AlignedArray[] _testMatrices; + private readonly AlignedArray[] _testSrcVectors; + private readonly AlignedArray[] _testDstVectors; + private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment(); + private readonly FloatEqualityComparer _comparer; + + private const float DEFAULT_SCALE = 1.7f; + + public CpuMathUtilsUnitTests() + { + // Padded array whose length is a multiple of 4 + float[] testArray1 = new float[8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; + // Unpadded array whose length is not a multiple of 4. + float[] testArray2 = new float[7] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f }; + _testArrays = new float[][] { testArray1, testArray2 }; + _testIndexArray = new int[4] { 0, 2, 5, 6 }; + _comparer = new FloatEqualityComparer(); + + // Padded matrices whose dimensions are multiples of 4 + float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; + float[] testMatrix2 = new float[4 * 8]; + + for (int i = 0; i < testMatrix2.Length; i++) + { + testMatrix2[i] = i + 1; + } + + AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, _vectorAlignment); + AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, _vectorAlignment); + testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length); + testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length); + + _testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 }; + + // Padded source vectors whose dimensions are multiples of 4 + float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f }; + float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f }; + + AlignedArray testSrcVectorAligned1 = new AlignedArray(4, _vectorAlignment); + AlignedArray testSrcVectorAligned2 = new AlignedArray(8, _vectorAlignment); + testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length); + testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length); + + _testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; + + // Padded destination vectors whose dimensions are multiples of 4 + float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f }; + float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f }; + + AlignedArray testDstVectorAligned1 = new AlignedArray(4, _vectorAlignment); + AlignedArray testDstVectorAligned2 = new AlignedArray(8, _vectorAlignment); + testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length); + testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length); + + _testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })] + [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })] + [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })] + public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })] + [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })] + [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })] + public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })] + [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })] + [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })] + public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })] + [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })] + [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })] + public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })] + [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })] + [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })] + public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; + + CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })] + [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })] + [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })] + public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; + + CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })] + [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })] + [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })] + public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; + + CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })] + [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })] + [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })] + public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; + + CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void AddScalarUTest(int test) + { + float[] dst = (float[])_testArrays[test].Clone(); + float[] expected = (float[])dst.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + expected[i] += DEFAULT_SCALE; + } + + CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void ScaleUTest(int test) + { + float[] dst = (float[])_testArrays[test].Clone(); + float[] expected = (float[])dst.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + expected[i] *= DEFAULT_SCALE; + } + + CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void ScaleSrcUTest(int test) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + float[] expected = (float[])dst.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + expected[i] *= DEFAULT_SCALE; + } + + CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void ScaleAddUTest(int test) + { + float[] dst = (float[])_testArrays[test].Clone(); + float[] expected = (float[])dst.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + expected[i] = DEFAULT_SCALE * (dst[i] + DEFAULT_SCALE); + } + + CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void AddScaleUTest(int test) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + float[] expected = (float[])dst.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + expected[i] *= (1 + DEFAULT_SCALE); + } + + CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void AddScaleSUTest(int test) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + int[] idx = _testIndexArray; + float[] expected = (float[])dst.Clone(); + + expected[0] = 5.292f; + expected[2] = -13.806f; + expected[5] = -43.522f; + expected[6] = 55.978f; + + CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length); + var actual = dst; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void AddScaleCopyUTest(int test) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + float[] result = (float[])dst.Clone(); + float[] expected = (float[])dst.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + expected[i] *= (1 + DEFAULT_SCALE); + } + + CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length); + var actual = result; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void AddUTest(int test) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + float[] expected = (float[])src.Clone(); + + // Ensures src and dst are different arrays + for (int i = 0; i < dst.Length; i++) + { + dst[i] += 1; + } + + for (int i = 0; i < expected.Length; i++) + { + expected[i] = 2 * expected[i] + 1; + } + + CpuMathUtils.Add(src, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void AddSUTest(int test) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + int[] idx = _testIndexArray; + float[] expected = (float[])dst.Clone(); + + expected[0] = 3.92f; + expected[2] = -12.14f; + expected[5] = -36.69f; + expected[6] = 46.29f; + + CpuMathUtils.Add(src, idx, dst, idx.Length); + var actual = dst; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void MulElementWiseUTest(int test) + { + float[] src1 = (float[])_testArrays[test].Clone(); + float[] src2 = (float[])src1.Clone(); + float[] dst = (float[])src1.Clone(); + + // Ensures src1 and src2 are different arrays + for (int i = 0; i < src2.Length; i++) + { + src2[i] += 1; + } + + float[] expected = (float[])src1.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + expected[i] *= (1 + expected[i]); + } + + CpuMathUtils.MulElementWise(src1, src2, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0, -93.9f)] + [InlineData(1, -97.19f)] + public void SumUTest(int test, float expected) + { + float[] src = (float[])_testArrays[test].Clone(); + var actual = CpuMathUtils.Sum(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 13399.9376f)] + [InlineData(1, 13389.1135f)] + public void SumSqUTest(int test, float expected) + { + float[] src = (float[])_testArrays[test].Clone(); + var actual = CpuMathUtils.SumSq(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 13742.3176f)] + [InlineData(1, 13739.7895f)] + public void SumSqDiffUTest(int test, float expected) + { + float[] src = (float[])_testArrays[test].Clone(); + var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 196.98f)] + [InlineData(1, 193.69f)] + public void SumAbsUTest(int test, float expected) + { + float[] src = (float[])_testArrays[test].Clone(); + var actual = CpuMathUtils.SumAbs(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 196.98f)] + [InlineData(1, 195.39f)] + public void SumAbsDiffUTest(int test, float expected) + { + float[] src = (float[])_testArrays[test].Clone(); + var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 106.37f)] + [InlineData(1, 106.37f)] + public void MaxAbsUTest(int test, float expected) + { + float[] src = (float[])_testArrays[test].Clone(); + var actual = CpuMathUtils.MaxAbs(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 108.07f)] + [InlineData(1, 108.07f)] + public void MaxAbsDiffUTest(int test, float expected) + { + float[] src = (float[])_testArrays[test].Clone(); + var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 13306.0376f)] + [InlineData(1, 13291.9235f)] + public void DotUTest(int test, float expected) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + + for (int i = 0; i < dst.Length; i++) + { + dst[i] += 1; + } + + var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 736.7352f)] + [InlineData(1, 736.7352f)] + public void DotSUTest(int test, float expected) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + int[] idx = _testIndexArray; + + // Ensures src and dst are different arrays + for (int i = 0; i < dst.Length; i++) + { + dst[i] += 1; + } + + var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length); + Assert.Equal(expected, actual, 4); + } + + [Theory] + [InlineData(0, 8.0f)] + [InlineData(1, 7.0f)] + public void Dist2Test(int test, float expected) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + + // Ensures src and dst are different arrays + for (int i = 0; i < dst.Length; i++) + { + dst[i] += 1; + } + + var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length); + Assert.Equal(expected, actual, 0); + } + + [Theory] + [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })] + [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })] + public void ZeroItemsUTest(int test, int[] idx, float[] expected) + { + AlignedArray src = new AlignedArray(4 + 4 * test, _vectorAlignment); + src.CopyFrom(_testSrcVectors[test]); + + CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx); + float[] actual = new float[src.Size]; + src.CopyTo(actual, 0, src.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })] + [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })] + public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected) + { + AlignedArray src = new AlignedArray(4 + 4 * test, _vectorAlignment); + src.CopyFrom(_testSrcVectors[test]); + + CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx); + float[] actual = new float[src.Size]; + src.CopyTo(actual, 0, src.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void SdcaL1UpdateUTest(int test) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] v = (float[])src.Clone(); + float[] w = (float[])src.Clone(); + float[] expected = (float[])w.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + float value = src[i] * (1 + DEFAULT_SCALE); + expected[i] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0; + } + + CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w); + var actual = w; + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void SdcaL1UpdateSUTest(int test) + { + float[] src = (float[])_testArrays[test].Clone(); + float[] v = (float[])src.Clone(); + float[] w = (float[])src.Clone(); + int[] idx = _testIndexArray; + float[] expected = (float[])w.Clone(); + + for (int i = 0; i < idx.Length; i++) + { + int index = idx[i]; + float value = v[index] + src[i] * DEFAULT_SCALE; + expected[index] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0; + } + + CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w); + var actual = w; + Assert.Equal(expected, actual, _comparer); + } + } + + internal class FloatEqualityComparer : IEqualityComparer + { + public bool Equals(float a, float b) + { + return Math.Abs(a - b) < 1e-5f; + } + + public int GetHashCode(float a) + { + throw new NotImplementedException(); + } + } +} \ No newline at end of file From a763059a22000ab8cabfb08cb1fca4bfaa99aa5b Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Sat, 18 Aug 2018 19:03:00 -0700 Subject: [PATCH 16/29] Kept only the most recent unit tests which are sufficient for both netcoreapp and netstandard --- .../{UnitTests.netcoreapp.cs => UnitTests.cs} | 0 ...ft.ML.CpuMath.UnitTests.netstandard.csproj | 6 +- .../UnitTests.netstandard.cs | 619 ------------------ 3 files changed, 5 insertions(+), 620 deletions(-) rename test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/{UnitTests.netcoreapp.cs => UnitTests.cs} (100%) delete mode 100644 test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs similarity index 100% rename from test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.netcoreapp.cs rename to test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj index 9552f688a8..862c95ef90 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj +++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj @@ -12,5 +12,9 @@ + + + + - + \ No newline at end of file diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs deleted file mode 100644 index f453c0749d..0000000000 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.netstandard.cs +++ /dev/null @@ -1,619 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Xunit; -using Microsoft.ML.Runtime.Internal.CpuMath; - -namespace Microsoft.ML.CpuMath.UnitTests -{ - public class CpuMathUtilsUnitTests - { - private readonly float[][] _testArrays; - private readonly int[] _testIndexArray; - private readonly AlignedArray[] _testMatrices; - private readonly AlignedArray[] _testSrcVectors; - private readonly AlignedArray[] _testDstVectors; - private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment(); - private readonly FloatEqualityComparer _comparer; - - private const float DEFAULT_SCALE = 1.7f; - - public CpuMathUtilsUnitTests() - { - // Padded array whose length is a multiple of 4 - float[] testArray1 = new float[8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; - // Unpadded array whose length is not a multiple of 4. - float[] testArray2 = new float[7] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f }; - _testArrays = new float[][] { testArray1, testArray2 }; - _testIndexArray = new int[4] { 0, 2, 5, 6 }; - _comparer = new FloatEqualityComparer(); - - // Padded matrices whose dimensions are multiples of 4 - float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, - 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; - float[] testMatrix2 = new float[4 * 8]; - - for (int i = 0; i < testMatrix2.Length; i++) - { - testMatrix2[i] = i + 1; - } - - AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, _vectorAlignment); - AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, _vectorAlignment); - testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length); - testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length); - - _testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 }; - - // Padded source vectors whose dimensions are multiples of 4 - float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f }; - float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f }; - - AlignedArray testSrcVectorAligned1 = new AlignedArray(4, _vectorAlignment); - AlignedArray testSrcVectorAligned2 = new AlignedArray(8, _vectorAlignment); - testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length); - testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length); - - _testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; - - // Padded destination vectors whose dimensions are multiples of 4 - float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f }; - float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f }; - - AlignedArray testDstVectorAligned1 = new AlignedArray(4, _vectorAlignment); - AlignedArray testDstVectorAligned2 = new AlignedArray(8, _vectorAlignment); - testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length); - testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length); - - _testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; - } - - [Theory] - [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })] - [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })] - [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })] - public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = _testMatrices[matTest]; - AlignedArray src = _testSrcVectors[srcTest]; - AlignedArray dst = _testDstVectors[dstTest]; - - CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })] - [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })] - [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })] - public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = _testMatrices[matTest]; - AlignedArray src = _testSrcVectors[srcTest]; - AlignedArray dst = _testDstVectors[dstTest]; - - CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })] - [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })] - [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })] - public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = _testMatrices[matTest]; - AlignedArray src = _testSrcVectors[srcTest]; - AlignedArray dst = _testDstVectors[dstTest]; - - CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })] - [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })] - [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })] - public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = _testMatrices[matTest]; - AlignedArray src = _testSrcVectors[srcTest]; - AlignedArray dst = _testDstVectors[dstTest]; - - CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })] - [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })] - [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })] - public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = _testMatrices[matTest]; - AlignedArray src = _testSrcVectors[srcTest]; - AlignedArray dst = _testDstVectors[dstTest]; - int[] idx = _testIndexArray; - - CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })] - [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })] - [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })] - public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = _testMatrices[matTest]; - AlignedArray src = _testSrcVectors[srcTest]; - AlignedArray dst = _testDstVectors[dstTest]; - int[] idx = _testIndexArray; - - CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })] - [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })] - [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })] - public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = _testMatrices[matTest]; - AlignedArray src = _testSrcVectors[srcTest]; - AlignedArray dst = _testDstVectors[dstTest]; - int[] idx = _testIndexArray; - - CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })] - [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })] - [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })] - public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) - { - AlignedArray mat = _testMatrices[matTest]; - AlignedArray src = _testSrcVectors[srcTest]; - AlignedArray dst = _testDstVectors[dstTest]; - int[] idx = _testIndexArray; - - CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); - float[] actual = new float[dst.Size]; - dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void AddScalarUTest(int test) - { - float[] dst = (float[])_testArrays[test].Clone(); - float[] expected = (float[])dst.Clone(); - - for (int i = 0; i < expected.Length; i++) - { - expected[i] += DEFAULT_SCALE; - } - - CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length); - var actual = dst; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void ScaleUTest(int test) - { - float[] dst = (float[])_testArrays[test].Clone(); - float[] expected = (float[])dst.Clone(); - - for (int i = 0; i < expected.Length; i++) - { - expected[i] *= DEFAULT_SCALE; - } - - CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length); - var actual = dst; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void ScaleSrcUTest(int test) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - float[] expected = (float[])dst.Clone(); - - for (int i = 0; i < expected.Length; i++) - { - expected[i] *= DEFAULT_SCALE; - } - - CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length); - var actual = dst; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void ScaleAddUTest(int test) - { - float[] dst = (float[])_testArrays[test].Clone(); - float[] expected = (float[])dst.Clone(); - - for (int i = 0; i < expected.Length; i++) - { - expected[i] = DEFAULT_SCALE * (dst[i] + DEFAULT_SCALE); - } - - CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length); - var actual = dst; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void AddScaleUTest(int test) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - float[] expected = (float[])dst.Clone(); - - for (int i = 0; i < expected.Length; i++) - { - expected[i] *= (1 + DEFAULT_SCALE); - } - - CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, dst.Length); - var actual = dst; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void AddScaleSUTest(int test) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - int[] idx = _testIndexArray; - float[] expected = (float[])dst.Clone(); - - expected[0] = 5.292f; - expected[2] = -13.806f; - expected[5] = -43.522f; - expected[6] = 55.978f; - - CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length); - var actual = dst; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void AddScaleCopyUTest(int test) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - float[] result = (float[])dst.Clone(); - float[] expected = (float[])dst.Clone(); - - for (int i = 0; i < expected.Length; i++) - { - expected[i] *= (1 + DEFAULT_SCALE); - } - - CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length); - var actual = result; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void AddUTest(int test) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - float[] expected = (float[])src.Clone(); - - // Ensures src and dst are different arrays - for (int i = 0; i < dst.Length; i++) - { - dst[i] += 1; - } - - for (int i = 0; i < expected.Length; i++) - { - expected[i] = 2 * expected[i] + 1; - } - - CpuMathUtils.Add(src, dst, dst.Length); - var actual = dst; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void AddSUTest(int test) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - int[] idx = _testIndexArray; - float[] expected = (float[])dst.Clone(); - - expected[0] = 3.92f; - expected[2] = -12.14f; - expected[5] = -36.69f; - expected[6] = 46.29f; - - CpuMathUtils.Add(src, idx, dst, idx.Length); - var actual = dst; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void MulElementWiseUTest(int test) - { - float[] src1 = (float[])_testArrays[test].Clone(); - float[] src2 = (float[])src1.Clone(); - float[] dst = (float[])src1.Clone(); - - // Ensures src1 and src2 are different arrays - for (int i = 0; i < src2.Length; i++) - { - src2[i] += 1; - } - - float[] expected = (float[])src1.Clone(); - - for (int i = 0; i < expected.Length; i++) - { - expected[i] *= (1 + expected[i]); - } - - CpuMathUtils.MulElementWise(src1, src2, dst, dst.Length); - var actual = dst; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0, -93.9f)] - [InlineData(1, -97.19f)] - public void SumUTest(int test, float expected) - { - float[] src = (float[])_testArrays[test].Clone(); - var actual = CpuMathUtils.Sum(src, src.Length); - Assert.Equal(expected, actual, 2); - } - - [Theory] - [InlineData(0, 13399.9376f)] - [InlineData(1, 13389.1135f)] - public void SumSqUTest(int test, float expected) - { - float[] src = (float[])_testArrays[test].Clone(); - var actual = CpuMathUtils.SumSq(src, src.Length); - Assert.Equal(expected, actual, 2); - } - - [Theory] - [InlineData(0, 13742.3176f)] - [InlineData(1, 13739.7895f)] - public void SumSqDiffUTest(int test, float expected) - { - float[] src = (float[])_testArrays[test].Clone(); - var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length); - Assert.Equal(expected, actual, 2); - } - - [Theory] - [InlineData(0, 196.98f)] - [InlineData(1, 193.69f)] - public void SumAbsUTest(int test, float expected) - { - float[] src = (float[])_testArrays[test].Clone(); - var actual = CpuMathUtils.SumAbs(src, src.Length); - Assert.Equal(expected, actual, 2); - } - - [Theory] - [InlineData(0, 196.98f)] - [InlineData(1, 195.39f)] - public void SumAbsDiffUTest(int test, float expected) - { - float[] src = (float[])_testArrays[test].Clone(); - var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length); - Assert.Equal(expected, actual, 2); - } - - [Theory] - [InlineData(0, 106.37f)] - [InlineData(1, 106.37f)] - public void MaxAbsUTest(int test, float expected) - { - float[] src = (float[])_testArrays[test].Clone(); - var actual = CpuMathUtils.MaxAbs(src, src.Length); - Assert.Equal(expected, actual, 2); - } - - [Theory] - [InlineData(0, 108.07f)] - [InlineData(1, 108.07f)] - public void MaxAbsDiffUTest(int test, float expected) - { - float[] src = (float[])_testArrays[test].Clone(); - var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length); - Assert.Equal(expected, actual, 2); - } - - [Theory] - [InlineData(0, 13306.0376f)] - [InlineData(1, 13291.9235f)] - public void DotUTest(int test, float expected) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - - for (int i = 0; i < dst.Length; i++) - { - dst[i] += 1; - } - - var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length); - Assert.Equal(expected, actual, 2); - } - - [Theory] - [InlineData(0, 736.7352f)] - [InlineData(1, 736.7352f)] - public void DotSUTest(int test, float expected) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - int[] idx = _testIndexArray; - - // Ensures src and dst are different arrays - for (int i = 0; i < dst.Length; i++) - { - dst[i] += 1; - } - - var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length); - Assert.Equal(expected, actual, 4); - } - - [Theory] - [InlineData(0, 8.0f)] - [InlineData(1, 7.0f)] - public void Dist2Test(int test, float expected) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - - // Ensures src and dst are different arrays - for (int i = 0; i < dst.Length; i++) - { - dst[i] += 1; - } - - var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length); - Assert.Equal(expected, actual, 0); - } - - [Theory] - [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })] - [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })] - public void ZeroItemsUTest(int test, int[] idx, float[] expected) - { - AlignedArray src = new AlignedArray(4 + 4 * test, _vectorAlignment); - src.CopyFrom(_testSrcVectors[test]); - - CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx); - float[] actual = new float[src.Size]; - src.CopyTo(actual, 0, src.Size); - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })] - [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })] - public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected) - { - AlignedArray src = new AlignedArray(4 + 4 * test, _vectorAlignment); - src.CopyFrom(_testSrcVectors[test]); - - CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx); - float[] actual = new float[src.Size]; - src.CopyTo(actual, 0, src.Size); - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void SdcaL1UpdateUTest(int test) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] v = (float[])src.Clone(); - float[] w = (float[])src.Clone(); - float[] expected = (float[])w.Clone(); - - for (int i = 0; i < expected.Length; i++) - { - float value = src[i] * (1 + DEFAULT_SCALE); - expected[i] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0; - } - - CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w); - var actual = w; - Assert.Equal(expected, actual, _comparer); - } - - [Theory] - [InlineData(0)] - [InlineData(1)] - public void SdcaL1UpdateSUTest(int test) - { - float[] src = (float[])_testArrays[test].Clone(); - float[] v = (float[])src.Clone(); - float[] w = (float[])src.Clone(); - int[] idx = _testIndexArray; - float[] expected = (float[])w.Clone(); - - for (int i = 0; i < idx.Length; i++) - { - int index = idx[i]; - float value = v[index] + src[i] * DEFAULT_SCALE; - expected[index] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0; - } - - CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w); - var actual = w; - Assert.Equal(expected, actual, _comparer); - } - } - - internal class FloatEqualityComparer : IEqualityComparer - { - public bool Equals(float a, float b) - { - return Math.Abs(a - b) < 1e-5f; - } - - public int GetHashCode(float a) - { - throw new NotImplementedException(); - } - } -} \ No newline at end of file From 8bc8cc83183befbf71912738381e8ea138be4e7c Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Mon, 20 Aug 2018 13:28:50 -0700 Subject: [PATCH 17/29] Respond to PR feedback: Style changes --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 117 +++++------------- .../CpuMathUtils.netstandard.cs | 2 +- 2 files changed, 32 insertions(+), 87 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 863a40b787..db9335b4ed 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -45,14 +45,6 @@ private static Vector256 ToVector256(in Vector128 a, in Vector128< return Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1); } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static void ZeroUpper() - { - // Currently no-op since _mm256_zeroupper is not supported (ref: https://github.com/dotnet/coreclr/pull/16955) - // This is a placeholder in case the intrinsic is supported later on. - return; - } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector128 GetLow(in Vector256 x) { @@ -264,8 +256,6 @@ public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, pDstCurrent += 4; pMatCurrent += 3 * ccol; } - - ZeroUpper(); } } @@ -326,8 +316,6 @@ public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, A pDstCurrent += 8; pm0 += 8 * ccol; } - - ZeroUpper(); } } @@ -440,8 +428,6 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s pMatCurrent += 3 * crow; pSrcCurrent += 4; } - - ZeroUpper(); } } @@ -509,8 +495,6 @@ public static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSr ppos++; } - - ZeroUpper(); } } @@ -535,7 +519,7 @@ public static unsafe void AddScalarU(float scalar, Span dst) Vector128 scalarVector128 = Sse.SetAllVector128(scalar); - while (pDstCurrent + 4 <= pDstEnd) + if (pDstCurrent + 4 <= pDstEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); dstVector = Sse.Add(dstVector, scalarVector128); @@ -553,8 +537,6 @@ public static unsafe void AddScalarU(float scalar, Span dst) pDstCurrent++; } } - - ZeroUpper(); } public static unsafe void ScaleU(float scale, Span dst) @@ -578,7 +560,7 @@ public static unsafe void ScaleU(float scale, Span dst) Vector128 scaleVector128 = Sse.SetAllVector128(scale); - while (pDstCurrent + 4 <= pEnd) + if (pDstCurrent + 4 <= pEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -598,8 +580,6 @@ public static unsafe void ScaleU(float scale, Span dst) pDstCurrent++; } } - - ZeroUpper(); } public static unsafe void ScaleSrcU(float scale, Span src, Span dst) @@ -625,7 +605,7 @@ public static unsafe void ScaleSrcU(float scale, Span src, Span ds Vector128 scaleVector128 = Sse.SetAllVector128(scale); - while (pDstCurrent + 4 <= pDstEnd) + if (pDstCurrent + 4 <= pDstEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Multiply(srcVector, scaleVector128); @@ -645,8 +625,6 @@ public static unsafe void ScaleSrcU(float scale, Span src, Span ds pDstCurrent++; } } - - ZeroUpper(); } // dst[i] = a * (dst[i] + b) @@ -673,7 +651,7 @@ public static unsafe void ScaleAddU(float a, float b, Span dst) Vector128 a128 = Sse.SetAllVector128(a); Vector128 b128 = Sse.SetAllVector128(b); - while (pDstCurrent + 4 <= pDstEnd) + if (pDstCurrent + 4 <= pDstEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); dstVector = Sse.Add(dstVector, b128); @@ -693,8 +671,6 @@ public static unsafe void ScaleAddU(float a, float b, Span dst) pDstCurrent++; } } - - ZeroUpper(); } public static unsafe void AddScaleU(float scale, Span src, Span dst) @@ -723,7 +699,7 @@ public static unsafe void AddScaleU(float scale, Span src, Span ds Vector128 scaleVector128 = Sse.SetAllVector128(scale); - while (pDstCurrent + 4 <= pEnd) + if (pDstCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -749,8 +725,6 @@ public static unsafe void AddScaleU(float scale, Span src, Span ds pDstCurrent++; } } - - ZeroUpper(); } public static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) @@ -781,7 +755,7 @@ public static unsafe void AddScaleCopyU(float scale, Span src, Span scaleVector128 = Sse.SetAllVector128(scale); - while (pResCurrent + 4 <= pResEnd) + if (pResCurrent + 4 <= pResEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -807,8 +781,6 @@ public static unsafe void AddScaleCopyU(float scale, Span src, Span src, Span idx, Span dst) @@ -839,7 +811,7 @@ public static unsafe void AddScaleSU(float scale, Span src, Span idx Vector128 scaleVector128 = Sse.SetAllVector128(scale); - while (pIdxCurrent + 4 <= pEnd) + if (pIdxCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent); @@ -860,8 +832,6 @@ public static unsafe void AddScaleSU(float scale, Span src, Span idx pSrcCurrent++; } } - - ZeroUpper(); } public static unsafe void AddU(Span src, Span dst) @@ -885,7 +855,7 @@ public static unsafe void AddU(Span src, Span dst) pDstCurrent += 8; } - while (pSrcCurrent + 4 <= pEnd) + if (pSrcCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -909,8 +879,6 @@ public static unsafe void AddU(Span src, Span dst) pDstCurrent++; } } - - ZeroUpper(); } public static unsafe void AddSU(Span src, Span idx, Span dst) @@ -936,7 +904,7 @@ public static unsafe void AddSU(Span src, Span idx, Span dst) pSrcCurrent += 8; } - while (pIdxCurrent + 4 <= pEnd) + if (pIdxCurrent + 4 <= pEnd) { Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent); Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); @@ -956,8 +924,6 @@ public static unsafe void AddSU(Span src, Span idx, Span dst) pSrcCurrent++; } } - - ZeroUpper(); } public static unsafe void MulElementWiseU(Span src1, Span src2, Span dst) @@ -983,7 +949,7 @@ public static unsafe void MulElementWiseU(Span src1, Span src2, Sp pDstCurrent += 8; } - while (pDstCurrent + 4 <= pEnd) + if (pDstCurrent + 4 <= pEnd) { Vector128 src1Vector = Sse.LoadVector128(pSrc1Current); Vector128 src2Vector = Sse.LoadVector128(pSrc2Current); @@ -1007,8 +973,6 @@ public static unsafe void MulElementWiseU(Span src1, Span src2, Sp pDstCurrent++; } } - - ZeroUpper(); } public static unsafe float SumU(Span src) @@ -1031,7 +995,7 @@ public static unsafe float SumU(Span src) Vector128 result128 = Sse.SetZeroVector128(); - while (pSrcCurrent + 4 <= pSrcEnd) + if (pSrcCurrent + 4 <= pSrcEnd) { result128 = Sse.Add(result128, Sse.LoadVector128(pSrcCurrent)); pSrcCurrent += 4; @@ -1045,9 +1009,7 @@ public static unsafe float SumU(Span src) pSrcCurrent++; } - float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); - ZeroUpper(); - return sum; + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); } } @@ -1073,7 +1035,7 @@ public static unsafe float SumSqU(Span src) Vector128 result128 = Sse.SetZeroVector128(); - while (pSrcCurrent + 4 <= pSrcEnd) + if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); result128 = Sse.Add(result128, Sse.Multiply(srcVector, srcVector)); @@ -1091,9 +1053,7 @@ public static unsafe float SumSqU(Span src) pSrcCurrent++; } - float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); - ZeroUpper(); - return sum; + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); } } @@ -1122,7 +1082,7 @@ public static unsafe float SumSqDiffU(float mean, Span src) Vector128 result128 = Sse.SetZeroVector128(); Vector128 meanVector128 = Sse.SetAllVector128(mean); - while (pSrcCurrent + 4 <= pSrcEnd) + if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector128); @@ -1142,9 +1102,7 @@ public static unsafe float SumSqDiffU(float mean, Span src) pSrcCurrent++; } - float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); - ZeroUpper(); - return sum; + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); } } @@ -1172,7 +1130,7 @@ public static unsafe float SumAbsU(Span src) Vector128 result128 = Sse.SetZeroVector128(); Vector128 mask128 = GetAbsMask128(); - while (pSrcCurrent + 4 <= pSrcEnd) + if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); result128 = Sse.Add(result128, Sse.And(srcVector, mask128)); @@ -1190,9 +1148,7 @@ public static unsafe float SumAbsU(Span src) pSrcCurrent++; } - float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); - ZeroUpper(); - return sum; + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); } } @@ -1223,7 +1179,7 @@ public static unsafe float SumAbsDiffU(float mean, Span src) Vector128 meanVector128 = Sse.SetAllVector128(mean); Vector128 mask128 = GetAbsMask128(); - while (pSrcCurrent + 4 <= pSrcEnd) + if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector128); @@ -1243,9 +1199,7 @@ public static unsafe float SumAbsDiffU(float mean, Span src) pSrcCurrent++; } - float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); - ZeroUpper(); - return sum; + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); } } @@ -1273,7 +1227,7 @@ public static unsafe float MaxAbsU(Span src) Vector128 result128 = Sse.SetZeroVector128(); Vector128 mask128 = GetAbsMask128(); - while (pSrcCurrent + 4 <= pSrcEnd) + if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); result128 = Sse.Max(result128, Sse.And(srcVector, mask128)); @@ -1291,9 +1245,7 @@ public static unsafe float MaxAbsU(Span src) pSrcCurrent++; } - float max = Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded)); - ZeroUpper(); - return max; + return Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded)); } } @@ -1324,7 +1276,7 @@ public static unsafe float MaxAbsDiffU(float mean, Span src) Vector128 meanVector128 = Sse.SetAllVector128(mean); Vector128 mask128 = GetAbsMask128(); - while (pSrcCurrent + 4 <= pSrcEnd) + if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector128); @@ -1344,9 +1296,7 @@ public static unsafe float MaxAbsDiffU(float mean, Span src) pSrcCurrent++; } - float max = Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded)); - ZeroUpper(); - return max; + return Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded)); } } @@ -1377,7 +1327,7 @@ public static unsafe float DotU(Span src, Span dst) Vector128 result128 = Sse.SetZeroVector128(); - while (pSrcCurrent + 4 <= pSrcEnd) + if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -1401,9 +1351,7 @@ public static unsafe float DotU(Span src, Span dst) pDstCurrent++; } - float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); - ZeroUpper(); - return sum; + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); } } @@ -1436,7 +1384,7 @@ public static unsafe float DotSU(Span src, Span dst, Span idx Vector128 result128 = Sse.SetZeroVector128(); - while (pIdxCurrent + 4 <= pIdxEnd) + if (pIdxCurrent + 4 <= pIdxEnd) { Vector128 srcVector = Load4(pSrcCurrent, pIdxCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -1460,9 +1408,7 @@ public static unsafe float DotSU(Span src, Span dst, Span idx pDstCurrent++; } - float sum = Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); - ZeroUpper(); - return sum; + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); } } @@ -1493,7 +1439,7 @@ public static unsafe float Dist2(Span src, Span dst) Vector128 sqDistanceVector128 = Sse.SetZeroVector128(); - while (pSrcCurrent + 4 <= pSrcEnd) + if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent), Sse.LoadVector128(pDstCurrent)); @@ -1516,7 +1462,6 @@ public static unsafe float Dist2(Span src, Span dst) pDstCurrent++; } - ZeroUpper(); return norm; } } @@ -1558,7 +1503,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, flo Vector128 signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xThreshold128 = Sse.SetAllVector128(threshold); - while (pSrcCurrent + 4 <= pSrcEnd) + if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); @@ -1623,7 +1568,7 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Sp Vector128 signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xThreshold128 = Sse.SetAllVector128(threshold); - while (pIdxCurrent + 4 <= pIdxEnd) + if (pIdxCurrent + 4 <= pIdxEnd) { Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index bbb7f3bd6a..706f4529bb 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -6,7 +6,7 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { - // REVIEW NEEDED: AVX support cannot be checked in .NET Core App 2.0, so we assume Vector128 alignment for SSE. Is it okay? + // REVIEW NEEDED: AVX support cannot be checked in .NET Standard 2.0, so we assume Vector128 alignment for SSE. Is it okay? // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; From f1664faa061864e5137fa63bf4f0af5055147bea Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Mon, 20 Aug 2018 16:52:26 -0700 Subject: [PATCH 18/29] Respond to PR feedback: More style changes --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 2 +- src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index db9335b4ed..1bcc8e651e 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -48,7 +48,7 @@ private static Vector256 ToVector256(in Vector128 a, in Vector128< [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector128 GetLow(in Vector256 x) { - return Avx.ExtractVector128(x, 0); + return Avx.GetLowerHalf(x); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index 706f4529bb..497dd59003 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -6,8 +6,6 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { - // REVIEW NEEDED: AVX support cannot be checked in .NET Standard 2.0, so we assume Vector128 alignment for SSE. Is it okay? - // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; From 26ed88490570de6194cff4f327cd23f0797a225f Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Mon, 20 Aug 2018 16:53:02 -0700 Subject: [PATCH 19/29] Implemented class inheritance in perf tests to reduce overlapping code --- .../AvxPerformanceTests.cs | 86 +-------------- .../PerformanceTests.cs | 101 ++++++++++++++++++ .../SsePerformanceTests.cs | 86 +-------------- 3 files changed, 103 insertions(+), 170 deletions(-) create mode 100644 test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs index 01058384f8..2e4b598540 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs @@ -9,92 +9,8 @@ namespace Microsoft.ML.CpuMath.PerformanceTests { - public class AvxPerformanceTests + public class AvxPerformanceTests : PerformanceTests { - private const int EXP_MAX = 127; - private const int EXP_MIN = 0; - - private const int IDXLEN = 1000003; - private const int LEN = 1000003; - private const int EXP_RANGE = EXP_MAX / 8; - private const int DEFAULT_SEED = 253421; - private const float DEFAULT_SCALE = 1.11f; - private const int DEFAULT_CROW = 500; - private const int DEFAULT_CCOL = 2000; - private const bool ADD = true; - - private float[] src, dst, original, src1, src2, result; - private int[] idx; - private int seed = DEFAULT_SEED; - - private static float NextFloat(Random rand, int expRange) - { - double mantissa = (rand.NextDouble() * 2.0) - 1.0; - double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1)); - return (float)(mantissa * exponent); - } - - private static int GetSeed() - { - int seed = DEFAULT_SEED; - string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED"); - - if (CPUMATH_SEED != null) - { - if (!int.TryParse(CPUMATH_SEED, out seed)) - { - if (string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase)) - { - seed = new Random().Next(); - } - else - { - seed = DEFAULT_SEED; - } - } - } - - Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results"); - return seed; - } - - [GlobalSetup] - public void Setup() - { - src = new float[LEN]; - dst = new float[LEN]; - src1 = new float[LEN]; - src2 = new float[LEN]; - original = new float[LEN]; - result = new float[LEN]; - idx = new int[IDXLEN]; - - seed = GetSeed(); - Random rand = new Random(seed); - - for (int i = 0; i < LEN; i++) - { - src[i] = NextFloat(rand, EXP_RANGE); - dst[i] = NextFloat(rand, EXP_RANGE); - original[i] = dst[i]; - result[i] = dst[i]; - src1[i] = NextFloat(rand, EXP_RANGE); - src2[i] = NextFloat(rand, EXP_RANGE); - } - - for (int i = 0; i < IDXLEN; i++) - { - idx[i] = rand.Next(0, LEN); - } - } - - [GlobalCleanup] - public void GlobalCleanup() - { - original.CopyTo(dst, 0); - original.CopyTo(result, 0); - } - [Benchmark] public void ManagedAddScalarUPerf() { diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs new file mode 100644 index 0000000000..64278eaf21 --- /dev/null +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs @@ -0,0 +1,101 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; +using Microsoft.ML.Runtime.Internal.CpuMath; + +namespace Microsoft.ML.CpuMath.PerformanceTests +{ + public class PerformanceTests + { + private const int EXP_MAX = 127; + private const int EXP_MIN = 0; + private const int EXP_RANGE = EXP_MAX / 8; + + protected const int IDXLEN = 1000003; + protected const int LEN = 1000003; + + private const int DEFAULT_SEED = 253421; + protected const float DEFAULT_SCALE = 1.11f; + + protected const int DEFAULT_CROW = 500; + protected const int DEFAULT_CCOL = 2000; + protected const bool ADD = true; + + protected float[] src, dst, original, src1, src2, result; + protected int[] idx; + + private int seed = DEFAULT_SEED; + + private float NextFloat(Random rand, int expRange) + { + double mantissa = (rand.NextDouble() * 2.0) - 1.0; + double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1)); + return (float)(mantissa * exponent); + } + + private int GetSeed() + { + int seed = DEFAULT_SEED; + string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED"); + + if (CPUMATH_SEED != null) + { + if (!int.TryParse(CPUMATH_SEED, out seed)) + { + if (string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase)) + { + seed = new Random().Next(); + } + else + { + seed = DEFAULT_SEED; + } + } + } + + Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results"); + return seed; + } + + [GlobalSetup] + public void Setup() + { + src = new float[LEN]; + dst = new float[LEN]; + src1 = new float[LEN]; + src2 = new float[LEN]; + original = new float[LEN]; + result = new float[LEN]; + idx = new int[IDXLEN]; + + seed = GetSeed(); + Random rand = new Random(seed); + + for (int i = 0; i < LEN; i++) + { + src[i] = NextFloat(rand, EXP_RANGE); + dst[i] = NextFloat(rand, EXP_RANGE); + original[i] = dst[i]; + result[i] = dst[i]; + src1[i] = NextFloat(rand, EXP_RANGE); + src2[i] = NextFloat(rand, EXP_RANGE); + } + + for (int i = 0; i < IDXLEN; i++) + { + idx[i] = rand.Next(0, LEN); + } + } + + [GlobalCleanup] + public void GlobalCleanup() + { + original.CopyTo(dst, 0); + original.CopyTo(result, 0); + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index c3869c63d9..3188c64db9 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -9,92 +9,8 @@ namespace Microsoft.ML.CpuMath.PerformanceTests { - public class SsePerformanceTests + public class SsePerformanceTests : PerformanceTests { - private const int EXP_MAX = 127; - private const int EXP_MIN = 0; - - private const int IDXLEN = 1000003; - private const int LEN = 1000003; - private const int EXP_RANGE = EXP_MAX / 8; - private const int DEFAULT_SEED = 253421; - private const float DEFAULT_SCALE = 1.11f; - private const int DEFAULT_CROW = 500; - private const int DEFAULT_CCOL = 2000; - private const bool ADD = true; - - private float[] src, dst, original, src1, src2, result; - private int[] idx; - private int seed = DEFAULT_SEED; - - private static float NextFloat(Random rand, int expRange) - { - double mantissa = (rand.NextDouble() * 2.0) - 1.0; - double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1)); - return (float)(mantissa * exponent); - } - - private static int GetSeed() - { - int seed = DEFAULT_SEED; - string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED"); - - if (CPUMATH_SEED != null) - { - if (!int.TryParse(CPUMATH_SEED, out seed)) - { - if(string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase)) - { - seed = new Random().Next(); - } - else - { - seed = DEFAULT_SEED; - } - } - } - - Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results"); - return seed; - } - - [GlobalSetup] - public void Setup() - { - src = new float[LEN]; - dst = new float[LEN]; - src1 = new float[LEN]; - src2 = new float[LEN]; - original = new float[LEN]; - result = new float[LEN]; - idx = new int[IDXLEN]; - - seed = GetSeed(); - Random rand = new Random(seed); - - for (int i = 0; i < LEN; i++) - { - src[i] = NextFloat(rand, EXP_RANGE); - dst[i] = NextFloat(rand, EXP_RANGE); - original[i] = dst[i]; - result[i] = dst[i]; - src1[i] = NextFloat(rand, EXP_RANGE); - src2[i] = NextFloat(rand, EXP_RANGE); - } - - for (int i = 0; i < IDXLEN; i++) - { - idx[i] = rand.Next(0, LEN); - } - } - - [GlobalCleanup] - public void GlobalCleanup() - { - original.CopyTo(dst, 0); - original.CopyTo(result, 0); - } - [Benchmark] public unsafe void NativeAddScalarUPerf() { From 31de89587928ffeb4cbe0023c50854e9f9ab115d Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Mon, 20 Aug 2018 17:21:57 -0700 Subject: [PATCH 20/29] Respond to PR feedback: Changed Sse/AvxIntrinsics from public to interal, adding InternalsVisibleTo attributes --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 7 ++++++- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 1bcc8e651e..2419e319cb 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -14,9 +14,14 @@ using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +[assembly: InternalsVisibleTo("Microsoft.ML.CpuMath.PerformanceTests, PublicKey=0024000004800000940000000602" + + "00000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8" + + "cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a349" + + "28e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")] + namespace Microsoft.ML.Runtime.Internal.CpuMath { - public static class AvxIntrinsics + internal static class AvxIntrinsics { private const int Vector256Alignment = 32; diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index e6bc9d6dd4..2ef3de95a0 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -18,9 +18,14 @@ using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +[assembly: InternalsVisibleTo("Microsoft.ML.CpuMath.PerformanceTests, PublicKey=0024000004800000940000000602" + + "00000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8" + + "cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a349" + + "28e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")] + namespace Microsoft.ML.Runtime.Internal.CpuMath { - public static class SseIntrinsics + internal static class SseIntrinsics { // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; From f07afb2d9688956abd39a478402060a05224fa95 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Mon, 20 Aug 2018 17:37:23 -0700 Subject: [PATCH 21/29] Respond to PR feedback: Used env vars to determine whether to use AVX/SSE --- .../CpuMathUtils.netcoreapp.cs | 103 +++++++++--------- 1 file changed, 54 insertions(+), 49 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 6843cd4757..9eb094d4aa 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -9,6 +9,11 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { + private static string _enableAvx = Environment.GetEnvironmentVariable("COMPlus_EnableAVX"); + private static string _featureSimd = Environment.GetEnvironmentVariable("COMPlus_FeatureSIMD"); + private static bool _useAvx = Avx.IsSupported && !string.Equals(_enableAvx, "false", StringComparison.OrdinalIgnoreCase); + private static bool _useSse = Sse.IsSupported && !string.Equals(_featureSimd, "false", StringComparison.OrdinalIgnoreCase); + // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; @@ -18,7 +23,7 @@ public static partial class CpuMathUtils public static int GetVectorAlignment() { // Assumes SSE support on machines that run ML.NET. - return Avx.IsSupported ? Vector256Alignment : Vector128Alignment; + return _useAvx ? Vector256Alignment : Vector128Alignment; } public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) @@ -26,7 +31,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr Contracts.Assert(mat.Size == dst.Size * src.Size); Contracts.Assert(crun >= 0); - if (Avx.IsSupported) + if (_useAvx) { if (!tran) { @@ -39,7 +44,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Size, crun); } } - else if (Sse.IsSupported) + else if (_useSse) { if (!tran) { @@ -118,7 +123,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo Contracts.AssertNonEmpty(rgposSrc); Contracts.Assert(crun >= 0); - if (Avx.IsSupported) + if (_useAvx) { if (!tran) { @@ -131,7 +136,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size); } } - else if (Sse.IsSupported) + else if (_useSse) { if (!tran) { @@ -205,11 +210,11 @@ public static void Add(float a, float[] dst, int count) private static void Add(float a, Span dst) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.AddScalarU(a, dst); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.AddScalarU(a, dst); } @@ -243,11 +248,11 @@ public static void Scale(float a, float[] dst, int offset, int count) private static void Scale(float a, Span dst) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.ScaleU(a, dst); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.ScaleU(a, dst); } @@ -274,11 +279,11 @@ public static void Scale(float a, float[] src, float[] dst, int count) private static void Scale(float a, Span src, Span dst) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.ScaleSrcU(a, src, dst); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.ScaleSrcU(a, src, dst); } @@ -303,11 +308,11 @@ public static void ScaleAdd(float a, float b, float[] dst, int count) private static void ScaleAdd(float a, float b, Span dst) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.ScaleAddU(a, b, dst); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.ScaleAddU(a, b, dst); } @@ -346,11 +351,11 @@ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, in private static void AddScale(float a, Span src, Span dst) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.AddScaleU(a, src, dst); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.AddScaleU(a, src, dst); } @@ -394,11 +399,11 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in private static void AddScale(float a, Span src, Span indices, Span dst) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.AddScaleSU(a, src, indices, dst); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.AddScaleSU(a, src, indices, dst); } @@ -427,11 +432,11 @@ public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, private static void AddScaleCopy(float a, Span src, Span dst, Span res) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.AddScaleCopyU(a, src, dst, res); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.AddScaleCopyU(a, src, dst, res); } @@ -457,11 +462,11 @@ public static void Add(float[] src, float[] dst, int count) private static void Add(Span src, Span dst) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.AddU(src, dst); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.AddU(src, dst); } @@ -505,11 +510,11 @@ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, i private static void Add(Span src, Span indices, Span dst) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.AddSU(src, indices, dst); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.AddSU(src, indices, dst); } @@ -538,11 +543,11 @@ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int c private static void MulElementWise(Span src1, Span src2, Span dst) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.MulElementWiseU(src1, src2, dst); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.MulElementWiseU(src1, src2, dst); } @@ -576,11 +581,11 @@ public static float Sum(float[] src, int offset, int count) private static float Sum(Span src) { - if (Avx.IsSupported) + if (_useAvx) { return AvxIntrinsics.SumU(src); } - else if (Sse.IsSupported) + else if (_useSse) { return SseIntrinsics.SumU(src); } @@ -616,11 +621,11 @@ public static float SumSq(float[] src, int offset, int count) private static float SumSq(Span src) { - if (Avx.IsSupported) + if (_useAvx) { return AvxIntrinsics.SumSqU(src); } - else if (Sse.IsSupported) + else if (_useSse) { return SseIntrinsics.SumSqU(src); } @@ -647,11 +652,11 @@ public static float SumSq(float mean, float[] src, int offset, int count) private static float SumSq(float mean, Span src) { - if (Avx.IsSupported) + if (_useAvx) { return (mean == 0) ? AvxIntrinsics.SumSqU(src) : AvxIntrinsics.SumSqDiffU(mean, src); } - else if (Sse.IsSupported) + else if (_useSse) { return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src); } @@ -687,11 +692,11 @@ public static float SumAbs(float[] src, int offset, int count) private static float SumAbs(Span src) { - if (Avx.IsSupported) + if (_useAvx) { return AvxIntrinsics.SumAbsU(src); } - else if (Sse.IsSupported) + else if (_useSse) { return SseIntrinsics.SumAbsU(src); } @@ -718,11 +723,11 @@ public static float SumAbs(float mean, float[] src, int offset, int count) private static float SumAbs(float mean, Span src) { - if (Avx.IsSupported) + if (_useAvx) { return (mean == 0) ? AvxIntrinsics.SumAbsU(src) : AvxIntrinsics.SumAbsDiffU(mean, src); } - else if (Sse.IsSupported) + else if (_useSse) { return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src); } @@ -758,11 +763,11 @@ public static float MaxAbs(float[] src, int offset, int count) private static float MaxAbs(Span src) { - if (Avx.IsSupported) + if (_useAvx) { return AvxIntrinsics.MaxAbsU(src); } - else if (Sse.IsSupported) + else if (_useSse) { return SseIntrinsics.MaxAbsU(src); } @@ -792,11 +797,11 @@ public static float MaxAbsDiff(float mean, float[] src, int count) private static float MaxAbsDiff(float mean, Span src) { - if (Avx.IsSupported) + if (_useAvx) { return AvxIntrinsics.MaxAbsDiffU(mean, src); } - else if (Sse.IsSupported) + else if (_useSse) { return SseIntrinsics.MaxAbsDiffU(mean, src); } @@ -840,11 +845,11 @@ public static float DotProductDense(float[] a, int offset, float[] b, int count) private static float DotProductDense(Span a, Span b) { - if (Avx.IsSupported) + if (_useAvx) { return AvxIntrinsics.DotU(a, b); } - else if (Sse.IsSupported) + else if (_useSse) { return SseIntrinsics.DotU(a, b); } @@ -891,11 +896,11 @@ public static float DotProductSparse(float[] a, int offset, float[] b, int[] ind private static float DotProductSparse(Span a, Span b, Span indices) { - if (Avx.IsSupported) + if (_useAvx) { return AvxIntrinsics.DotSU(a, b, indices); } - else if (Sse.IsSupported) + else if (_useSse) { return SseIntrinsics.DotSU(a, b, indices); } @@ -924,11 +929,11 @@ public static float L2DistSquared(float[] a, float[] b, int count) private static float L2DistSquared(Span a, Span b) { - if (Avx.IsSupported) + if (_useAvx) { return AvxIntrinsics.Dist2(a, b); } - else if (Sse.IsSupported) + else if (_useSse) { return SseIntrinsics.Dist2(a, b); } @@ -1024,11 +1029,11 @@ public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src private static void SdcaL1UpdateDense(float primalUpdate, Span src, float threshold, Span v, Span w) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w); } @@ -1062,11 +1067,11 @@ public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] sr private static void SdcaL1UpdateSparse(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) { - if (Avx.IsSupported) + if (_useAvx) { AvxIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w); } - else if (Sse.IsSupported) + else if (_useSse) { SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w); } From 9a9d27216a5b913d8ad6bbf2184c957fca6f4841 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Mon, 20 Aug 2018 18:09:48 -0700 Subject: [PATCH 22/29] Respond to PR feedback: Included 0 into consideration for parsing env vars --- src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 9eb094d4aa..3ca679a74e 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -11,8 +11,8 @@ public static partial class CpuMathUtils { private static string _enableAvx = Environment.GetEnvironmentVariable("COMPlus_EnableAVX"); private static string _featureSimd = Environment.GetEnvironmentVariable("COMPlus_FeatureSIMD"); - private static bool _useAvx = Avx.IsSupported && !string.Equals(_enableAvx, "false", StringComparison.OrdinalIgnoreCase); - private static bool _useSse = Sse.IsSupported && !string.Equals(_featureSimd, "false", StringComparison.OrdinalIgnoreCase); + private static bool _useAvx = Avx.IsSupported && !string.Equals(_enableAvx, "false", StringComparison.OrdinalIgnoreCase) && !string.Equals(_enableAvx, "0", StringComparison.OrdinalIgnoreCase); + private static bool _useSse = Sse.IsSupported && !string.Equals(_featureSimd, "false", StringComparison.OrdinalIgnoreCase) && !string.Equals(_enableAvx, "0", StringComparison.OrdinalIgnoreCase); // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; From c249d8846d2870b0f43a99c3e3a7df8cda63b0c0 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Tue, 21 Aug 2018 10:47:29 -0700 Subject: [PATCH 23/29] Respond to PR feedback: env vars, InternalsVisibleTo, and abstract --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 5 - .../CpuMathUtils.netcoreapp.cs | 103 +++++++++--------- .../Microsoft.ML.CpuMath.csproj | 7 +- .../Properties/AssemblyInfo.cs | 7 ++ src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 5 - .../PerformanceTests.cs | 2 +- 6 files changed, 60 insertions(+), 69 deletions(-) create mode 100644 src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 2419e319cb..5f44625bbe 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -14,11 +14,6 @@ using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; -[assembly: InternalsVisibleTo("Microsoft.ML.CpuMath.PerformanceTests, PublicKey=0024000004800000940000000602" + - "00000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8" + - "cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a349" + - "28e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")] - namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static class AvxIntrinsics diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 3ca679a74e..6843cd4757 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -9,11 +9,6 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { - private static string _enableAvx = Environment.GetEnvironmentVariable("COMPlus_EnableAVX"); - private static string _featureSimd = Environment.GetEnvironmentVariable("COMPlus_FeatureSIMD"); - private static bool _useAvx = Avx.IsSupported && !string.Equals(_enableAvx, "false", StringComparison.OrdinalIgnoreCase) && !string.Equals(_enableAvx, "0", StringComparison.OrdinalIgnoreCase); - private static bool _useSse = Sse.IsSupported && !string.Equals(_featureSimd, "false", StringComparison.OrdinalIgnoreCase) && !string.Equals(_enableAvx, "0", StringComparison.OrdinalIgnoreCase); - // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; @@ -23,7 +18,7 @@ public static partial class CpuMathUtils public static int GetVectorAlignment() { // Assumes SSE support on machines that run ML.NET. - return _useAvx ? Vector256Alignment : Vector128Alignment; + return Avx.IsSupported ? Vector256Alignment : Vector128Alignment; } public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) @@ -31,7 +26,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr Contracts.Assert(mat.Size == dst.Size * src.Size); Contracts.Assert(crun >= 0); - if (_useAvx) + if (Avx.IsSupported) { if (!tran) { @@ -44,7 +39,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Size, crun); } } - else if (_useSse) + else if (Sse.IsSupported) { if (!tran) { @@ -123,7 +118,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo Contracts.AssertNonEmpty(rgposSrc); Contracts.Assert(crun >= 0); - if (_useAvx) + if (Avx.IsSupported) { if (!tran) { @@ -136,7 +131,7 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size); } } - else if (_useSse) + else if (Sse.IsSupported) { if (!tran) { @@ -210,11 +205,11 @@ public static void Add(float a, float[] dst, int count) private static void Add(float a, Span dst) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.AddScalarU(a, dst); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.AddScalarU(a, dst); } @@ -248,11 +243,11 @@ public static void Scale(float a, float[] dst, int offset, int count) private static void Scale(float a, Span dst) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.ScaleU(a, dst); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.ScaleU(a, dst); } @@ -279,11 +274,11 @@ public static void Scale(float a, float[] src, float[] dst, int count) private static void Scale(float a, Span src, Span dst) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.ScaleSrcU(a, src, dst); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.ScaleSrcU(a, src, dst); } @@ -308,11 +303,11 @@ public static void ScaleAdd(float a, float b, float[] dst, int count) private static void ScaleAdd(float a, float b, Span dst) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.ScaleAddU(a, b, dst); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.ScaleAddU(a, b, dst); } @@ -351,11 +346,11 @@ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, in private static void AddScale(float a, Span src, Span dst) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.AddScaleU(a, src, dst); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.AddScaleU(a, src, dst); } @@ -399,11 +394,11 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in private static void AddScale(float a, Span src, Span indices, Span dst) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.AddScaleSU(a, src, indices, dst); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.AddScaleSU(a, src, indices, dst); } @@ -432,11 +427,11 @@ public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, private static void AddScaleCopy(float a, Span src, Span dst, Span res) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.AddScaleCopyU(a, src, dst, res); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.AddScaleCopyU(a, src, dst, res); } @@ -462,11 +457,11 @@ public static void Add(float[] src, float[] dst, int count) private static void Add(Span src, Span dst) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.AddU(src, dst); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.AddU(src, dst); } @@ -510,11 +505,11 @@ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, i private static void Add(Span src, Span indices, Span dst) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.AddSU(src, indices, dst); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.AddSU(src, indices, dst); } @@ -543,11 +538,11 @@ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int c private static void MulElementWise(Span src1, Span src2, Span dst) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.MulElementWiseU(src1, src2, dst); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.MulElementWiseU(src1, src2, dst); } @@ -581,11 +576,11 @@ public static float Sum(float[] src, int offset, int count) private static float Sum(Span src) { - if (_useAvx) + if (Avx.IsSupported) { return AvxIntrinsics.SumU(src); } - else if (_useSse) + else if (Sse.IsSupported) { return SseIntrinsics.SumU(src); } @@ -621,11 +616,11 @@ public static float SumSq(float[] src, int offset, int count) private static float SumSq(Span src) { - if (_useAvx) + if (Avx.IsSupported) { return AvxIntrinsics.SumSqU(src); } - else if (_useSse) + else if (Sse.IsSupported) { return SseIntrinsics.SumSqU(src); } @@ -652,11 +647,11 @@ public static float SumSq(float mean, float[] src, int offset, int count) private static float SumSq(float mean, Span src) { - if (_useAvx) + if (Avx.IsSupported) { return (mean == 0) ? AvxIntrinsics.SumSqU(src) : AvxIntrinsics.SumSqDiffU(mean, src); } - else if (_useSse) + else if (Sse.IsSupported) { return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src); } @@ -692,11 +687,11 @@ public static float SumAbs(float[] src, int offset, int count) private static float SumAbs(Span src) { - if (_useAvx) + if (Avx.IsSupported) { return AvxIntrinsics.SumAbsU(src); } - else if (_useSse) + else if (Sse.IsSupported) { return SseIntrinsics.SumAbsU(src); } @@ -723,11 +718,11 @@ public static float SumAbs(float mean, float[] src, int offset, int count) private static float SumAbs(float mean, Span src) { - if (_useAvx) + if (Avx.IsSupported) { return (mean == 0) ? AvxIntrinsics.SumAbsU(src) : AvxIntrinsics.SumAbsDiffU(mean, src); } - else if (_useSse) + else if (Sse.IsSupported) { return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src); } @@ -763,11 +758,11 @@ public static float MaxAbs(float[] src, int offset, int count) private static float MaxAbs(Span src) { - if (_useAvx) + if (Avx.IsSupported) { return AvxIntrinsics.MaxAbsU(src); } - else if (_useSse) + else if (Sse.IsSupported) { return SseIntrinsics.MaxAbsU(src); } @@ -797,11 +792,11 @@ public static float MaxAbsDiff(float mean, float[] src, int count) private static float MaxAbsDiff(float mean, Span src) { - if (_useAvx) + if (Avx.IsSupported) { return AvxIntrinsics.MaxAbsDiffU(mean, src); } - else if (_useSse) + else if (Sse.IsSupported) { return SseIntrinsics.MaxAbsDiffU(mean, src); } @@ -845,11 +840,11 @@ public static float DotProductDense(float[] a, int offset, float[] b, int count) private static float DotProductDense(Span a, Span b) { - if (_useAvx) + if (Avx.IsSupported) { return AvxIntrinsics.DotU(a, b); } - else if (_useSse) + else if (Sse.IsSupported) { return SseIntrinsics.DotU(a, b); } @@ -896,11 +891,11 @@ public static float DotProductSparse(float[] a, int offset, float[] b, int[] ind private static float DotProductSparse(Span a, Span b, Span indices) { - if (_useAvx) + if (Avx.IsSupported) { return AvxIntrinsics.DotSU(a, b, indices); } - else if (_useSse) + else if (Sse.IsSupported) { return SseIntrinsics.DotSU(a, b, indices); } @@ -929,11 +924,11 @@ public static float L2DistSquared(float[] a, float[] b, int count) private static float L2DistSquared(Span a, Span b) { - if (_useAvx) + if (Avx.IsSupported) { return AvxIntrinsics.Dist2(a, b); } - else if (_useSse) + else if (Sse.IsSupported) { return SseIntrinsics.Dist2(a, b); } @@ -1029,11 +1024,11 @@ public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src private static void SdcaL1UpdateDense(float primalUpdate, Span src, float threshold, Span v, Span w) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w); } @@ -1067,11 +1062,11 @@ public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] sr private static void SdcaL1UpdateSparse(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) { - if (_useAvx) + if (Avx.IsSupported) { AvxIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w); } - else if (_useSse) + else if (Sse.IsSupported) { SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w); } diff --git a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj index ef24bf2762..05f97d3040 100644 --- a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj +++ b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj @@ -10,10 +10,9 @@ 7.3 - - Auto - true - + + + diff --git a/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs b/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs new file mode 100644 index 0000000000..ab9968b399 --- /dev/null +++ b/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs @@ -0,0 +1,7 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.CpuMath.PerformanceTests, PublicKey=002400000480000094000000060200000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a34928e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")] \ No newline at end of file diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 2ef3de95a0..faaa2a44de 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -18,11 +18,6 @@ using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; -[assembly: InternalsVisibleTo("Microsoft.ML.CpuMath.PerformanceTests, PublicKey=0024000004800000940000000602" + - "00000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8" + - "cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a349" + - "28e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")] - namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static class SseIntrinsics diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs index 64278eaf21..1eb9157a2f 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs @@ -9,7 +9,7 @@ namespace Microsoft.ML.CpuMath.PerformanceTests { - public class PerformanceTests + public abstract class PerformanceTests { private const int EXP_MAX = 127; private const int EXP_MIN = 0; From f606432b8b2f452d0e48ad1ec9de01a7f4c06af0 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Tue, 21 Aug 2018 14:06:29 -0700 Subject: [PATCH 24/29] Respond to PR feedback: Added new comparer class specifically for MatMul --- .../UnitTests.cs | 44 +++++++++++++------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs index 2d59a2acf1..1877ebe6b0 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using Xunit; +using Xunit.Abstractions; using Microsoft.ML.Runtime.Internal.CpuMath; namespace Microsoft.ML.CpuMath.UnitTests @@ -18,6 +19,7 @@ public class CpuMathUtilsUnitTests private readonly AlignedArray[] _testDstVectors; private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment(); private readonly FloatEqualityComparer _comparer; + private readonly FloatEqualityComparerForMatMul _matMulComparer; private const float DEFAULT_SCALE = 1.7f; @@ -30,6 +32,7 @@ public CpuMathUtilsUnitTests() _testArrays = new float[][] { testArray1, testArray2 }; _testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }; _comparer = new FloatEqualityComparer(); + _matMulComparer = new FloatEqualityComparerForMatMul(); // Padded matrices whose dimensions are multiples of 8 float[] testMatrix1 = new float[8 * 8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, @@ -78,7 +81,7 @@ public CpuMathUtilsUnitTests() } [Theory] - [InlineData(0, 0, 0, new float[] { -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f, -416.68f })] + [InlineData(0, 0, 0, new float[] { -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f })] [InlineData(1, 1, 0, new float[] { 1496f, 3672f, 5848f, 8024f, 10200f, 12376f, 14552f, 16728f })] [InlineData(1, 0, 1, new float[] { 204f, 492f, 780f, 1068f, 1356f, 1644f, 1932f, 2220f, 2508f, 2796f, 3084f, 3372f, 3660f, 3948f, 4236f, 4524f })] public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) @@ -90,11 +93,11 @@ public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { -416.68f, -415.68f, -414.68f, -413.68f, -412.68f, -411.68f, -410.68f, -409.68f })] + [InlineData(0, 0, 0, new float[] { -416.6801f, -415.6801f, -414.6801f, -413.6801f, -412.6801f, -411.6801f, -410.6801f, -409.6801f })] [InlineData(1, 1, 0, new float[] { 1496f, 3673f, 5850f, 8027f, 10204f, 12381f, 14558f, 16735f })] [InlineData(1, 0, 1, new float[] { 204f, 493f, 782f, 1071f, 1360f, 1649f, 1938f, 2227f, 2516f, 2805f, 3094f, 3383f, 3672f, 3961f, 4250f, 4539f })] public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) @@ -106,11 +109,11 @@ public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expect CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { 70.56f, -85.68f, -351.36f, 498.24f, -3829.32f, -969.48f, 1168.2f, 118.44f })] + [InlineData(0, 0, 0, new float[] { 70.56001f, -85.68f, -351.36f, 498.24f, -3829.32f, -969.48f, 1168.2f, 118.44f })] [InlineData(1, 0, 1, new float[] { 2724f, 2760f, 2796f, 2832f, 2868f, 2904f, 2940f, 2976f, 3012f, 3048f, 3084f, 3120f, 3156f, 3192f, 3228f, 3264f })] [InlineData(1, 1, 0, new float[] { 11016f, 11152f, 11288f, 11424f, 11560f, 11696f, 11832f, 11968f })] public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) @@ -122,11 +125,11 @@ public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expec CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { 70.56f, -84.68f, -349.36f, 501.24f, -3825.32f, -964.48f, 1174.2f, 125.44f })] + [InlineData(0, 0, 0, new float[] { 70.56001f, -84.68f, -349.36f, 501.24f, -3825.32f, -964.48f, 1174.2f, 125.44f })] [InlineData(1, 0, 1, new float[] { 2724f, 2761f, 2798f, 2835f, 2872f, 2909f, 2946f, 2983f, 3020f, 3057f, 3094f, 3131f, 3168f, 3205f, 3242f, 3279f })] [InlineData(1, 1, 0, new float[] { 11016f, 11153f, 11290f, 11427f, 11564f, 11701f, 11838f, 11975f })] public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) @@ -138,11 +141,11 @@ public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] ex CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f, 38.25f })] + [InlineData(0, 0, 0, new float[] { 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f })] [InlineData(1, 1, 0, new float[] { 910f, 2190f, 3470f, 4750f, 6030f, 7310f, 8590f, 9870f })] [InlineData(1, 0, 1, new float[] { 95f, 231f, 367f, 503f, 639f, 775f, 911f, 1047f, 1183f, 1319f, 1455f, 1591f, 1727f, 1863f, 1999f, 2135f })] public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) @@ -155,11 +158,11 @@ public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { 38.25f, 39.25f, 40.25f, 41.25f, 42.25f, 43.25f, 44.25f, 45.25f })] + [InlineData(0, 0, 0, new float[] { 38.25002f, 39.25002f, 40.25002f, 41.25002f, 42.25002f, 43.25002f, 44.25002f, 45.25002f })] [InlineData(1, 1, 0, new float[] { 910f, 2191f, 3472f, 4753f, 6034f, 7315f, 8596f, 9877f })] [InlineData(1, 0, 1, new float[] { 95f, 232f, 369f, 506f, 643f, 780f, 917f, 1054f, 1191f, 1328f, 1465f, 1602f, 1739f, 1876f, 2013f, 2150f })] public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) @@ -172,7 +175,7 @@ public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expec CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] @@ -189,7 +192,7 @@ public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expe CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] @@ -206,7 +209,7 @@ public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] e CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, _comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] @@ -621,6 +624,19 @@ public void SdcaL1UpdateSUTest(int test) } internal class FloatEqualityComparer : IEqualityComparer + { + public bool Equals(float a, float b) + { + return Math.Abs(a - b) < 1e-5f; + } + + public int GetHashCode(float a) + { + throw new NotImplementedException(); + } + } + + internal class FloatEqualityComparerForMatMul : IEqualityComparer { public bool Equals(float a, float b) { From 27ad82930c1488dc3fe9bdd57697771df54e12d4 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Wed, 22 Aug 2018 17:35:54 -0700 Subject: [PATCH 25/29] Respond to PR feedback: Changes to intrinsics --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 216 ++++++++---------- .../CpuMathUtils.netcoreapp.cs | 11 +- .../CpuMathUtils.netstandard.cs | 8 +- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 2 +- 4 files changed, 108 insertions(+), 129 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 5f44625bbe..09804f88ed 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -18,69 +18,57 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static class AvxIntrinsics { + private static readonly Vector128 _absMask128 = Sse2.IsSupported ? + Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : + Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + + private static readonly Vector256 _absMask256 = Avx.StaticCast(Avx.SetAllVector256(0x7FFFFFFF)); + private const int Vector256Alignment = 32; - private static bool Compat(AlignedArray a) + private static bool HasCompatibleAlignment(AlignedArray alignedArray) { - Contracts.AssertValue(a); - Contracts.Assert(a.Size > 0); - return a.CbAlign == Vector256Alignment; + Contracts.AssertValue(alignedArray); + Contracts.Assert(alignedArray.Size > 0); + return (alignedArray.CbAlign % Vector256Alignment) == 0; } - private static unsafe float* Ptr(AlignedArray a, float* p) + private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase) { - Contracts.AssertValue(a); - float* q = p + a.GetBase((long)p); - Contracts.Assert(((long)q & (Vector256Alignment - 1)) == 0); - return q; + Contracts.AssertValue(alignedArray); + float* alignedBase = unalignedBase + alignedArray.GetBase((long)unalignedBase); + Contracts.Assert(((long)alignedBase & (Vector256Alignment - 1)) == 0); + return alignedBase; } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector256 ToVector256(in Vector128 a, in Vector128 b) - { - // REVIEW NEEDED: Is it the correct port of the following code? - // #ifndef _WIN32 - // #define _mm256_set_m128(va, vb) _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1) - // #endif - return Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1); - } + private static Vector256 SetHighLow(in Vector128 a, in Vector128 b) + => Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1); [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector128 GetLow(in Vector256 x) - { - return Avx.GetLowerHalf(x); - } + => Avx.GetLowerHalf(x); [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector128 GetHigh(in Vector256 x) - { - return Avx.ExtractVector128(x, 1); - } + => Avx.ExtractVector128(x, 1); [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe Vector128 Load1(float* src, int* idx) - { - return Sse.SetScalarVector128(src[idx[0]]); - } + => Sse.SetScalarVector128(src[idx[0]]); [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe Vector128 Load4(float* src, int* idx) - { - return Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); - } + => Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe Vector256 Load8(float* src, int* idx) - { - return Avx.SetVector256(src[idx[7]], src[idx[6]], src[idx[5]], src[idx[4]], src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); - } + => Avx.SetVector256(src[idx[7]], src[idx[6]], src[idx[5]], src[idx[4]], src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); + // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA. [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector128 Rotate(in Vector128 x) - { - // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA. - return Sse.Shuffle(x, x, 0x39); - } + => Sse.Shuffle(x, x, 0x39); [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe void Store4(in Vector128 x, float* dst, int* idx) @@ -141,38 +129,44 @@ private static Vector256 VectorSum256(in Vector256 vector) [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector128 VectorMax128(in Vector128 vector) { + // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> BADC. Vector128 x1 = Sse.Shuffle(vector, vector, 0xB1); + + // Performs element-wise maximum operation: The 1st and 3rd 32-bit slots become + // max(A, B) and max(C, D). Vector128 partialMax = Sse.Max(vector, x1); + + // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> CAAA. x1 = Sse.Shuffle(partialMax, partialMax, 0x02); + + // Performs element-wise maximum operation: The 1st 32-bit slot becomes + // max(A, B, C, D). return Sse.MaxScalar(partialMax, x1); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector256 VectorMax256(in Vector256 vector) { + // The control byte shuffles the eight 32-bit floats of partialMax: ABCD|EFGH -> BADC|FEHG. Vector256 x1 = Avx.Shuffle(vector, vector, 0xB1); + + // Performs element-wise maximum operation: The 1st, 3rd, 5th, and 7th 32-bit slots become + // max(A, B), max(C, D), max(E, F), and max(G, H). Vector256 partialMax = Avx.Max(vector, x1); - x1 = Avx.Shuffle(partialMax, partialMax, 0x02); - return Avx.Max(partialMax, x1); - } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 GetAbsMask128() - { - return Sse2.IsSupported ? - Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : - Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); - } + // The control byte shuffles the eight 32-bit floats of partialMax: ABCD|EFGH -> CAAA|GEEE. + x1 = Avx.Shuffle(partialMax, partialMax, 0x02); - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector256 GetAbsMask256() - { - return Avx.StaticCast(Avx.SetAllVector256(0x7FFFFFFF)); + // Performs element-wise maximum operation: The 1st and 5th 32-bit slots become + // max(max(A, B), max(C, D)) = max(A, B, C, D) and + // max(max(E, F), max(G, H)) = max(E, F, G, H). + return Avx.Max(partialMax, x1); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 GetNewDst128(in Vector128 xDst1, in Vector128 signMask, in Vector128 xThreshold) + private static Vector128 GetNewDst128(in Vector128 xDst1, in Vector128 xThreshold) { + Vector128 signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true @@ -181,14 +175,12 @@ private static Vector128 GetNewDst128(in Vector128 xDst1, in Vecto } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector256 GetNewDst256(in Vector256 xDst1, in Vector256 signMask, in Vector256 xThreshold) + private static Vector256 GetNewDst256(in Vector256 xDst1, in Vector256 xThreshold) { + Vector256 signMask = Avx.SetAllVector256(-0.0f); // 0x8000 0000 Vector256 xSign = Avx.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise Vector256 xDst1Abs = Avx.Xor(xDst1, xSign); - - // REVIEW NEEDED: Do we want Signaling or NonSignaling? The original functionality is NonSignaling, which does not throw an exception even when there is an NaN. - // Signaling means that if an operand contains an NaN, an exception is raised (ref: https://stackoverflow.com/questions/16988199/how-to-choose-avx-compare-predicate-variants) - Vector256 xCond = Avx.Compare(xDst1Abs, xThreshold, FloatComparisonMode.GreaterThanOrderedSignaling); // result = 0xFFFF FFFF if true + Vector256 xCond = Avx.Compare(xDst1Abs, xThreshold, FloatComparisonMode.GreaterThanOrderedNonSignaling); // result = 0xFFFF FFFF if true Vector256 x2 = Avx.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise return Avx.And(Avx.Subtract(xDst1, x2), xCond); } @@ -196,17 +188,17 @@ private static Vector256 GetNewDst256(in Vector256 xDst1, in Vecto // Multiply matrix times vector into vector. public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); float* pSrcEnd = psrc + ccol; float* pDstEnd = pdst + crow; @@ -263,9 +255,9 @@ public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); // REVIEW: For extremely sparse inputs, interchanging the loops would // likely be more efficient. @@ -274,9 +266,9 @@ public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, A fixed (float* pMatStart = &mat.Items[0]) fixed (int* pposSrc = &rgposSrc[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); int* pposMin = pposSrc + iposMin; int* pposEnd = pposSrc + iposEnd; @@ -321,17 +313,17 @@ public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, A public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); float* pSrcEnd = psrc + ccol; float* pDstEnd = pdst + crow; @@ -348,10 +340,10 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s Vector128 h31 = Sse.Shuffle(h01, h01, 0xFF); // D h01 = Sse.Shuffle(h01, h01, 0x00); // A - Vector256 x01 = ToVector256(h01, h01); - Vector256 x11 = ToVector256(h11, h11); - Vector256 x21 = ToVector256(h21, h21); - Vector256 x31 = ToVector256(h31, h31); + Vector256 x01 = SetHighLow(h01, h01); + Vector256 x11 = SetHighLow(h11, h11); + Vector256 x21 = SetHighLow(h21, h21); + Vector256 x31 = SetHighLow(h31, h31); pSrcCurrent += 4; @@ -392,10 +384,10 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s Vector128 h31 = Sse.Shuffle(h01, h01, 0xFF); // D h01 = Sse.Shuffle(h01, h01, 0x00); // A - Vector256 x01 = ToVector256(h01, h01); - Vector256 x11 = ToVector256(h11, h11); - Vector256 x21 = ToVector256(h21, h21); - Vector256 x31 = ToVector256(h31, h31); + Vector256 x01 = SetHighLow(h01, h01); + Vector256 x11 = SetHighLow(h11, h11); + Vector256 x21 = SetHighLow(h21, h21); + Vector256 x31 = SetHighLow(h31, h31); float* pDstCurrent = pdst; @@ -435,18 +427,18 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s public static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) fixed (int* pposSrc = &rgposSrc[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); int* ppos = pposSrc + iposMin; int* pposEnd = pposSrc + iposEnd; @@ -1114,12 +1106,11 @@ public static unsafe float SumAbsU(Span src) float* pSrcCurrent = psrc; Vector256 result256 = Avx.SetZeroVector256(); - Vector256 mask256 = GetAbsMask256(); while (pSrcCurrent + 8 <= pSrcEnd) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); - result256 = Avx.Add(result256, Avx.And(srcVector, mask256)); + result256 = Avx.Add(result256, Avx.And(srcVector, _absMask256)); pSrcCurrent += 8; } @@ -1128,12 +1119,11 @@ public static unsafe float SumAbsU(Span src) Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); - Vector128 mask128 = GetAbsMask128(); if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - result128 = Sse.Add(result128, Sse.And(srcVector, mask128)); + result128 = Sse.Add(result128, Sse.And(srcVector, _absMask128)); pSrcCurrent += 4; } @@ -1143,7 +1133,7 @@ public static unsafe float SumAbsU(Span src) while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); - result128 = Sse.AddScalar(result128, Sse.And(srcVector, mask128)); + result128 = Sse.AddScalar(result128, Sse.And(srcVector, _absMask128)); pSrcCurrent++; } @@ -1161,13 +1151,12 @@ public static unsafe float SumAbsDiffU(float mean, Span src) Vector256 result256 = Avx.SetZeroVector256(); Vector256 meanVector256 = Avx.SetAllVector256(mean); - Vector256 mask256 = GetAbsMask256(); while (pSrcCurrent + 8 <= pSrcEnd) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); srcVector = Avx.Subtract(srcVector, meanVector256); - result256 = Avx.Add(result256, Avx.And(srcVector, mask256)); + result256 = Avx.Add(result256, Avx.And(srcVector, _absMask256)); pSrcCurrent += 8; } @@ -1177,13 +1166,12 @@ public static unsafe float SumAbsDiffU(float mean, Span src) Vector128 result128 = Sse.SetZeroVector128(); Vector128 meanVector128 = Sse.SetAllVector128(mean); - Vector128 mask128 = GetAbsMask128(); if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector128); - result128 = Sse.Add(result128, Sse.And(srcVector, mask128)); + result128 = Sse.Add(result128, Sse.And(srcVector, _absMask128)); pSrcCurrent += 4; } @@ -1194,7 +1182,7 @@ public static unsafe float SumAbsDiffU(float mean, Span src) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); srcVector = Sse.SubtractScalar(srcVector, meanVector128); - result128 = Sse.AddScalar(result128, Sse.And(srcVector, mask128)); + result128 = Sse.AddScalar(result128, Sse.And(srcVector, _absMask128)); pSrcCurrent++; } @@ -1211,12 +1199,11 @@ public static unsafe float MaxAbsU(Span src) float* pSrcCurrent = psrc; Vector256 result256 = Avx.SetZeroVector256(); - Vector256 mask256 = GetAbsMask256(); while (pSrcCurrent + 8 <= pSrcEnd) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); - result256 = Avx.Max(result256, Avx.And(srcVector, mask256)); + result256 = Avx.Max(result256, Avx.And(srcVector, _absMask256)); pSrcCurrent += 8; } @@ -1225,12 +1212,11 @@ public static unsafe float MaxAbsU(Span src) Vector128 resultPadded = Sse.MaxScalar(GetLow(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); - Vector128 mask128 = GetAbsMask128(); if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - result128 = Sse.Max(result128, Sse.And(srcVector, mask128)); + result128 = Sse.Max(result128, Sse.And(srcVector, _absMask128)); pSrcCurrent += 4; } @@ -1240,7 +1226,7 @@ public static unsafe float MaxAbsU(Span src) while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); - result128 = Sse.MaxScalar(result128, Sse.And(srcVector, mask128)); + result128 = Sse.MaxScalar(result128, Sse.And(srcVector, _absMask128)); pSrcCurrent++; } @@ -1258,13 +1244,12 @@ public static unsafe float MaxAbsDiffU(float mean, Span src) Vector256 result256 = Avx.SetZeroVector256(); Vector256 meanVector256 = Avx.SetAllVector256(mean); - Vector256 mask256 = GetAbsMask256(); while (pSrcCurrent + 8 <= pSrcEnd) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); srcVector = Avx.Subtract(srcVector, meanVector256); - result256 = Avx.Max(result256, Avx.And(srcVector, mask256)); + result256 = Avx.Max(result256, Avx.And(srcVector, _absMask256)); pSrcCurrent += 8; } @@ -1274,13 +1259,12 @@ public static unsafe float MaxAbsDiffU(float mean, Span src) Vector128 result128 = Sse.SetZeroVector128(); Vector128 meanVector128 = Sse.SetAllVector128(mean); - Vector128 mask128 = GetAbsMask128(); if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector128); - result128 = Sse.Max(result128, Sse.And(srcVector, mask128)); + result128 = Sse.Max(result128, Sse.And(srcVector, _absMask128)); pSrcCurrent += 4; } @@ -1291,7 +1275,7 @@ public static unsafe float MaxAbsDiffU(float mean, Span src) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); srcVector = Sse.SubtractScalar(srcVector, meanVector128); - result128 = Sse.MaxScalar(result128, Sse.And(srcVector, mask128)); + result128 = Sse.MaxScalar(result128, Sse.And(srcVector, _absMask128)); pSrcCurrent++; } @@ -1478,8 +1462,6 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, flo float* pDst2Current = pdst2; Vector256 xPrimal256 = Avx.SetAllVector256(primalUpdate); - - Vector256 signMask256 = Avx.SetAllVector256(-0.0f); // 0x8000 0000 Vector256 xThreshold256 = Avx.SetAllVector256(threshold); while (pSrcCurrent + 8 <= pSrcEnd) @@ -1488,7 +1470,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, flo Vector256 xDst1 = Avx.LoadVector256(pDst1Current); xDst1 = Avx.Add(xDst1, Avx.Multiply(xSrc, xPrimal256)); - Vector256 xDst2 = GetNewDst256(xDst1, signMask256, xThreshold256); + Vector256 xDst2 = GetNewDst256(xDst1, xThreshold256); Avx.Store(pDst1Current, xDst1); Avx.Store(pDst2Current, xDst2); @@ -1499,8 +1481,6 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, flo } Vector128 xPrimal128 = Sse.SetAllVector128(primalUpdate); - - Vector128 signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xThreshold128 = Sse.SetAllVector128(threshold); if (pSrcCurrent + 4 <= pSrcEnd) @@ -1509,7 +1489,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, flo Vector128 xDst1 = Sse.LoadVector128(pDst1Current); xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128)); - Vector128 xDst2 = GetNewDst128(xDst1, signMask128, xThreshold128); + Vector128 xDst2 = GetNewDst128(xDst1, xThreshold128); Sse.Store(pDst1Current, xDst1); Sse.Store(pDst2Current, xDst2); @@ -1544,8 +1524,6 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Sp int* pIdxCurrent = pidx; Vector256 xPrimal256 = Avx.SetAllVector256(primalUpdate); - - Vector256 signMask = Avx.SetAllVector256(-0.0f); // 0x8000 0000 Vector256 xThreshold = Avx.SetAllVector256(threshold); while (pIdxCurrent + 8 <= pIdxEnd) @@ -1554,7 +1532,7 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Sp Vector256 xDst1 = Load8(pdst1, pIdxCurrent); xDst1 = Avx.Add(xDst1, Avx.Multiply(xSrc, xPrimal256)); - Vector256 xDst2 = GetNewDst256(xDst1, signMask, xThreshold); + Vector256 xDst2 = GetNewDst256(xDst1, xThreshold); Store8(in xDst1, pdst1, pIdxCurrent); Store8(in xDst2, pdst2, pIdxCurrent); @@ -1564,8 +1542,6 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Sp } Vector128 xPrimal128 = Sse.SetAllVector128(primalUpdate); - - Vector128 signMask128 = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xThreshold128 = Sse.SetAllVector128(threshold); if (pIdxCurrent + 4 <= pIdxEnd) @@ -1574,7 +1550,7 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Sp Vector128 xDst1 = Load4(pdst1, pIdxCurrent); xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128)); - Vector128 xDst2 = GetNewDst128(xDst1, signMask128, xThreshold128); + Vector128 xDst2 = GetNewDst128(xDst1, xThreshold128); Store4(in xDst1, pdst1, pIdxCurrent); Store4(in xDst2, pdst2, pIdxCurrent); diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 6843cd4757..f15f5c3938 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System; @@ -15,11 +16,13 @@ public static partial class CpuMathUtils // The count of bytes in Vector256, corresponding to _cbAlign in AlignedArray private const int Vector256Alignment = 32; + // The count of bytes in a 32-bit float, corresponding to _cbAlign in AlignedArray + private const int FloatAlignment = 4; + + // If neither AVX nor SSE is supported, return basic alignment for a 4-byte float. + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] public static int GetVectorAlignment() - { - // Assumes SSE support on machines that run ML.NET. - return Avx.IsSupported ? Vector256Alignment : Vector128Alignment; - } + => Avx.IsSupported ? Vector256Alignment : (Sse.IsSupported ? Vector128Alignment : FloatAlignment); public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) { diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index 497dd59003..b35f171388 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -2,6 +2,8 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Runtime.CompilerServices; + namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils @@ -9,11 +11,9 @@ public static partial class CpuMathUtils // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] public static int GetVectorAlignment() - { - // Assumes SSE support on machines that run ML.NET. - return Vector128Alignment; - } + => Vector128Alignment; public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun); diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index faaa2a44de..323300888c 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -29,7 +29,7 @@ private static bool Compat(AlignedArray a) { Contracts.AssertValue(a); Contracts.Assert(a.Size > 0); - return a.CbAlign == Vector128Alignment; + return (a.CbAlign % Vector128Alignment) == 0; } private static unsafe float* Ptr(AlignedArray a, float* p) From 0fd78a63021fd850e6b9bef7e2042454dbf826aa Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Thu, 23 Aug 2018 12:11:15 -0700 Subject: [PATCH 26/29] Respond to PR comment: Makes alignment checking consistent in external and internal calls --- src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs index d217ccf6f9..30308f219d 100644 --- a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs +++ b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs @@ -16,7 +16,7 @@ public static void AssertCompatible(ICpuFullMatrix values) #if DEBUG var mat = values as TMatrix; Contracts.AssertValue(mat); - Contracts.Assert(mat.Items.CbAlign == CpuMathUtils.GetVectorAlignment()); + Contracts.Assert((mat.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0); #endif } @@ -29,7 +29,7 @@ public static void AssertCompatible(ICpuVector values) #if DEBUG CpuAlignedVector vec = values as CpuAlignedVector; Contracts.AssertValue(vec); - Contracts.Assert(vec.Items.CbAlign == CpuMathUtils.GetVectorAlignment()); + Contracts.Assert((vec.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0); #endif } From 3380ded08b654fb4b228d25f05a5005a90a0b60e Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Thu, 23 Aug 2018 12:42:07 -0700 Subject: [PATCH 27/29] Respond to PR feedback: Refactored Sse/AvxIntrinsics helper functions --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 164 ++++++---------------- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 160 ++++++++++----------- 2 files changed, 121 insertions(+), 203 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 09804f88ed..eb4ac3b817 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -18,10 +18,6 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static class AvxIntrinsics { - private static readonly Vector128 _absMask128 = Sse2.IsSupported ? - Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : - Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); - private static readonly Vector256 _absMask256 = Avx.StaticCast(Avx.SetAllVector256(0x7FFFFFFF)); private const int Vector256Alignment = 32; @@ -41,10 +37,6 @@ private static bool HasCompatibleAlignment(AlignedArray alignedArray) return alignedBase; } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector256 SetHighLow(in Vector128 a, in Vector128 b) - => Avx.InsertVector128(Avx.ExtendToVector256(b), a, 1); - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector128 GetLow(in Vector256 x) => Avx.GetLowerHalf(x); @@ -53,72 +45,31 @@ private static Vector128 GetLow(in Vector256 x) private static Vector128 GetHigh(in Vector256 x) => Avx.ExtractVector128(x, 1); - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 Load1(float* src, int* idx) - => Sse.SetScalarVector128(src[idx[0]]); - - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 Load4(float* src, int* idx) - => Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe Vector256 Load8(float* src, int* idx) => Avx.SetVector256(src[idx[7]], src[idx[6]], src[idx[5]], src[idx[4]], src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); - // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA. - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 Rotate(in Vector128 x) - => Sse.Shuffle(x, x, 0x39); - - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe void Store4(in Vector128 x, float* dst, int* idx) - { - Sse.StoreScalar(dst + idx[0], x); - Vector128 rotated = Rotate(in x); - Sse.StoreScalar(dst + idx[1], rotated); - rotated = Rotate(in rotated); - Sse.StoreScalar(dst + idx[2], rotated); - rotated = Rotate(in rotated); - Sse.StoreScalar(dst + idx[3], rotated); - } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe void Store8(in Vector256 x, float* dst, int* idx) { Vector128 tmp = GetLow(in x); Sse.StoreScalar(dst + idx[0], tmp); - tmp = Rotate(in tmp); + tmp = SseIntrinsics.Rotate(in tmp); Sse.StoreScalar(dst + idx[1], tmp); - tmp = Rotate(in tmp); + tmp = SseIntrinsics.Rotate(in tmp); Sse.StoreScalar(dst + idx[2], tmp); - tmp = Rotate(in tmp); + tmp = SseIntrinsics.Rotate(in tmp); Sse.StoreScalar(dst + idx[3], tmp); tmp = GetHigh(in x); Sse.StoreScalar(dst + idx[4], tmp); - tmp = Rotate(in tmp); + tmp = SseIntrinsics.Rotate(in tmp); Sse.StoreScalar(dst + idx[5], tmp); - tmp = Rotate(in tmp); + tmp = SseIntrinsics.Rotate(in tmp); Sse.StoreScalar(dst + idx[6], tmp); - tmp = Rotate(in tmp); + tmp = SseIntrinsics.Rotate(in tmp); Sse.StoreScalar(dst + idx[7], tmp); } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 VectorSum128(in Vector128 vector) - { - if (Sse3.IsSupported) - { - Vector128 partialSum = Sse3.HorizontalAdd(vector, vector); - return Sse3.HorizontalAdd(partialSum, partialSum); - } - else - { - Vector128 partialSum = Sse.Add(vector, Sse.MoveHighToLow(vector, vector)); - // The control byte shuffles the four 32-bit floats of partialSum: ABCD -> BADC. - return Sse.Add(partialSum, Sse.Shuffle(partialSum, partialSum, 0xB1)); - } - } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector256 VectorSum256(in Vector256 vector) { @@ -126,24 +77,6 @@ private static Vector256 VectorSum256(in Vector256 vector) return Avx.HorizontalAdd(partialSum, partialSum); } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 VectorMax128(in Vector128 vector) - { - // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> BADC. - Vector128 x1 = Sse.Shuffle(vector, vector, 0xB1); - - // Performs element-wise maximum operation: The 1st and 3rd 32-bit slots become - // max(A, B) and max(C, D). - Vector128 partialMax = Sse.Max(vector, x1); - - // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> CAAA. - x1 = Sse.Shuffle(partialMax, partialMax, 0x02); - - // Performs element-wise maximum operation: The 1st 32-bit slot becomes - // max(A, B, C, D). - return Sse.MaxScalar(partialMax, x1); - } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector256 VectorMax256(in Vector256 vector) { @@ -163,17 +96,6 @@ private static Vector256 VectorMax256(in Vector256 vector) return Avx.Max(partialMax, x1); } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 GetNewDst128(in Vector128 xDst1, in Vector128 xThreshold) - { - Vector128 signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000 - Vector128 xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise - Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); - Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true - Vector128 x2 = Sse.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise - return Sse.And(Sse.Subtract(xDst1, x2), xCond); - } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector256 GetNewDst256(in Vector256 xDst1, in Vector256 xThreshold) { @@ -340,10 +262,10 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s Vector128 h31 = Sse.Shuffle(h01, h01, 0xFF); // D h01 = Sse.Shuffle(h01, h01, 0x00); // A - Vector256 x01 = SetHighLow(h01, h01); - Vector256 x11 = SetHighLow(h11, h11); - Vector256 x21 = SetHighLow(h21, h21); - Vector256 x31 = SetHighLow(h31, h31); + Vector256 x01 = Avx.SetHighLow(h01, h01); + Vector256 x11 = Avx.SetHighLow(h11, h11); + Vector256 x21 = Avx.SetHighLow(h21, h21); + Vector256 x31 = Avx.SetHighLow(h31, h31); pSrcCurrent += 4; @@ -384,10 +306,10 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s Vector128 h31 = Sse.Shuffle(h01, h01, 0xFF); // D h01 = Sse.Shuffle(h01, h01, 0x00); // A - Vector256 x01 = SetHighLow(h01, h01); - Vector256 x11 = SetHighLow(h11, h11); - Vector256 x21 = SetHighLow(h21, h21); - Vector256 x31 = SetHighLow(h31, h31); + Vector256 x01 = Avx.SetHighLow(h01, h01); + Vector256 x11 = Avx.SetHighLow(h11, h11); + Vector256 x21 = Avx.SetHighLow(h21, h21); + Vector256 x31 = Avx.SetHighLow(h31, h31); float* pDstCurrent = pdst; @@ -806,11 +728,11 @@ public static unsafe void AddScaleSU(float scale, Span src, Span idx if (pIdxCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent); + Vector128 dstVector = SseIntrinsics.Load4(pDstCurrent, pIdxCurrent); srcVector = Sse.Multiply(srcVector, scaleVector128); dstVector = Sse.Add(dstVector, srcVector); - Store4(in dstVector, pDstCurrent, pIdxCurrent); + SseIntrinsics.Store4(in dstVector, pDstCurrent, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; @@ -898,11 +820,11 @@ public static unsafe void AddSU(Span src, Span idx, Span dst) if (pIdxCurrent + 4 <= pEnd) { - Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent); + Vector128 dstVector = SseIntrinsics.Load4(pDstCurrent, pIdxCurrent); Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); dstVector = Sse.Add(dstVector, srcVector); - Store4(in dstVector, pDstCurrent, pIdxCurrent); + SseIntrinsics.Store4(in dstVector, pDstCurrent, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; @@ -993,7 +915,7 @@ public static unsafe float SumU(Span src) pSrcCurrent += 4; } - result128 = VectorSum128(in result128); + result128 = SseIntrinsics.VectorSum128(in result128); while (pSrcCurrent < pSrcEnd) { @@ -1035,7 +957,7 @@ public static unsafe float SumSqU(Span src) pSrcCurrent += 4; } - result128 = VectorSum128(in result128); + result128 = SseIntrinsics.VectorSum128(in result128); while (pSrcCurrent < pSrcEnd) { @@ -1083,7 +1005,7 @@ public static unsafe float SumSqDiffU(float mean, Span src) pSrcCurrent += 4; } - result128 = VectorSum128(in result128); + result128 = SseIntrinsics.VectorSum128(in result128); while (pSrcCurrent < pSrcEnd) { @@ -1123,17 +1045,17 @@ public static unsafe float SumAbsU(Span src) if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - result128 = Sse.Add(result128, Sse.And(srcVector, _absMask128)); + result128 = Sse.Add(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); pSrcCurrent += 4; } - result128 = VectorSum128(in result128); + result128 = SseIntrinsics.VectorSum128(in result128); while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); - result128 = Sse.AddScalar(result128, Sse.And(srcVector, _absMask128)); + result128 = Sse.AddScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); pSrcCurrent++; } @@ -1171,18 +1093,18 @@ public static unsafe float SumAbsDiffU(float mean, Span src) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector128); - result128 = Sse.Add(result128, Sse.And(srcVector, _absMask128)); + result128 = Sse.Add(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); pSrcCurrent += 4; } - result128 = VectorSum128(in result128); + result128 = SseIntrinsics.VectorSum128(in result128); while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); srcVector = Sse.SubtractScalar(srcVector, meanVector128); - result128 = Sse.AddScalar(result128, Sse.And(srcVector, _absMask128)); + result128 = Sse.AddScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); pSrcCurrent++; } @@ -1216,17 +1138,17 @@ public static unsafe float MaxAbsU(Span src) if (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - result128 = Sse.Max(result128, Sse.And(srcVector, _absMask128)); + result128 = Sse.Max(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); pSrcCurrent += 4; } - result128 = VectorMax128(in result128); + result128 = SseIntrinsics.VectorMax128(in result128); while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); - result128 = Sse.MaxScalar(result128, Sse.And(srcVector, _absMask128)); + result128 = Sse.MaxScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); pSrcCurrent++; } @@ -1264,18 +1186,18 @@ public static unsafe float MaxAbsDiffU(float mean, Span src) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector128); - result128 = Sse.Max(result128, Sse.And(srcVector, _absMask128)); + result128 = Sse.Max(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); pSrcCurrent += 4; } - result128 = VectorMax128(in result128); + result128 = SseIntrinsics.VectorMax128(in result128); while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); srcVector = Sse.SubtractScalar(srcVector, meanVector128); - result128 = Sse.MaxScalar(result128, Sse.And(srcVector, _absMask128)); + result128 = Sse.MaxScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); pSrcCurrent++; } @@ -1322,7 +1244,7 @@ public static unsafe float DotU(Span src, Span dst) pDstCurrent += 4; } - result128 = VectorSum128(in result128); + result128 = SseIntrinsics.VectorSum128(in result128); while (pSrcCurrent < pSrcEnd) { @@ -1370,7 +1292,7 @@ public static unsafe float DotSU(Span src, Span dst, Span idx if (pIdxCurrent + 4 <= pIdxEnd) { - Vector128 srcVector = Load4(pSrcCurrent, pIdxCurrent); + Vector128 srcVector = SseIntrinsics.Load4(pSrcCurrent, pIdxCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); result128 = Sse.Add(result128, Sse.Multiply(srcVector, dstVector)); @@ -1379,11 +1301,11 @@ public static unsafe float DotSU(Span src, Span dst, Span idx pDstCurrent += 4; } - result128 = VectorSum128(in result128); + result128 = SseIntrinsics.VectorSum128(in result128); while (pIdxCurrent < pIdxEnd) { - Vector128 srcVector = Load1(pSrcCurrent, pIdxCurrent); + Vector128 srcVector = SseIntrinsics.Load1(pSrcCurrent, pIdxCurrent); Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, dstVector)); @@ -1434,7 +1356,7 @@ public static unsafe float Dist2(Span src, Span dst) pDstCurrent += 4; } - sqDistanceVector128 = VectorSum128(in sqDistanceVector128); + sqDistanceVector128 = SseIntrinsics.VectorSum128(in sqDistanceVector128); float norm = Sse.ConvertToSingle(Sse.AddScalar(sqDistanceVector128, sqDistanceVectorPadded)); while (pSrcCurrent < pSrcEnd) @@ -1489,7 +1411,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, flo Vector128 xDst1 = Sse.LoadVector128(pDst1Current); xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128)); - Vector128 xDst2 = GetNewDst128(xDst1, xThreshold128); + Vector128 xDst2 = SseIntrinsics.GetNewDst128(xDst1, xThreshold128); Sse.Store(pDst1Current, xDst1); Sse.Store(pDst2Current, xDst2); @@ -1548,12 +1470,12 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Sp { Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); - Vector128 xDst1 = Load4(pdst1, pIdxCurrent); + Vector128 xDst1 = SseIntrinsics.Load4(pdst1, pIdxCurrent); xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128)); - Vector128 xDst2 = GetNewDst128(xDst1, xThreshold128); + Vector128 xDst2 = SseIntrinsics.GetNewDst128(xDst1, xThreshold128); - Store4(in xDst1, pdst1, pIdxCurrent); - Store4(in xDst2, pdst2, pIdxCurrent); + SseIntrinsics.Store4(in xDst1, pdst1, pIdxCurrent); + SseIntrinsics.Store4(in xDst2, pdst2, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 323300888c..ee326dc5ba 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -22,45 +22,43 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static class SseIntrinsics { + internal static readonly Vector128 AbsMask128 = Sse2.IsSupported ? + Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : + Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; - private static bool Compat(AlignedArray a) + private static bool HasCompatibleAlignment(AlignedArray alignedArray) { - Contracts.AssertValue(a); - Contracts.Assert(a.Size > 0); - return (a.CbAlign % Vector128Alignment) == 0; + Contracts.AssertValue(alignedArray); + Contracts.Assert(alignedArray.Size > 0); + return (alignedArray.CbAlign % Vector128Alignment) == 0; } - private static unsafe float* Ptr(AlignedArray a, float* p) + private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase) { - Contracts.AssertValue(a); - float* q = p + a.GetBase((long)p); - Contracts.Assert(((long)q & (Vector128Alignment - 1)) == 0); - return q; + Contracts.AssertValue(alignedArray); + float* alignedBase = unalignedBase + alignedArray.GetBase((long)unalignedBase); + Contracts.Assert(((long)alignedBase & (Vector128Alignment - 1)) == 0); + return alignedBase; } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 Load1(float* src, int* idx) - { - return Sse.SetScalarVector128(src[idx[0]]); - } + internal static unsafe Vector128 Load1(float* src, int* idx) + => Sse.SetScalarVector128(src[idx[0]]); [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 Load4(float* src, int* idx) - { - return Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); - } + internal static unsafe Vector128 Load4(float* src, int* idx) + => Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); + // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA. [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 Rotate(in Vector128 x) - { - // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA. - return Sse.Shuffle(x, x, 0x39); - } + internal static Vector128 Rotate(in Vector128 x) + => Sse.Shuffle(x, x, 0x39); [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe void Store4(in Vector128 x, float* dst, int* idx) + internal static unsafe void Store4(in Vector128 x, float* dst, int* idx) { Sse.StoreScalar(dst + idx[0], x); Vector128 rotated = Rotate(in x); @@ -72,7 +70,7 @@ private static unsafe void Store4(in Vector128 x, float* dst, int* idx) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 VectorSum(in Vector128 vector) + internal static Vector128 VectorSum128(in Vector128 vector) { if (Sse3.IsSupported) { @@ -88,25 +86,27 @@ private static Vector128 VectorSum(in Vector128 vector) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 VectorMax(in Vector128 vector) + internal static Vector128 VectorMax128(in Vector128 vector) { + // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> BADC. Vector128 x1 = Sse.Shuffle(vector, vector, 0xB1); + + // Performs element-wise maximum operation: The 1st and 3rd 32-bit slots become + // max(A, B) and max(C, D). Vector128 partialMax = Sse.Max(vector, x1); + + // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> CAAA. x1 = Sse.Shuffle(partialMax, partialMax, 0x02); - return Sse.MaxScalar(partialMax, x1); - } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 GetAbsMask() - { - return Sse2.IsSupported ? - Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : - Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + // Performs element-wise maximum operation: The 1st 32-bit slot becomes + // max(A, B, C, D). + return Sse.MaxScalar(partialMax, x1); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 GetNewDst(in Vector128 xDst1, in Vector128 signMask, in Vector128 xThreshold) + internal static Vector128 GetNewDst128(in Vector128 xDst1, in Vector128 xThreshold) { + Vector128 signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true @@ -117,17 +117,17 @@ private static Vector128 GetNewDst(in Vector128 xDst1, in Vector12 // Multiply matrix times vector into vector. public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); float* pSrcEnd = psrc + ccol; float* pDstEnd = pdst + crow; @@ -183,9 +183,9 @@ public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); // REVIEW: For extremely sparse inputs, interchanging the loops would // likely be more efficient. @@ -194,9 +194,9 @@ public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, A fixed (float* pMatStart = &mat.Items[0]) fixed (int* pposSrc = &rgposSrc[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); int* pposMin = pposSrc + iposMin; int* pposEnd = pposSrc + iposEnd; @@ -239,17 +239,17 @@ public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, A public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); float* pSrcEnd = psrc + ccol; float* pDstEnd = pdst + crow; @@ -342,18 +342,18 @@ public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray s public static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) fixed (int* pposSrc = &rgposSrc[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); int* ppos = pposSrc + iposMin; int* pposEnd = pposSrc + iposEnd; @@ -768,7 +768,7 @@ public static unsafe float SumU(Span src) pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); while (pSrcCurrent < pSrcEnd) { @@ -797,7 +797,7 @@ public static unsafe float SumSqU(Span src) pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); while (pSrcCurrent < pSrcEnd) { @@ -830,7 +830,7 @@ public static unsafe float SumSqDiffU(float mean, Span src) pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); while (pSrcCurrent < pSrcEnd) { @@ -853,22 +853,21 @@ public static unsafe float SumAbsU(Span src) float* pSrcCurrent = psrc; Vector128 result = Sse.SetZeroVector128(); - Vector128 mask = GetAbsMask(); while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - result = Sse.Add(result, Sse.And(srcVector, mask)); + result = Sse.Add(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); - result = Sse.AddScalar(result, Sse.And(srcVector, mask)); + result = Sse.AddScalar(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent++; } @@ -886,24 +885,23 @@ public static unsafe float SumAbsDiffU(float mean, Span src) Vector128 result = Sse.SetZeroVector128(); Vector128 meanVector = Sse.SetAllVector128(mean); - Vector128 mask = GetAbsMask(); while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector); - result = Sse.Add(result, Sse.And(srcVector, mask)); + result = Sse.Add(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); srcVector = Sse.SubtractScalar(srcVector, meanVector); - result = Sse.AddScalar(result, Sse.And(srcVector, mask)); + result = Sse.AddScalar(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent++; } @@ -920,22 +918,21 @@ public static unsafe float MaxAbsU(Span src) float* pSrcCurrent = psrc; Vector128 result = Sse.SetZeroVector128(); - Vector128 mask = GetAbsMask(); while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - result = Sse.Max(result, Sse.And(srcVector, mask)); + result = Sse.Max(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent += 4; } - result = VectorMax(in result); + result = VectorMax128(in result); while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); - result = Sse.MaxScalar(result, Sse.And(srcVector, mask)); + result = Sse.MaxScalar(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent++; } @@ -953,24 +950,23 @@ public static unsafe float MaxAbsDiffU(float mean, Span src) Vector128 result = Sse.SetZeroVector128(); Vector128 meanVector = Sse.SetAllVector128(mean); - Vector128 mask = GetAbsMask(); while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector); - result = Sse.Max(result, Sse.And(srcVector, mask)); + result = Sse.Max(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent += 4; } - result = VectorMax(in result); + result = VectorMax128(in result); while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); srcVector = Sse.SubtractScalar(srcVector, meanVector); - result = Sse.MaxScalar(result, Sse.And(srcVector, mask)); + result = Sse.MaxScalar(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent++; } @@ -1001,7 +997,7 @@ public static unsafe float DotU(Span src, Span dst) pDstCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); while (pSrcCurrent < pSrcEnd) { @@ -1042,7 +1038,7 @@ public static unsafe float DotSU(Span src, Span dst, Span idx pDstCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); while (pIdxCurrent < pIdxEnd) { @@ -1081,7 +1077,7 @@ public static unsafe float Dist2(Span src, Span dst) pDstCurrent += 4; } - sqDistanceVector = VectorSum(in sqDistanceVector); + sqDistanceVector = VectorSum128(in sqDistanceVector); float norm = Sse.ConvertToSingle(sqDistanceVector); while (pSrcCurrent < pSrcEnd) @@ -1119,7 +1115,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, flo Vector128 xDst1 = Sse.LoadVector128(pDst1Current); xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal)); - Vector128 xDst2 = GetNewDst(xDst1, signMask, xThreshold); + Vector128 xDst2 = GetNewDst128(xDst1, xThreshold); Sse.Store(pDst1Current, xDst1); Sse.Store(pDst2Current, xDst2); @@ -1164,7 +1160,7 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Sp Vector128 xDst1 = Load4(pdst1, pIdxCurrent); xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal)); - Vector128 xDst2 = GetNewDst(xDst1, signMask, xThreshold); + Vector128 xDst2 = GetNewDst128(xDst1, xThreshold); Store4(in xDst1, pdst1, pIdxCurrent); Store4(in xDst2, pdst2, pIdxCurrent); From b8d63cccbdca22dd1b24b5b524b4d25e6e65eb67 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Thu, 23 Aug 2018 12:44:53 -0700 Subject: [PATCH 28/29] Made two Sse/AvxIntrinsics helper functions about AlignedArray inline in hopes of improving perf --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 2 ++ src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index eb4ac3b817..d5e2fc0e38 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -22,6 +22,7 @@ internal static class AvxIntrinsics private const int Vector256Alignment = 32; + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static bool HasCompatibleAlignment(AlignedArray alignedArray) { Contracts.AssertValue(alignedArray); @@ -29,6 +30,7 @@ private static bool HasCompatibleAlignment(AlignedArray alignedArray) return (alignedArray.CbAlign % Vector256Alignment) == 0; } + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase) { Contracts.AssertValue(alignedArray); diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index ee326dc5ba..0f4fb54d18 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -29,6 +29,7 @@ internal static class SseIntrinsics // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static bool HasCompatibleAlignment(AlignedArray alignedArray) { Contracts.AssertValue(alignedArray); @@ -36,6 +37,7 @@ private static bool HasCompatibleAlignment(AlignedArray alignedArray) return (alignedArray.CbAlign % Vector128Alignment) == 0; } + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase) { Contracts.AssertValue(alignedArray); From 32a3704748ad7bcb654c967cfcccf25990ac0207 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Wed, 29 Aug 2018 14:41:37 -0700 Subject: [PATCH 29/29] Respond to PR feedback: styles for Vector256Alignment and Avx.GetLowerHalf --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 30 ++++++++++------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index d5e2fc0e38..06f89d097e 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -35,14 +35,10 @@ private static bool HasCompatibleAlignment(AlignedArray alignedArray) { Contracts.AssertValue(alignedArray); float* alignedBase = unalignedBase + alignedArray.GetBase((long)unalignedBase); - Contracts.Assert(((long)alignedBase & (Vector256Alignment - 1)) == 0); + Contracts.Assert(((long)alignedBase % Vector256Alignment) == 0); return alignedBase; } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 GetLow(in Vector256 x) - => Avx.GetLowerHalf(x); - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector128 GetHigh(in Vector256 x) => Avx.ExtractVector128(x, 1); @@ -54,7 +50,7 @@ private static unsafe Vector256 Load8(float* src, int* idx) [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe void Store8(in Vector256 x, float* dst, int* idx) { - Vector128 tmp = GetLow(in x); + Vector128 tmp = Avx.GetLowerHalf(in x); Sse.StoreScalar(dst + idx[0], tmp); tmp = SseIntrinsics.Rotate(in tmp); Sse.StoreScalar(dst + idx[1], tmp); @@ -162,7 +158,7 @@ public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, res2 = Avx.HorizontalAdd(res2, res3); res0 = Avx.HorizontalAdd(res0, res2); - Vector128 sum = Sse.Add(GetLow(in res0), GetHigh(in res0)); + Vector128 sum = Sse.Add(Avx.GetLowerHalf(in res0), GetHigh(in res0)); if (add) { sum = Sse.Add(sum, Sse.LoadAlignedVector128(pDstCurrent)); @@ -907,7 +903,7 @@ public static unsafe float SumU(Span src) } result256 = VectorSum256(in result256); - Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); @@ -947,7 +943,7 @@ public static unsafe float SumSqU(Span src) } result256 = VectorSum256(in result256); - Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); @@ -993,7 +989,7 @@ public static unsafe float SumSqDiffU(float mean, Span src) } result256 = VectorSum256(in result256); - Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); Vector128 meanVector128 = Sse.SetAllVector128(mean); @@ -1040,7 +1036,7 @@ public static unsafe float SumAbsU(Span src) } result256 = VectorSum256(in result256); - Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); @@ -1086,7 +1082,7 @@ public static unsafe float SumAbsDiffU(float mean, Span src) } result256 = VectorSum256(in result256); - Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); Vector128 meanVector128 = Sse.SetAllVector128(mean); @@ -1133,7 +1129,7 @@ public static unsafe float MaxAbsU(Span src) } result256 = VectorMax256(in result256); - Vector128 resultPadded = Sse.MaxScalar(GetLow(result256), GetHigh(result256)); + Vector128 resultPadded = Sse.MaxScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); @@ -1179,7 +1175,7 @@ public static unsafe float MaxAbsDiffU(float mean, Span src) } result256 = VectorMax256(in result256); - Vector128 resultPadded = Sse.MaxScalar(GetLow(result256), GetHigh(result256)); + Vector128 resultPadded = Sse.MaxScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); Vector128 meanVector128 = Sse.SetAllVector128(mean); @@ -1231,7 +1227,7 @@ public static unsafe float DotU(Span src, Span dst) } result256 = VectorSum256(in result256); - Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); @@ -1288,7 +1284,7 @@ public static unsafe float DotSU(Span src, Span dst, Span idx } result256 = VectorSum256(in result256); - Vector128 resultPadded = Sse.AddScalar(GetLow(result256), GetHigh(result256)); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); Vector128 result128 = Sse.SetZeroVector128(); @@ -1343,7 +1339,7 @@ public static unsafe float Dist2(Span src, Span dst) } sqDistanceVector256 = VectorSum256(in sqDistanceVector256); - Vector128 sqDistanceVectorPadded = Sse.AddScalar(GetLow(sqDistanceVector256), GetHigh(sqDistanceVector256)); + Vector128 sqDistanceVectorPadded = Sse.AddScalar(Avx.GetLowerHalf(sqDistanceVector256), GetHigh(sqDistanceVector256)); Vector128 sqDistanceVector128 = Sse.SetZeroVector128();