diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs new file mode 100644 index 0000000000..06f89d097e --- /dev/null +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -0,0 +1,1495 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +// The exported function names need to be unique (can't be disambiguated based on signature), hence +// we introduce suffix letters to indicate the general patterns used. +// * A suffix means aligned and padded for SSE operations. +// * U suffix means unaligned and unpadded. +// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector. +// * Tran means the matrix is transposed. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Microsoft.ML.Runtime.Internal.CpuMath +{ + internal static class AvxIntrinsics + { + private static readonly Vector256 _absMask256 = Avx.StaticCast(Avx.SetAllVector256(0x7FFFFFFF)); + + private const int Vector256Alignment = 32; + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static bool HasCompatibleAlignment(AlignedArray alignedArray) + { + Contracts.AssertValue(alignedArray); + Contracts.Assert(alignedArray.Size > 0); + return (alignedArray.CbAlign % Vector256Alignment) == 0; + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase) + { + Contracts.AssertValue(alignedArray); + float* alignedBase = unalignedBase + alignedArray.GetBase((long)unalignedBase); + Contracts.Assert(((long)alignedBase % Vector256Alignment) == 0); + return alignedBase; + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 GetHigh(in Vector256 x) + => Avx.ExtractVector128(x, 1); + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector256 Load8(float* src, int* idx) + => Avx.SetVector256(src[idx[7]], src[idx[6]], src[idx[5]], src[idx[4]], src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe void Store8(in Vector256 x, float* dst, int* idx) + { + Vector128 tmp = Avx.GetLowerHalf(in x); + Sse.StoreScalar(dst + idx[0], tmp); + tmp = SseIntrinsics.Rotate(in tmp); + Sse.StoreScalar(dst + idx[1], tmp); + tmp = SseIntrinsics.Rotate(in tmp); + Sse.StoreScalar(dst + idx[2], tmp); + tmp = SseIntrinsics.Rotate(in tmp); + Sse.StoreScalar(dst + idx[3], tmp); + tmp = GetHigh(in x); + Sse.StoreScalar(dst + idx[4], tmp); + tmp = SseIntrinsics.Rotate(in tmp); + Sse.StoreScalar(dst + idx[5], tmp); + tmp = SseIntrinsics.Rotate(in tmp); + Sse.StoreScalar(dst + idx[6], tmp); + tmp = SseIntrinsics.Rotate(in tmp); + Sse.StoreScalar(dst + idx[7], tmp); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector256 VectorSum256(in Vector256 vector) + { + Vector256 partialSum = Avx.HorizontalAdd(vector, vector); + return Avx.HorizontalAdd(partialSum, partialSum); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector256 VectorMax256(in Vector256 vector) + { + // The control byte shuffles the eight 32-bit floats of partialMax: ABCD|EFGH -> BADC|FEHG. + Vector256 x1 = Avx.Shuffle(vector, vector, 0xB1); + + // Performs element-wise maximum operation: The 1st, 3rd, 5th, and 7th 32-bit slots become + // max(A, B), max(C, D), max(E, F), and max(G, H). + Vector256 partialMax = Avx.Max(vector, x1); + + // The control byte shuffles the eight 32-bit floats of partialMax: ABCD|EFGH -> CAAA|GEEE. + x1 = Avx.Shuffle(partialMax, partialMax, 0x02); + + // Performs element-wise maximum operation: The 1st and 5th 32-bit slots become + // max(max(A, B), max(C, D)) = max(A, B, C, D) and + // max(max(E, F), max(G, H)) = max(E, F, G, H). + return Avx.Max(partialMax, x1); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector256 GetNewDst256(in Vector256 xDst1, in Vector256 xThreshold) + { + Vector256 signMask = Avx.SetAllVector256(-0.0f); // 0x8000 0000 + Vector256 xSign = Avx.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise + Vector256 xDst1Abs = Avx.Xor(xDst1, xSign); + Vector256 xCond = Avx.Compare(xDst1Abs, xThreshold, FloatComparisonMode.GreaterThanOrderedNonSignaling); // result = 0xFFFF FFFF if true + Vector256 x2 = Avx.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise + return Avx.And(Avx.Subtract(xDst1, x2), xCond); + } + + // Multiply matrix times vector into vector. + public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); + + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + { + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); + + float* pSrcEnd = psrc + ccol; + float* pDstEnd = pdst + crow; + float* pDstCurrent = pdst; + float* pMatCurrent = pmat; + + while (pDstCurrent < pDstEnd) + { + Vector256 res0 = Avx.SetZeroVector256(); + Vector256 res1 = res0; + Vector256 res2 = res0; + Vector256 res3 = res0; + + float* pSrcCurrent = psrc; + + while (pSrcCurrent < pSrcEnd) + { + float* pMatTemp = pMatCurrent; + + Vector256 x01 = Avx.LoadAlignedVector256(pMatTemp); + Vector256 x11 = Avx.LoadAlignedVector256(pMatTemp += ccol); + Vector256 x21 = Avx.LoadAlignedVector256(pMatTemp += ccol); + Vector256 x31 = Avx.LoadAlignedVector256(pMatTemp += ccol); + Vector256 x02 = Avx.LoadAlignedVector256(pSrcCurrent); + + res0 = Avx.Add(res0, Avx.Multiply(x01, x02)); + res1 = Avx.Add(res1, Avx.Multiply(x11, x02)); + res2 = Avx.Add(res2, Avx.Multiply(x21, x02)); + res3 = Avx.Add(res3, Avx.Multiply(x31, x02)); + + pSrcCurrent += 8; + pMatCurrent += 8; + } + + // Add up the entries of each, with the 4 results in res0 + res0 = Avx.HorizontalAdd(res0, res1); + res2 = Avx.HorizontalAdd(res2, res3); + res0 = Avx.HorizontalAdd(res0, res2); + + Vector128 sum = Sse.Add(Avx.GetLowerHalf(in res0), GetHigh(in res0)); + if (add) + { + sum = Sse.Add(sum, Sse.LoadAlignedVector128(pDstCurrent)); + } + Sse.StoreAligned(pDstCurrent, sum); + + pDstCurrent += 4; + pMatCurrent += 3 * ccol; + } + } + } + + // Partial sparse source vector. + public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) + { + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); + + // REVIEW: For extremely sparse inputs, interchanging the loops would + // likely be more efficient. + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + fixed (int* pposSrc = &rgposSrc[0]) + { + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); + + int* pposMin = pposSrc + iposMin; + int* pposEnd = pposSrc + iposEnd; + float* pDstEnd = pdst + crow; + float* pm0 = pmat - posMin; + float* pSrcCurrent = psrc - posMin; + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pm1 = pm0 + ccol; + float* pm2 = pm1 + ccol; + float* pm3 = pm2 + ccol; + Vector256 result = Avx.SetZeroVector256(); + + int* ppos = pposMin; + + while (ppos < pposEnd) + { + int col1 = *ppos; + int col2 = col1 + 4 * ccol; + Vector256 x1 = Avx.SetVector256(pm3[col2], pm2[col2], pm1[col2], pm0[col2], + pm3[col1], pm2[col1], pm1[col1], pm0[col1]); + Vector256 x2 = Avx.SetAllVector256(pSrcCurrent[col1]); + x2 = Avx.Multiply(x2, x1); + result = Avx.Add(result, x2); + + ppos++; + } + + if (add) + { + result = Avx.Add(result, Avx.LoadAlignedVector256(pDstCurrent)); + } + Avx.StoreAligned(pDstCurrent, result); + + pDstCurrent += 8; + pm0 += 8 * ccol; + } + } + } + + public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); + + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + { + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); + + float* pSrcEnd = psrc + ccol; + float* pDstEnd = pdst + crow; + float* pSrcCurrent = psrc; + float* pMatCurrent = pmat; + + // We do 4-way unrolling + if (!add) + { + Vector128 h01 = Sse.LoadAlignedVector128(pSrcCurrent); + // Replicate each slot of h01 (ABCD) into its own register. + Vector128 h11 = Sse.Shuffle(h01, h01, 0x55); // B + Vector128 h21 = Sse.Shuffle(h01, h01, 0xAA); // C + Vector128 h31 = Sse.Shuffle(h01, h01, 0xFF); // D + h01 = Sse.Shuffle(h01, h01, 0x00); // A + + Vector256 x01 = Avx.SetHighLow(h01, h01); + Vector256 x11 = Avx.SetHighLow(h11, h11); + Vector256 x21 = Avx.SetHighLow(h21, h21); + Vector256 x31 = Avx.SetHighLow(h31, h31); + + pSrcCurrent += 4; + + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pMatTemp = pMatCurrent; + Vector256 x02 = Avx.LoadAlignedVector256(pMatTemp); + Vector256 x12 = Avx.LoadAlignedVector256(pMatTemp += crow); + Vector256 x22 = Avx.LoadAlignedVector256(pMatTemp += crow); + Vector256 x32 = Avx.LoadAlignedVector256(pMatTemp += crow); + + x02 = Avx.Multiply(x01, x02); + x12 = Avx.Multiply(x11, x12); + x22 = Avx.Multiply(x21, x22); + x32 = Avx.Multiply(x31, x32); + + x02 = Avx.Add(x02, x12); + x22 = Avx.Add(x22, x32); + x02 = Avx.Add(x02, x22); + + Avx.StoreAligned(pDstCurrent, x02); + + pDstCurrent += 8; + pMatCurrent += 8; + } + + pMatCurrent += 3 * crow; + } + + while (pSrcCurrent < pSrcEnd) + { + Vector128 h01 = Sse.LoadAlignedVector128(pSrcCurrent); + // Replicate each slot of h01 (ABCD) into its own register. + Vector128 h11 = Sse.Shuffle(h01, h01, 0x55); // B + Vector128 h21 = Sse.Shuffle(h01, h01, 0xAA); // C + Vector128 h31 = Sse.Shuffle(h01, h01, 0xFF); // D + h01 = Sse.Shuffle(h01, h01, 0x00); // A + + Vector256 x01 = Avx.SetHighLow(h01, h01); + Vector256 x11 = Avx.SetHighLow(h11, h11); + Vector256 x21 = Avx.SetHighLow(h21, h21); + Vector256 x31 = Avx.SetHighLow(h31, h31); + + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pMatTemp = pMatCurrent; + + Vector256 x02 = Avx.LoadAlignedVector256(pMatTemp); + Vector256 x12 = Avx.LoadAlignedVector256(pMatTemp += crow); + Vector256 x22 = Avx.LoadAlignedVector256(pMatTemp += crow); + Vector256 x32 = Avx.LoadAlignedVector256(pMatTemp += crow); + Vector256 x3 = Avx.LoadAlignedVector256(pDstCurrent); + + x02 = Avx.Multiply(x01, x02); + x12 = Avx.Multiply(x11, x12); + x22 = Avx.Multiply(x21, x22); + x32 = Avx.Multiply(x31, x32); + + x02 = Avx.Add(x02, x12); + x22 = Avx.Add(x22, x32); + x02 = Avx.Add(x02, x22); + x3 = Avx.Add(x02, x3); + + Avx.StoreAligned(pDstCurrent, x3); + + pDstCurrent += 8; + pMatCurrent += 8; + } + + pMatCurrent += 3 * crow; + pSrcCurrent += 4; + } + } + } + + // Partial sparse source vector. + public static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow) + { + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); + + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + fixed (int* pposSrc = &rgposSrc[0]) + { + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); + + int* ppos = pposSrc + iposMin; + int* pposEnd = pposSrc + iposEnd; + float* pDstEnd = pdst + crow; + + if (!add) + { + int col = *ppos - posMin; + ppos++; + + Vector256 x0 = Avx.SetAllVector256(psrc[col]); + float* pDstCurrent = pdst; + float* pMatCurrent = pmat + col * crow; + + while (pDstCurrent < pDstEnd) + { + Vector256 x1 = Avx.LoadAlignedVector256(pMatCurrent); + x1 = Avx.Multiply(x1, x0); + Avx.StoreAligned(pDstCurrent, x1); + + pDstCurrent += 8; + pMatCurrent += 8; + } + } + + // REVIEW: Should we explore unrolling the outer loop? + while (ppos < pposEnd) + { + int col = *ppos - posMin; + + Vector256 x0 = Avx.SetAllVector256(psrc[col]); + float* pDstCurrent = pdst; + float* pMatCurrent = pmat + col * crow; + + while (pDstCurrent < pDstEnd) + { + Vector256 x1 = Avx.LoadAlignedVector256(pMatCurrent); + Vector256 x2 = Avx.LoadAlignedVector256(pDstCurrent); + x1 = Avx.Multiply(x1, x0); + x2 = Avx.Add(x2, x1); + Avx.StoreAligned(pDstCurrent, x2); + + pDstCurrent += 8; + pMatCurrent += 8; + } + + ppos++; + } + } + } + + // dst[i] += scale + public static unsafe void AddScalarU(float scalar, Span dst) + { + fixed (float* pdst = dst) + { + float* pDstEnd = pdst + dst.Length; + float* pDstCurrent = pdst; + + Vector256 scalarVector256 = Avx.SetAllVector256(scalar); + + while (pDstCurrent + 8 <= pDstEnd) + { + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + dstVector = Avx.Add(dstVector, scalarVector256); + Avx.Store(pDstCurrent, dstVector); + + pDstCurrent += 8; + } + + Vector128 scalarVector128 = Sse.SetAllVector128(scalar); + + if (pDstCurrent + 4 <= pDstEnd) + { + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + dstVector = Sse.Add(dstVector, scalarVector128); + Sse.Store(pDstCurrent, dstVector); + + pDstCurrent += 4; + } + + while (pDstCurrent < pDstEnd) + { + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + dstVector = Sse.AddScalar(dstVector, scalarVector128); + Sse.StoreScalar(pDstCurrent, dstVector); + + pDstCurrent++; + } + } + } + + public static unsafe void ScaleU(float scale, Span dst) + { + fixed (float* pdst = dst) + { + float* pDstCurrent = pdst; + float* pEnd = pdst + dst.Length; + + Vector256 scaleVector256 = Avx.SetAllVector256(scale); + + while (pDstCurrent + 8 <= pEnd) + { + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + + dstVector = Avx.Multiply(scaleVector256, dstVector); + Avx.Store(pDstCurrent, dstVector); + + pDstCurrent += 8; + } + + Vector128 scaleVector128 = Sse.SetAllVector128(scale); + + if (pDstCurrent + 4 <= pEnd) + { + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + + dstVector = Sse.Multiply(scaleVector128, dstVector); + Sse.Store(pDstCurrent, dstVector); + + pDstCurrent += 4; + } + + while (pDstCurrent < pEnd) + { + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + + dstVector = Sse.MultiplyScalar(scaleVector128, dstVector); + Sse.StoreScalar(pDstCurrent, dstVector); + + pDstCurrent++; + } + } + } + + public static unsafe void ScaleSrcU(float scale, Span src, Span dst) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pDstEnd = pdst + dst.Length; + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + + Vector256 scaleVector256 = Avx.SetAllVector256(scale); + + while (pDstCurrent + 8 <= pDstEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + srcVector = Avx.Multiply(srcVector, scaleVector256); + Avx.Store(pDstCurrent, srcVector); + + pSrcCurrent += 8; + pDstCurrent += 8; + } + + Vector128 scaleVector128 = Sse.SetAllVector128(scale); + + if (pDstCurrent + 4 <= pDstEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Multiply(srcVector, scaleVector128); + Sse.Store(pDstCurrent, srcVector); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + while (pDstCurrent < pDstEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.MultiplyScalar(srcVector, scaleVector128); + Sse.StoreScalar(pDstCurrent, srcVector); + + pSrcCurrent++; + pDstCurrent++; + } + } + } + + // dst[i] = a * (dst[i] + b) + public static unsafe void ScaleAddU(float a, float b, Span dst) + { + fixed (float* pdst = dst) + { + float* pDstEnd = pdst + dst.Length; + float* pDstCurrent = pdst; + + Vector256 a256 = Avx.SetAllVector256(a); + Vector256 b256 = Avx.SetAllVector256(b); + + while (pDstCurrent + 8 <= pDstEnd) + { + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + dstVector = Avx.Add(dstVector, b256); + dstVector = Avx.Multiply(dstVector, a256); + Avx.Store(pDstCurrent, dstVector); + + pDstCurrent += 8; + } + + Vector128 a128 = Sse.SetAllVector128(a); + Vector128 b128 = Sse.SetAllVector128(b); + + if (pDstCurrent + 4 <= pDstEnd) + { + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + dstVector = Sse.Add(dstVector, b128); + dstVector = Sse.Multiply(dstVector, a128); + Sse.Store(pDstCurrent, dstVector); + + pDstCurrent += 4; + } + + while (pDstCurrent < pDstEnd) + { + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + dstVector = Sse.AddScalar(dstVector, b128); + dstVector = Sse.MultiplyScalar(dstVector, a128); + Sse.StoreScalar(pDstCurrent, dstVector); + + pDstCurrent++; + } + } + } + + public static unsafe void AddScaleU(float scale, Span src, Span dst) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pEnd = pdst + dst.Length; + + Vector256 scaleVector256 = Avx.SetAllVector256(scale); + + while (pDstCurrent + 8 <= pEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + + srcVector = Avx.Multiply(srcVector, scaleVector256); + dstVector = Avx.Add(dstVector, srcVector); + Avx.Store(pDstCurrent, dstVector); + + pSrcCurrent += 8; + pDstCurrent += 8; + } + + Vector128 scaleVector128 = Sse.SetAllVector128(scale); + + if (pDstCurrent + 4 <= pEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + + srcVector = Sse.Multiply(srcVector, scaleVector128); + dstVector = Sse.Add(dstVector, srcVector); + Sse.Store(pDstCurrent, dstVector); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + while (pDstCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + + srcVector = Sse.MultiplyScalar(srcVector, scaleVector128); + dstVector = Sse.AddScalar(dstVector, srcVector); + Sse.StoreScalar(pDstCurrent, dstVector); + + pSrcCurrent++; + pDstCurrent++; + } + } + } + + public static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + fixed (float* pres = result) + { + float* pResEnd = pres + result.Length; + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pResCurrent = pres; + + Vector256 scaleVector256 = Avx.SetAllVector256(scale); + + while (pResCurrent + 8 <= pResEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + srcVector = Avx.Multiply(srcVector, scaleVector256); + dstVector = Avx.Add(dstVector, srcVector); + Avx.Store(pResCurrent, dstVector); + + pSrcCurrent += 8; + pDstCurrent += 8; + pResCurrent += 8; + } + + Vector128 scaleVector128 = Sse.SetAllVector128(scale); + + if (pResCurrent + 4 <= pResEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + srcVector = Sse.Multiply(srcVector, scaleVector128); + dstVector = Sse.Add(dstVector, srcVector); + Sse.Store(pResCurrent, dstVector); + + pSrcCurrent += 4; + pDstCurrent += 4; + pResCurrent += 4; + } + + while (pResCurrent < pResEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + srcVector = Sse.MultiplyScalar(srcVector, scaleVector128); + dstVector = Sse.AddScalar(dstVector, srcVector); + Sse.StoreScalar(pResCurrent, dstVector); + + pSrcCurrent++; + pDstCurrent++; + pResCurrent++; + } + } + } + + public static unsafe void AddScaleSU(float scale, Span src, Span idx, Span dst) + { + fixed (float* psrc = src) + fixed (int* pidx = idx) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + int* pIdxCurrent = pidx; + float* pDstCurrent = pdst; + int* pEnd = pidx + idx.Length; + + Vector256 scaleVector256 = Avx.SetAllVector256(scale); + + while (pIdxCurrent + 8 <= pEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Load8(pDstCurrent, pIdxCurrent); + + srcVector = Avx.Multiply(srcVector, scaleVector256); + dstVector = Avx.Add(dstVector, srcVector); + Store8(in dstVector, pDstCurrent, pIdxCurrent); + + pIdxCurrent += 8; + pSrcCurrent += 8; + } + + Vector128 scaleVector128 = Sse.SetAllVector128(scale); + + if (pIdxCurrent + 4 <= pEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = SseIntrinsics.Load4(pDstCurrent, pIdxCurrent); + + srcVector = Sse.Multiply(srcVector, scaleVector128); + dstVector = Sse.Add(dstVector, srcVector); + SseIntrinsics.Store4(in dstVector, pDstCurrent, pIdxCurrent); + + pIdxCurrent += 4; + pSrcCurrent += 4; + } + + while (pIdxCurrent < pEnd) + { + pDstCurrent[*pIdxCurrent] += scale * (*pSrcCurrent); + + pIdxCurrent++; + pSrcCurrent++; + } + } + } + + public static unsafe void AddU(Span src, Span dst) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pEnd = psrc + src.Length; + + while (pSrcCurrent + 8 <= pEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + + Vector256 result = Avx.Add(srcVector, dstVector); + Avx.Store(pDstCurrent, result); + + pSrcCurrent += 8; + pDstCurrent += 8; + } + + if (pSrcCurrent + 4 <= pEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + + Vector128 result = Sse.Add(srcVector, dstVector); + Sse.Store(pDstCurrent, result); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + while (pSrcCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + + Vector128 result = Sse.AddScalar(srcVector, dstVector); + Sse.StoreScalar(pDstCurrent, result); + + pSrcCurrent++; + pDstCurrent++; + } + } + } + + public static unsafe void AddSU(Span src, Span idx, Span dst) + { + fixed (float* psrc = src) + fixed (int* pidx = idx) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + int* pIdxCurrent = pidx; + float* pDstCurrent = pdst; + int* pEnd = pidx + idx.Length; + + while (pIdxCurrent + 8 <= pEnd) + { + Vector256 dstVector = Load8(pDstCurrent, pIdxCurrent); + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + + dstVector = Avx.Add(dstVector, srcVector); + Store8(in dstVector, pDstCurrent, pIdxCurrent); + + pIdxCurrent += 8; + pSrcCurrent += 8; + } + + if (pIdxCurrent + 4 <= pEnd) + { + Vector128 dstVector = SseIntrinsics.Load4(pDstCurrent, pIdxCurrent); + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + + dstVector = Sse.Add(dstVector, srcVector); + SseIntrinsics.Store4(in dstVector, pDstCurrent, pIdxCurrent); + + pIdxCurrent += 4; + pSrcCurrent += 4; + } + + while (pIdxCurrent < pEnd) + { + pDstCurrent[*pIdxCurrent] += *pSrcCurrent; + + pIdxCurrent++; + pSrcCurrent++; + } + } + } + + public static unsafe void MulElementWiseU(Span src1, Span src2, Span dst) + { + fixed (float* psrc1 = src1) + fixed (float* psrc2 = src2) + fixed (float* pdst = dst) + { + float* pSrc1Current = psrc1; + float* pSrc2Current = psrc2; + float* pDstCurrent = pdst; + float* pEnd = pdst + dst.Length; + + while (pDstCurrent + 8 <= pEnd) + { + Vector256 src1Vector = Avx.LoadVector256(pSrc1Current); + Vector256 src2Vector = Avx.LoadVector256(pSrc2Current); + src2Vector = Avx.Multiply(src1Vector, src2Vector); + Avx.Store(pDstCurrent, src2Vector); + + pSrc1Current += 8; + pSrc2Current += 8; + pDstCurrent += 8; + } + + if (pDstCurrent + 4 <= pEnd) + { + Vector128 src1Vector = Sse.LoadVector128(pSrc1Current); + Vector128 src2Vector = Sse.LoadVector128(pSrc2Current); + src2Vector = Sse.Multiply(src1Vector, src2Vector); + Sse.Store(pDstCurrent, src2Vector); + + pSrc1Current += 4; + pSrc2Current += 4; + pDstCurrent += 4; + } + + while (pDstCurrent < pEnd) + { + Vector128 src1Vector = Sse.LoadScalarVector128(pSrc1Current); + Vector128 src2Vector = Sse.LoadScalarVector128(pSrc2Current); + src2Vector = Sse.MultiplyScalar(src1Vector, src2Vector); + Sse.StoreScalar(pDstCurrent, src2Vector); + + pSrc1Current++; + pSrc2Current++; + pDstCurrent++; + } + } + } + + public static unsafe float SumU(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + result256 = Avx.Add(result256, Avx.LoadVector256(pSrcCurrent)); + pSrcCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + if (pSrcCurrent + 4 <= pSrcEnd) + { + result128 = Sse.Add(result128, Sse.LoadVector128(pSrcCurrent)); + pSrcCurrent += 4; + } + + result128 = SseIntrinsics.VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + result128 = Sse.AddScalar(result128, Sse.LoadScalarVector128(pSrcCurrent)); + pSrcCurrent++; + } + + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + } + } + + public static unsafe float SumSqU(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + result256 = Avx.Add(result256, Avx.Multiply(srcVector, srcVector)); + + pSrcCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + if (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + result128 = Sse.Add(result128, Sse.Multiply(srcVector, srcVector)); + + pSrcCurrent += 4; + } + + result128 = SseIntrinsics.VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, srcVector)); + + pSrcCurrent++; + } + + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + } + } + + public static unsafe float SumSqDiffU(float mean, Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + Vector256 meanVector256 = Avx.SetAllVector256(mean); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + srcVector = Avx.Subtract(srcVector, meanVector256); + result256 = Avx.Add(result256, Avx.Multiply(srcVector, srcVector)); + + pSrcCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + Vector128 meanVector128 = Sse.SetAllVector128(mean); + + if (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector128); + result128 = Sse.Add(result128, Sse.Multiply(srcVector, srcVector)); + + pSrcCurrent += 4; + } + + result128 = SseIntrinsics.VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector128); + result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, srcVector)); + + pSrcCurrent++; + } + + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + } + } + + public static unsafe float SumAbsU(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + result256 = Avx.Add(result256, Avx.And(srcVector, _absMask256)); + + pSrcCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + if (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + result128 = Sse.Add(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); + + pSrcCurrent += 4; + } + + result128 = SseIntrinsics.VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + result128 = Sse.AddScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); + + pSrcCurrent++; + } + + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + } + } + + public static unsafe float SumAbsDiffU(float mean, Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + Vector256 meanVector256 = Avx.SetAllVector256(mean); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + srcVector = Avx.Subtract(srcVector, meanVector256); + result256 = Avx.Add(result256, Avx.And(srcVector, _absMask256)); + + pSrcCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + Vector128 meanVector128 = Sse.SetAllVector128(mean); + + if (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector128); + result128 = Sse.Add(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); + + pSrcCurrent += 4; + } + + result128 = SseIntrinsics.VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector128); + result128 = Sse.AddScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); + + pSrcCurrent++; + } + + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + } + } + + public static unsafe float MaxAbsU(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + result256 = Avx.Max(result256, Avx.And(srcVector, _absMask256)); + + pSrcCurrent += 8; + } + + result256 = VectorMax256(in result256); + Vector128 resultPadded = Sse.MaxScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + if (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + result128 = Sse.Max(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); + + pSrcCurrent += 4; + } + + result128 = SseIntrinsics.VectorMax128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + result128 = Sse.MaxScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); + + pSrcCurrent++; + } + + return Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded)); + } + } + + public static unsafe float MaxAbsDiffU(float mean, Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector256 result256 = Avx.SetZeroVector256(); + Vector256 meanVector256 = Avx.SetAllVector256(mean); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + srcVector = Avx.Subtract(srcVector, meanVector256); + result256 = Avx.Max(result256, Avx.And(srcVector, _absMask256)); + + pSrcCurrent += 8; + } + + result256 = VectorMax256(in result256); + Vector128 resultPadded = Sse.MaxScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + Vector128 meanVector128 = Sse.SetAllVector128(mean); + + if (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector128); + result128 = Sse.Max(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); + + pSrcCurrent += 4; + } + + result128 = SseIntrinsics.VectorMax128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector128); + result128 = Sse.MaxScalar(result128, Sse.And(srcVector, SseIntrinsics.AbsMask128)); + + pSrcCurrent++; + } + + return Sse.ConvertToSingle(Sse.MaxScalar(result128, resultPadded)); + } + } + + public static unsafe float DotU(Span src, Span dst) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pSrcEnd = psrc + src.Length; + + Vector256 result256 = Avx.SetZeroVector256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + + result256 = Avx.Add(result256, Avx.Multiply(srcVector, dstVector)); + + pSrcCurrent += 8; + pDstCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + if (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + + result128 = Sse.Add(result128, Sse.Multiply(srcVector, dstVector)); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + result128 = SseIntrinsics.VectorSum128(in result128); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + + result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, dstVector)); + + pSrcCurrent++; + pDstCurrent++; + } + + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + } + } + + public static unsafe float DotSU(Span src, Span dst, Span idx) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + fixed (int* pidx = idx) + { + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + int* pIdxCurrent = pidx; + int* pIdxEnd = pidx + idx.Length; + + Vector256 result256 = Avx.SetZeroVector256(); + + while (pIdxCurrent + 8 <= pIdxEnd) + { + Vector256 srcVector = Load8(pSrcCurrent, pIdxCurrent); + Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + + result256 = Avx.Add(result256, Avx.Multiply(srcVector, dstVector)); + + pIdxCurrent += 8; + pDstCurrent += 8; + } + + result256 = VectorSum256(in result256); + Vector128 resultPadded = Sse.AddScalar(Avx.GetLowerHalf(result256), GetHigh(result256)); + + Vector128 result128 = Sse.SetZeroVector128(); + + if (pIdxCurrent + 4 <= pIdxEnd) + { + Vector128 srcVector = SseIntrinsics.Load4(pSrcCurrent, pIdxCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + + result128 = Sse.Add(result128, Sse.Multiply(srcVector, dstVector)); + + pIdxCurrent += 4; + pDstCurrent += 4; + } + + result128 = SseIntrinsics.VectorSum128(in result128); + + while (pIdxCurrent < pIdxEnd) + { + Vector128 srcVector = SseIntrinsics.Load1(pSrcCurrent, pIdxCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + + result128 = Sse.AddScalar(result128, Sse.MultiplyScalar(srcVector, dstVector)); + + pIdxCurrent++; + pDstCurrent++; + } + + return Sse.ConvertToSingle(Sse.AddScalar(result128, resultPadded)); + } + } + + public static unsafe float Dist2(Span src, Span dst) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pSrcEnd = psrc + src.Length; + + Vector256 sqDistanceVector256 = Avx.SetZeroVector256(); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 distanceVector = Avx.Subtract(Avx.LoadVector256(pSrcCurrent), + Avx.LoadVector256(pDstCurrent)); + sqDistanceVector256 = Avx.Add(sqDistanceVector256, + Avx.Multiply(distanceVector, distanceVector)); + + pSrcCurrent += 8; + pDstCurrent += 8; + } + + sqDistanceVector256 = VectorSum256(in sqDistanceVector256); + Vector128 sqDistanceVectorPadded = Sse.AddScalar(Avx.GetLowerHalf(sqDistanceVector256), GetHigh(sqDistanceVector256)); + + Vector128 sqDistanceVector128 = Sse.SetZeroVector128(); + + if (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent), + Sse.LoadVector128(pDstCurrent)); + sqDistanceVector128 = Sse.Add(sqDistanceVector128, + Sse.Multiply(distanceVector, distanceVector)); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + sqDistanceVector128 = SseIntrinsics.VectorSum128(in sqDistanceVector128); + + float norm = Sse.ConvertToSingle(Sse.AddScalar(sqDistanceVector128, sqDistanceVectorPadded)); + while (pSrcCurrent < pSrcEnd) + { + float distance = (*pSrcCurrent) - (*pDstCurrent); + norm += distance * distance; + + pSrcCurrent++; + pDstCurrent++; + } + + return norm; + } + } + + public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) + { + fixed (float* psrc = src) + fixed (float* pdst1 = v) + fixed (float* pdst2 = w) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + float* pDst1Current = pdst1; + float* pDst2Current = pdst2; + + Vector256 xPrimal256 = Avx.SetAllVector256(primalUpdate); + Vector256 xThreshold256 = Avx.SetAllVector256(threshold); + + while (pSrcCurrent + 8 <= pSrcEnd) + { + Vector256 xSrc = Avx.LoadVector256(pSrcCurrent); + + Vector256 xDst1 = Avx.LoadVector256(pDst1Current); + xDst1 = Avx.Add(xDst1, Avx.Multiply(xSrc, xPrimal256)); + Vector256 xDst2 = GetNewDst256(xDst1, xThreshold256); + + Avx.Store(pDst1Current, xDst1); + Avx.Store(pDst2Current, xDst2); + + pSrcCurrent += 8; + pDst1Current += 8; + pDst2Current += 8; + } + + Vector128 xPrimal128 = Sse.SetAllVector128(primalUpdate); + Vector128 xThreshold128 = Sse.SetAllVector128(threshold); + + if (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); + + Vector128 xDst1 = Sse.LoadVector128(pDst1Current); + xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128)); + Vector128 xDst2 = SseIntrinsics.GetNewDst128(xDst1, xThreshold128); + + Sse.Store(pDst1Current, xDst1); + Sse.Store(pDst2Current, xDst2); + + pSrcCurrent += 4; + pDst1Current += 4; + pDst2Current += 4; + } + + while (pSrcCurrent < pSrcEnd) + { + *pDst1Current += (*pSrcCurrent) * primalUpdate; + float dst1 = *pDst1Current; + *pDst2Current = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0; + + pSrcCurrent++; + pDst1Current++; + pDst2Current++; + } + } + } + + public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) + { + fixed (float* psrc = src) + fixed (int* pidx = indices) + fixed (float* pdst1 = v) + fixed (float* pdst2 = w) + { + int* pIdxEnd = pidx + indices.Length; + float* pSrcCurrent = psrc; + int* pIdxCurrent = pidx; + + Vector256 xPrimal256 = Avx.SetAllVector256(primalUpdate); + Vector256 xThreshold = Avx.SetAllVector256(threshold); + + while (pIdxCurrent + 8 <= pIdxEnd) + { + Vector256 xSrc = Avx.LoadVector256(pSrcCurrent); + + Vector256 xDst1 = Load8(pdst1, pIdxCurrent); + xDst1 = Avx.Add(xDst1, Avx.Multiply(xSrc, xPrimal256)); + Vector256 xDst2 = GetNewDst256(xDst1, xThreshold); + + Store8(in xDst1, pdst1, pIdxCurrent); + Store8(in xDst2, pdst2, pIdxCurrent); + + pIdxCurrent += 8; + pSrcCurrent += 8; + } + + Vector128 xPrimal128 = Sse.SetAllVector128(primalUpdate); + Vector128 xThreshold128 = Sse.SetAllVector128(threshold); + + if (pIdxCurrent + 4 <= pIdxEnd) + { + Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); + + Vector128 xDst1 = SseIntrinsics.Load4(pdst1, pIdxCurrent); + xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal128)); + Vector128 xDst2 = SseIntrinsics.GetNewDst128(xDst1, xThreshold128); + + SseIntrinsics.Store4(in xDst1, pdst1, pIdxCurrent); + SseIntrinsics.Store4(in xDst2, pdst2, pIdxCurrent); + + pIdxCurrent += 4; + pSrcCurrent += 4; + } + + while (pIdxCurrent < pIdxEnd) + { + int index = *pIdxCurrent; + pdst1[index] += (*pSrcCurrent) * primalUpdate; + float dst1 = pdst1[index]; + pdst2[index] = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0; + + pIdxCurrent++; + pSrcCurrent++; + } + } + } + } +} diff --git a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs index 9c7fa5ae1f..30308f219d 100644 --- a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs +++ b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs @@ -16,7 +16,7 @@ public static void AssertCompatible(ICpuFullMatrix values) #if DEBUG var mat = values as TMatrix; Contracts.AssertValue(mat); - Contracts.Assert(mat.Items.CbAlign == CpuMathUtils.Vector128Alignment); + Contracts.Assert((mat.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0); #endif } @@ -29,7 +29,7 @@ public static void AssertCompatible(ICpuVector values) #if DEBUG CpuAlignedVector vec = values as CpuAlignedVector; Contracts.AssertValue(vec); - Contracts.Assert(vec.Items.CbAlign == CpuMathUtils.Vector128Alignment); + Contracts.Assert((vec.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0); #endif } diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 81d7acf25a..f15f5c3938 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System; @@ -10,14 +11,38 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath public static partial class CpuMathUtils { // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray - public const int Vector128Alignment = 16; + private const int Vector128Alignment = 16; + + // The count of bytes in Vector256, corresponding to _cbAlign in AlignedArray + private const int Vector256Alignment = 32; + + // The count of bytes in a 32-bit float, corresponding to _cbAlign in AlignedArray + private const int FloatAlignment = 4; + + // If neither AVX nor SSE is supported, return basic alignment for a 4-byte float. + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + public static int GetVectorAlignment() + => Avx.IsSupported ? Vector256Alignment : (Sse.IsSupported ? Vector128Alignment : FloatAlignment); public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) { Contracts.Assert(mat.Size == dst.Size * src.Size); Contracts.Assert(crun >= 0); - if (Sse.IsSupported) + if (Avx.IsSupported) + { + if (!tran) + { + Contracts.Assert(crun <= dst.Size); + AvxIntrinsics.MatMulX(add, mat, src, dst, crun, src.Size); + } + else + { + Contracts.Assert(crun <= src.Size); + AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Size, crun); + } + } + else if (Sse.IsSupported) { if (!tran) { @@ -96,7 +121,20 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo Contracts.AssertNonEmpty(rgposSrc); Contracts.Assert(crun >= 0); - if (Sse.IsSupported) + if (Avx.IsSupported) + { + if (!tran) + { + Contracts.Assert(crun <= dst.Size); + AvxIntrinsics.MatMulPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size); + } + else + { + Contracts.Assert(crun <= srcValues.Size); + AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size); + } + } + else if (Sse.IsSupported) { if (!tran) { @@ -170,7 +208,11 @@ public static void Add(float a, float[] dst, int count) private static void Add(float a, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddScalarU(a, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.AddScalarU(a, dst); } @@ -204,7 +246,11 @@ public static void Scale(float a, float[] dst, int offset, int count) private static void Scale(float a, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.ScaleU(a, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.ScaleU(a, dst); } @@ -231,7 +277,11 @@ public static void Scale(float a, float[] src, float[] dst, int count) private static void Scale(float a, Span src, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.ScaleSrcU(a, src, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.ScaleSrcU(a, src, dst); } @@ -256,7 +306,11 @@ public static void ScaleAdd(float a, float b, float[] dst, int count) private static void ScaleAdd(float a, float b, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.ScaleAddU(a, b, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.ScaleAddU(a, b, dst); } @@ -295,7 +349,11 @@ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, in private static void AddScale(float a, Span src, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddScaleU(a, src, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.AddScaleU(a, src, dst); } @@ -339,7 +397,11 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in private static void AddScale(float a, Span src, Span indices, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddScaleSU(a, src, indices, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.AddScaleSU(a, src, indices, dst); } @@ -368,7 +430,11 @@ public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, private static void AddScaleCopy(float a, Span src, Span dst, Span res) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddScaleCopyU(a, src, dst, res); + } + else if (Sse.IsSupported) { SseIntrinsics.AddScaleCopyU(a, src, dst, res); } @@ -394,7 +460,11 @@ public static void Add(float[] src, float[] dst, int count) private static void Add(Span src, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddU(src, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.AddU(src, dst); } @@ -438,7 +508,11 @@ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, i private static void Add(Span src, Span indices, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.AddSU(src, indices, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.AddSU(src, indices, dst); } @@ -467,7 +541,11 @@ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int c private static void MulElementWise(Span src1, Span src2, Span dst) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.MulElementWiseU(src1, src2, dst); + } + else if (Sse.IsSupported) { SseIntrinsics.MulElementWiseU(src1, src2, dst); } @@ -501,7 +579,11 @@ public static float Sum(float[] src, int offset, int count) private static float Sum(Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.SumU(src); + } + else if (Sse.IsSupported) { return SseIntrinsics.SumU(src); } @@ -537,7 +619,11 @@ public static float SumSq(float[] src, int offset, int count) private static float SumSq(Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.SumSqU(src); + } + else if (Sse.IsSupported) { return SseIntrinsics.SumSqU(src); } @@ -564,7 +650,11 @@ public static float SumSq(float mean, float[] src, int offset, int count) private static float SumSq(float mean, Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return (mean == 0) ? AvxIntrinsics.SumSqU(src) : AvxIntrinsics.SumSqDiffU(mean, src); + } + else if (Sse.IsSupported) { return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src); } @@ -600,7 +690,11 @@ public static float SumAbs(float[] src, int offset, int count) private static float SumAbs(Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.SumAbsU(src); + } + else if (Sse.IsSupported) { return SseIntrinsics.SumAbsU(src); } @@ -627,7 +721,11 @@ public static float SumAbs(float mean, float[] src, int offset, int count) private static float SumAbs(float mean, Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return (mean == 0) ? AvxIntrinsics.SumAbsU(src) : AvxIntrinsics.SumAbsDiffU(mean, src); + } + else if (Sse.IsSupported) { return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src); } @@ -663,7 +761,11 @@ public static float MaxAbs(float[] src, int offset, int count) private static float MaxAbs(Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.MaxAbsU(src); + } + else if (Sse.IsSupported) { return SseIntrinsics.MaxAbsU(src); } @@ -693,7 +795,11 @@ public static float MaxAbsDiff(float mean, float[] src, int count) private static float MaxAbsDiff(float mean, Span src) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.MaxAbsDiffU(mean, src); + } + else if (Sse.IsSupported) { return SseIntrinsics.MaxAbsDiffU(mean, src); } @@ -737,7 +843,11 @@ public static float DotProductDense(float[] a, int offset, float[] b, int count) private static float DotProductDense(Span a, Span b) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.DotU(a, b); + } + else if (Sse.IsSupported) { return SseIntrinsics.DotU(a, b); } @@ -784,7 +894,11 @@ public static float DotProductSparse(float[] a, int offset, float[] b, int[] ind private static float DotProductSparse(Span a, Span b, Span indices) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.DotSU(a, b, indices); + } + else if (Sse.IsSupported) { return SseIntrinsics.DotSU(a, b, indices); } @@ -813,7 +927,11 @@ public static float L2DistSquared(float[] a, float[] b, int count) private static float L2DistSquared(Span a, Span b) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + return AvxIntrinsics.Dist2(a, b); + } + else if (Sse.IsSupported) { return SseIntrinsics.Dist2(a, b); } @@ -909,7 +1027,11 @@ public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src private static void SdcaL1UpdateDense(float primalUpdate, Span src, float threshold, Span v, Span w) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w); + } + else if (Sse.IsSupported) { SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w); } @@ -943,7 +1065,11 @@ public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] sr private static void SdcaL1UpdateSparse(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) { - if (Sse.IsSupported) + if (Avx.IsSupported) + { + AvxIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w); + } + else if (Sse.IsSupported) { SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w); } diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index 6f480b0f25..b35f171388 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -2,12 +2,18 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Runtime.CompilerServices; + namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray - public const int Vector128Alignment = 16; + private const int Vector128Alignment = 16; + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + public static int GetVectorAlignment() + => Vector128Alignment; public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun); diff --git a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj index b6c95b93f4..05f97d3040 100644 --- a/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj +++ b/src/Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.csproj @@ -10,6 +10,10 @@ 7.3 + + + + @@ -26,5 +30,7 @@ + + \ No newline at end of file diff --git a/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs b/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs new file mode 100644 index 0000000000..ab9968b399 --- /dev/null +++ b/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs @@ -0,0 +1,7 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.CpuMath.PerformanceTests, PublicKey=002400000480000094000000060200000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a34928e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")] \ No newline at end of file diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index bf7ad03e34..0f4fb54d18 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -22,45 +22,45 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static class SseIntrinsics { + internal static readonly Vector128 AbsMask128 = Sse2.IsSupported ? + Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : + Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; - private static bool Compat(AlignedArray a) + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static bool HasCompatibleAlignment(AlignedArray alignedArray) { - Contracts.AssertValue(a); - Contracts.Assert(a.Size > 0); - return a.CbAlign == Vector128Alignment; + Contracts.AssertValue(alignedArray); + Contracts.Assert(alignedArray.Size > 0); + return (alignedArray.CbAlign % Vector128Alignment) == 0; } - private static unsafe float* Ptr(AlignedArray a, float* p) + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase) { - Contracts.AssertValue(a); - float* q = p + a.GetBase((long)p); - Contracts.Assert(((long)q & (Vector128Alignment - 1)) == 0); - return q; + Contracts.AssertValue(alignedArray); + float* alignedBase = unalignedBase + alignedArray.GetBase((long)unalignedBase); + Contracts.Assert(((long)alignedBase & (Vector128Alignment - 1)) == 0); + return alignedBase; } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 Load1(float* src, int* idx) - { - return Sse.SetScalarVector128(src[idx[0]]); - } + internal static unsafe Vector128 Load1(float* src, int* idx) + => Sse.SetScalarVector128(src[idx[0]]); [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 Load4(float* src, int* idx) - { - return Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); - } + internal static unsafe Vector128 Load4(float* src, int* idx) + => Sse.SetVector128(src[idx[3]], src[idx[2]], src[idx[1]], src[idx[0]]); + // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA. [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 Rotate(in Vector128 x) - { - // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA. - return Sse.Shuffle(x, x, 0x39); - } + internal static Vector128 Rotate(in Vector128 x) + => Sse.Shuffle(x, x, 0x39); [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe void Store4(in Vector128 x, float* dst, int* idx) + internal static unsafe void Store4(in Vector128 x, float* dst, int* idx) { Sse.StoreScalar(dst + idx[0], x); Vector128 rotated = Rotate(in x); @@ -72,7 +72,7 @@ private static unsafe void Store4(in Vector128 x, float* dst, int* idx) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 VectorSum(in Vector128 vector) + internal static Vector128 VectorSum128(in Vector128 vector) { if (Sse3.IsSupported) { @@ -88,25 +88,27 @@ private static Vector128 VectorSum(in Vector128 vector) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 VectorMax(in Vector128 vector) + internal static Vector128 VectorMax128(in Vector128 vector) { + // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> BADC. Vector128 x1 = Sse.Shuffle(vector, vector, 0xB1); + + // Performs element-wise maximum operation: The 1st and 3rd 32-bit slots become + // max(A, B) and max(C, D). Vector128 partialMax = Sse.Max(vector, x1); + + // The control byte shuffles the four 32-bit floats of partialMax: ABCD -> CAAA. x1 = Sse.Shuffle(partialMax, partialMax, 0x02); - return Sse.MaxScalar(partialMax, x1); - } - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 GetAbsMask() - { - return Sse2.IsSupported ? - Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : - Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + // Performs element-wise maximum operation: The 1st 32-bit slot becomes + // max(A, B, C, D). + return Sse.MaxScalar(partialMax, x1); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 GetNewDst(in Vector128 xDst1, in Vector128 signMask, in Vector128 xThreshold) + internal static Vector128 GetNewDst128(in Vector128 xDst1, in Vector128 xThreshold) { + Vector128 signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true @@ -115,19 +117,19 @@ private static Vector128 GetNewDst(in Vector128 xDst1, in Vector12 } // Multiply matrix times vector into vector. - internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); float* pSrcEnd = psrc + ccol; float* pDstEnd = pdst + crow; @@ -180,12 +182,12 @@ internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src } // Partial sparse source vector. - internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); // REVIEW: For extremely sparse inputs, interchanging the loops would // likely be more efficient. @@ -194,9 +196,9 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, fixed (float* pMatStart = &mat.Items[0]) fixed (int* pposSrc = &rgposSrc[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); int* pposMin = pposSrc + iposMin; int* pposEnd = pposSrc + iposEnd; @@ -237,19 +239,19 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, } } - internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); float* pSrcEnd = psrc + ccol; float* pDstEnd = pdst + crow; @@ -339,21 +341,21 @@ internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray } // Partial sparse source vector. - internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + public static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Contracts.Assert(HasCompatibleAlignment(mat)); + Contracts.Assert(HasCompatibleAlignment(src)); + Contracts.Assert(HasCompatibleAlignment(dst)); fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) fixed (int* pposSrc = &rgposSrc[0]) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - float* pmat = Ptr(mat, pMatStart); + float* psrc = GetAlignedBase(src, pSrcStart); + float* pdst = GetAlignedBase(dst, pDstStart); + float* pmat = GetAlignedBase(mat, pMatStart); int* ppos = pposSrc + iposMin; int* pposEnd = pposSrc + iposEnd; @@ -406,44 +408,44 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos } // dst[i] += scale - internal static unsafe void AddScalarU(float scale, Span dst) + public static unsafe void AddScalarU(float scalar, Span dst) { fixed (float* pdst = dst) { float* pDstEnd = pdst + dst.Length; float* pDstCurrent = pdst; - Vector128 x1 = Sse.SetAllVector128(scale); + Vector128 scalarVector = Sse.SetAllVector128(scalar); while (pDstCurrent + 4 <= pDstEnd) { - Vector128 x2 = Sse.LoadVector128(pDstCurrent); - x2 = Sse.Add(x2, x1); - Sse.Store(pDstCurrent, x2); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + dstVector = Sse.Add(dstVector, scalarVector); + Sse.Store(pDstCurrent, dstVector); pDstCurrent += 4; } while (pDstCurrent < pDstEnd) { - Vector128 x2 = Sse.LoadScalarVector128(pDstCurrent); - x2 = Sse.AddScalar(x2, x1); - Sse.StoreScalar(pDstCurrent, x2); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + dstVector = Sse.AddScalar(dstVector, scalarVector); + Sse.StoreScalar(pDstCurrent, dstVector); pDstCurrent++; } } } - internal static unsafe void ScaleU(float scale, Span dst) + public static unsafe void ScaleU(float scale, Span dst) { - Vector128 scaleVector = Sse.SetAllVector128(scale); - fixed (float* pdst = dst) { float* pDstCurrent = pdst; float* pEnd = pdst + dst.Length; + Vector128 scaleVector = Sse.SetAllVector128(scale); + while (pDstCurrent + 4 <= pEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -466,10 +468,8 @@ internal static unsafe void ScaleU(float scale, Span dst) } } - internal static unsafe void ScaleSrcU(float scale, Span src, Span dst) + public static unsafe void ScaleSrcU(float scale, Span src, Span dst) { - Vector128 scaleVector = Sse.SetAllVector128(scale); - fixed (float* psrc = src) fixed (float* pdst = dst) { @@ -477,6 +477,8 @@ internal static unsafe void ScaleSrcU(float scale, Span src, Span float* pSrcCurrent = psrc; float* pDstCurrent = pdst; + Vector128 scaleVector = Sse.SetAllVector128(scale); + while (pDstCurrent + 4 <= pDstEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); @@ -500,21 +502,21 @@ internal static unsafe void ScaleSrcU(float scale, Span src, Span } // dst[i] = a * (dst[i] + b) - internal static unsafe void ScaleAddU(float a, float b, Span dst) + public static unsafe void ScaleAddU(float a, float b, Span dst) { - Vector128 x1 = Sse.SetAllVector128(a); - Vector128 x2 = Sse.SetAllVector128(b); - fixed (float* pdst = dst) { float* pDstEnd = pdst + dst.Length; float* pDstCurrent = pdst; + Vector128 aVector = Sse.SetAllVector128(a); + Vector128 bVector = Sse.SetAllVector128(b); + while (pDstCurrent + 4 <= pDstEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); - dstVector = Sse.Add(dstVector, x2); - dstVector = Sse.Multiply(dstVector, x1); + dstVector = Sse.Add(dstVector, bVector); + dstVector = Sse.Multiply(dstVector, aVector); Sse.Store(pDstCurrent, dstVector); pDstCurrent += 4; @@ -523,8 +525,8 @@ internal static unsafe void ScaleAddU(float a, float b, Span dst) while (pDstCurrent < pDstEnd) { Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); - dstVector = Sse.AddScalar(dstVector, x2); - dstVector = Sse.MultiplyScalar(dstVector, x1); + dstVector = Sse.AddScalar(dstVector, bVector); + dstVector = Sse.MultiplyScalar(dstVector, aVector); Sse.StoreScalar(pDstCurrent, dstVector); pDstCurrent++; @@ -532,10 +534,8 @@ internal static unsafe void ScaleAddU(float a, float b, Span dst) } } - internal static unsafe void AddScaleU(float scale, Span src, Span dst) + public static unsafe void AddScaleU(float scale, Span src, Span dst) { - Vector128 scaleVector = Sse.SetAllVector128(scale); - fixed (float* psrc = src) fixed (float* pdst = dst) { @@ -543,6 +543,8 @@ internal static unsafe void AddScaleU(float scale, Span src, Span float* pDstCurrent = pdst; float* pEnd = pdst + dst.Length; + Vector128 scaleVector = Sse.SetAllVector128(scale); + while (pDstCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); @@ -571,7 +573,7 @@ internal static unsafe void AddScaleU(float scale, Span src, Span } } - internal static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) + public static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -582,15 +584,15 @@ internal static unsafe void AddScaleCopyU(float scale, Span src, Span x1 = Sse.SetAllVector128(scale); + Vector128 scaleVector = Sse.SetAllVector128(scale); while (pResCurrent + 4 <= pResEnd) { - Vector128 x2 = Sse.LoadVector128(pSrcCurrent); - Vector128 x3 = Sse.LoadVector128(pDstCurrent); - x2 = Sse.Multiply(x2, x1); - x3 = Sse.Add(x3, x2); - Sse.Store(pResCurrent, x3); + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + srcVector = Sse.Multiply(srcVector, scaleVector); + dstVector = Sse.Add(dstVector, srcVector); + Sse.Store(pResCurrent, dstVector); pSrcCurrent += 4; pDstCurrent += 4; @@ -599,11 +601,11 @@ internal static unsafe void AddScaleCopyU(float scale, Span src, Span x2 = Sse.LoadScalarVector128(pSrcCurrent); - Vector128 x3 = Sse.LoadScalarVector128(pDstCurrent); - x2 = Sse.MultiplyScalar(x2, x1); - x3 = Sse.AddScalar(x3, x2); - Sse.StoreScalar(pResCurrent, x3); + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + srcVector = Sse.MultiplyScalar(srcVector, scaleVector); + dstVector = Sse.AddScalar(dstVector, srcVector); + Sse.StoreScalar(pResCurrent, dstVector); pSrcCurrent++; pDstCurrent++; @@ -612,10 +614,8 @@ internal static unsafe void AddScaleCopyU(float scale, Span src, Span src, Span idx, Span dst) + public static unsafe void AddScaleSU(float scale, Span src, Span idx, Span dst) { - Vector128 scaleVector = Sse.SetAllVector128(scale); - fixed (float* psrc = src) fixed (int* pidx = idx) fixed (float* pdst = dst) @@ -625,6 +625,8 @@ internal static unsafe void AddScaleSU(float scale, Span src, Span i float* pDstCurrent = pdst; int* pEnd = pidx + idx.Length; + Vector128 scaleVector = Sse.SetAllVector128(scale); + while (pIdxCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); @@ -648,7 +650,7 @@ internal static unsafe void AddScaleSU(float scale, Span src, Span i } } - internal static unsafe void AddU(Span src, Span dst) + public static unsafe void AddU(Span src, Span dst) { fixed (float* psrc = src) fixed (float* pdst = dst) @@ -683,7 +685,7 @@ internal static unsafe void AddU(Span src, Span dst) } } - internal static unsafe void AddSU(Span src, Span idx, Span dst) + public static unsafe void AddSU(Span src, Span idx, Span dst) { fixed (float* psrc = src) fixed (int* pidx = idx) @@ -696,11 +698,11 @@ internal static unsafe void AddSU(Span src, Span idx, Span ds while (pIdxCurrent + 4 <= pEnd) { - Vector128 srcVector = Load4(pDstCurrent, pIdxCurrent); - Vector128 dstVector = Sse.LoadVector128(pSrcCurrent); + Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent); + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - srcVector = Sse.Add(srcVector, dstVector); - Store4(in srcVector, pDstCurrent, pIdxCurrent); + dstVector = Sse.Add(dstVector, srcVector); + Store4(in dstVector, pDstCurrent, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; @@ -716,7 +718,7 @@ internal static unsafe void AddSU(Span src, Span idx, Span ds } } - internal static unsafe void MulElementWiseU(Span src1, Span src2, Span dst) + public static unsafe void MulElementWiseU(Span src1, Span src2, Span dst) { fixed (float* psrc1 = &src1[0]) fixed (float* psrc2 = &src2[0]) @@ -753,7 +755,7 @@ internal static unsafe void MulElementWiseU(Span src1, Span src2, } } - internal static unsafe float SumU(Span src) + public static unsafe float SumU(Span src) { fixed (float* psrc = src) { @@ -768,7 +770,7 @@ internal static unsafe float SumU(Span src) pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); while (pSrcCurrent < pSrcEnd) { @@ -780,16 +782,16 @@ internal static unsafe float SumU(Span src) } } - internal static unsafe float SumSqU(Span src) + public static unsafe float SumSqU(Span src) { - Vector128 result = Sse.SetZeroVector128(); - fixed (float* psrc = src) { + float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); result = Sse.Add(result, Sse.Multiply(srcVector, srcVector)); @@ -797,21 +799,21 @@ internal static unsafe float SumSqU(Span src) pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, srcVector)); pSrcCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } - internal static unsafe float SumSqDiffU(float mean, Span src) + public static unsafe float SumSqDiffU(float mean, Span src) { fixed (float* psrc = src) { @@ -823,20 +825,20 @@ internal static unsafe float SumSqDiffU(float mean, Span src) while (pSrcCurrent + 4 <= pSrcEnd) { - Vector128 x = Sse.LoadVector128(pSrcCurrent); - x = Sse.Subtract(x, meanVector); - result = Sse.Add(result, Sse.Multiply(x, x)); + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector); + result = Sse.Add(result, Sse.Multiply(srcVector, srcVector)); pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); while (pSrcCurrent < pSrcEnd) { - Vector128 x = Sse.LoadScalarVector128(pSrcCurrent); - x = Sse.SubtractScalar(x, meanVector); - result = Sse.AddScalar(result, Sse.MultiplyScalar(x, x)); + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector); + result = Sse.AddScalar(result, Sse.MultiplyScalar(srcVector, srcVector)); pSrcCurrent++; } @@ -845,152 +847,148 @@ internal static unsafe float SumSqDiffU(float mean, Span src) } } - internal static unsafe float SumAbsU(Span src) + public static unsafe float SumAbsU(Span src) { - Vector128 result = Sse.SetZeroVector128(); - Vector128 mask = GetAbsMask(); - fixed (float* psrc = src) { + float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - result = Sse.Add(result, Sse.And(srcVector, mask)); + result = Sse.Add(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); - result = Sse.AddScalar(result, Sse.And(srcVector, mask)); + result = Sse.AddScalar(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } - internal static unsafe float SumAbsDiffU(float mean, Span src) + public static unsafe float SumAbsDiffU(float mean, Span src) { - Vector128 result = Sse.SetZeroVector128(); - Vector128 meanVector = Sse.SetAllVector128(mean); - Vector128 mask = GetAbsMask(); - fixed (float* psrc = src) { + float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + Vector128 meanVector = Sse.SetAllVector128(mean); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector); - result = Sse.Add(result, Sse.And(srcVector, mask)); + result = Sse.Add(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); srcVector = Sse.SubtractScalar(srcVector, meanVector); - result = Sse.AddScalar(result, Sse.And(srcVector, mask)); + result = Sse.AddScalar(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } - internal static unsafe float MaxAbsU(Span src) + public static unsafe float MaxAbsU(Span src) { - Vector128 result = Sse.SetZeroVector128(); - Vector128 mask = GetAbsMask(); - fixed (float* psrc = src) { + float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - result = Sse.Max(result, Sse.And(srcVector, mask)); + result = Sse.Max(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent += 4; } - result = VectorMax(in result); + result = VectorMax128(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); - result = Sse.MaxScalar(result, Sse.And(srcVector, mask)); + result = Sse.MaxScalar(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } - internal static unsafe float MaxAbsDiffU(float mean, Span src) + public static unsafe float MaxAbsDiffU(float mean, Span src) { - Vector128 result = Sse.SetZeroVector128(); - Vector128 meanVector = Sse.SetAllVector128(mean); - Vector128 mask = GetAbsMask(); - fixed (float* psrc = src) { + float* pSrcEnd = psrc + src.Length; float* pSrcCurrent = psrc; - float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + Vector128 meanVector = Sse.SetAllVector128(mean); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector); - result = Sse.Max(result, Sse.And(srcVector, mask)); + result = Sse.Max(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent += 4; } - result = VectorMax(in result); + result = VectorMax128(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); srcVector = Sse.SubtractScalar(srcVector, meanVector); - result = Sse.MaxScalar(result, Sse.And(srcVector, mask)); + result = Sse.MaxScalar(result, Sse.And(srcVector, AbsMask128)); pSrcCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } - internal static unsafe float DotU(Span src, Span dst) + public static unsafe float DotU(Span src, Span dst) { - Vector128 result = Sse.SetZeroVector128(); - fixed (float* psrc = src) fixed (float* pdst = dst) { float* pSrcCurrent = psrc; float* pDstCurrent = pdst; - float* pEnd = psrc + src.Length; + float* pSrcEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -1001,9 +999,9 @@ internal static unsafe float DotU(Span src, Span dst) pDstCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); @@ -1013,15 +1011,13 @@ internal static unsafe float DotU(Span src, Span dst) pSrcCurrent++; pDstCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } - internal static unsafe float DotSU(Span src, Span dst, Span idx) + public static unsafe float DotSU(Span src, Span dst, Span idx) { - Vector128 result = Sse.SetZeroVector128(); - fixed (float* psrc = src) fixed (float* pdst = dst) fixed (int* pidx = idx) @@ -1029,9 +1025,11 @@ internal static unsafe float DotSU(Span src, Span dst, Span i float* pSrcCurrent = psrc; float* pDstCurrent = pdst; int* pIdxCurrent = pidx; - int* pEnd = pidx + idx.Length; + int* pIdxEnd = pidx + idx.Length; - while (pIdxCurrent + 4 <= pEnd) + Vector128 result = Sse.SetZeroVector128(); + + while (pIdxCurrent + 4 <= pIdxEnd) { Vector128 srcVector = Load4(pSrcCurrent, pIdxCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -1042,9 +1040,9 @@ internal static unsafe float DotSU(Span src, Span dst, Span i pDstCurrent += 4; } - result = VectorSum(in result); + result = VectorSum128(in result); - while (pIdxCurrent < pEnd) + while (pIdxCurrent < pIdxEnd) { Vector128 srcVector = Load1(pSrcCurrent, pIdxCurrent); Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); @@ -1054,23 +1052,23 @@ internal static unsafe float DotSU(Span src, Span dst, Span i pIdxCurrent++; pDstCurrent++; } - } - return Sse.ConvertToSingle(result); + return Sse.ConvertToSingle(result); + } } - internal static unsafe float Dist2(Span src, Span dst) + public static unsafe float Dist2(Span src, Span dst) { - Vector128 sqDistanceVector = Sse.SetZeroVector128(); - fixed (float* psrc = src) fixed (float* pdst = dst) { float* pSrcCurrent = psrc; float* pDstCurrent = pdst; - float* pEnd = psrc + src.Length; + float* pSrcEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + Vector128 sqDistanceVector = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent), Sse.LoadVector128(pDstCurrent)); @@ -1081,10 +1079,10 @@ internal static unsafe float Dist2(Span src, Span dst) pDstCurrent += 4; } - sqDistanceVector = VectorSum(in sqDistanceVector); + sqDistanceVector = VectorSum128(in sqDistanceVector); float norm = Sse.ConvertToSingle(sqDistanceVector); - while (pSrcCurrent < pEnd) + while (pSrcCurrent < pSrcEnd) { float distance = (*pSrcCurrent) - (*pDstCurrent); norm += distance * distance; @@ -1097,7 +1095,7 @@ internal static unsafe float Dist2(Span src, Span dst) } } - internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) + public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) { fixed (float* psrc = src) fixed (float* pdst1 = v) @@ -1119,7 +1117,7 @@ internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, f Vector128 xDst1 = Sse.LoadVector128(pDst1Current); xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal)); - Vector128 xDst2 = GetNewDst(xDst1, signMask, xThreshold); + Vector128 xDst2 = GetNewDst128(xDst1, xThreshold); Sse.Store(pDst1Current, xDst1); Sse.Store(pDst2Current, xDst2); @@ -1142,7 +1140,7 @@ internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, f } } - internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) + public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) { fixed (float* psrc = src) fixed (int* pidx = indices) @@ -1164,7 +1162,7 @@ internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Vector128 xDst1 = Load4(pdst1, pIdxCurrent); xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal)); - Vector128 xDst2 = GetNewDst(xDst1, signMask, xThreshold); + Vector128 xDst2 = GetNewDst128(xDst1, xThreshold); Store4(in xDst1, pdst1, pIdxCurrent); Store4(in xDst2, pdst2, pIdxCurrent); diff --git a/src/Microsoft.ML.Transforms/RffTransform.cs b/src/Microsoft.ML.Transforms/RffTransform.cs index 6ad6ceec5f..675235d6ef 100644 --- a/src/Microsoft.ML.Transforms/RffTransform.cs +++ b/src/Microsoft.ML.Transforms/RffTransform.cs @@ -120,10 +120,10 @@ public TransformInfo(IHost host, Column item, Arguments args, int d, Float avgDi sub = args.MatrixGenerator; _matrixGenerator = sub.CreateInstance(host, avgDist); - int roundedUpD = RoundUp(NewDim, CfltAlign); - int roundedUpNumFeatures = RoundUp(SrcDim, CfltAlign); - RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.Vector128Alignment); - RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.Vector128Alignment); + int roundedUpD = RoundUp(NewDim, _cfltAlign); + int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign); + RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment()); + RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment()); InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD); } @@ -156,10 +156,10 @@ public TransformInfo(IHostEnvironment env, ModelLoadContext ctx, int colValueCou ctx.LoadModelOrNull(env, out _matrixGenerator, directoryName)); // initialize the transform matrix - int roundedUpD = RoundUp(NewDim, CfltAlign); - int roundedUpNumFeatures = RoundUp(SrcDim, CfltAlign); - RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.Vector128Alignment); - RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.Vector128Alignment); + int roundedUpD = RoundUp(NewDim, _cfltAlign); + int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign); + RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment()); + RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment()); InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD); } @@ -227,7 +227,7 @@ private static VersionInfo GetVersionInfo() private readonly TransformInfo[] _transformInfos; private const string RegistrationName = "Rff"; - private const int CfltAlign = CpuMathUtils.Vector128Alignment / sizeof(float); + private static readonly int _cfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float); private static string TestColumnType(ColumnType type) { @@ -496,8 +496,8 @@ private ValueGetter> GetterFromVectorType(IRow input, int iinfo) var getSrc = GetSrcGetter>(input, iinfo); var src = default(VBuffer); - var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, CfltAlign), CpuMathUtils.Vector128Alignment); - var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.Vector128Alignment); + var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, _cfltAlign), CpuMathUtils.GetVectorAlignment()); + var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment()); return (ref VBuffer dst) => @@ -512,8 +512,8 @@ private ValueGetter> GetterFromFloatType(IRow input, int iinfo) var getSrc = GetSrcGetter(input, iinfo); var src = default(Float); - var featuresAligned = new AlignedArray(RoundUp(1, CfltAlign), CpuMathUtils.Vector128Alignment); - var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, CfltAlign), CpuMathUtils.Vector128Alignment); + var featuresAligned = new AlignedArray(RoundUp(1, _cfltAlign), CpuMathUtils.GetVectorAlignment()); + var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment()); var oneDimensionalVector = new VBuffer(1, new Float[] { 0 }); diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs new file mode 100644 index 0000000000..2e4b598540 --- /dev/null +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs @@ -0,0 +1,148 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; +using Microsoft.ML.Runtime.Internal.CpuMath; + +namespace Microsoft.ML.CpuMath.PerformanceTests +{ + public class AvxPerformanceTests : PerformanceTests + { + [Benchmark] + public void ManagedAddScalarUPerf() + { + AvxIntrinsics.AddScalarU(DEFAULT_SCALE, new Span(dst, 0, LEN)); + } + + [Benchmark] + public void ManagedScaleUPerf() + { + AvxIntrinsics.ScaleU(DEFAULT_SCALE, new Span(dst, 0, LEN)); + } + + [Benchmark] + public void ManagedScaleSrcUPerf() + { + AvxIntrinsics.ScaleSrcU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } + + [Benchmark] + public void ManagedScaleAddUPerf() + { + AvxIntrinsics.ScaleAddU(DEFAULT_SCALE, DEFAULT_SCALE, new Span(dst, 0, LEN)); + } + + [Benchmark] + public void ManagedAddScaleUPerf() + { + AvxIntrinsics.AddScaleU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } + + [Benchmark] + public void ManagedAddScaleSUPerf() + { + AvxIntrinsics.AddScaleSU(DEFAULT_SCALE, new Span(src), new Span(idx, 0, IDXLEN), new Span(dst)); + } + + [Benchmark] + public void ManagedAddScaleCopyUPerf() + { + AvxIntrinsics.AddScaleCopyU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN), new Span(result, 0, LEN)); + } + + [Benchmark] + public void ManagedAddUPerf() + { + AvxIntrinsics.AddU(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } + + [Benchmark] + public void ManagedAddSUPerf() + { + AvxIntrinsics.AddSU(new Span(src), new Span(idx, 0, IDXLEN), new Span(dst)); + } + + + [Benchmark] + public void ManagedMulElementWiseUPerf() + { + AvxIntrinsics.MulElementWiseU(new Span(src1, 0, LEN), new Span(src2, 0, LEN), + new Span(dst, 0, LEN)); + } + + [Benchmark] + public float ManagedSumUPerf() + { + return AvxIntrinsics.SumU(new Span(src, 0, LEN)); + } + + [Benchmark] + public float ManagedSumSqUPerf() + { + return AvxIntrinsics.SumSqU(new Span(src, 0, LEN)); + } + + [Benchmark] + public float ManagedSumSqDiffUPerf() + { + return AvxIntrinsics.SumSqDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } + + [Benchmark] + public float ManagedSumAbsUPerf() + { + return AvxIntrinsics.SumAbsU(new Span(src, 0, LEN)); + } + + [Benchmark] + public float ManagedSumAbsDiffUPerf() + { + return AvxIntrinsics.SumAbsDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } + + [Benchmark] + public float ManagedMaxAbsUPerf() + { + return AvxIntrinsics.MaxAbsU(new Span(src, 0, LEN)); + } + + [Benchmark] + public float ManagedMaxAbsDiffUPerf() + { + return AvxIntrinsics.MaxAbsDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } + + [Benchmark] + public float ManagedDotUPerf() + { + return AvxIntrinsics.DotU(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } + + [Benchmark] + public float ManagedDotSUPerf() + { + return AvxIntrinsics.DotSU(new Span(src), new Span(dst), new Span(idx, 0, IDXLEN)); + } + + [Benchmark] + public float ManagedDist2Perf() + { + return AvxIntrinsics.Dist2(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } + + [Benchmark] + public void ManagedSdcaL1UpdateUPerf() + { + AvxIntrinsics.SdcaL1UpdateU(DEFAULT_SCALE, new Span(src, 0, LEN), DEFAULT_SCALE, new Span(dst, 0, LEN), new Span(result, 0, LEN)); + } + + [Benchmark] + public void ManagedSdcaL1UpdateSUPerf() + { + AvxIntrinsics.SdcaL1UpdateSU(DEFAULT_SCALE, new Span(src, 0, IDXLEN), new Span(idx, 0, IDXLEN), DEFAULT_SCALE, new Span(dst), new Span(result)); + } + } +} diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs new file mode 100644 index 0000000000..1eb9157a2f --- /dev/null +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/PerformanceTests.cs @@ -0,0 +1,101 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; +using Microsoft.ML.Runtime.Internal.CpuMath; + +namespace Microsoft.ML.CpuMath.PerformanceTests +{ + public abstract class PerformanceTests + { + private const int EXP_MAX = 127; + private const int EXP_MIN = 0; + private const int EXP_RANGE = EXP_MAX / 8; + + protected const int IDXLEN = 1000003; + protected const int LEN = 1000003; + + private const int DEFAULT_SEED = 253421; + protected const float DEFAULT_SCALE = 1.11f; + + protected const int DEFAULT_CROW = 500; + protected const int DEFAULT_CCOL = 2000; + protected const bool ADD = true; + + protected float[] src, dst, original, src1, src2, result; + protected int[] idx; + + private int seed = DEFAULT_SEED; + + private float NextFloat(Random rand, int expRange) + { + double mantissa = (rand.NextDouble() * 2.0) - 1.0; + double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1)); + return (float)(mantissa * exponent); + } + + private int GetSeed() + { + int seed = DEFAULT_SEED; + string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED"); + + if (CPUMATH_SEED != null) + { + if (!int.TryParse(CPUMATH_SEED, out seed)) + { + if (string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase)) + { + seed = new Random().Next(); + } + else + { + seed = DEFAULT_SEED; + } + } + } + + Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results"); + return seed; + } + + [GlobalSetup] + public void Setup() + { + src = new float[LEN]; + dst = new float[LEN]; + src1 = new float[LEN]; + src2 = new float[LEN]; + original = new float[LEN]; + result = new float[LEN]; + idx = new int[IDXLEN]; + + seed = GetSeed(); + Random rand = new Random(seed); + + for (int i = 0; i < LEN; i++) + { + src[i] = NextFloat(rand, EXP_RANGE); + dst[i] = NextFloat(rand, EXP_RANGE); + original[i] = dst[i]; + result[i] = dst[i]; + src1[i] = NextFloat(rand, EXP_RANGE); + src2[i] = NextFloat(rand, EXP_RANGE); + } + + for (int i = 0; i < IDXLEN; i++) + { + idx[i] = rand.Next(0, LEN); + } + } + + [GlobalCleanup] + public void GlobalCleanup() + { + original.CopyTo(dst, 0); + original.CopyTo(result, 0); + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index ade2ea6a0e..3188c64db9 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -9,105 +9,22 @@ namespace Microsoft.ML.CpuMath.PerformanceTests { - public class SsePerformanceTests + public class SsePerformanceTests : PerformanceTests { - private const int EXP_MAX = 127; - private const int EXP_MIN = 0; - - private const int IDXLEN = 1000003; - private const int LEN = 1000003; - private const int EXP_RANGE = EXP_MAX / 2; - private const int DEFAULT_SEED = 253421; - private const float DEFAULT_SCALE = 1.11f; - private const int DEFAULT_CROW = 500; - private const int DEFAULT_CCOL = 2000; - private const bool ADD = true; - - private float[] src, dst, original, src1, src2, result; - private int[] idx; - private int seed = DEFAULT_SEED; - - private static float NextFloat(Random rand, int expRange) - { - double mantissa = (rand.NextDouble() * 2.0) - 1.0; - double exponent = Math.Pow(2.0, rand.Next(-expRange + 1, expRange + 1)); - return (float)(mantissa * exponent); - } - - private static int GetSeed() - { - int seed = DEFAULT_SEED; - - if (Environment.GetEnvironmentVariable("CPUMATH_SEED") != null) - { - string CPUMATH_SEED = Environment.GetEnvironmentVariable("CPUMATH_SEED"); - - if (!int.TryParse(CPUMATH_SEED, out seed)) - { - if(string.Equals(CPUMATH_SEED, "random", StringComparison.OrdinalIgnoreCase)) - { - seed = new Random().Next(); - } - else - { - seed = DEFAULT_SEED; - } - } - } - - Console.WriteLine("Random seed: " + seed + "; set environment variable CPUMATH_SEED to this value to reproduce results"); - - return seed; - } - - [GlobalSetup] - public void Setup() - { - src = new float[LEN]; - dst = new float[LEN]; - src1 = new float[LEN]; - src2 = new float[LEN]; - original = new float[LEN]; - result = new float[LEN]; - idx = new int[IDXLEN]; - - seed = GetSeed(); - Random rand = new Random(seed); - - for (int i = 0; i < LEN; i++) - { - src[i] = NextFloat(rand, EXP_RANGE); - dst[i] = NextFloat(rand, EXP_RANGE); - original[i] = dst[i]; - result[i] = dst[i]; - src1[i] = NextFloat(rand, EXP_RANGE); - src2[i] = NextFloat(rand, EXP_RANGE); - } - - for (int i = 0; i < IDXLEN; i++) - { - idx[i] = rand.Next(0, LEN); - } - } - - [GlobalCleanup] - public void GlobalCleanup() - { - original.CopyTo(dst, 0); - original.CopyTo(result, 0); - } - [Benchmark] - public unsafe float NativeAddScalarUPerf() + public unsafe void NativeAddScalarUPerf() { fixed (float* pdst = dst) { - return CpuMathNativeUtils.AddScalarU(DEFAULT_SCALE, pdst, LEN); + CpuMathNativeUtils.AddScalarU(DEFAULT_SCALE, pdst, LEN); } } [Benchmark] - public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN); + public void ManagedAddScalarUPerf() + { + SseIntrinsics.AddScalarU(DEFAULT_SCALE, new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeScaleUPerf() @@ -119,7 +36,10 @@ public unsafe void NativeScaleUPerf() } [Benchmark] - public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); + public void ManagedScaleUPerf() + { + SseIntrinsics.ScaleU(DEFAULT_SCALE, new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeScaleSrcUPerf() @@ -132,7 +52,10 @@ public unsafe void NativeScaleSrcUPerf() } [Benchmark] - public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN); + public void ManagedScaleSrcUPerf() + { + SseIntrinsics.ScaleSrcU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeScaleAddUPerf() @@ -144,7 +67,10 @@ public unsafe void NativeScaleAddUPerf() } [Benchmark] - public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN); + public void ManagedScaleAddUPerf() + { + SseIntrinsics.ScaleAddU(DEFAULT_SCALE, DEFAULT_SCALE, new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeAddScaleUPerf() @@ -157,7 +83,10 @@ public unsafe void NativeAddScaleUPerf() } [Benchmark] - public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); + public void ManagedAddScaleUPerf() + { + SseIntrinsics.AddScaleU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeAddScaleSUPerf() @@ -171,7 +100,10 @@ public unsafe void NativeAddScaleSUPerf() } [Benchmark] - public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN); + public void ManagedAddScaleSUPerf() + { + SseIntrinsics.AddScaleSU(DEFAULT_SCALE, new Span(src), new Span(idx, 0, IDXLEN), new Span(dst)); + } [Benchmark] public unsafe void NativeAddScaleCopyUPerf() @@ -185,7 +117,10 @@ public unsafe void NativeAddScaleCopyUPerf() } [Benchmark] - public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN); + public void ManagedAddScaleCopyUPerf() + { + SseIntrinsics.AddScaleCopyU(DEFAULT_SCALE, new Span(src, 0, LEN), new Span(dst, 0, LEN), new Span(result, 0, LEN)); + } [Benchmark] public unsafe void NativeAddUPerf() @@ -198,7 +133,10 @@ public unsafe void NativeAddUPerf() } [Benchmark] - public void ManagedAddUPerf() => CpuMathUtils.Add(src, dst, LEN); + public void ManagedAddUPerf() + { + SseIntrinsics.AddU(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeAddSUPerf() @@ -212,7 +150,10 @@ public unsafe void NativeAddSUPerf() } [Benchmark] - public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN); + public void ManagedAddSUPerf() + { + SseIntrinsics.AddSU(new Span(src), new Span(idx, 0, IDXLEN), new Span(dst)); + } [Benchmark] @@ -227,7 +168,11 @@ public unsafe void NativeMulElementWiseUPerf() } [Benchmark] - public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN); + public void ManagedMulElementWiseUPerf() + { + SseIntrinsics.MulElementWiseU(new Span(src1, 0, LEN), new Span(src2, 0, LEN), + new Span(dst, 0, LEN)); + } [Benchmark] public unsafe float NativeSumUPerf() @@ -239,7 +184,10 @@ public unsafe float NativeSumUPerf() } [Benchmark] - public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN); + public float ManagedSumUPerf() + { + return SseIntrinsics.SumU(new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeSumSqUPerf() @@ -251,7 +199,10 @@ public unsafe float NativeSumSqUPerf() } [Benchmark] - public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN); + public float ManagedSumSqUPerf() + { + return SseIntrinsics.SumSqU(new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeSumSqDiffUPerf() @@ -263,7 +214,10 @@ public unsafe float NativeSumSqDiffUPerf() } [Benchmark] - public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN); + public float ManagedSumSqDiffUPerf() + { + return SseIntrinsics.SumSqDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeSumAbsUPerf() @@ -275,7 +229,10 @@ public unsafe float NativeSumAbsUPerf() } [Benchmark] - public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN); + public float ManagedSumAbsUPerf() + { + return SseIntrinsics.SumAbsU(new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeSumAbsDiffUPerf() @@ -287,7 +244,10 @@ public unsafe float NativeSumAbsDiffUPerf() } [Benchmark] - public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN); + public float ManagedSumAbsDiffUPerf() + { + return SseIntrinsics.SumAbsDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeMaxAbsUPerf() @@ -299,7 +259,10 @@ public unsafe float NativeMaxAbsUPerf() } [Benchmark] - public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN); + public float ManagedMaxAbsUPerf() + { + return SseIntrinsics.MaxAbsU(new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeMaxAbsDiffUPerf() @@ -311,8 +274,10 @@ public unsafe float NativeMaxAbsDiffUPerf() } [Benchmark] - public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN); - // TODO: MaxAbsU!!! + public float ManagedMaxAbsDiffUPerf() + { + return SseIntrinsics.MaxAbsDiffU(DEFAULT_SCALE, new Span(src, 0, LEN)); + } [Benchmark] public unsafe float NativeDotUPerf() @@ -325,7 +290,10 @@ public unsafe float NativeDotUPerf() } [Benchmark] - public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN); + public float ManagedDotUPerf() + { + return SseIntrinsics.DotU(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] public unsafe float NativeDotSUPerf() @@ -339,7 +307,10 @@ public unsafe float NativeDotSUPerf() } [Benchmark] - public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN); + public float ManagedDotSUPerf() + { + return SseIntrinsics.DotSU(new Span(src), new Span(dst), new Span(idx, 0, IDXLEN)); + } [Benchmark] public unsafe float NativeDist2Perf() @@ -352,7 +323,10 @@ public unsafe float NativeDist2Perf() } [Benchmark] - public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN); + public float ManagedDist2Perf() + { + return SseIntrinsics.Dist2(new Span(src, 0, LEN), new Span(dst, 0, LEN)); + } [Benchmark] public unsafe void NativeSdcaL1UpdateUPerf() @@ -366,7 +340,10 @@ public unsafe void NativeSdcaL1UpdateUPerf() } [Benchmark] - public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result); + public void ManagedSdcaL1UpdateUPerf() + { + SseIntrinsics.SdcaL1UpdateU(DEFAULT_SCALE, new Span(src, 0, LEN), DEFAULT_SCALE, new Span(dst, 0, LEN), new Span(result, 0, LEN)); + } [Benchmark] public unsafe void NativeSdcaL1UpdateSUPerf() @@ -381,6 +358,9 @@ public unsafe void NativeSdcaL1UpdateSUPerf() } [Benchmark] - public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result); + public void ManagedSdcaL1UpdateSUPerf() + { + SseIntrinsics.SdcaL1UpdateSU(DEFAULT_SCALE, new Span(src, 0, IDXLEN), new Span(idx, 0, IDXLEN), DEFAULT_SCALE, new Span(dst), new Span(result)); + } } } diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj index e611b15032..44ad91ed90 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj +++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/Microsoft.ML.CpuMath.UnitTests.netcoreapp.csproj @@ -8,9 +8,5 @@ - - - - diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs similarity index 52% rename from test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs rename to test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs index d1d5955a8e..1877ebe6b0 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs @@ -1,205 +1,215 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using System; using System.Collections.Generic; using Xunit; +using Xunit.Abstractions; using Microsoft.ML.Runtime.Internal.CpuMath; namespace Microsoft.ML.CpuMath.UnitTests { public class CpuMathUtilsUnitTests { - private readonly float[][] testArrays; - private readonly int[] testIndexArray; - private readonly AlignedArray[] testMatrices; - private readonly AlignedArray[] testSrcVectors; - private readonly AlignedArray[] testDstVectors; + private readonly float[][] _testArrays; + private readonly int[] _testIndexArray; + private readonly AlignedArray[] _testMatrices; + private readonly AlignedArray[] _testSrcVectors; + private readonly AlignedArray[] _testDstVectors; + private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment(); + private readonly FloatEqualityComparer _comparer; + private readonly FloatEqualityComparerForMatMul _matMulComparer; + private const float DEFAULT_SCALE = 1.7f; - private const int SseCbAlign = 16; - private FloatEqualityComparer comparer; public CpuMathUtilsUnitTests() { // Padded array whose length is a multiple of 4 - float[] testArray1 = new float[8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; + float[] testArray1 = new float[16] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; // Unpadded array whose length is not a multiple of 4. - float[] testArray2 = new float[7] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f }; - testArrays = new float[][] { testArray1, testArray2 }; - testIndexArray = new int[4] { 0, 2, 5, 6 }; - comparer = new FloatEqualityComparer(); - - // Padded matrices whose dimensions are multiples of 4 - float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, - 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; - float[] testMatrix2 = new float[4 * 8]; + float[] testArray2 = new float[15] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f }; + _testArrays = new float[][] { testArray1, testArray2 }; + _testIndexArray = new int[9] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }; + _comparer = new FloatEqualityComparer(); + _matMulComparer = new FloatEqualityComparerForMatMul(); + + // Padded matrices whose dimensions are multiples of 8 + float[] testMatrix1 = new float[8 * 8] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; + float[] testMatrix2 = new float[8 * 16]; for (int i = 0; i < testMatrix2.Length; i++) { testMatrix2[i] = i + 1; } - AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, SseCbAlign); - AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, SseCbAlign); + AlignedArray testMatrixAligned1 = new AlignedArray(8 * 8, _vectorAlignment); + AlignedArray testMatrixAligned2 = new AlignedArray(8 * 16, _vectorAlignment); testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length); testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length); - testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 }; + _testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 }; - // Padded source vectors whose dimensions are multiples of 4 - float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f }; - float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f }; + // Padded source vectors whose dimensions are multiples of 8 + float[] testSrcVector1 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f }; + float[] testSrcVector2 = new float[16] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f, 16f }; - AlignedArray testSrcVectorAligned1 = new AlignedArray(4, SseCbAlign); - AlignedArray testSrcVectorAligned2 = new AlignedArray(8, SseCbAlign); + AlignedArray testSrcVectorAligned1 = new AlignedArray(8, _vectorAlignment); + AlignedArray testSrcVectorAligned2 = new AlignedArray(16, _vectorAlignment); testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length); testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length); - testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; + _testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; - // Padded destination vectors whose dimensions are multiples of 4 - float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f }; - float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f }; + // Padded destination vectors whose dimensions are multiples of 8 + float[] testDstVector1 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f }; + float[] testDstVector2 = new float[16] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f }; - AlignedArray testDstVectorAligned1 = new AlignedArray(4, SseCbAlign); - AlignedArray testDstVectorAligned2 = new AlignedArray(8, SseCbAlign); + AlignedArray testDstVectorAligned1 = new AlignedArray(8, _vectorAlignment); + AlignedArray testDstVectorAligned2 = new AlignedArray(16, _vectorAlignment); testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length); testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length); - testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; + _testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; } [Theory] - [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })] - [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })] - [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })] + [InlineData(0, 0, 0, new float[] { -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f })] + [InlineData(1, 1, 0, new float[] { 1496f, 3672f, 5848f, 8024f, 10200f, 12376f, 14552f, 16728f })] + [InlineData(1, 0, 1, new float[] { 204f, 492f, 780f, 1068f, 1356f, 1644f, 1932f, 2220f, 2508f, 2796f, 3084f, 3372f, 3660f, 3948f, 4236f, 4524f })] public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })] - [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })] - [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })] + [InlineData(0, 0, 0, new float[] { -416.6801f, -415.6801f, -414.6801f, -413.6801f, -412.6801f, -411.6801f, -410.6801f, -409.6801f })] + [InlineData(1, 1, 0, new float[] { 1496f, 3673f, 5850f, 8027f, 10204f, 12381f, 14558f, 16735f })] + [InlineData(1, 0, 1, new float[] { 204f, 493f, 782f, 1071f, 1360f, 1649f, 1938f, 2227f, 2516f, 2805f, 3094f, 3383f, 3672f, 3961f, 4250f, 4539f })] public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })] - [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })] - [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })] + [InlineData(0, 0, 0, new float[] { 70.56001f, -85.68f, -351.36f, 498.24f, -3829.32f, -969.48f, 1168.2f, 118.44f })] + [InlineData(1, 0, 1, new float[] { 2724f, 2760f, 2796f, 2832f, 2868f, 2904f, 2940f, 2976f, 3012f, 3048f, 3084f, 3120f, 3156f, 3192f, 3228f, 3264f })] + [InlineData(1, 1, 0, new float[] { 11016f, 11152f, 11288f, 11424f, 11560f, 11696f, 11832f, 11968f })] public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })] - [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })] - [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })] + [InlineData(0, 0, 0, new float[] { 70.56001f, -84.68f, -349.36f, 501.24f, -3825.32f, -964.48f, 1174.2f, 125.44f })] + [InlineData(1, 0, 1, new float[] { 2724f, 2761f, 2798f, 2835f, 2872f, 2909f, 2946f, 2983f, 3020f, 3057f, 3094f, 3131f, 3168f, 3205f, 3242f, 3279f })] + [InlineData(1, 1, 0, new float[] { 11016f, 11153f, 11290f, 11427f, 11564f, 11701f, 11838f, 11975f })] public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })] - [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })] - [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })] + [InlineData(0, 0, 0, new float[] { 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f })] + [InlineData(1, 1, 0, new float[] { 910f, 2190f, 3470f, 4750f, 6030f, 7310f, 8590f, 9870f })] + [InlineData(1, 0, 1, new float[] { 95f, 231f, 367f, 503f, 639f, 775f, 911f, 1047f, 1183f, 1319f, 1455f, 1591f, 1727f, 1863f, 1999f, 2135f })] public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; - CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); + CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })] - [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })] - [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })] + [InlineData(0, 0, 0, new float[] { 38.25002f, 39.25002f, 40.25002f, 41.25002f, 42.25002f, 43.25002f, 44.25002f, 45.25002f })] + [InlineData(1, 1, 0, new float[] { 910f, 2191f, 3472f, 4753f, 6034f, 7315f, 8596f, 9877f })] + [InlineData(1, 0, 1, new float[] { 95f, 232f, 369f, 506f, 643f, 780f, 917f, 1054f, 1191f, 1328f, 1465f, 1602f, 1739f, 1876f, 2013f, 2150f })] public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; - CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); + CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })] - [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })] - [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })] + [InlineData(0, 0, 0, new float[] { 33.32f, -40.46f, -165.92f, 235.28f, -1808.29f, -457.81f, 551.65f, 55.93f })] + [InlineData(1, 0, 1, new float[] { 1265f, 1282f, 1299f, 1316f, 1333f, 1350f, 1367f, 1384f, 1401f, 1418f, 1435f, 1452f, 1469f, 1486f, 1503f, 1520f })] + [InlineData(1, 1, 0, new float[] { 6720f, 6800f, 6880f, 6960f, 7040f, 7120f, 7200f, 7280f })] public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; - CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); + CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] - [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })] - [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })] - [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })] + [InlineData(0, 0, 0, new float[] { 33.32f, -39.46f, -163.92f, 238.28f, -1804.29f, -452.81f, 557.65f, 62.93f })] + [InlineData(1, 0, 1, new float[] { 1265f, 1283f, 1301f, 1319f, 1337f, 1355f, 1373f, 1391f, 1409f, 1427f, 1445f, 1463f, 1481f, 1499f, 1517f, 1535f })] + [InlineData(1, 1, 0, new float[] { 6720f, 6801f, 6882f, 6963f, 7044f, 7125f, 7206f, 7287f })] public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[matTest]; - AlignedArray src = testSrcVectors[srcTest]; - AlignedArray dst = testDstVectors[dstTest]; - int[] idx = testIndexArray; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; + int[] idx = _testIndexArray; - CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); + CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _matMulComparer); } [Theory] @@ -207,7 +217,7 @@ public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] e [InlineData(1)] public void AddScalarUTest(int test) { - float[] dst = (float[])testArrays[test].Clone(); + float[] dst = (float[])_testArrays[test].Clone(); float[] expected = (float[])dst.Clone(); for (int i = 0; i < expected.Length; i++) @@ -217,7 +227,7 @@ public void AddScalarUTest(int test) CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -225,7 +235,7 @@ public void AddScalarUTest(int test) [InlineData(1)] public void ScaleUTest(int test) { - float[] dst = (float[])testArrays[test].Clone(); + float[] dst = (float[])_testArrays[test].Clone(); float[] expected = (float[])dst.Clone(); for (int i = 0; i < expected.Length; i++) @@ -235,7 +245,7 @@ public void ScaleUTest(int test) CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -243,7 +253,7 @@ public void ScaleUTest(int test) [InlineData(1)] public void ScaleSrcUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); float[] expected = (float[])dst.Clone(); @@ -254,7 +264,7 @@ public void ScaleSrcUTest(int test) CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -262,7 +272,7 @@ public void ScaleSrcUTest(int test) [InlineData(1)] public void ScaleAddUTest(int test) { - float[] dst = (float[])testArrays[test].Clone(); + float[] dst = (float[])_testArrays[test].Clone(); float[] expected = (float[])dst.Clone(); for (int i = 0; i < expected.Length; i++) @@ -272,7 +282,7 @@ public void ScaleAddUTest(int test) CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -280,7 +290,7 @@ public void ScaleAddUTest(int test) [InlineData(1)] public void AddScaleUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); float[] expected = (float[])dst.Clone(); @@ -291,7 +301,7 @@ public void AddScaleUTest(int test) CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -299,19 +309,24 @@ public void AddScaleUTest(int test) [InlineData(1)] public void AddScaleSUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); - int[] idx = testIndexArray; + int[] idx = _testIndexArray; float[] expected = (float[])dst.Clone(); expected[0] = 5.292f; expected[2] = -13.806f; expected[5] = -43.522f; expected[6] = 55.978f; + expected[8] = -178.869f; + expected[11] = -31.941f; + expected[12] = -51.205f; + expected[13] = -21.337f; + expected[14] = 35.782f; CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, idx.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -319,7 +334,7 @@ public void AddScaleSUTest(int test) [InlineData(1)] public void AddScaleCopyUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); float[] result = (float[])dst.Clone(); float[] expected = (float[])dst.Clone(); @@ -331,7 +346,7 @@ public void AddScaleCopyUTest(int test) CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length); var actual = result; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -339,7 +354,7 @@ public void AddScaleCopyUTest(int test) [InlineData(1)] public void AddUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); float[] expected = (float[])src.Clone(); @@ -356,7 +371,7 @@ public void AddUTest(int test) CpuMathUtils.Add(src, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -364,19 +379,24 @@ public void AddUTest(int test) [InlineData(1)] public void AddSUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); - int[] idx = testIndexArray; + int[] idx = _testIndexArray; float[] expected = (float[])dst.Clone(); expected[0] = 3.92f; expected[2] = -12.14f; expected[5] = -36.69f; expected[6] = 46.29f; + expected[8] = -104.41f; + expected[11] = -13.09f; + expected[12] = -73.92f; + expected[13] = -23.64f; + expected[14] = 34.41f; CpuMathUtils.Add(src, idx, dst, idx.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -384,7 +404,7 @@ public void AddSUTest(int test) [InlineData(1)] public void MulElementWiseUTest(int test) { - float[] src1 = (float[])testArrays[test].Clone(); + float[] src1 = (float[])_testArrays[test].Clone(); float[] src2 = (float[])src1.Clone(); float[] dst = (float[])src1.Clone(); @@ -403,55 +423,55 @@ public void MulElementWiseUTest(int test) CpuMathUtils.MulElementWise(src1, src2, dst, dst.Length); var actual = dst; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] - [InlineData(0, -93.9f)] - [InlineData(1, -97.19f)] + [InlineData(0, -187.8f)] + [InlineData(1, -191.09f)] public void SumUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.Sum(src, src.Length); Assert.Equal(expected, actual, 2); } [Theory] - [InlineData(0, 13399.9376f)] - [InlineData(1, 13389.1135f)] + [InlineData(0, 26799.8752f)] + [InlineData(1, 26789.0511f)] public void SumSqUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.SumSq(src, src.Length); Assert.Equal(expected, actual, 2); } [Theory] - [InlineData(0, 13742.3176f)] - [InlineData(1, 13739.7895f)] + [InlineData(0, 27484.6352f)] + [InlineData(1, 27482.1071f)] public void SumSqDiffUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length); Assert.Equal(expected, actual, 2); } [Theory] - [InlineData(0, 196.98f)] - [InlineData(1, 193.69f)] + [InlineData(0, 393.96f)] + [InlineData(1, 390.67f)] public void SumAbsUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.SumAbs(src, src.Length); Assert.Equal(expected, actual, 2); } [Theory] - [InlineData(0, 196.98f)] - [InlineData(1, 195.39f)] + [InlineData(0, 393.96f)] + [InlineData(1, 392.37f)] public void SumAbsDiffUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length); Assert.Equal(expected, actual, 2); } @@ -461,7 +481,7 @@ public void SumAbsDiffUTest(int test, float expected) [InlineData(1, 106.37f)] public void MaxAbsUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.MaxAbs(src, src.Length); Assert.Equal(expected, actual, 2); } @@ -471,17 +491,17 @@ public void MaxAbsUTest(int test, float expected) [InlineData(1, 108.07f)] public void MaxAbsDiffUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length); Assert.Equal(expected, actual, 2); } [Theory] - [InlineData(0, 13306.0376f)] - [InlineData(1, 13291.9235f)] + [InlineData(0, 26612.0752f)] + [InlineData(1, 26597.9611f)] public void DotUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); for (int i = 0; i < dst.Length; i++) @@ -490,17 +510,17 @@ public void DotUTest(int test, float expected) } var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length); - Assert.Equal(expected, actual, 2); + Assert.Equal(expected, actual, 1); } [Theory] - [InlineData(0, 736.7352f)] - [InlineData(1, 736.7352f)] + [InlineData(0, -3406.2154f)] + [InlineData(1, -3406.2154f)] public void DotSUTest(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); - int[] idx = testIndexArray; + int[] idx = _testIndexArray; // Ensures src and dst are different arrays for (int i = 0; i < dst.Length; i++) @@ -509,15 +529,15 @@ public void DotSUTest(int test, float expected) } var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length); - Assert.Equal(expected, actual, 4); + Assert.Equal(expected, actual, 2); } [Theory] - [InlineData(0, 8.0f)] - [InlineData(1, 7.0f)] + [InlineData(0, 16.0f)] + [InlineData(1, 15.0f)] public void Dist2Test(int test, float expected) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] dst = (float[])src.Clone(); // Ensures src and dst are different arrays @@ -531,31 +551,31 @@ public void Dist2Test(int test, float expected) } [Theory] - [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })] - [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })] + [InlineData(0, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })] + [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 0f, 10f, 11f, 0f, 0f, 0f, 0f, 16f })] public void ZeroItemsUTest(int test, int[] idx, float[] expected) { - AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign); - src.CopyFrom(testSrcVectors[test]); + AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment); + src.CopyFrom(_testSrcVectors[test]); CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx); float[] actual = new float[src.Size]; src.CopyTo(actual, 0, src.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] - [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })] - [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })] + [InlineData(0, new int[] { 0, 2, 5 }, new float[] { 0f, 2f, 0f, 4f, 5f, 6f, 0f, 8f })] + [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 9f, 0f, 11f, 12f, 0f, 0f, 0f, 16f })] public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected) { - AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign); - src.CopyFrom(testSrcVectors[test]); + AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment); + src.CopyFrom(_testSrcVectors[test]); CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx); float[] actual = new float[src.Size]; src.CopyTo(actual, 0, src.Size); - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -563,7 +583,7 @@ public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected) [InlineData(1)] public void SdcaL1UpdateUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] v = (float[])src.Clone(); float[] w = (float[])src.Clone(); float[] expected = (float[])w.Clone(); @@ -576,7 +596,7 @@ public void SdcaL1UpdateUTest(int test) CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w); var actual = w; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } [Theory] @@ -584,10 +604,10 @@ public void SdcaL1UpdateUTest(int test) [InlineData(1)] public void SdcaL1UpdateSUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); + float[] src = (float[])_testArrays[test].Clone(); float[] v = (float[])src.Clone(); float[] w = (float[])src.Clone(); - int[] idx = testIndexArray; + int[] idx = _testIndexArray; float[] expected = (float[])w.Clone(); for (int i = 0; i < idx.Length; i++) @@ -599,7 +619,7 @@ public void SdcaL1UpdateSUTest(int test) CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w); var actual = w; - Assert.Equal(expected, actual, comparer); + Assert.Equal(expected, actual, _comparer); } } @@ -615,4 +635,17 @@ public int GetHashCode(float a) throw new NotImplementedException(); } } -} + + internal class FloatEqualityComparerForMatMul : IEqualityComparer + { + public bool Equals(float a, float b) + { + return Math.Abs(a - b) < 1e-3f; + } + + public int GetHashCode(float a) + { + throw new NotImplementedException(); + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj index 9552f688a8..862c95ef90 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj +++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/Microsoft.ML.CpuMath.UnitTests.netstandard.csproj @@ -12,5 +12,9 @@ + + + + - + \ No newline at end of file