From 3e407095aefd85b8b58bd203baff2fe63e7957c4 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Thu, 23 May 2024 22:33:14 -0700 Subject: [PATCH 01/11] basic impl --- .../Lfu/CmSketchTests.cs | 2 +- BitFaster.Caching/Intrinsics.cs | 28 +++- BitFaster.Caching/Lfu/CmSketchCore.cs | 153 ++++++++++++++++++ 3 files changed, 179 insertions(+), 4 deletions(-) diff --git a/BitFaster.Caching.UnitTests/Lfu/CmSketchTests.cs b/BitFaster.Caching.UnitTests/Lfu/CmSketchTests.cs index 85de5040..3aa61087 100644 --- a/BitFaster.Caching.UnitTests/Lfu/CmSketchTests.cs +++ b/BitFaster.Caching.UnitTests/Lfu/CmSketchTests.cs @@ -6,7 +6,7 @@ namespace BitFaster.Caching.UnitTests.Lfu { - // Test with AVX2 if it is supported + // Test with AVX2 or ARM64 if it is supported public class CMSketchAvx2Tests : CmSketchTestBase { } diff --git a/BitFaster.Caching/Intrinsics.cs b/BitFaster.Caching/Intrinsics.cs index 8a1bd29a..45908a01 100644 --- a/BitFaster.Caching/Intrinsics.cs +++ b/BitFaster.Caching/Intrinsics.cs @@ -2,6 +2,10 @@ using System.Runtime.Intrinsics.X86; #endif +#if NET6_0 +using System.Runtime.Intrinsics.Arm; +#endif + namespace BitFaster.Caching { /// @@ -12,7 +16,14 @@ public interface IsaProbe /// /// Gets a value indicating whether AVX2 is supported. /// - bool IsAvx2Supported { get; } + bool IsAvx2Supported { get; } + +#if NET6_0_OR_GREATER + /// + /// Gets a value indicating whether Arm64 is supported. + /// + bool IsArm64Supported { get => false; } +#endif } /// @@ -25,7 +36,15 @@ public interface IsaProbe public bool IsAvx2Supported => false; #else /// - public bool IsAvx2Supported => Avx2.IsSupported; + public bool IsAvx2Supported => Avx2.IsSupported; +#endif + +#if NET6_0_OR_GREATER + /// + public bool IsArm64Supported => AdvSimd.Arm64.IsSupported; +#else + /// + public bool IsArm64Supported => false; #endif } @@ -35,6 +54,9 @@ public interface IsaProbe public readonly struct DisableHardwareIntrinsics : IsaProbe { /// - public bool IsAvx2Supported => false; + public bool IsAvx2Supported => false; + + /// + public bool IsArm64Supported => false; } } diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index de255840..e8d35f95 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -8,6 +8,10 @@ using System.Runtime.Intrinsics.X86; #endif +#if NET6_0_OR_GREATER +using System.Runtime.Intrinsics.Arm; +#endif + namespace BitFaster.Caching.Lfu { /// @@ -76,6 +80,12 @@ public int EstimateFrequency(T value) { return EstimateFrequencyAvx(value); } +#if NET6_0_OR_GREATER + else if (isa.IsArm64Supported) + { + return EstimateFrequencyArm(value); + } +#endif else { return EstimateFrequencyStd(value); @@ -99,6 +109,12 @@ public void Increment(T value) { IncrementAvx(value); } +#if NET6_0_OR_GREATER + else if (isa.IsArm64Supported) + { + IncrementArm(value); + } +#endif else { IncrementStd(value); @@ -329,5 +345,142 @@ private unsafe void IncrementAvx(T value) } } #endif + +#if NET6_0_OR_GREATER + private unsafe void IncrementArm(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = Vector128.Create(counterHash); + h = AdvSimd.ShiftArithmetic(h, Vector128.Create(0, -8, -16, -24)); + + Vector128 index = AdvSimd.ShiftRightLogical(h, 1); + index = AdvSimd.And(index, Vector128.Create(15)); // j - counter index + Vector128 offset = AdvSimd.And(h, Vector128.Create(1)); + Vector128 blockOffset = AdvSimd.Add(Vector128.Create(block), offset); // i - table index + blockOffset = AdvSimd.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + + fixed (long* tablePtr = table) + { + int t0 = AdvSimd.Extract(blockOffset, 0); + int t1 = AdvSimd.Extract(blockOffset, 1); + int t2 = AdvSimd.Extract(blockOffset, 2); + int t3 = AdvSimd.Extract(blockOffset, 3); + + var ta0 = AdvSimd.LoadVector64(tablePtr + t0); + var ta1 = AdvSimd.LoadVector64(tablePtr + t1); + var ta2 = AdvSimd.LoadVector64(tablePtr + t2); + var ta3 = AdvSimd.LoadVector64(tablePtr + t3); + + Vector128 tableVectorA = Vector128.Create(ta0, ta1); + Vector128 tableVectorB = Vector128.Create(ta2, ta3); + + // TODO: VectorTableLookup + //Vector128 tableVectorA = Vector128.Create( + // tablePtr[t0], + // tablePtr[t1]); + //Vector128 tableVectorB = Vector128.Create( + // tablePtr[t2], + // tablePtr[t3]); + + // j == index + index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); + + Vector128 longOffA = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0); + longOffA = AdvSimd.Arm64.InsertSelectedScalar(longOffA, 2, index, 1); + + Vector128 longOffB = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2); + longOffB = AdvSimd.Arm64.InsertSelectedScalar(longOffB, 2, index, 3); + + Vector128 fifteen = Vector128.Create(0xfL); + Vector128 maskA = AdvSimd.ShiftArithmetic(fifteen, longOffA.AsInt64()); + Vector128 maskB = AdvSimd.ShiftArithmetic(fifteen, longOffB.AsInt64()); + + Vector128 maskedA = AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorA, maskA), maskA); + Vector128 maskedB = AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorB, maskB), maskB); + + var one = Vector128.Create(1L); + Vector128 incA = AdvSimd.ShiftArithmetic(one, longOffA.AsInt64()); + Vector128 incB = AdvSimd.ShiftArithmetic(one, longOffB.AsInt64()); + + maskedA = AdvSimd.Not(maskedA); + maskedB = AdvSimd.Not(maskedB); + + incA = AdvSimd.And(maskedA, incA); + incB = AdvSimd.And(maskedA, incB); + + tablePtr[t0] += AdvSimd.Extract(incA, 0); + tablePtr[t1] += AdvSimd.Extract(incA, 1); + tablePtr[t2] += AdvSimd.Extract(incB, 0); + tablePtr[t3] += AdvSimd.Extract(incB, 1); + + var maxA = AdvSimd.Arm64.MaxAcross(incA.AsInt32()); + var maxB = AdvSimd.Arm64.MaxAcross(incB.AsInt32()); + maxA = AdvSimd.Arm64.InsertSelectedScalar(maxA, 1, maxB, 0); + var max = AdvSimd.Arm64.MaxAcross(maxA.AsInt16()); + + if (max.ToScalar() != 0 && (++size == sampleSize)) + { + Reset(); + } + } + } + + private unsafe int EstimateFrequencyArm(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = Vector128.Create(counterHash); + h = AdvSimd.ShiftArithmetic(h, Vector128.Create(0, -8, -16, -24)); + + Vector128 index = AdvSimd.ShiftRightLogical(h, 1); + + index = AdvSimd.And(index, Vector128.Create(0xf)); // j - counter index + Vector128 offset = AdvSimd.And(h, Vector128.Create(1)); + Vector128 blockOffset = AdvSimd.Add(Vector128.Create(block), offset); // i - table index + blockOffset = AdvSimd.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + + fixed (long* tablePtr = table) + { + // TODO: VectorTableLookup + Vector128 tableVectorA = Vector128.Create( + tablePtr[AdvSimd.Extract(blockOffset, 0)], + tablePtr[AdvSimd.Extract(blockOffset, 1)]); + Vector128 tableVectorB = Vector128.Create( + tablePtr[AdvSimd.Extract(blockOffset, 2)], + tablePtr[AdvSimd.Extract(blockOffset, 3)]); + + // j == index + index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); + + Vector128 indexA = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0); + indexA = AdvSimd.Arm64.InsertSelectedScalar(indexA, 2, index, 1); + + Vector128 indexB = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2); + indexB = AdvSimd.Arm64.InsertSelectedScalar(indexB, 2, index, 3); + + indexA = AdvSimd.Negate(indexA); + indexB = AdvSimd.Negate(indexB); + + Vector128 a = AdvSimd.ShiftArithmetic(tableVectorA, indexA.AsInt64()); + Vector128 b = AdvSimd.ShiftArithmetic(tableVectorB, indexB.AsInt64()); + + var fifteen = Vector128.Create(0xfL); + a = AdvSimd.And(a, fifteen); + b = AdvSimd.And(b, fifteen); + + var minA = AdvSimd.Arm64.MinAcross(a.AsInt32()); + var minB = AdvSimd.Arm64.MinAcross(b.AsInt32()); + minA = AdvSimd.Arm64.InsertSelectedScalar(minA, 1, minB, 0); + var min = AdvSimd.Arm64.MinAcross(minA.AsInt16()); + + return min.ToScalar(); + } + } +#endif } } From adc98697bdc3cae9e1282c9e6efce22b921e8f2d Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Sat, 25 May 2024 00:56:58 +0000 Subject: [PATCH 02/11] run tests --- BitFaster.Caching.UnitTests/Intrinsics.cs | 11 +++++++++++ BitFaster.Caching/Lfu/CmSketchCore.cs | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/BitFaster.Caching.UnitTests/Intrinsics.cs b/BitFaster.Caching.UnitTests/Intrinsics.cs index 312d78a1..6827a93b 100644 --- a/BitFaster.Caching.UnitTests/Intrinsics.cs +++ b/BitFaster.Caching.UnitTests/Intrinsics.cs @@ -1,6 +1,10 @@ #if NETCOREAPP3_1_OR_GREATER using System.Runtime.Intrinsics.X86; #endif +#if NET6_0_OR_GREATER +using System.Runtime.Intrinsics.Arm; +#endif + using Xunit; namespace BitFaster.Caching.UnitTests @@ -10,8 +14,15 @@ public static class Intrinsics public static void SkipAvxIfNotSupported() { #if NETCOREAPP3_1_OR_GREATER +#if NET6_0_OR_GREATER // when we are trying to test Avx2, skip the test if it's not supported + Skip.If(typeof(I) == typeof(DetectIsa) && !(Avx2.IsSupported || AdvSimd.Arm64.IsSupported)); +#else +// when we are trying to test Avx2, skip the test if it's not supported Skip.If(typeof(I) == typeof(DetectIsa) && !Avx2.IsSupported); +#endif + + #else Skip.If(true); #endif diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index e8d35f95..f4e28eed 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -409,7 +409,7 @@ private unsafe void IncrementArm(T value) maskedB = AdvSimd.Not(maskedB); incA = AdvSimd.And(maskedA, incA); - incB = AdvSimd.And(maskedA, incB); + incB = AdvSimd.And(maskedB, incB); tablePtr[t0] += AdvSimd.Extract(incA, 0); tablePtr[t1] += AdvSimd.Extract(incA, 1); From dbe9b79f3e7748a10cf0488e689892cef67a2e4a Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Sat, 25 May 2024 02:33:00 +0000 Subject: [PATCH 03/11] fix --- .../Lfu/SketchIncrement.cs | 2 ++ BitFaster.Caching.UnitTests/Intrinsics.cs | 11 +++++------ BitFaster.Caching.UnitTests/Lfu/CmSketchTests.cs | 15 ++++++++++++--- BitFaster.Caching/Lfu/CmSketchCore.cs | 13 ++++++++----- 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index eb005032..886d8349 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -1,5 +1,6 @@  using System.Collections.Generic; +using Benchly; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; using BitFaster.Caching.Lfu; @@ -9,6 +10,7 @@ namespace BitFaster.Caching.Benchmarks.Lfu [SimpleJob(RuntimeMoniker.Net60)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] + [ColumnChart(Title = "Sketch Increment ({JOB})")] public class SketchIncrement { const int iterations = 1_048_576; diff --git a/BitFaster.Caching.UnitTests/Intrinsics.cs b/BitFaster.Caching.UnitTests/Intrinsics.cs index 6827a93b..ebbe194a 100644 --- a/BitFaster.Caching.UnitTests/Intrinsics.cs +++ b/BitFaster.Caching.UnitTests/Intrinsics.cs @@ -14,14 +14,13 @@ public static class Intrinsics public static void SkipAvxIfNotSupported() { #if NETCOREAPP3_1_OR_GREATER -#if NET6_0_OR_GREATER - // when we are trying to test Avx2, skip the test if it's not supported + #if NET6_0_OR_GREATER + // when we are trying to test Avx2/Arm64, skip the test if it's not supported Skip.If(typeof(I) == typeof(DetectIsa) && !(Avx2.IsSupported || AdvSimd.Arm64.IsSupported)); -#else -// when we are trying to test Avx2, skip the test if it's not supported + #else + // when we are trying to test Avx2, skip the test if it's not supported Skip.If(typeof(I) == typeof(DetectIsa) && !Avx2.IsSupported); -#endif - + #endif #else Skip.If(true); diff --git a/BitFaster.Caching.UnitTests/Lfu/CmSketchTests.cs b/BitFaster.Caching.UnitTests/Lfu/CmSketchTests.cs index 3aa61087..ba32b42e 100644 --- a/BitFaster.Caching.UnitTests/Lfu/CmSketchTests.cs +++ b/BitFaster.Caching.UnitTests/Lfu/CmSketchTests.cs @@ -6,12 +6,12 @@ namespace BitFaster.Caching.UnitTests.Lfu { - // Test with AVX2 or ARM64 if it is supported - public class CMSketchAvx2Tests : CmSketchTestBase + // Test with AVX2/ARM64 if it is supported + public class CMSketchIntrinsicsTests : CmSketchTestBase { } - // Test with AVX2 disabled + // Test with AVX2/ARM64 disabled public class CmSketchTests : CmSketchTestBase { } @@ -29,14 +29,23 @@ public CmSketchTestBase() public void Repro() { sketch = new CmSketchCore(1_048_576, EqualityComparer.Default); + var baseline = new CmSketchCore(1_048_576, EqualityComparer.Default); for (int i = 0; i < 1_048_576; i++) { if (i % 3 == 0) { sketch.Increment(i); + baseline.Increment(i); } } + + baseline.Size.Should().Be(sketch.Size); + + for (int i = 0; i < 1_048_576; i++) + { + sketch.EstimateFrequency(i).Should().Be(baseline.EstimateFrequency(i)); + } } diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index f4e28eed..6c1a4062 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -473,12 +473,15 @@ private unsafe int EstimateFrequencyArm(T value) a = AdvSimd.And(a, fifteen); b = AdvSimd.And(b, fifteen); - var minA = AdvSimd.Arm64.MinAcross(a.AsInt32()); - var minB = AdvSimd.Arm64.MinAcross(b.AsInt32()); - minA = AdvSimd.Arm64.InsertSelectedScalar(minA, 1, minB, 0); - var min = AdvSimd.Arm64.MinAcross(minA.AsInt16()); + // TODO: VectorTableLookup + Vector128 x = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, a.AsInt32(), 0); + x = AdvSimd.Arm64.InsertSelectedScalar(x, 1, a.AsInt32(), 2); + x = AdvSimd.Arm64.InsertSelectedScalar(x, 2, b.AsInt32(), 0); + x = AdvSimd.Arm64.InsertSelectedScalar(x, 3, b.AsInt32(), 2); + + var minA = AdvSimd.Arm64.MinAcross(x); - return min.ToScalar(); + return minA.ToScalar(); } } #endif From 75481ffa487abae1c8b1b85da9c0e9db668ac3a2 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Sat, 25 May 2024 03:25:59 +0000 Subject: [PATCH 04/11] table lookup --- BitFaster.Caching/Lfu/CmSketchCore.cs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 6c1a4062..573baa97 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -474,14 +474,17 @@ private unsafe int EstimateFrequencyArm(T value) b = AdvSimd.And(b, fifteen); // TODO: VectorTableLookup - Vector128 x = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, a.AsInt32(), 0); - x = AdvSimd.Arm64.InsertSelectedScalar(x, 1, a.AsInt32(), 2); - x = AdvSimd.Arm64.InsertSelectedScalar(x, 2, b.AsInt32(), 0); - x = AdvSimd.Arm64.InsertSelectedScalar(x, 3, b.AsInt32(), 2); + //Vector128 x = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, a.AsInt32(), 0); + //x = AdvSimd.Arm64.InsertSelectedScalar(x, 1, a.AsInt32(), 2); + //x = AdvSimd.Arm64.InsertSelectedScalar(x, 2, b.AsInt32(), 0); + //x = AdvSimd.Arm64.InsertSelectedScalar(x, 3, b.AsInt32(), 2); - var minA = AdvSimd.Arm64.MinAcross(x); + var min = AdvSimd.Arm64.VectorTableLookup(a.AsByte(), Vector128.Create(0x0B0A090803020100, 0xFFFFFFFFFFFFFFFF).AsByte()); + min = AdvSimd.Arm64.VectorTableLookupExtension(min, b.AsByte(), Vector128.Create(0xFFFFFFFFFFFFFFFF, 0x0B0A090803020100).AsByte()); - return minA.ToScalar(); + var min32 = AdvSimd.Arm64.MinAcross(min.AsInt32()); + + return min32.ToScalar(); } } #endif From 87105f6ce6122285c2db0a01d06b8631efab8dcd Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Sat, 25 May 2024 04:59:06 +0000 Subject: [PATCH 05/11] opt --- BitFaster.Caching/Lfu/CmSketchCore.cs | 100 +++++++------------------- 1 file changed, 25 insertions(+), 75 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 573baa97..05e40196 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; #if !NETSTANDARD2_0 @@ -347,20 +348,16 @@ private unsafe void IncrementAvx(T value) #endif #if NET6_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveOptimization | MethodImplOptions.AggressiveInlining)] private unsafe void IncrementArm(T value) { int blockHash = Spread(comparer.GetHashCode(value)); int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - Vector128 h = Vector128.Create(counterHash); - h = AdvSimd.ShiftArithmetic(h, Vector128.Create(0, -8, -16, -24)); - - Vector128 index = AdvSimd.ShiftRightLogical(h, 1); - index = AdvSimd.And(index, Vector128.Create(15)); // j - counter index - Vector128 offset = AdvSimd.And(h, Vector128.Create(1)); - Vector128 blockOffset = AdvSimd.Add(Vector128.Create(block), offset); // i - table index - blockOffset = AdvSimd.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + Vector128 h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24)); + Vector128 index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf)); + Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); fixed (long* tablePtr = table) { @@ -369,47 +366,24 @@ private unsafe void IncrementArm(T value) int t2 = AdvSimd.Extract(blockOffset, 2); int t3 = AdvSimd.Extract(blockOffset, 3); - var ta0 = AdvSimd.LoadVector64(tablePtr + t0); - var ta1 = AdvSimd.LoadVector64(tablePtr + t1); - var ta2 = AdvSimd.LoadVector64(tablePtr + t2); - var ta3 = AdvSimd.LoadVector64(tablePtr + t3); - - Vector128 tableVectorA = Vector128.Create(ta0, ta1); - Vector128 tableVectorB = Vector128.Create(ta2, ta3); - - // TODO: VectorTableLookup - //Vector128 tableVectorA = Vector128.Create( - // tablePtr[t0], - // tablePtr[t1]); - //Vector128 tableVectorB = Vector128.Create( - // tablePtr[t2], - // tablePtr[t3]); + Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t0), AdvSimd.LoadVector64(tablePtr + t1)); + Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t2), AdvSimd.LoadVector64(tablePtr + t3)); - // j == index index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); - Vector128 longOffA = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0); - longOffA = AdvSimd.Arm64.InsertSelectedScalar(longOffA, 2, index, 1); - - Vector128 longOffB = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2); - longOffB = AdvSimd.Arm64.InsertSelectedScalar(longOffB, 2, index, 3); + Vector128 longOffA = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0), 2, index, 1); + Vector128 longOffB = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2), 2, index, 3); Vector128 fifteen = Vector128.Create(0xfL); Vector128 maskA = AdvSimd.ShiftArithmetic(fifteen, longOffA.AsInt64()); Vector128 maskB = AdvSimd.ShiftArithmetic(fifteen, longOffB.AsInt64()); - Vector128 maskedA = AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorA, maskA), maskA); - Vector128 maskedB = AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorB, maskB), maskB); + Vector128 maskedA = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorA, maskA), maskA)); + Vector128 maskedB = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorB, maskB), maskB)); var one = Vector128.Create(1L); - Vector128 incA = AdvSimd.ShiftArithmetic(one, longOffA.AsInt64()); - Vector128 incB = AdvSimd.ShiftArithmetic(one, longOffB.AsInt64()); - - maskedA = AdvSimd.Not(maskedA); - maskedB = AdvSimd.Not(maskedB); - - incA = AdvSimd.And(maskedA, incA); - incB = AdvSimd.And(maskedB, incB); + Vector128 incA = AdvSimd.And(maskedA, AdvSimd.ShiftArithmetic(one, longOffA.AsInt64())); + Vector128 incB = AdvSimd.And(maskedA, AdvSimd.ShiftArithmetic(one, longOffB.AsInt64())); tablePtr[t0] += AdvSimd.Extract(incA, 0); tablePtr[t1] += AdvSimd.Extract(incA, 1); @@ -428,57 +402,33 @@ private unsafe void IncrementArm(T value) } } + [MethodImpl(MethodImplOptions.AggressiveOptimization | MethodImplOptions.AggressiveInlining)] private unsafe int EstimateFrequencyArm(T value) { int blockHash = Spread(comparer.GetHashCode(value)); int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - Vector128 h = Vector128.Create(counterHash); - h = AdvSimd.ShiftArithmetic(h, Vector128.Create(0, -8, -16, -24)); - - Vector128 index = AdvSimd.ShiftRightLogical(h, 1); - - index = AdvSimd.And(index, Vector128.Create(0xf)); // j - counter index - Vector128 offset = AdvSimd.And(h, Vector128.Create(1)); - Vector128 blockOffset = AdvSimd.Add(Vector128.Create(block), offset); // i - table index - blockOffset = AdvSimd.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) + Vector128 h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24)); + Vector128 index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf)); + Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); fixed (long* tablePtr = table) { - // TODO: VectorTableLookup - Vector128 tableVectorA = Vector128.Create( - tablePtr[AdvSimd.Extract(blockOffset, 0)], - tablePtr[AdvSimd.Extract(blockOffset, 1)]); - Vector128 tableVectorB = Vector128.Create( - tablePtr[AdvSimd.Extract(blockOffset, 2)], - tablePtr[AdvSimd.Extract(blockOffset, 3)]); + Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 0)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 1))); + Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 2)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 3))); - // j == index index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); - Vector128 indexA = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0); - indexA = AdvSimd.Arm64.InsertSelectedScalar(indexA, 2, index, 1); - - Vector128 indexB = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2); - indexB = AdvSimd.Arm64.InsertSelectedScalar(indexB, 2, index, 3); - - indexA = AdvSimd.Negate(indexA); - indexB = AdvSimd.Negate(indexB); - - Vector128 a = AdvSimd.ShiftArithmetic(tableVectorA, indexA.AsInt64()); - Vector128 b = AdvSimd.ShiftArithmetic(tableVectorB, indexB.AsInt64()); + Vector128 indexA = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0), 2, index, 1)); + Vector128 indexB = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2), 2, index, 3)); var fifteen = Vector128.Create(0xfL); - a = AdvSimd.And(a, fifteen); - b = AdvSimd.And(b, fifteen); - - // TODO: VectorTableLookup - //Vector128 x = AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, a.AsInt32(), 0); - //x = AdvSimd.Arm64.InsertSelectedScalar(x, 1, a.AsInt32(), 2); - //x = AdvSimd.Arm64.InsertSelectedScalar(x, 2, b.AsInt32(), 0); - //x = AdvSimd.Arm64.InsertSelectedScalar(x, 3, b.AsInt32(), 2); + Vector128 a = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorA, indexA.AsInt64()), fifteen); + Vector128 b = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorB, indexB.AsInt64()), fifteen); + // Before: < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F > + // After: < 0, 1, 2, 3, 8, 9, A, B, 4, 5, 6, 7, C, D, E, F > var min = AdvSimd.Arm64.VectorTableLookup(a.AsByte(), Vector128.Create(0x0B0A090803020100, 0xFFFFFFFFFFFFFFFF).AsByte()); min = AdvSimd.Arm64.VectorTableLookupExtension(min, b.AsByte(), Vector128.Create(0xFFFFFFFFFFFFFFFF, 0x0B0A090803020100).AsByte()); From 7d7d0232a02ca034affc06709644ff90b9b91c23 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Sat, 25 May 2024 05:42:17 +0000 Subject: [PATCH 06/11] opt --- BitFaster.Caching/Lfu/CmSketchCore.cs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 05e40196..9b13c2fd 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -383,17 +383,14 @@ private unsafe void IncrementArm(T value) var one = Vector128.Create(1L); Vector128 incA = AdvSimd.And(maskedA, AdvSimd.ShiftArithmetic(one, longOffA.AsInt64())); - Vector128 incB = AdvSimd.And(maskedA, AdvSimd.ShiftArithmetic(one, longOffB.AsInt64())); + Vector128 incB = AdvSimd.And(maskedB, AdvSimd.ShiftArithmetic(one, longOffB.AsInt64())); tablePtr[t0] += AdvSimd.Extract(incA, 0); tablePtr[t1] += AdvSimd.Extract(incA, 1); tablePtr[t2] += AdvSimd.Extract(incB, 0); tablePtr[t3] += AdvSimd.Extract(incB, 1); - var maxA = AdvSimd.Arm64.MaxAcross(incA.AsInt32()); - var maxB = AdvSimd.Arm64.MaxAcross(incB.AsInt32()); - maxA = AdvSimd.Arm64.InsertSelectedScalar(maxA, 1, maxB, 0); - var max = AdvSimd.Arm64.MaxAcross(maxA.AsInt16()); + var max = AdvSimd.Arm64.MaxAcross(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.MaxAcross(incA.AsInt32()), 1, AdvSimd.Arm64.MaxAcross(incB.AsInt32()), 0).AsInt16()); if (max.ToScalar() != 0 && (++size == sampleSize)) { From 8a3be6e9ea059773c75d7b286d4b9131ded6f40b Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Sun, 24 Nov 2024 03:01:15 +0000 Subject: [PATCH 07/11] temp --- .../BitFaster.Caching.Benchmarks.csproj | 2 +- BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs | 2 ++ BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs | 2 ++ BitFaster.Caching/BitFaster.Caching.csproj | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj index 11d69738..7192cd7c 100644 --- a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj +++ b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj @@ -2,7 +2,7 @@ Exe - net48;net6.0;net8.0 + net6.0;net8.0 True true diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index b97bc19d..11462b4c 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -8,6 +8,8 @@ namespace BitFaster.Caching.Benchmarks.Lfu { [SimpleJob(RuntimeMoniker.Net60)] + [SimpleJob(RuntimeMoniker.Net80)] + [SimpleJob(RuntimeMoniker.Net90)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] [ColumnChart(Title ="Sketch Frequency ({JOB})")] diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index 886d8349..08f42cfe 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -8,6 +8,8 @@ namespace BitFaster.Caching.Benchmarks.Lfu { [SimpleJob(RuntimeMoniker.Net60)] + [SimpleJob(RuntimeMoniker.Net80)] + [SimpleJob(RuntimeMoniker.Net90)] [MemoryDiagnoser(displayGenColumns: false)] [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] [ColumnChart(Title = "Sketch Increment ({JOB})")] diff --git a/BitFaster.Caching/BitFaster.Caching.csproj b/BitFaster.Caching/BitFaster.Caching.csproj index 9cb1279c..eff52630 100644 --- a/BitFaster.Caching/BitFaster.Caching.csproj +++ b/BitFaster.Caching/BitFaster.Caching.csproj @@ -29,7 +29,7 @@ true enable - true + false 2.5.0 From c972f72904e573236f69b2e4557f482987689e80 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Wed, 27 Nov 2024 01:53:43 +0000 Subject: [PATCH 08/11] cleanup --- .../BitFaster.Caching.Benchmarks.csproj | 8 + .../Lfu/CmSketchNoPin.cs | 105 +++- .../Lfu/SketchFrequency.cs | 25 +- .../Lfu/SketchIncrement.cs | 25 +- BitFaster.Caching/Lfu/CmSketchCore.cs | 2 - BitFaster.Caching/Lfu/CmSketchCore512.cs | 473 ------------------ 6 files changed, 134 insertions(+), 504 deletions(-) delete mode 100644 BitFaster.Caching/Lfu/CmSketchCore512.cs diff --git a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj index 892da036..aa79c2b9 100644 --- a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj +++ b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj @@ -9,6 +9,8 @@ true true true + true + true @@ -41,5 +43,11 @@ MacOS + + Arm64 + + + X64 + diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs index 809a3f9b..56ea0ced 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs @@ -1,9 +1,12 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; + #if NET6_0_OR_GREATER using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; #endif @@ -61,6 +64,12 @@ public int EstimateFrequency(T value) { return EstimateFrequencyAvx(value); } +#if NET6_0_OR_GREATER + else if (isa.IsArm64Supported) + { + return EstimateFrequencyArm(value); + } +#endif else { return EstimateFrequencyStd(value); @@ -84,11 +93,16 @@ public void Increment(T value) { IncrementAvx(value); } +#if NET6_0_OR_GREATER + else if (isa.IsArm64Supported) + { + IncrementArm(value); + } +#endif else { IncrementStd(value); } -#endif } /// @@ -314,5 +328,94 @@ private unsafe void IncrementAvx(T value) } } #endif + +#if NET6_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void IncrementArm(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24)); + Vector128 index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf)); + Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); + + fixed (long* tablePtr = table) + { + int t0 = AdvSimd.Extract(blockOffset, 0); + int t1 = AdvSimd.Extract(blockOffset, 1); + int t2 = AdvSimd.Extract(blockOffset, 2); + int t3 = AdvSimd.Extract(blockOffset, 3); + + Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t0), AdvSimd.LoadVector64(tablePtr + t1)); + Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t2), AdvSimd.LoadVector64(tablePtr + t3)); + + index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); + + Vector128 longOffA = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0), 2, index, 1); + Vector128 longOffB = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2), 2, index, 3); + + Vector128 fifteen = Vector128.Create(0xfL); + Vector128 maskA = AdvSimd.ShiftArithmetic(fifteen, longOffA.AsInt64()); + Vector128 maskB = AdvSimd.ShiftArithmetic(fifteen, longOffB.AsInt64()); + + Vector128 maskedA = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorA, maskA), maskA)); + Vector128 maskedB = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorB, maskB), maskB)); + + var one = Vector128.Create(1L); + Vector128 incA = AdvSimd.And(maskedA, AdvSimd.ShiftArithmetic(one, longOffA.AsInt64())); + Vector128 incB = AdvSimd.And(maskedB, AdvSimd.ShiftArithmetic(one, longOffB.AsInt64())); + + tablePtr[t0] += AdvSimd.Extract(incA, 0); + tablePtr[t1] += AdvSimd.Extract(incA, 1); + tablePtr[t2] += AdvSimd.Extract(incB, 0); + tablePtr[t3] += AdvSimd.Extract(incB, 1); + + var max = AdvSimd.Arm64.MaxAcross(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.MaxAcross(incA.AsInt32()), 1, AdvSimd.Arm64.MaxAcross(incB.AsInt32()), 0).AsInt16()); + + if (max.ToScalar() != 0 && (++size == sampleSize)) + { + Reset(); + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int EstimateFrequencyArm(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24)); + Vector128 index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf)); + Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); + + fixed (long* tablePtr = table) + { + Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 0)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 1))); + Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 2)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 3))); + + index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); + + Vector128 indexA = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0), 2, index, 1)); + Vector128 indexB = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2), 2, index, 3)); + + var fifteen = Vector128.Create(0xfL); + Vector128 a = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorA, indexA.AsInt64()), fifteen); + Vector128 b = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorB, indexB.AsInt64()), fifteen); + + // Before: < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F > + // After: < 0, 1, 2, 3, 8, 9, A, B, 4, 5, 6, 7, C, D, E, F > + var min = AdvSimd.Arm64.VectorTableLookup(a.AsByte(), Vector128.Create(0x0B0A090803020100, 0xFFFFFFFFFFFFFFFF).AsByte()); + min = AdvSimd.Arm64.VectorTableLookupExtension(min, b.AsByte(), Vector128.Create(0xFFFFFFFFFFFFFFFF, 0x0B0A090803020100).AsByte()); + + var min32 = AdvSimd.Arm64.MinAcross(min.AsInt32()); + + return min32.ToScalar(); + } + } +#endif } } diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index 2e855078..9fb8a968 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -24,7 +24,6 @@ public class SketchFrequency private CmSketchCore blockStd; private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockVector; - private CmSketchCore512 blockVector512; [Params(32_768, 524_288, 8_388_608, 134_217_728)] public int Size { get; set; } @@ -38,7 +37,6 @@ public void Setup() blockStd = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockVector = new CmSketchCore(Size, EqualityComparer.Default); - blockVector512 = new CmSketchCore512(Size, EqualityComparer.Default); } [Benchmark(Baseline = true, OperationsPerInvoke = iterations)] @@ -50,7 +48,7 @@ public int FrequencyFlat() return count; } - +#if X64 [Benchmark(OperationsPerInvoke = iterations)] public int FrequencyFlatAvx() { @@ -60,7 +58,7 @@ public int FrequencyFlatAvx() return count; } - +#endif [Benchmark(OperationsPerInvoke = iterations)] public int FrequencyBlock() { @@ -72,7 +70,11 @@ public int FrequencyBlock() } [Benchmark(OperationsPerInvoke = iterations)] +#if Arm64 + public int FrequencyBlockNeonNotPinned() +#else public int FrequencyBlockAvxNotPinned() +#endif { int count = 0; for (int i = 0; i < iterations; i++) @@ -82,7 +84,12 @@ public int FrequencyBlockAvxNotPinned() } [Benchmark(OperationsPerInvoke = iterations)] + +#if Arm64 + public int FrequencyBlockNeonPinned() +#else public int FrequencyBlockAvxPinned() +#endif { int count = 0; for (int i = 0; i < iterations; i++) @@ -90,15 +97,5 @@ public int FrequencyBlockAvxPinned() return count; } - - [Benchmark(OperationsPerInvoke = iterations)] - public int FrequencyBlockAvxPinned512() - { - int count = 0; - for (int i = 0; i < iterations; i++) - count += blockVector512.EstimateFrequency(i) > blockVector.EstimateFrequency(i + 1) ? 1 : 0; - - return count; - } } } diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index 8607cdcd..dcfa1e7e 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -23,7 +23,6 @@ public class SketchIncrement private CmSketchCore blockStd; private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockAvx; - private CmSketchCore512 blockVector512; [Params(32_768, 524_288, 8_388_608, 134_217_728)] @@ -38,7 +37,6 @@ public void Setup() blockStd = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); - blockVector512 = new CmSketchCore512(Size, EqualityComparer.Default); } [Benchmark(Baseline = true, OperationsPerInvoke = iterations)] @@ -49,7 +47,7 @@ public void IncFlat() flatStd.Increment(i); } } - +#if X64 [Benchmark(OperationsPerInvoke = iterations)] public void IncFlatAvx() { @@ -58,7 +56,7 @@ public void IncFlatAvx() flatAvx.Increment(i); } } - +#endif [Benchmark(OperationsPerInvoke = iterations)] public void IncBlock() { @@ -69,7 +67,11 @@ public void IncBlock() } [Benchmark(OperationsPerInvoke = iterations)] - public void IncBlockAvxNotPinned() +#if Arm64 + public void IncBlockNeonNotPinned() +#else + public int IncBlockAvxNotPinned() +#endif { for (int i = 0; i < iterations; i++) { @@ -78,21 +80,16 @@ public void IncBlockAvxNotPinned() } [Benchmark(OperationsPerInvoke = iterations)] +#if Arm64 + public void IncBlockNeonPinned() +#else public void IncBlockAvxPinned() +#endif { for (int i = 0; i < iterations; i++) { blockAvx.Increment(i); } } - - [Benchmark(OperationsPerInvoke = iterations)] - public void IncBlockAvxPinned512() - { - for (int i = 0; i < iterations; i++) - { - blockVector512.Increment(i); - } - } } } diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index f144ae9d..acc822e6 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -392,7 +392,6 @@ private unsafe void IncrementArm(T value) Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); long* tablePtr = tableAddr; - //fixed (long* tablePtr = table) { int t0 = AdvSimd.Extract(blockOffset, 0); int t1 = AdvSimd.Extract(blockOffset, 1); @@ -444,7 +443,6 @@ private unsafe int EstimateFrequencyArm(T value) Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); long* tablePtr = tableAddr; - //fixed (long* tablePtr = table) { Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 0)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 1))); Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 2)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 3))); diff --git a/BitFaster.Caching/Lfu/CmSketchCore512.cs b/BitFaster.Caching/Lfu/CmSketchCore512.cs deleted file mode 100644 index 268be533..00000000 --- a/BitFaster.Caching/Lfu/CmSketchCore512.cs +++ /dev/null @@ -1,473 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; - - -#if !NETSTANDARD2_0 -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; -#endif - -#if NET6_0_OR_GREATER -using System.Runtime.Intrinsics.Arm; -#endif - -namespace BitFaster.Caching.Lfu -{ - /// - /// A probabilistic data structure used to estimate the frequency of a given value. Periodic aging reduces the - /// accumulated count across all values over time, such that a historic popular value will decay to zero frequency - /// over time if it is not accessed. - /// - /// - /// The maximum frequency of an element is limited to 15 (4-bits). Each element is hashed to a 64 byte 'block' - /// consisting of 4 segments of 32 4-bit counters. The 64 byte blocks are the same size as x64 L1 cache lines. - /// While the blocks are not guaranteed to be aligned, this scheme minimizes L1 cache misses resulting in a - /// significant speedup. When supported, a vectorized AVX2 code path provides a further speedup. Together, block - /// and AVX2 are approximately 2x faster than the original implementation. - /// - /// This is a direct C# translation of FrequencySketch in the Caffeine library by ben.manes@gmail.com (Ben Manes). - /// https://github.com/ben-manes/caffeine - public unsafe class CmSketchCore512 - where T : notnull - where I : struct, IsaProbe - { - private const long ResetMask = 0x7777777777777777L; - private const long OneMask = 0x1111111111111111L; - - private long[] table; - private int sampleSize; - private int blockMask; - private int size; - -#if NET6_0_OR_GREATER - private long* tableAddr; -#endif - - private readonly IEqualityComparer comparer; - - /// - /// Initializes a new instance of the CmSketch class with the specified maximum size and equality comparer. - /// - /// The maximum size. - /// The equality comparer. - public CmSketchCore512(long maximumSize, IEqualityComparer comparer) - { - EnsureCapacity(maximumSize); - this.comparer = comparer; - } - - /// - /// Gets the reset sample size. - /// - public int ResetSampleSize => this.sampleSize; - - /// - /// Gets the size. - /// - public int Size => this.size; - - /// - /// Estimate the frequency of the specified value, up to the maximum of 15. - /// - /// The value. - /// The estimated frequency of the value. - public int EstimateFrequency(T value) - { -#if NETSTANDARD2_0 - return EstimateFrequencyStd(value); -#else - - I isa = default; - - if (isa.IsAvx2Supported) - { - return EstimateFrequencyAvx(value); - } -#if NET6_0_OR_GREATER - else if (isa.IsArm64Supported) - { - return EstimateFrequencyArm(value); - } -#endif - else - { - return EstimateFrequencyStd(value); - } -#endif - } - - /// - /// Increment the count of the specified value. - /// - /// The value. - public void Increment(T value) - { -#if NETSTANDARD2_0 - IncrementStd(value); -#else - - I isa = default; - - if (isa.IsAvx2Supported) - { - IncrementAvx(value); - } -#if NET6_0_OR_GREATER - else if (isa.IsArm64Supported) - { - IncrementArm(value); - } -#endif - else - { - IncrementStd(value); - } -#endif - } - - /// - /// Clears the count for all items. - /// - public void Clear() - { - Array.Clear(table, 0, table.Length); - size = 0; - } - - [MemberNotNull(nameof(table))] - private void EnsureCapacity(long maximumSize) - { - int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); - -#if NET6_0_OR_GREATER - I isa = default; - if (isa.IsAvx2Supported || isa.IsArm64Supported) - { - // over alloc by 8 to give 64 bytes padding, tableAddr is then aligned to 64 bytes - const int pad = 8; - bool pinned = true; - table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, pinned); - - tableAddr = (long*)Unsafe.AsPointer(ref table[0]); - tableAddr = (long*)((long)tableAddr + (long)tableAddr % 64); - - blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; - } - else -#endif - { - table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; - blockMask = (int)((uint)(table.Length) >> 3) - 1; - } - - sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); - - size = 0; - } - - private unsafe int EstimateFrequencyStd(T value) - { - var count = stackalloc int[4]; - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - int index = (h >> 1) & 15; - int offset = h & 1; - count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL); - } - return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3])); - } - - private unsafe void IncrementStd(T value) - { - var index = stackalloc int[8]; - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - index[i] = (h >> 1) & 15; - int offset = h & 1; - index[i + 4] = block + offset + (i << 1); - } - - bool added = - IncrementAt(index[4], index[0]) - | IncrementAt(index[5], index[1]) - | IncrementAt(index[6], index[2]) - | IncrementAt(index[7], index[3]); - - if (added && (++size == sampleSize)) - { - Reset(); - } - } - - // Applies another round of hashing for additional randomization. - private static int Rehash(int x) - { - x = (int)(x * 0x31848bab); - x ^= (int)((uint)x >> 14); - return x; - } - - // Applies a supplemental hash function to defend against poor quality hash. - private static int Spread(int x) - { - x ^= (int)((uint)x >> 17); - x = (int)(x * 0xed5ad4bb); - x ^= (int)((uint)x >> 11); - x = (int)(x * 0xac4c1b51); - x ^= (int)((uint)x >> 15); - return x; - } - - private bool IncrementAt(int i, int j) - { - int offset = j << 2; - long mask = (0xfL << offset); - - if ((table[i] & mask) != mask) - { - table[i] += (1L << offset); - return true; - } - - return false; - } - - private void Reset() - { - // unroll, almost 2x faster - int count0 = 0; - int count1 = 0; - int count2 = 0; - int count3 = 0; - - for (int i = 0; i < table.Length; i += 4) - { - count0 += BitOps.BitCount(table[i] & OneMask); - count1 += BitOps.BitCount(table[i + 1] & OneMask); - count2 += BitOps.BitCount(table[i + 2] & OneMask); - count3 += BitOps.BitCount(table[i + 3] & OneMask); - - table[i] = (long)((ulong)table[i] >> 1) & ResetMask; - table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; - table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; - table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; - } - - count0 = (count0 + count1) + (count2 + count3); - - size = (size - (count0 >> 2)) >> 1; - } - -#if !NETSTANDARD2_0 - private unsafe int EstimateFrequencyAvx(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = Vector128.Create(counterHash); - h = Avx2.ShiftRightLogicalVariable(h.AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); - - var index = Avx2.ShiftRightLogical(h, 1); - index = Avx2.And(index, Vector128.Create(15)); // j - counter index - Vector128 offset = Avx2.And(h, Vector128.Create(1)); - Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index - blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) - -#if NET6_0_OR_GREATER - long* tablePtr = tableAddr; -#else - fixed (long* tablePtr = table) -#endif - { - Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); - index = Avx2.ShiftLeftLogical(index, 2); - - // convert index from int to long via permute - Vector256 indexLong = Vector256.Create(index, Vector128.Zero).AsInt64(); - Vector256 permuteMask2 = Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7); - indexLong = Avx2.PermuteVar8x32(indexLong.AsInt32(), permuteMask2).AsInt64(); - tableVector = Avx2.ShiftRightLogicalVariable(tableVector, indexLong.AsUInt64()); - tableVector = Avx2.And(tableVector, Vector256.Create(0xfL)); - - Vector256 permuteMask = Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7); - Vector128 count = Avx2.PermuteVar8x32(tableVector.AsInt32(), permuteMask) - .GetLower() - .AsUInt16(); - - // set the zeroed high parts of the long value to ushort.Max -#if NET6_0_OR_GREATER - count = Avx2.Blend(count, Vector128.AllBitsSet, 0b10101010); -#else - count = Avx2.Blend(count, Vector128.Create(ushort.MaxValue), 0b10101010); -#endif - - return Avx2.MinHorizontal(count).GetElement(0); - } - } - - private unsafe void IncrementAvx(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = Vector128.Create(counterHash); - h = Avx2.ShiftRightLogicalVariable(h.AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); - - Vector128 index = Avx2.ShiftRightLogical(h, 1); - index = Avx2.And(index, Vector128.Create(15)); // j - counter index - Vector128 offset = Avx2.And(h, Vector128.Create(1)); - Vector128 blockOffset = Avx2.Add(Vector128.Create(block), offset); // i - table index - blockOffset = Avx2.Add(blockOffset, Vector128.Create(0, 2, 4, 6)); // + (i << 1) - -#if NET6_0_OR_GREATER - long* tablePtr = tableAddr; -#else - fixed (long* tablePtr = table) -#endif - { - Vector256 tableVector = Avx2.GatherVector256(tablePtr, blockOffset, 8); - - // j == index - index = Avx2.ShiftLeftLogical(index, 2); - Vector256 offsetLong = Vector256.Create(index, Vector128.Zero).AsInt64(); - - Vector256 permuteMask = Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7); - offsetLong = Avx2.PermuteVar8x32(offsetLong.AsInt32(), permuteMask).AsInt64(); - - // mask = (0xfL << offset) - Vector256 fifteen = Vector256.Create(0xfL); - Vector256 mask = Avx2.ShiftLeftLogicalVariable(fifteen, offsetLong.AsUInt64()); - - // (table[i] & mask) != mask) - // Note masked is 'equal' - therefore use AndNot below - Vector256 masked = Avx2.CompareEqual(Avx2.And(tableVector, mask), mask); - - // 1L << offset - Vector256 inc = Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong.AsUInt64()); - - // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) - inc = Avx2.AndNot(masked, inc); - - Vector256 result = Avx2.CompareEqual(masked.AsByte(), Vector256.Zero); - bool wasInc = Avx2.MoveMask(result.AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); - - tablePtr[blockOffset.GetElement(0)] += inc.GetElement(0); - tablePtr[blockOffset.GetElement(1)] += inc.GetElement(1); - tablePtr[blockOffset.GetElement(2)] += inc.GetElement(2); - tablePtr[blockOffset.GetElement(3)] += inc.GetElement(3); - - if (wasInc && (++size == sampleSize)) - { - Reset(); - } - } - } -#endif - -#if NET6_0_OR_GREATER - [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] - private unsafe void IncrementArm(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24)); - Vector128 index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf)); - Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); - - long* tablePtr = tableAddr; - //fixed (long* tablePtr = table) - { - int t0 = AdvSimd.Extract(blockOffset, 0); - int t1 = AdvSimd.Extract(blockOffset, 1); - int t2 = AdvSimd.Extract(blockOffset, 2); - int t3 = AdvSimd.Extract(blockOffset, 3); - - Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t0), AdvSimd.LoadVector64(tablePtr + t1)); - Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t2), AdvSimd.LoadVector64(tablePtr + t3)); - - index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); - - Vector128 longOffA = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0), 2, index, 1); - Vector128 longOffB = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2), 2, index, 3); - - Vector128 fifteen = Vector128.Create(0xfL); - Vector128 maskA = AdvSimd.ShiftArithmetic(fifteen, longOffA.AsInt64()); - Vector128 maskB = AdvSimd.ShiftArithmetic(fifteen, longOffB.AsInt64()); - - Vector128 maskedA = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorA, maskA), maskA)); - Vector128 maskedB = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorB, maskB), maskB)); - - var one = Vector128.Create(1L); - Vector128 incA = AdvSimd.And(maskedA, AdvSimd.ShiftArithmetic(one, longOffA.AsInt64())); - Vector128 incB = AdvSimd.And(maskedB, AdvSimd.ShiftArithmetic(one, longOffB.AsInt64())); - - tablePtr[t0] += AdvSimd.Extract(incA, 0); - tablePtr[t1] += AdvSimd.Extract(incA, 1); - tablePtr[t2] += AdvSimd.Extract(incB, 0); - tablePtr[t3] += AdvSimd.Extract(incB, 1); - - var max = AdvSimd.Arm64.MaxAcross(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.MaxAcross(incA.AsInt32()), 1, AdvSimd.Arm64.MaxAcross(incB.AsInt32()), 0).AsInt16()); - - if (max.ToScalar() != 0 && (++size == sampleSize)) - { - Reset(); - } - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] - private unsafe int EstimateFrequencyArm(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24)); - Vector128 index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf)); - Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); - - long* tablePtr = tableAddr; - //fixed (long* tablePtr = table) - { - Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 0)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 1))); - Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 2)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 3))); - - index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); - - Vector128 indexA = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0), 2, index, 1)); - Vector128 indexB = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2), 2, index, 3)); - - var fifteen = Vector128.Create(0xfL); - Vector128 a = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorA, indexA.AsInt64()), fifteen); - Vector128 b = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorB, indexB.AsInt64()), fifteen); - - // Before: < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F > - // After: < 0, 1, 2, 3, 8, 9, A, B, 4, 5, 6, 7, C, D, E, F > - var min = AdvSimd.Arm64.VectorTableLookup(a.AsByte(), Vector128.Create(0x0B0A090803020100, 0xFFFFFFFFFFFFFFFF).AsByte()); - min = AdvSimd.Arm64.VectorTableLookupExtension(min, b.AsByte(), Vector128.Create(0xFFFFFFFFFFFFFFFF, 0x0B0A090803020100).AsByte()); - - var min32 = AdvSimd.Arm64.MinAcross(min.AsInt32()); - - return min32.ToScalar(); - } - } -#endif - } -} From 48677bb17f75281f1b5e975f657d3f2a00785b62 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Wed, 27 Nov 2024 03:39:49 +0000 Subject: [PATCH 09/11] endif --- BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs index 56ea0ced..68428e06 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs @@ -103,6 +103,7 @@ public void Increment(T value) { IncrementStd(value); } +#endif } /// From 94b896834d3f227a1b594dae27a55cdb87dc6968 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Wed, 27 Nov 2024 04:50:43 +0000 Subject: [PATCH 10/11] fix return --- BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index dcfa1e7e..9e8330d3 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -70,7 +70,7 @@ public void IncBlock() #if Arm64 public void IncBlockNeonNotPinned() #else - public int IncBlockAvxNotPinned() + public void IncBlockAvxNotPinned() #endif { for (int i = 0; i < iterations; i++) From 77800424c1b8ef3619787d0a2e7ea96d1b6623d3 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Fri, 29 Nov 2024 00:02:27 +0000 Subject: [PATCH 11/11] cleanup --- BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs | 8 ++++---- BitFaster.Caching/Lfu/CmSketchCore.cs | 7 +++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index 39c2fd59..ba206ddf 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -26,7 +26,7 @@ public class SketchFrequency private CmSketchCore blockStd; private CmSketchNoPin blockAvxNoPin; - private CmSketchCore blockVector; + private CmSketchCore blockAvx; [Params(32_768, 524_288, 8_388_608, 134_217_728)] public int Size { get; set; } @@ -39,7 +39,7 @@ public void Setup() blockStd = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); - blockVector = new CmSketchCore(Size, EqualityComparer.Default); + blockAvx = new CmSketchCore(Size, EqualityComparer.Default); } [Benchmark(Baseline = true, OperationsPerInvoke = iterations)] @@ -81,7 +81,7 @@ public int FrequencyBlockAvxNotPinned() { int count = 0; for (int i = 0; i < iterations; i++) - count += blockAvxNoPin.EstimateFrequency(i) > blockVector.EstimateFrequency(i + 1) ? 1 : 0; + count += blockAvxNoPin.EstimateFrequency(i) > blockAvx.EstimateFrequency(i + 1) ? 1 : 0; return count; } @@ -96,7 +96,7 @@ public int FrequencyBlockAvxPinned() { int count = 0; for (int i = 0; i < iterations; i++) - count += blockVector.EstimateFrequency(i) > blockVector.EstimateFrequency(i + 1) ? 1 : 0; + count += blockAvx.EstimateFrequency(i) > blockAvx.EstimateFrequency(i + 1) ? 1 : 0; return count; } diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 7c95ce20..46f516b7 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -38,13 +38,12 @@ public unsafe class CmSketchCore private const long OneMask = 0x1111111111111111L; private long[] table; - private int sampleSize; - private int blockMask; - private int size; - #if NET6_0_OR_GREATER private long* tableAddr; #endif + private int sampleSize; + private int blockMask; + private int size; private readonly IEqualityComparer comparer;