From 5257c04051c877778d2d0b9dad36e90fe2698dae Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Tue, 7 Jan 2025 10:58:44 -0800 Subject: [PATCH 1/5] unroll --- BitFaster.Caching/BitFaster.Caching.csproj | 2 +- BitFaster.Caching/Lfu/CmSketchCore.cs | 31 +++++++++++++--------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/BitFaster.Caching/BitFaster.Caching.csproj b/BitFaster.Caching/BitFaster.Caching.csproj index 20405be8..4860e2e0 100644 --- a/BitFaster.Caching/BitFaster.Caching.csproj +++ b/BitFaster.Caching/BitFaster.Caching.csproj @@ -2,7 +2,7 @@ netstandard2.0;netcoreapp3.1;net6.0 - 10.0 + 11.0 Alex Peck BitFaster.Caching diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 46f516b7..7c1b7597 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -186,24 +186,31 @@ private unsafe int EstimateFrequencyStd(T value) private unsafe void IncrementStd(T value) { - var index = stackalloc int[8]; int blockHash = Spread(comparer.GetHashCode(value)); int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - index[i] = (h >> 1) & 15; - int offset = h & 1; - index[i + 4] = block + offset + (i << 1); - } + // Loop unrolling improves throughput by 10m ops/s + int h0 = counterHash; + int h1 = counterHash >>> 8; + int h2 = counterHash >>> 16; + int h3 = counterHash >>> 24; + + int index0 = (h0 >>> 1) & 15; + int index1 = (h1 >>> 1) & 15; + int index2 = (h2 >>> 1) & 15; + int index3 = (h3 >>> 1) & 15; + + int slot0 = block + (h0 & 1); + int slot1 = block + (h1 & 1) + 2; + int slot2 = block + (h2 & 1) + 4; + int slot3 = block + (h3 & 1) + 6; bool added = - IncrementAt(index[4], index[0]) - | IncrementAt(index[5], index[1]) - | IncrementAt(index[6], index[2]) - | IncrementAt(index[7], index[3]); + IncrementAt(slot0, index0) + | IncrementAt(slot1, index1) + | IncrementAt(slot2, index2) + | IncrementAt(slot3, index3); if (added && (++size == sampleSize)) { From 874a5155b0d8fab7bf726ca0f83a4cfa92125700 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Tue, 7 Jan 2025 12:11:07 -0800 Subject: [PATCH 2/5] bench --- .../Lfu/CmSketchLooped.cs | 419 ++++++++++++++++++ .../Lfu/SketchIncrement.cs | 17 +- 2 files changed, 433 insertions(+), 3 deletions(-) create mode 100644 BitFaster.Caching.Benchmarks/Lfu/CmSketchLooped.cs diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchLooped.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchLooped.cs new file mode 100644 index 00000000..720b8942 --- /dev/null +++ b/BitFaster.Caching.Benchmarks/Lfu/CmSketchLooped.cs @@ -0,0 +1,419 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; +using System.Text; +using System.Threading.Tasks; + +namespace BitFaster.Caching.Benchmarks.Lfu +{ + public unsafe class CmSketchLooped + where T : notnull + where I : struct, IsaProbe + { + private const long ResetMask = 0x7777777777777777L; + private const long OneMask = 0x1111111111111111L; + + private long[] table; +#if NET6_0_OR_GREATER + private long* tableAddr; +#endif + private int sampleSize; + private int blockMask; + private int size; + + private readonly IEqualityComparer comparer; + + /// + /// Initializes a new instance of the CmSketch class with the specified maximum size and equality comparer. + /// + /// The maximum size. + /// The equality comparer. + public CmSketchLooped(long maximumSize, IEqualityComparer comparer) + { + EnsureCapacity(maximumSize); + this.comparer = comparer; + } + + /// + /// Gets the reset sample size. + /// + public int ResetSampleSize => this.sampleSize; + + /// + /// Gets the size. + /// + public int Size => this.size; + + /// + /// Estimate the frequency of the specified value, up to the maximum of 15. + /// + /// The value. + /// The estimated frequency of the value. + public int EstimateFrequency(T value) + { +#if NETSTANDARD2_0 + return EstimateFrequencyStd(value); +#else + + I isa = default; + + if (isa.IsAvx2Supported) + { + return EstimateFrequencyAvx(value); + } +#if NET6_0_OR_GREATER + else if (isa.IsArm64Supported) + { + return EstimateFrequencyArm(value); + } +#endif + else + { + return EstimateFrequencyStd(value); + } +#endif + } + + /// + /// Increment the count of the specified value. + /// + /// The value. + public void Increment(T value) + { +#if NETSTANDARD2_0 + IncrementStd(value); +#else + + I isa = default; + + if (isa.IsAvx2Supported) + { + IncrementAvx(value); + } +#if NET6_0_OR_GREATER + else if (isa.IsArm64Supported) + { + IncrementArm(value); + } +#endif + else + { + IncrementStd(value); + } +#endif + } + + /// + /// Clears the count for all items. + /// + public void Clear() + { + Array.Clear(table, 0, table.Length); + size = 0; + } + + [MemberNotNull(nameof(table))] + private void EnsureCapacity(long maximumSize) + { + int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); + +#if NET6_0_OR_GREATER + I isa = default; + if (isa.IsAvx2Supported || isa.IsArm64Supported) + { + // over alloc by 8 to give 64 bytes padding, tableAddr is then aligned to 64 bytes + const int pad = 8; + bool pinned = true; + table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, pinned); + + tableAddr = (long*)Unsafe.AsPointer(ref table[0]); + tableAddr = (long*)((long)tableAddr + (long)tableAddr % 64); + + blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; + } + else +#endif + { + table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; + blockMask = (int)((uint)(table.Length) >> 3) - 1; + } + + sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); + + size = 0; + } + + private unsafe int EstimateFrequencyStd(T value) + { + var count = stackalloc int[4]; + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + for (int i = 0; i < 4; i++) + { + int h = (int)((uint)counterHash >> (i << 3)); + int index = (h >> 1) & 15; + int offset = h & 1; + count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL); + } + return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3])); + } + + private unsafe void IncrementStd(T value) + { + var index = stackalloc int[8]; + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + for (int i = 0; i < 4; i++) + { + int h = (int)((uint)counterHash >> (i << 3)); + index[i] = (h >> 1) & 15; + int offset = h & 1; + index[i + 4] = block + offset + (i << 1); + } + + bool added = + IncrementAt(index[4], index[0]) + | IncrementAt(index[5], index[1]) + | IncrementAt(index[6], index[2]) + | IncrementAt(index[7], index[3]); + + if (added && (++size == sampleSize)) + { + Reset(); + } + } + + // Applies another round of hashing for additional randomization. + private static int Rehash(int x) + { + x = (int)(x * 0x31848bab); + x ^= (int)((uint)x >> 14); + return x; + } + + // Applies a supplemental hash function to defend against poor quality hash. + private static int Spread(int x) + { + x ^= (int)((uint)x >> 17); + x = (int)(x * 0xed5ad4bb); + x ^= (int)((uint)x >> 11); + x = (int)(x * 0xac4c1b51); + x ^= (int)((uint)x >> 15); + return x; + } + + private bool IncrementAt(int i, int j) + { + int offset = j << 2; + long mask = (0xfL << offset); + + if ((table[i] & mask) != mask) + { + table[i] += (1L << offset); + return true; + } + + return false; + } + + private void Reset() + { + // unroll, almost 2x faster + int count0 = 0; + int count1 = 0; + int count2 = 0; + int count3 = 0; + + for (int i = 0; i < table.Length; i += 4) + { + count0 += BitOps.BitCount(table[i] & OneMask); + count1 += BitOps.BitCount(table[i + 1] & OneMask); + count2 += BitOps.BitCount(table[i + 2] & OneMask); + count3 += BitOps.BitCount(table[i + 3] & OneMask); + + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + } + + count0 = (count0 + count1) + (count2 + count3); + + size = (size - (count0 >> 2)) >> 1; + } + +#if !NETSTANDARD2_0 + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int EstimateFrequencyAvx(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); + Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); + + Vector256 indexLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); + +#if NET6_0_OR_GREATER + long* tablePtr = tableAddr; +#else + fixed (long* tablePtr = table) +#endif + { + Vector128 count = Avx2.PermuteVar8x32(Avx2.And(Avx2.ShiftRightLogicalVariable(Avx2.GatherVector256(tablePtr, blockOffset, 8), indexLong), Vector256.Create(0xfL)).AsInt32(), Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7)) + .GetLower() + .AsUInt16(); + + // set the zeroed high parts of the long value to ushort.Max +#if NET6_0_OR_GREATER + count = Avx2.Blend(count, Vector128.AllBitsSet, 0b10101010); +#else + count = Avx2.Blend(count, Vector128.Create(ushort.MaxValue), 0b10101010); +#endif + + return Avx2.MinHorizontal(count).GetElement(0); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void IncrementAvx(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); + Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); + Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); + + Vector256 offsetLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); + Vector256 mask = Avx2.ShiftLeftLogicalVariable(Vector256.Create(0xfL), offsetLong); + +#if NET6_0_OR_GREATER + long* tablePtr = tableAddr; +#else + fixed (long* tablePtr = table) +#endif + { + // Note masked is 'equal' - therefore use AndNot below + Vector256 masked = Avx2.CompareEqual(Avx2.And(Avx2.GatherVector256(tablePtr, blockOffset, 8), mask), mask); + + // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) + Vector256 inc = Avx2.AndNot(masked, Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong)); + + bool wasInc = Avx2.MoveMask(Avx2.CompareEqual(masked.AsByte(), Vector256.Zero).AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); + + tablePtr[blockOffset.GetElement(0)] += inc.GetElement(0); + tablePtr[blockOffset.GetElement(1)] += inc.GetElement(1); + tablePtr[blockOffset.GetElement(2)] += inc.GetElement(2); + tablePtr[blockOffset.GetElement(3)] += inc.GetElement(3); + + if (wasInc && (++size == sampleSize)) + { + Reset(); + } + } + } +#endif + +#if NET6_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void IncrementArm(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24)); + Vector128 index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf)); + Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); + + long* tablePtr = tableAddr; + { + int t0 = AdvSimd.Extract(blockOffset, 0); + int t1 = AdvSimd.Extract(blockOffset, 1); + int t2 = AdvSimd.Extract(blockOffset, 2); + int t3 = AdvSimd.Extract(blockOffset, 3); + + Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t0), AdvSimd.LoadVector64(tablePtr + t1)); + Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t2), AdvSimd.LoadVector64(tablePtr + t3)); + + index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); + + Vector128 longOffA = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0), 2, index, 1); + Vector128 longOffB = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2), 2, index, 3); + + Vector128 fifteen = Vector128.Create(0xfL); + Vector128 maskA = AdvSimd.ShiftArithmetic(fifteen, longOffA.AsInt64()); + Vector128 maskB = AdvSimd.ShiftArithmetic(fifteen, longOffB.AsInt64()); + + Vector128 maskedA = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorA, maskA), maskA)); + Vector128 maskedB = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorB, maskB), maskB)); + + var one = Vector128.Create(1L); + Vector128 incA = AdvSimd.And(maskedA, AdvSimd.ShiftArithmetic(one, longOffA.AsInt64())); + Vector128 incB = AdvSimd.And(maskedB, AdvSimd.ShiftArithmetic(one, longOffB.AsInt64())); + + tablePtr[t0] += AdvSimd.Extract(incA, 0); + tablePtr[t1] += AdvSimd.Extract(incA, 1); + tablePtr[t2] += AdvSimd.Extract(incB, 0); + tablePtr[t3] += AdvSimd.Extract(incB, 1); + + var max = AdvSimd.Arm64.MaxAcross(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.MaxAcross(incA.AsInt32()), 1, AdvSimd.Arm64.MaxAcross(incB.AsInt32()), 0).AsInt16()); + + if (max.ToScalar() != 0 && (++size == sampleSize)) + { + Reset(); + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int EstimateFrequencyArm(T value) + { + int blockHash = Spread(comparer.GetHashCode(value)); + int counterHash = Rehash(blockHash); + int block = (blockHash & blockMask) << 3; + + Vector128 h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24)); + Vector128 index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf)); + Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); + + long* tablePtr = tableAddr; + { + Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 0)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 1))); + Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 2)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 3))); + + index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); + + Vector128 indexA = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0), 2, index, 1)); + Vector128 indexB = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2), 2, index, 3)); + + var fifteen = Vector128.Create(0xfL); + Vector128 a = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorA, indexA.AsInt64()), fifteen); + Vector128 b = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorB, indexB.AsInt64()), fifteen); + + // Before: < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F > + // After: < 0, 1, 2, 3, 8, 9, A, B, 4, 5, 6, 7, C, D, E, F > + var min = AdvSimd.Arm64.VectorTableLookup(a.AsByte(), Vector128.Create(0x0B0A090803020100, 0xFFFFFFFFFFFFFFFF).AsByte()); + min = AdvSimd.Arm64.VectorTableLookupExtension(min, b.AsByte(), Vector128.Create(0xFFFFFFFFFFFFFFFF, 0x0B0A090803020100).AsByte()); + + var min32 = AdvSimd.Arm64.MinAcross(min.AsInt32()); + + return min32.ToScalar(); + } + } +#endif + } +} diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index e2fb1e02..3531220c 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -23,7 +23,8 @@ public class SketchIncrement private CmSketchFlat flatStd; private CmSketchFlat flatAvx; - private CmSketchCore blockStd; + private CmSketchLooped blockStdNoUnroll; + private CmSketchCore blockStdUnroll; private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockAvx; @@ -37,7 +38,8 @@ public void Setup() flatStd = new CmSketchFlat(Size, EqualityComparer.Default); flatAvx = new CmSketchFlat(Size, EqualityComparer.Default); - blockStd = new CmSketchCore(Size, EqualityComparer.Default); + blockStdNoUnroll = new CmSketchLooped(Size, EqualityComparer.Default); + blockStdUnroll = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); } @@ -65,7 +67,16 @@ public void IncBlock() { for (int i = 0; i < iterations; i++) { - blockStd.Increment(i); + blockStdNoUnroll.Increment(i); + } + } + + [Benchmark(OperationsPerInvoke = iterations)] + public void IncBlockUnroll() + { + for (int i = 0; i < iterations; i++) + { + blockStdUnroll.Increment(i); } } From 0a9bcff64d1d3b60ddd419422350717c0731637d Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Wed, 8 Jan 2025 10:40:30 -0800 Subject: [PATCH 3/5] unroll freq --- .../Lfu/SketchFrequency.cs | 18 ++++++++-- BitFaster.Caching/Lfu/CmSketchCore.cs | 33 +++++++++++++------ 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index ba206ddf..8bab8480 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -24,7 +24,8 @@ public class SketchFrequency private CmSketchFlat flatStd; private CmSketchFlat flatAvx; - private CmSketchCore blockStd; + private CmSketchLooped blockStdNoUnroll; + private CmSketchCore blockStdUnroll; private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockAvx; @@ -37,7 +38,8 @@ public void Setup() flatStd = new CmSketchFlat(Size, EqualityComparer.Default); flatAvx = new CmSketchFlat(Size, EqualityComparer.Default); - blockStd = new CmSketchCore(Size, EqualityComparer.Default); + blockStdNoUnroll = new CmSketchLooped(Size, EqualityComparer.Default); + blockStdUnroll = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); } @@ -67,7 +69,17 @@ public int FrequencyBlock() { int count = 0; for (int i = 0; i < iterations; i++) - count += blockStd.EstimateFrequency(i) > blockStd.EstimateFrequency(i + 1) ? 1 : 0; + count += blockStdNoUnroll.EstimateFrequency(i) > blockStdNoUnroll.EstimateFrequency(i + 1) ? 1 : 0; + + return count; + } + + [Benchmark(OperationsPerInvoke = iterations)] + public int FrequencyBlockUnroll() + { + int count = 0; + for (int i = 0; i < iterations; i++) + count += blockStdUnroll.EstimateFrequency(i) > blockStdUnroll.EstimateFrequency(i + 1) ? 1 : 0; return count; } diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 7c1b7597..0a3f5c24 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -8,6 +8,7 @@ #if !NETSTANDARD2_0 using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; + #endif #if NET6_0_OR_GREATER @@ -169,19 +170,31 @@ private void EnsureCapacity(long maximumSize) private unsafe int EstimateFrequencyStd(T value) { - var count = stackalloc int[4]; int blockHash = Spread(comparer.GetHashCode(value)); int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - int index = (h >> 1) & 15; - int offset = h & 1; - count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL); - } - return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3])); + int h0 = counterHash; + int h1 = counterHash >>> 8; + int h2 = counterHash >>> 16; + int h3 = counterHash >>> 24; + + int index0 = (h0 >>> 1) & 15; + int index1 = (h1 >>> 1) & 15; + int index2 = (h2 >>> 1) & 15; + int index3 = (h3 >>> 1) & 15; + + int slot0 = block + (h0 & 1); + int slot1 = block + (h1 & 1) + 2; + int slot2 = block + (h2 & 1) + 4; + int slot3 = block + (h3 & 1) + 6; + + int count0 = (int)((table[slot0] >>> (index0 << 2)) & 0xfL); + int count1 = (int)((table[slot1] >>> (index1 << 2)) & 0xfL); + int count2 = (int)((table[slot2] >>> (index2 << 2)) & 0xfL); + int count3 = (int)((table[slot3] >>> (index3 << 2)) & 0xfL); + + return Math.Min(Math.Min(count0, count1), Math.Min(count2, count3)); } private unsafe void IncrementStd(T value) @@ -190,7 +203,7 @@ private unsafe void IncrementStd(T value) int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - // Loop unrolling improves throughput by 10m ops/s + // Loop unrolling improves throughput int h0 = counterHash; int h1 = counterHash >>> 8; int h2 = counterHash >>> 16; From a714cdb32e57fcb684a2a92784867dac38efe583 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Wed, 8 Jan 2025 11:02:21 -0800 Subject: [PATCH 4/5] comment --- BitFaster.Caching/Lfu/CmSketchCore.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 0a3f5c24..9764c51b 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -174,6 +174,7 @@ private unsafe int EstimateFrequencyStd(T value) int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; + // Loop unrolling improves throughput int h0 = counterHash; int h1 = counterHash >>> 8; int h2 = counterHash >>> 16; From 2280f83e3626a8897cd4fa1572c3b937d768ebe5 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Thu, 9 Jan 2025 16:21:30 -0800 Subject: [PATCH 5/5] rem extra file --- .../Lfu/CmSketchLooped.cs | 419 ------------------ .../Lfu/CmSketchNoPin.cs | 3 + .../Lfu/SketchFrequency.cs | 4 +- .../Lfu/SketchIncrement.cs | 4 +- 4 files changed, 7 insertions(+), 423 deletions(-) delete mode 100644 BitFaster.Caching.Benchmarks/Lfu/CmSketchLooped.cs diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchLooped.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchLooped.cs deleted file mode 100644 index 720b8942..00000000 --- a/BitFaster.Caching.Benchmarks/Lfu/CmSketchLooped.cs +++ /dev/null @@ -1,419 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; -using System.Linq; -using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics.Arm; -using System.Runtime.Intrinsics.X86; -using System.Runtime.Intrinsics; -using System.Text; -using System.Threading.Tasks; - -namespace BitFaster.Caching.Benchmarks.Lfu -{ - public unsafe class CmSketchLooped - where T : notnull - where I : struct, IsaProbe - { - private const long ResetMask = 0x7777777777777777L; - private const long OneMask = 0x1111111111111111L; - - private long[] table; -#if NET6_0_OR_GREATER - private long* tableAddr; -#endif - private int sampleSize; - private int blockMask; - private int size; - - private readonly IEqualityComparer comparer; - - /// - /// Initializes a new instance of the CmSketch class with the specified maximum size and equality comparer. - /// - /// The maximum size. - /// The equality comparer. - public CmSketchLooped(long maximumSize, IEqualityComparer comparer) - { - EnsureCapacity(maximumSize); - this.comparer = comparer; - } - - /// - /// Gets the reset sample size. - /// - public int ResetSampleSize => this.sampleSize; - - /// - /// Gets the size. - /// - public int Size => this.size; - - /// - /// Estimate the frequency of the specified value, up to the maximum of 15. - /// - /// The value. - /// The estimated frequency of the value. - public int EstimateFrequency(T value) - { -#if NETSTANDARD2_0 - return EstimateFrequencyStd(value); -#else - - I isa = default; - - if (isa.IsAvx2Supported) - { - return EstimateFrequencyAvx(value); - } -#if NET6_0_OR_GREATER - else if (isa.IsArm64Supported) - { - return EstimateFrequencyArm(value); - } -#endif - else - { - return EstimateFrequencyStd(value); - } -#endif - } - - /// - /// Increment the count of the specified value. - /// - /// The value. - public void Increment(T value) - { -#if NETSTANDARD2_0 - IncrementStd(value); -#else - - I isa = default; - - if (isa.IsAvx2Supported) - { - IncrementAvx(value); - } -#if NET6_0_OR_GREATER - else if (isa.IsArm64Supported) - { - IncrementArm(value); - } -#endif - else - { - IncrementStd(value); - } -#endif - } - - /// - /// Clears the count for all items. - /// - public void Clear() - { - Array.Clear(table, 0, table.Length); - size = 0; - } - - [MemberNotNull(nameof(table))] - private void EnsureCapacity(long maximumSize) - { - int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); - -#if NET6_0_OR_GREATER - I isa = default; - if (isa.IsAvx2Supported || isa.IsArm64Supported) - { - // over alloc by 8 to give 64 bytes padding, tableAddr is then aligned to 64 bytes - const int pad = 8; - bool pinned = true; - table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, pinned); - - tableAddr = (long*)Unsafe.AsPointer(ref table[0]); - tableAddr = (long*)((long)tableAddr + (long)tableAddr % 64); - - blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; - } - else -#endif - { - table = new long[Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8)]; - blockMask = (int)((uint)(table.Length) >> 3) - 1; - } - - sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); - - size = 0; - } - - private unsafe int EstimateFrequencyStd(T value) - { - var count = stackalloc int[4]; - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - int index = (h >> 1) & 15; - int offset = h & 1; - count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL); - } - return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3])); - } - - private unsafe void IncrementStd(T value) - { - var index = stackalloc int[8]; - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - index[i] = (h >> 1) & 15; - int offset = h & 1; - index[i + 4] = block + offset + (i << 1); - } - - bool added = - IncrementAt(index[4], index[0]) - | IncrementAt(index[5], index[1]) - | IncrementAt(index[6], index[2]) - | IncrementAt(index[7], index[3]); - - if (added && (++size == sampleSize)) - { - Reset(); - } - } - - // Applies another round of hashing for additional randomization. - private static int Rehash(int x) - { - x = (int)(x * 0x31848bab); - x ^= (int)((uint)x >> 14); - return x; - } - - // Applies a supplemental hash function to defend against poor quality hash. - private static int Spread(int x) - { - x ^= (int)((uint)x >> 17); - x = (int)(x * 0xed5ad4bb); - x ^= (int)((uint)x >> 11); - x = (int)(x * 0xac4c1b51); - x ^= (int)((uint)x >> 15); - return x; - } - - private bool IncrementAt(int i, int j) - { - int offset = j << 2; - long mask = (0xfL << offset); - - if ((table[i] & mask) != mask) - { - table[i] += (1L << offset); - return true; - } - - return false; - } - - private void Reset() - { - // unroll, almost 2x faster - int count0 = 0; - int count1 = 0; - int count2 = 0; - int count3 = 0; - - for (int i = 0; i < table.Length; i += 4) - { - count0 += BitOps.BitCount(table[i] & OneMask); - count1 += BitOps.BitCount(table[i + 1] & OneMask); - count2 += BitOps.BitCount(table[i + 2] & OneMask); - count3 += BitOps.BitCount(table[i + 3] & OneMask); - - table[i] = (long)((ulong)table[i] >> 1) & ResetMask; - table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; - table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; - table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; - } - - count0 = (count0 + count1) + (count2 + count3); - - size = (size - (count0 >> 2)) >> 1; - } - -#if !NETSTANDARD2_0 - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private unsafe int EstimateFrequencyAvx(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); - Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); - Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); - - Vector256 indexLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); - -#if NET6_0_OR_GREATER - long* tablePtr = tableAddr; -#else - fixed (long* tablePtr = table) -#endif - { - Vector128 count = Avx2.PermuteVar8x32(Avx2.And(Avx2.ShiftRightLogicalVariable(Avx2.GatherVector256(tablePtr, blockOffset, 8), indexLong), Vector256.Create(0xfL)).AsInt32(), Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7)) - .GetLower() - .AsUInt16(); - - // set the zeroed high parts of the long value to ushort.Max -#if NET6_0_OR_GREATER - count = Avx2.Blend(count, Vector128.AllBitsSet, 0b10101010); -#else - count = Avx2.Blend(count, Vector128.Create(ushort.MaxValue), 0b10101010); -#endif - - return Avx2.MinHorizontal(count).GetElement(0); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private unsafe void IncrementAvx(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); - Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); - Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); - - Vector256 offsetLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); - Vector256 mask = Avx2.ShiftLeftLogicalVariable(Vector256.Create(0xfL), offsetLong); - -#if NET6_0_OR_GREATER - long* tablePtr = tableAddr; -#else - fixed (long* tablePtr = table) -#endif - { - // Note masked is 'equal' - therefore use AndNot below - Vector256 masked = Avx2.CompareEqual(Avx2.And(Avx2.GatherVector256(tablePtr, blockOffset, 8), mask), mask); - - // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) - Vector256 inc = Avx2.AndNot(masked, Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong)); - - bool wasInc = Avx2.MoveMask(Avx2.CompareEqual(masked.AsByte(), Vector256.Zero).AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); - - tablePtr[blockOffset.GetElement(0)] += inc.GetElement(0); - tablePtr[blockOffset.GetElement(1)] += inc.GetElement(1); - tablePtr[blockOffset.GetElement(2)] += inc.GetElement(2); - tablePtr[blockOffset.GetElement(3)] += inc.GetElement(3); - - if (wasInc && (++size == sampleSize)) - { - Reset(); - } - } - } -#endif - -#if NET6_0_OR_GREATER - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private unsafe void IncrementArm(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24)); - Vector128 index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf)); - Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); - - long* tablePtr = tableAddr; - { - int t0 = AdvSimd.Extract(blockOffset, 0); - int t1 = AdvSimd.Extract(blockOffset, 1); - int t2 = AdvSimd.Extract(blockOffset, 2); - int t3 = AdvSimd.Extract(blockOffset, 3); - - Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t0), AdvSimd.LoadVector64(tablePtr + t1)); - Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t2), AdvSimd.LoadVector64(tablePtr + t3)); - - index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); - - Vector128 longOffA = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0), 2, index, 1); - Vector128 longOffB = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2), 2, index, 3); - - Vector128 fifteen = Vector128.Create(0xfL); - Vector128 maskA = AdvSimd.ShiftArithmetic(fifteen, longOffA.AsInt64()); - Vector128 maskB = AdvSimd.ShiftArithmetic(fifteen, longOffB.AsInt64()); - - Vector128 maskedA = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorA, maskA), maskA)); - Vector128 maskedB = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorB, maskB), maskB)); - - var one = Vector128.Create(1L); - Vector128 incA = AdvSimd.And(maskedA, AdvSimd.ShiftArithmetic(one, longOffA.AsInt64())); - Vector128 incB = AdvSimd.And(maskedB, AdvSimd.ShiftArithmetic(one, longOffB.AsInt64())); - - tablePtr[t0] += AdvSimd.Extract(incA, 0); - tablePtr[t1] += AdvSimd.Extract(incA, 1); - tablePtr[t2] += AdvSimd.Extract(incB, 0); - tablePtr[t3] += AdvSimd.Extract(incB, 1); - - var max = AdvSimd.Arm64.MaxAcross(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.MaxAcross(incA.AsInt32()), 1, AdvSimd.Arm64.MaxAcross(incB.AsInt32()), 0).AsInt16()); - - if (max.ToScalar() != 0 && (++size == sampleSize)) - { - Reset(); - } - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private unsafe int EstimateFrequencyArm(T value) - { - int blockHash = Spread(comparer.GetHashCode(value)); - int counterHash = Rehash(blockHash); - int block = (blockHash & blockMask) << 3; - - Vector128 h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24)); - Vector128 index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf)); - Vector128 blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); - - long* tablePtr = tableAddr; - { - Vector128 tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 0)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 1))); - Vector128 tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 2)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 3))); - - index = AdvSimd.ShiftLeftLogicalSaturate(index, 2); - - Vector128 indexA = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 0), 2, index, 1)); - Vector128 indexB = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128.Zero, 0, index, 2), 2, index, 3)); - - var fifteen = Vector128.Create(0xfL); - Vector128 a = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorA, indexA.AsInt64()), fifteen); - Vector128 b = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorB, indexB.AsInt64()), fifteen); - - // Before: < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F > - // After: < 0, 1, 2, 3, 8, 9, A, B, 4, 5, 6, 7, C, D, E, F > - var min = AdvSimd.Arm64.VectorTableLookup(a.AsByte(), Vector128.Create(0x0B0A090803020100, 0xFFFFFFFFFFFFFFFF).AsByte()); - min = AdvSimd.Arm64.VectorTableLookupExtension(min, b.AsByte(), Vector128.Create(0xFFFFFFFFFFFFFFFF, 0x0B0A090803020100).AsByte()); - - var min32 = AdvSimd.Arm64.MinAcross(min.AsInt32()); - - return min32.ToScalar(); - } - } -#endif - } -} diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs index 68428e06..fdc8629a 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs @@ -12,6 +12,9 @@ namespace BitFaster.Caching.Benchmarks.Lfu { + // Block sketch implementation without: + // - Pinned buffer for vector code paths + // - Loop unroll for non-vector code paths internal class CmSketchNoPin where T : notnull where I : struct, IsaProbe diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index 8bab8480..970c2d9f 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -24,7 +24,7 @@ public class SketchFrequency private CmSketchFlat flatStd; private CmSketchFlat flatAvx; - private CmSketchLooped blockStdNoUnroll; + private CmSketchNoPin blockStdNoUnroll; private CmSketchCore blockStdUnroll; private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockAvx; @@ -38,7 +38,7 @@ public void Setup() flatStd = new CmSketchFlat(Size, EqualityComparer.Default); flatAvx = new CmSketchFlat(Size, EqualityComparer.Default); - blockStdNoUnroll = new CmSketchLooped(Size, EqualityComparer.Default); + blockStdNoUnroll = new CmSketchNoPin(Size, EqualityComparer.Default); blockStdUnroll = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index 3531220c..0f868ed1 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -23,7 +23,7 @@ public class SketchIncrement private CmSketchFlat flatStd; private CmSketchFlat flatAvx; - private CmSketchLooped blockStdNoUnroll; + private CmSketchNoPin blockStdNoUnroll; private CmSketchCore blockStdUnroll; private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockAvx; @@ -38,7 +38,7 @@ public void Setup() flatStd = new CmSketchFlat(Size, EqualityComparer.Default); flatAvx = new CmSketchFlat(Size, EqualityComparer.Default); - blockStdNoUnroll = new CmSketchLooped(Size, EqualityComparer.Default); + blockStdNoUnroll = new CmSketchNoPin(Size, EqualityComparer.Default); blockStdUnroll = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default);