From 0ccca6a59da724ef715c7412f87a82d3e1467713 Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Sun, 1 Oct 2023 13:46:32 +0300 Subject: [PATCH 1/2] Fix DataFrame incorrectly sets column value for index higher than Buffer.MaxCapacity --- .../PrimitiveColumnContainer.cs | 18 +++++++++--------- .../BufferTests.cs | 19 +++++++++++++++++-- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs index 1824e595ae..654efe3520 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs @@ -359,13 +359,13 @@ private byte SetBit(byte curBitMap, int index, bool value) } // private function. Faster to use when we already have a span since it avoids indexing - private void SetValidityBit(Span bitMapBufferSpan, int index, bool value) + private void SetValidityBit(Span validitySpan, int index, bool value) { int bitMapBufferIndex = (int)((uint)index / 8); - Debug.Assert(bitMapBufferSpan.Length >= bitMapBufferIndex); - byte curBitMap = bitMapBufferSpan[bitMapBufferIndex]; + Debug.Assert(validitySpan.Length >= bitMapBufferIndex); + byte curBitMap = validitySpan[bitMapBufferIndex]; byte newBitMap = SetBit(curBitMap, index, value); - bitMapBufferSpan[bitMapBufferIndex] = newBitMap; + validitySpan[bitMapBufferIndex] = newBitMap; } /// @@ -457,25 +457,25 @@ public T? this[long rowIndex] return null; } int arrayIndex = GetArrayContainingRowIndex(rowIndex); - rowIndex = rowIndex - arrayIndex * ReadOnlyDataFrameBuffer.MaxCapacity; - return Buffers[arrayIndex][(int)rowIndex]; + var bufferOffset = (int)(rowIndex % ReadOnlyDataFrameBuffer.MaxCapacity); + return Buffers[arrayIndex][bufferOffset]; } set { int arrayIndex = GetArrayContainingRowIndex(rowIndex); - rowIndex = rowIndex - arrayIndex * ReadOnlyDataFrameBuffer.MaxCapacity; + var bufferOffset = (int)(rowIndex % ReadOnlyDataFrameBuffer.MaxCapacity); Buffers.GetOrCreateMutable(arrayIndex); NullBitMapBuffers.GetOrCreateMutable(arrayIndex); if (value.HasValue) { - Buffers[arrayIndex][(int)rowIndex] = value.Value; + Buffers[arrayIndex][bufferOffset] = value.Value; SetValidityBit(rowIndex, true); } else { - Buffers[arrayIndex][(int)rowIndex] = default; + Buffers[arrayIndex][bufferOffset] = default; SetValidityBit(rowIndex, false); } } diff --git a/test/Microsoft.Data.Analysis.Tests/BufferTests.cs b/test/Microsoft.Data.Analysis.Tests/BufferTests.cs index 16672818db..56c26267fd 100644 --- a/test/Microsoft.Data.Analysis.Tests/BufferTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/BufferTests.cs @@ -8,12 +8,27 @@ using System.Text; using Apache.Arrow; using Microsoft.ML.TestFramework.Attributes; +using Newtonsoft.Json.Linq; using Xunit; namespace Microsoft.Data.Analysis.Tests { public class BufferTests { + [X64Fact("32-bit doesn't allow to allocate more than 2 Gb")] + public void TestGetterAndSetterForColumnsGreaterThanMaxCapacity() + { + const int MaxCapacityInBytes = 2147483591; + + var length = MaxCapacityInBytes + 5; + var column = new PrimitiveDataFrameColumn("LargeColumn", length); + var index = length - 1; + column[index] = 33; + + Assert.Equal((byte)33, column[index]); + Assert.Null(column[index % MaxCapacityInBytes]); + } + [Fact] public void TestNullCounts() { @@ -446,7 +461,7 @@ public void TestArrowStringColumnClone() Assert.Null(clone[i]); } - [X64Fact("32-bit dosn't allow to allocate more than 2 Gb")] + [X64Fact("32-bit doesn't allow to allocate more than 2 Gb")] public void TestAppend_SizeMoreThanMaxBufferCapacity() { //Check appending value, than can increase buffer size over MaxCapacity (default strategy is to double buffer capacity) @@ -454,7 +469,7 @@ public void TestAppend_SizeMoreThanMaxBufferCapacity() intColumn.Append(10); } - [X64Fact("32-bit dosn't allow to allocate more than 2 Gb")] + [X64Fact("32-bit doesn't allow to allocate more than 2 Gb")] public void TestAppendMany_SizeMoreThanMaxBufferCapacity() { const int MaxCapacityInBytes = 2147483591; From 607bc8c3b74278e6ec2b8d95f726bce76a2dde91 Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Sun, 1 Oct 2023 13:59:58 +0300 Subject: [PATCH 2/2] Revert renaming --- src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs index 654efe3520..22578217d3 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs @@ -359,13 +359,13 @@ private byte SetBit(byte curBitMap, int index, bool value) } // private function. Faster to use when we already have a span since it avoids indexing - private void SetValidityBit(Span validitySpan, int index, bool value) + private void SetValidityBit(Span bitMapBufferSpan, int index, bool value) { int bitMapBufferIndex = (int)((uint)index / 8); - Debug.Assert(validitySpan.Length >= bitMapBufferIndex); - byte curBitMap = validitySpan[bitMapBufferIndex]; + Debug.Assert(bitMapBufferSpan.Length >= bitMapBufferIndex); + byte curBitMap = bitMapBufferSpan[bitMapBufferIndex]; byte newBitMap = SetBit(curBitMap, index, value); - validitySpan[bitMapBufferIndex] = newBitMap; + bitMapBufferSpan[bitMapBufferIndex] = newBitMap; } ///