diff --git a/src/Microsoft.Data.Analysis/ArrayUtility.cs b/src/Microsoft.Data.Analysis/ArrayUtility.cs new file mode 100644 index 0000000000..daffef9f8d --- /dev/null +++ b/src/Microsoft.Data.Analysis/ArrayUtility.cs @@ -0,0 +1,18 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.Data.Analysis +{ + internal static class ArrayUtility + { + // Maximum size of one-dimensional array. + // See: https://msdn.microsoft.com/en-us/library/hh285054(v=vs.110).aspx + // Polyfilling Array.MaxLength API for netstandard2.0 + public const int ArrayMaxSize = 0X7FEFFFFF; + } +} diff --git a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs index fd7f23964f..b4a3fc15a8 100644 --- a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs +++ b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs @@ -80,7 +80,7 @@ public void EnsureCapacity(int numberOfValues) if (newLength > Capacity) { //Double buffer size, but not higher than MaxByteCapacity - var doubledSize = (int)Math.Min((long)ReadOnlyBuffer.Length * 2, MaxCapacityInBytes); + var doubledSize = (int)Math.Min((long)ReadOnlyBuffer.Length * 2, ArrayUtility.ArrayMaxSize); var newCapacity = Math.Max(newLength * Size, doubledSize); var memory = new Memory(new byte[newCapacity]); diff --git a/src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs b/src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs index 069aa3e94a..f08b2120b1 100644 --- a/src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs +++ b/src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs @@ -36,11 +36,7 @@ public ReadOnlyMemory RawReadOnlyMemory protected int Capacity => ReadOnlyBuffer.Length / Size; - //The maximum size in any single dimension for byte array is 0x7FFFFFc7 - 2147483591 - //See https://learn.microsoft.com/en-us/dotnet/framework/configure-apps/file-schema/runtime/gcallowverylargeobjects-element - public const int MaxCapacityInBytes = 2147483591; - - public static int MaxCapacity => MaxCapacityInBytes / Size; + public static int MaxCapacity => ArrayUtility.ArrayMaxSize / Size; public ReadOnlySpan ReadOnlySpan { diff --git a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs index 1a46ff74a9..03d112d027 100644 --- a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs @@ -6,6 +6,7 @@ using System.Collections; using System.Collections.Generic; using System.Diagnostics; +using System.Runtime.CompilerServices; using Microsoft.ML; using Microsoft.ML.Data; @@ -17,15 +18,17 @@ namespace Microsoft.Data.Analysis /// Is NOT Arrow compatible public partial class StringDataFrameColumn : DataFrameColumn, IEnumerable { + public static int MaxCapacity = ArrayUtility.ArrayMaxSize / Unsafe.SizeOf(); // Max Size in bytes / size of pointer (8 bytes on x64) + private readonly List> _stringBuffers = new List>(); // To store more than intMax number of strings public StringDataFrameColumn(string name, long length = 0) : base(name, length, typeof(string)) { - int numberOfBuffersRequired = Math.Max((int)(length / int.MaxValue), 1); + int numberOfBuffersRequired = (int)(length / MaxCapacity + 1); for (int i = 0; i < numberOfBuffersRequired; i++) { - long bufferLen = length - _stringBuffers.Count * int.MaxValue; - List buffer = new List((int)Math.Min(int.MaxValue, bufferLen)); + long bufferLen = length - _stringBuffers.Count * MaxCapacity; + List buffer = new List((int)Math.Min(MaxCapacity, bufferLen)); _stringBuffers.Add(buffer); for (int j = 0; j < bufferLen; j++) { @@ -64,7 +67,7 @@ protected internal override void Resize(long length) public void Append(string value) { List lastBuffer = _stringBuffers[_stringBuffers.Count - 1]; - if (lastBuffer.Count == int.MaxValue) + if (lastBuffer.Count == MaxCapacity) { lastBuffer = new List(); _stringBuffers.Add(lastBuffer); @@ -75,33 +78,34 @@ public void Append(string value) Length++; } - private int GetBufferIndexContainingRowIndex(ref long rowIndex) + private int GetBufferIndexContainingRowIndex(long rowIndex) { if (rowIndex >= Length) { throw new ArgumentOutOfRangeException(Strings.ColumnIndexOutOfRange, nameof(rowIndex)); } - return (int)(rowIndex / int.MaxValue); + return (int)(rowIndex / MaxCapacity); } protected override object GetValue(long rowIndex) { - int bufferIndex = GetBufferIndexContainingRowIndex(ref rowIndex); - return _stringBuffers[bufferIndex][(int)rowIndex]; + int bufferIndex = GetBufferIndexContainingRowIndex(rowIndex); + return _stringBuffers[bufferIndex][(int)(rowIndex % MaxCapacity)]; } protected override IReadOnlyList GetValues(long startIndex, int length) { var ret = new List(); - int bufferIndex = GetBufferIndexContainingRowIndex(ref startIndex); + int bufferIndex = GetBufferIndexContainingRowIndex(startIndex); + int bufferOffset = (int)(startIndex % MaxCapacity); while (ret.Count < length && bufferIndex < _stringBuffers.Count) { - for (int i = (int)startIndex; ret.Count < length && i < _stringBuffers[bufferIndex].Count; i++) + for (int i = bufferOffset; ret.Count < length && i < _stringBuffers[bufferIndex].Count; i++) { ret.Add(_stringBuffers[bufferIndex][i]); } bufferIndex++; - startIndex = 0; + bufferOffset = 0; } return ret; } @@ -110,9 +114,10 @@ protected override void SetValue(long rowIndex, object value) { if (value == null || value is string) { - int bufferIndex = GetBufferIndexContainingRowIndex(ref rowIndex); + int bufferIndex = GetBufferIndexContainingRowIndex(rowIndex); + int bufferOffset = (int)(rowIndex % MaxCapacity); var oldValue = this[rowIndex]; - _stringBuffers[bufferIndex][(int)rowIndex] = (string)value; + _stringBuffers[bufferIndex][bufferOffset] = (string)value; if (oldValue != (string)value) { if (value == null) @@ -138,15 +143,16 @@ protected override void SetValue(long rowIndex, object value) get { var ret = new List(); - int bufferIndex = GetBufferIndexContainingRowIndex(ref startIndex); + int bufferIndex = GetBufferIndexContainingRowIndex(startIndex); + int bufferOffset = (int)(startIndex % MaxCapacity); while (ret.Count < length && bufferIndex < _stringBuffers.Count) { - for (int i = (int)startIndex; ret.Count < length && i < _stringBuffers[bufferIndex].Count; i++) + for (int i = bufferOffset; ret.Count < length && i < _stringBuffers[bufferIndex].Count; i++) { ret.Add(_stringBuffers[bufferIndex][i]); } bufferIndex++; - startIndex = 0; + bufferOffset = 0; } return ret; } @@ -194,7 +200,7 @@ private PrimitiveDataFrameColumn GetSortIndices(Comparer comparer, sortIndices[i] = i; if (buffer[i] == null) { - columnNullIndices[nullIndicesSlot] = i + bufferSortIndices.Count * int.MaxValue; + columnNullIndices[nullIndicesSlot] = i + bufferSortIndices.Count * MaxCapacity; nullIndicesSlot++; } } @@ -295,11 +301,11 @@ private StringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn List setBuffer = ret._stringBuffers[0]; long setBufferMinRange = 0; - long setBufferMaxRange = int.MaxValue; + long setBufferMaxRange = MaxCapacity; List getBuffer = _stringBuffers[0]; long getBufferMinRange = 0; - long getBufferMaxRange = int.MaxValue; - long maxCapacity = int.MaxValue; + long getBufferMaxRange = MaxCapacity; + long maxCapacity = MaxCapacity; if (mapIndices.DataType == typeof(long)) { PrimitiveDataFrameColumn longMapIndices = mapIndices as PrimitiveDataFrameColumn; diff --git a/src/Microsoft.Data.Analysis/VBufferDataFrameColumn.cs b/src/Microsoft.Data.Analysis/VBufferDataFrameColumn.cs index ee860cfa33..fc15c873cb 100644 --- a/src/Microsoft.Data.Analysis/VBufferDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/VBufferDataFrameColumn.cs @@ -5,12 +5,9 @@ using System; using System.Collections; using System.Collections.Generic; +using System.Data; using System.Diagnostics; -using System.Linq; -using System.Runtime.InteropServices; -using System.Text; -using Apache.Arrow; -using Apache.Arrow.Types; +using System.Runtime.CompilerServices; using Microsoft.ML; using Microsoft.ML.Data; @@ -21,6 +18,9 @@ namespace Microsoft.Data.Analysis /// public partial class VBufferDataFrameColumn : DataFrameColumn, IEnumerable> { + + public static int MaxCapacity = ArrayUtility.ArrayMaxSize / Unsafe.SizeOf>(); + private readonly List>> _vBuffers = new List>>(); // To store more than intMax number of vbuffers /// @@ -30,11 +30,11 @@ public partial class VBufferDataFrameColumn : DataFrameColumn, IEnumerableLength of values public VBufferDataFrameColumn(string name, long length = 0) : base(name, length, typeof(VBuffer)) { - int numberOfBuffersRequired = Math.Max((int)(length / int.MaxValue), 1); + int numberOfBuffersRequired = (int)(length / MaxCapacity + 1); for (int i = 0; i < numberOfBuffersRequired; i++) { - long bufferLen = length - _vBuffers.Count * int.MaxValue; - List> buffer = new List>((int)Math.Min(int.MaxValue, bufferLen)); + int bufferLen = (int)Math.Min(MaxCapacity, length - _vBuffers.Count * MaxCapacity); + List> buffer = new List>(bufferLen); _vBuffers.Add(buffer); for (int j = 0; j < bufferLen; j++) { @@ -74,7 +74,7 @@ protected internal override void Resize(long length) public void Append(VBuffer value) { List> lastBuffer = _vBuffers[_vBuffers.Count - 1]; - if (lastBuffer.Count == int.MaxValue) + if (lastBuffer.Count == MaxCapacity) { lastBuffer = new List>(); _vBuffers.Add(lastBuffer); @@ -83,34 +83,35 @@ public void Append(VBuffer value) Length++; } - private int GetBufferIndexContainingRowIndex(ref long rowIndex) + private int GetBufferIndexContainingRowIndex(long rowIndex) { if (rowIndex >= Length) { throw new ArgumentOutOfRangeException(Strings.ColumnIndexOutOfRange, nameof(rowIndex)); } - return (int)(rowIndex / int.MaxValue); + return (int)(rowIndex / MaxCapacity); } protected override object GetValue(long rowIndex) { - int bufferIndex = GetBufferIndexContainingRowIndex(ref rowIndex); - return _vBuffers[bufferIndex][(int)rowIndex]; + int bufferIndex = GetBufferIndexContainingRowIndex(rowIndex); + return _vBuffers[bufferIndex][(int)(rowIndex % MaxCapacity)]; } protected override IReadOnlyList GetValues(long startIndex, int length) { var ret = new List(); - int bufferIndex = GetBufferIndexContainingRowIndex(ref startIndex); + int bufferIndex = GetBufferIndexContainingRowIndex(startIndex); + int bufferOffset = (int)(startIndex % MaxCapacity); while (ret.Count < length && bufferIndex < _vBuffers.Count) { - for (int i = (int)startIndex; ret.Count < length && i < _vBuffers[bufferIndex].Count; i++) + for (int i = bufferOffset; ret.Count < length && i < _vBuffers[bufferIndex].Count; i++) { ret.Add(_vBuffers[bufferIndex][i]); } bufferIndex++; - startIndex = 0; + bufferOffset = 0; } return ret; } @@ -119,9 +120,10 @@ protected override void SetValue(long rowIndex, object value) { if (value == null || value is VBuffer) { - int bufferIndex = GetBufferIndexContainingRowIndex(ref rowIndex); - var oldValue = this[rowIndex]; - _vBuffers[bufferIndex][(int)rowIndex] = (VBuffer)value; + int bufferIndex = GetBufferIndexContainingRowIndex(rowIndex); + int bufferOffset = (int)(rowIndex % MaxCapacity); + var oldValue = _vBuffers[bufferIndex][bufferOffset]; + _vBuffers[bufferIndex][bufferOffset] = (VBuffer)value; if (!oldValue.Equals((VBuffer)value)) { if (value == null) @@ -250,11 +252,11 @@ private VBufferDataFrameColumn CloneImplementation(PrimitiveDataFrameColum List> setBuffer = ret._vBuffers[0]; long setBufferMinRange = 0; - long setBufferMaxRange = int.MaxValue; + long setBufferMaxRange = MaxCapacity; List> getBuffer = _vBuffers[0]; long getBufferMinRange = 0; - long getBufferMaxRange = int.MaxValue; - long maxCapacity = int.MaxValue; + long getBufferMaxRange = MaxCapacity; + long maxCapacity = MaxCapacity; if (mapIndices.DataType == typeof(long)) { PrimitiveDataFrameColumn longMapIndices = mapIndices as PrimitiveDataFrameColumn; diff --git a/test/Microsoft.Data.Analysis.Tests/BufferTests.cs b/test/Microsoft.Data.Analysis.Tests/BufferTests.cs index 16672818db..ce797ceda3 100644 --- a/test/Microsoft.Data.Analysis.Tests/BufferTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/BufferTests.cs @@ -457,7 +457,7 @@ public void TestAppend_SizeMoreThanMaxBufferCapacity() [X64Fact("32-bit dosn't allow to allocate more than 2 Gb")] public void TestAppendMany_SizeMoreThanMaxBufferCapacity() { - const int MaxCapacityInBytes = 2147483591; + const int MaxCapacityInBytes = 0X7FEFFFFF; //Check appending values with extending column size over MaxCapacity of ReadOnlyDataFrameBuffer PrimitiveDataFrameColumn intColumn = new PrimitiveDataFrameColumn("Byte1", MaxCapacityInBytes - 5); diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 85e4ccd79c..61d9361ade 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -9,6 +9,7 @@ using Apache.Arrow; using Microsoft.ML; using Microsoft.ML.Data; +using Microsoft.ML.TestFramework.Attributes; using Xunit; namespace Microsoft.Data.Analysis.Tests @@ -75,7 +76,7 @@ public static ArrowStringDataFrameColumn CreateArrowStringColumn(int length, boo return new ArrowStringDataFrameColumn("ArrowString", dataMemory, offsetMemory, nullMemory, length, nullCount); } - public static VBufferDataFrameColumn CreateVBufferDataFrame(int length) + public static VBufferDataFrameColumn CreateVBufferDataFrameColumn(int length) { var buffers = Enumerable.Repeat(new VBuffer(5, new[] { 0, 1, 2, 3, 4 }), length).ToArray(); return new VBufferDataFrameColumn("VBuffer", buffers); @@ -85,7 +86,7 @@ public static DataFrame MakeDataFrameWithAllColumnTypes(int length, bool withNul { DataFrame df = MakeDataFrameWithAllMutableAndArrowColumnTypes(length, withNulls); - var vBufferColumn = CreateVBufferDataFrame(length); + var vBufferColumn = CreateVBufferDataFrameColumn(length); df.Columns.Insert(df.Columns.Count, vBufferColumn); return df; @@ -230,15 +231,51 @@ public DataFrame SplitTrainTest(DataFrame input, float testRatio, out DataFrame } [Fact] - public void TestVBufferColumn() + public void TestVBufferColumn_Creation() { - var vBufferColumn = CreateVBufferDataFrame(10); + var vBufferColumn = CreateVBufferDataFrameColumn(10); Assert.Equal(10, vBufferColumn.Length); Assert.Equal(5, vBufferColumn[0].GetValues().Length); Assert.Equal(0, vBufferColumn[0].GetValues()[0]); } + [Fact] + public void TestVBufferColumn_Indexer() + { + var buffer = new VBuffer(5, new[] { 4, 3, 2, 1, 0 }); + + var vBufferColumn = new VBufferDataFrameColumn("VBuffer", 1); + vBufferColumn[0] = buffer; + + Assert.Equal(1, vBufferColumn.Length); + Assert.Equal(5, vBufferColumn[0].GetValues().Length); + Assert.Equal(0, vBufferColumn[0].GetValues()[4]); + } + + [X64Fact("32-bit doesn't allow to allocate more than 2 Gb")] + public void TestVBufferColumn_Indexer_MoreThanMaxInt() + { + var originalValues = new[] { 4, 3, 2, 1, 0 }; + + var length = VBufferDataFrameColumn.MaxCapacity + 3; + + var vBufferColumn = new VBufferDataFrameColumn("VBuffer", length); + long index = length - 2; + + vBufferColumn[index] = new VBuffer(5, originalValues); + + var values = vBufferColumn[index].GetValues(); + + Assert.Equal(length, vBufferColumn.Length); + Assert.Equal(5, values.Length); + + for (int i = 0; i < values.Length; i++) + { + Assert.Equal(originalValues[i], values[i]); + } + } + [Fact] public void TestIndexer() {