From eb2230d65ad3f968052353c715af157794121dce Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Fri, 8 Sep 2023 20:36:06 +0300 Subject: [PATCH 1/8] Optimize PrimitiveColumnContainer.Clone method --- src/Microsoft.Data.Analysis/DataFrameBuffer.cs | 3 ++- .../PrimitiveColumnContainer.cs | 15 +++------------ test/Microsoft.Data.Analysis.Tests/BufferTests.cs | 15 ++++++++++++++- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs index e4ee20f9b6..a22f94c56d 100644 --- a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs +++ b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs @@ -38,10 +38,11 @@ public Span RawSpan public DataFrameBuffer(int numberOfValues = 8) : base(numberOfValues) { } - internal DataFrameBuffer(ReadOnlyMemory buffer, int length) : base(buffer, length) + internal DataFrameBuffer(ReadOnlyMemory buffer, int length) { _memory = new byte[buffer.Length]; buffer.CopyTo(_memory); + Length = length; } public void Append(T value) diff --git a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs index 1e37ac2206..aa713a66c5 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs @@ -9,6 +9,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Text; +using Microsoft.ML.Data; namespace Microsoft.Data.Analysis { @@ -436,13 +437,8 @@ private List> CloneNullBitMapBuffers() List> ret = new List>(); foreach (ReadOnlyDataFrameBuffer buffer in NullBitMapBuffers) { - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.ReadOnlyBuffer, buffer.Length); ret.Add(newBuffer); - ReadOnlySpan span = buffer.ReadOnlySpan; - for (int i = 0; i < span.Length; i++) - { - newBuffer.Append(span[i]); - } } return ret; } @@ -518,14 +514,9 @@ public PrimitiveColumnContainer Clone() var ret = new PrimitiveColumnContainer(); foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.ReadOnlyBuffer, buffer.Length); ret.Buffers.Add(newBuffer); - ReadOnlySpan span = buffer.ReadOnlySpan; ret.Length += buffer.Length; - for (int i = 0; i < span.Length; i++) - { - newBuffer.Append(span[i]); - } } ret.NullBitMapBuffers = CloneNullBitMapBuffers(); ret.NullCount = NullCount; diff --git a/test/Microsoft.Data.Analysis.Tests/BufferTests.cs b/test/Microsoft.Data.Analysis.Tests/BufferTests.cs index bc8f66d822..3a88e2eddc 100644 --- a/test/Microsoft.Data.Analysis.Tests/BufferTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/BufferTests.cs @@ -79,7 +79,6 @@ public void TestNullCounts() Assert.Equal(2, intColumn[3]); Assert.Null(intColumn[4]); Assert.Equal(3, intColumn[5]); - } [Fact] @@ -134,6 +133,20 @@ public void TestAppendMany() Assert.False(intColumn.IsValid(7)); } + [Fact] + public void TestClone() + { + PrimitiveDataFrameColumn intColumn = new PrimitiveDataFrameColumn("Int1", new int?[] { 1, 2, 3, 4, null }); + var copy = intColumn.Clone(); + + Assert.Equal(intColumn.Name, copy.Name); + Assert.Equal(intColumn.Length, copy.Length); + Assert.Equal(intColumn.DataType, copy.DataType); + + for (int i = 0; i < intColumn.Length; i++) + Assert.Equal(intColumn[i], copy[i]); + } + [Fact] public void TestBasicArrowStringColumn() { From 67af2766bdc0dd931d0ada2232cde6175d13e897 Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Fri, 8 Sep 2023 21:22:35 +0300 Subject: [PATCH 2/8] Avoid unnecessary type conversion during binary operations --- ...BinaryOperationImplementations.Exploded.cs | 70 +++++++++---------- ...BinaryOperationImplementations.Exploded.tt | 2 +- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.BinaryOperationImplementations.Exploded.cs b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.BinaryOperationImplementations.Exploded.cs index ac8035f0ba..6a54acb85e 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.BinaryOperationImplementations.Exploded.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.BinaryOperationImplementations.Exploded.cs @@ -19,7 +19,7 @@ internal DecimalDataFrameColumn AddImplementation(DecimalDataFrameColumn column, { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - DecimalDataFrameColumn newColumn = inPlace ? this : CloneAsDecimalColumn(); + DecimalDataFrameColumn newColumn = inPlace ? this : (DecimalDataFrameColumn)Clone(); newColumn.ColumnContainer.Add(column.ColumnContainer); return newColumn; } @@ -32,7 +32,7 @@ internal DoubleDataFrameColumn AddImplementation(DoubleDataFrameColumn column, b { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - DoubleDataFrameColumn newColumn = inPlace ? this : CloneAsDoubleColumn(); + DoubleDataFrameColumn newColumn = inPlace ? this : (DoubleDataFrameColumn)Clone(); newColumn.ColumnContainer.Add(column.ColumnContainer); return newColumn; } @@ -45,7 +45,7 @@ internal SingleDataFrameColumn AddImplementation(SingleDataFrameColumn column, b { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - SingleDataFrameColumn newColumn = inPlace ? this : CloneAsSingleColumn(); + SingleDataFrameColumn newColumn = inPlace ? this : (SingleDataFrameColumn)Clone(); newColumn.ColumnContainer.Add(column.ColumnContainer); return newColumn; } @@ -58,7 +58,7 @@ internal Int32DataFrameColumn AddImplementation(Int32DataFrameColumn column, boo { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - Int32DataFrameColumn newColumn = inPlace ? this : CloneAsInt32Column(); + Int32DataFrameColumn newColumn = inPlace ? this : (Int32DataFrameColumn)Clone(); newColumn.ColumnContainer.Add(column.ColumnContainer); return newColumn; } @@ -71,7 +71,7 @@ internal Int64DataFrameColumn AddImplementation(Int64DataFrameColumn column, boo { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - Int64DataFrameColumn newColumn = inPlace ? this : CloneAsInt64Column(); + Int64DataFrameColumn newColumn = inPlace ? this : (Int64DataFrameColumn)Clone(); newColumn.ColumnContainer.Add(column.ColumnContainer); return newColumn; } @@ -84,7 +84,7 @@ internal UInt32DataFrameColumn AddImplementation(UInt32DataFrameColumn column, b { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - UInt32DataFrameColumn newColumn = inPlace ? this : CloneAsUInt32Column(); + UInt32DataFrameColumn newColumn = inPlace ? this : (UInt32DataFrameColumn)Clone(); newColumn.ColumnContainer.Add(column.ColumnContainer); return newColumn; } @@ -97,7 +97,7 @@ internal UInt64DataFrameColumn AddImplementation(UInt64DataFrameColumn column, b { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - UInt64DataFrameColumn newColumn = inPlace ? this : CloneAsUInt64Column(); + UInt64DataFrameColumn newColumn = inPlace ? this : (UInt64DataFrameColumn)Clone(); newColumn.ColumnContainer.Add(column.ColumnContainer); return newColumn; } @@ -250,7 +250,7 @@ internal DecimalDataFrameColumn SubtractImplementation(DecimalDataFrameColumn co { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - DecimalDataFrameColumn newColumn = inPlace ? this : CloneAsDecimalColumn(); + DecimalDataFrameColumn newColumn = inPlace ? this : (DecimalDataFrameColumn)Clone(); newColumn.ColumnContainer.Subtract(column.ColumnContainer); return newColumn; } @@ -263,7 +263,7 @@ internal DoubleDataFrameColumn SubtractImplementation(DoubleDataFrameColumn colu { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - DoubleDataFrameColumn newColumn = inPlace ? this : CloneAsDoubleColumn(); + DoubleDataFrameColumn newColumn = inPlace ? this : (DoubleDataFrameColumn)Clone(); newColumn.ColumnContainer.Subtract(column.ColumnContainer); return newColumn; } @@ -276,7 +276,7 @@ internal SingleDataFrameColumn SubtractImplementation(SingleDataFrameColumn colu { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - SingleDataFrameColumn newColumn = inPlace ? this : CloneAsSingleColumn(); + SingleDataFrameColumn newColumn = inPlace ? this : (SingleDataFrameColumn)Clone(); newColumn.ColumnContainer.Subtract(column.ColumnContainer); return newColumn; } @@ -289,7 +289,7 @@ internal Int32DataFrameColumn SubtractImplementation(Int32DataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - Int32DataFrameColumn newColumn = inPlace ? this : CloneAsInt32Column(); + Int32DataFrameColumn newColumn = inPlace ? this : (Int32DataFrameColumn)Clone(); newColumn.ColumnContainer.Subtract(column.ColumnContainer); return newColumn; } @@ -302,7 +302,7 @@ internal Int64DataFrameColumn SubtractImplementation(Int64DataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - Int64DataFrameColumn newColumn = inPlace ? this : CloneAsInt64Column(); + Int64DataFrameColumn newColumn = inPlace ? this : (Int64DataFrameColumn)Clone(); newColumn.ColumnContainer.Subtract(column.ColumnContainer); return newColumn; } @@ -315,7 +315,7 @@ internal UInt32DataFrameColumn SubtractImplementation(UInt32DataFrameColumn colu { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - UInt32DataFrameColumn newColumn = inPlace ? this : CloneAsUInt32Column(); + UInt32DataFrameColumn newColumn = inPlace ? this : (UInt32DataFrameColumn)Clone(); newColumn.ColumnContainer.Subtract(column.ColumnContainer); return newColumn; } @@ -328,7 +328,7 @@ internal UInt64DataFrameColumn SubtractImplementation(UInt64DataFrameColumn colu { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - UInt64DataFrameColumn newColumn = inPlace ? this : CloneAsUInt64Column(); + UInt64DataFrameColumn newColumn = inPlace ? this : (UInt64DataFrameColumn)Clone(); newColumn.ColumnContainer.Subtract(column.ColumnContainer); return newColumn; } @@ -481,7 +481,7 @@ internal DecimalDataFrameColumn MultiplyImplementation(DecimalDataFrameColumn co { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - DecimalDataFrameColumn newColumn = inPlace ? this : CloneAsDecimalColumn(); + DecimalDataFrameColumn newColumn = inPlace ? this : (DecimalDataFrameColumn)Clone(); newColumn.ColumnContainer.Multiply(column.ColumnContainer); return newColumn; } @@ -494,7 +494,7 @@ internal DoubleDataFrameColumn MultiplyImplementation(DoubleDataFrameColumn colu { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - DoubleDataFrameColumn newColumn = inPlace ? this : CloneAsDoubleColumn(); + DoubleDataFrameColumn newColumn = inPlace ? this : (DoubleDataFrameColumn)Clone(); newColumn.ColumnContainer.Multiply(column.ColumnContainer); return newColumn; } @@ -507,7 +507,7 @@ internal SingleDataFrameColumn MultiplyImplementation(SingleDataFrameColumn colu { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - SingleDataFrameColumn newColumn = inPlace ? this : CloneAsSingleColumn(); + SingleDataFrameColumn newColumn = inPlace ? this : (SingleDataFrameColumn)Clone(); newColumn.ColumnContainer.Multiply(column.ColumnContainer); return newColumn; } @@ -520,7 +520,7 @@ internal Int32DataFrameColumn MultiplyImplementation(Int32DataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - Int32DataFrameColumn newColumn = inPlace ? this : CloneAsInt32Column(); + Int32DataFrameColumn newColumn = inPlace ? this : (Int32DataFrameColumn)Clone(); newColumn.ColumnContainer.Multiply(column.ColumnContainer); return newColumn; } @@ -533,7 +533,7 @@ internal Int64DataFrameColumn MultiplyImplementation(Int64DataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - Int64DataFrameColumn newColumn = inPlace ? this : CloneAsInt64Column(); + Int64DataFrameColumn newColumn = inPlace ? this : (Int64DataFrameColumn)Clone(); newColumn.ColumnContainer.Multiply(column.ColumnContainer); return newColumn; } @@ -546,7 +546,7 @@ internal UInt32DataFrameColumn MultiplyImplementation(UInt32DataFrameColumn colu { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - UInt32DataFrameColumn newColumn = inPlace ? this : CloneAsUInt32Column(); + UInt32DataFrameColumn newColumn = inPlace ? this : (UInt32DataFrameColumn)Clone(); newColumn.ColumnContainer.Multiply(column.ColumnContainer); return newColumn; } @@ -559,7 +559,7 @@ internal UInt64DataFrameColumn MultiplyImplementation(UInt64DataFrameColumn colu { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - UInt64DataFrameColumn newColumn = inPlace ? this : CloneAsUInt64Column(); + UInt64DataFrameColumn newColumn = inPlace ? this : (UInt64DataFrameColumn)Clone(); newColumn.ColumnContainer.Multiply(column.ColumnContainer); return newColumn; } @@ -712,7 +712,7 @@ internal DecimalDataFrameColumn DivideImplementation(DecimalDataFrameColumn colu { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - DecimalDataFrameColumn newColumn = inPlace ? this : CloneAsDecimalColumn(); + DecimalDataFrameColumn newColumn = inPlace ? this : (DecimalDataFrameColumn)Clone(); newColumn.ColumnContainer.Divide(column.ColumnContainer); return newColumn; } @@ -725,7 +725,7 @@ internal DoubleDataFrameColumn DivideImplementation(DoubleDataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - DoubleDataFrameColumn newColumn = inPlace ? this : CloneAsDoubleColumn(); + DoubleDataFrameColumn newColumn = inPlace ? this : (DoubleDataFrameColumn)Clone(); newColumn.ColumnContainer.Divide(column.ColumnContainer); return newColumn; } @@ -738,7 +738,7 @@ internal SingleDataFrameColumn DivideImplementation(SingleDataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - SingleDataFrameColumn newColumn = inPlace ? this : CloneAsSingleColumn(); + SingleDataFrameColumn newColumn = inPlace ? this : (SingleDataFrameColumn)Clone(); newColumn.ColumnContainer.Divide(column.ColumnContainer); return newColumn; } @@ -751,7 +751,7 @@ internal Int32DataFrameColumn DivideImplementation(Int32DataFrameColumn column, { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - Int32DataFrameColumn newColumn = inPlace ? this : CloneAsInt32Column(); + Int32DataFrameColumn newColumn = inPlace ? this : (Int32DataFrameColumn)Clone(); newColumn.ColumnContainer.Divide(column.ColumnContainer); return newColumn; } @@ -764,7 +764,7 @@ internal Int64DataFrameColumn DivideImplementation(Int64DataFrameColumn column, { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - Int64DataFrameColumn newColumn = inPlace ? this : CloneAsInt64Column(); + Int64DataFrameColumn newColumn = inPlace ? this : (Int64DataFrameColumn)Clone(); newColumn.ColumnContainer.Divide(column.ColumnContainer); return newColumn; } @@ -777,7 +777,7 @@ internal UInt32DataFrameColumn DivideImplementation(UInt32DataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - UInt32DataFrameColumn newColumn = inPlace ? this : CloneAsUInt32Column(); + UInt32DataFrameColumn newColumn = inPlace ? this : (UInt32DataFrameColumn)Clone(); newColumn.ColumnContainer.Divide(column.ColumnContainer); return newColumn; } @@ -790,7 +790,7 @@ internal UInt64DataFrameColumn DivideImplementation(UInt64DataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - UInt64DataFrameColumn newColumn = inPlace ? this : CloneAsUInt64Column(); + UInt64DataFrameColumn newColumn = inPlace ? this : (UInt64DataFrameColumn)Clone(); newColumn.ColumnContainer.Divide(column.ColumnContainer); return newColumn; } @@ -943,7 +943,7 @@ internal DecimalDataFrameColumn ModuloImplementation(DecimalDataFrameColumn colu { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - DecimalDataFrameColumn newColumn = inPlace ? this : CloneAsDecimalColumn(); + DecimalDataFrameColumn newColumn = inPlace ? this : (DecimalDataFrameColumn)Clone(); newColumn.ColumnContainer.Modulo(column.ColumnContainer); return newColumn; } @@ -956,7 +956,7 @@ internal DoubleDataFrameColumn ModuloImplementation(DoubleDataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - DoubleDataFrameColumn newColumn = inPlace ? this : CloneAsDoubleColumn(); + DoubleDataFrameColumn newColumn = inPlace ? this : (DoubleDataFrameColumn)Clone(); newColumn.ColumnContainer.Modulo(column.ColumnContainer); return newColumn; } @@ -969,7 +969,7 @@ internal SingleDataFrameColumn ModuloImplementation(SingleDataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - SingleDataFrameColumn newColumn = inPlace ? this : CloneAsSingleColumn(); + SingleDataFrameColumn newColumn = inPlace ? this : (SingleDataFrameColumn)Clone(); newColumn.ColumnContainer.Modulo(column.ColumnContainer); return newColumn; } @@ -982,7 +982,7 @@ internal Int32DataFrameColumn ModuloImplementation(Int32DataFrameColumn column, { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - Int32DataFrameColumn newColumn = inPlace ? this : CloneAsInt32Column(); + Int32DataFrameColumn newColumn = inPlace ? this : (Int32DataFrameColumn)Clone(); newColumn.ColumnContainer.Modulo(column.ColumnContainer); return newColumn; } @@ -995,7 +995,7 @@ internal Int64DataFrameColumn ModuloImplementation(Int64DataFrameColumn column, { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - Int64DataFrameColumn newColumn = inPlace ? this : CloneAsInt64Column(); + Int64DataFrameColumn newColumn = inPlace ? this : (Int64DataFrameColumn)Clone(); newColumn.ColumnContainer.Modulo(column.ColumnContainer); return newColumn; } @@ -1008,7 +1008,7 @@ internal UInt32DataFrameColumn ModuloImplementation(UInt32DataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - UInt32DataFrameColumn newColumn = inPlace ? this : CloneAsUInt32Column(); + UInt32DataFrameColumn newColumn = inPlace ? this : (UInt32DataFrameColumn)Clone(); newColumn.ColumnContainer.Modulo(column.ColumnContainer); return newColumn; } @@ -1021,7 +1021,7 @@ internal UInt64DataFrameColumn ModuloImplementation(UInt64DataFrameColumn column { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - UInt64DataFrameColumn newColumn = inPlace ? this : CloneAsUInt64Column(); + UInt64DataFrameColumn newColumn = inPlace ? this : (UInt64DataFrameColumn)Clone(); newColumn.ColumnContainer.Modulo(column.ColumnContainer); return newColumn; } diff --git a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.BinaryOperationImplementations.Exploded.tt b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.BinaryOperationImplementations.Exploded.tt index 001aab32b1..89eede1a0c 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.BinaryOperationImplementations.Exploded.tt +++ b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.BinaryOperationImplementations.Exploded.tt @@ -76,7 +76,7 @@ void GenerateAllBinaryCombinationsForMethod(string inputMethodName) { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } - <#=fullReturnType#> newColumn = inPlace ? this : CloneAs<#=capitalizedReturnType#>Column(); + <#=fullReturnType#> newColumn = inPlace ? this : (<#=fullReturnType#>)Clone(); newColumn.ColumnContainer.<#=inputMethodName#>(column.ColumnContainer); return newColumn; } From cbc7c4d512e960d30fead475b59798161691f749 Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Fri, 8 Sep 2023 22:02:06 +0300 Subject: [PATCH 3/8] Remove using --- src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs index aa713a66c5..627efdea26 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs @@ -9,7 +9,6 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Text; -using Microsoft.ML.Data; namespace Microsoft.Data.Analysis { From 63d983a1a26789136dfe660db95a561d85132609 Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Sat, 9 Sep 2023 09:05:54 +0300 Subject: [PATCH 4/8] Fix DataFrameBuffer constructor --- .../DataFrameBuffer.cs | 11 +++++- .../PrimitiveColumnContainer.cs | 37 +++++++------------ 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs index a22f94c56d..c0f2af2f4c 100644 --- a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs +++ b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs @@ -5,6 +5,7 @@ using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using Microsoft.ML.Data; namespace Microsoft.Data.Analysis { @@ -36,7 +37,15 @@ public Span RawSpan get => MemoryMarshal.Cast(Buffer.Span); } - public DataFrameBuffer(int numberOfValues = 8) : base(numberOfValues) { } + public DataFrameBuffer(int numberOfValues = 8) + { + if ((long)numberOfValues * Size > MaxCapacity) + { + throw new ArgumentException($"{numberOfValues} exceeds buffer capacity", nameof(numberOfValues)); + } + + _memory = new byte[numberOfValues]; + } internal DataFrameBuffer(ReadOnlyMemory buffer, int length) { diff --git a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs index 627efdea26..78d58bbca5 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs @@ -527,9 +527,9 @@ internal PrimitiveColumnContainer CloneAsBoolContainer() var ret = new PrimitiveColumnContainer(); foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); + if (typeof(T) == typeof(bool)) { var localBuffer = buffer; @@ -554,9 +554,8 @@ internal PrimitiveColumnContainer CloneAsByteContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { @@ -574,9 +573,8 @@ internal PrimitiveColumnContainer CloneAsSByteContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { @@ -594,9 +592,8 @@ internal PrimitiveColumnContainer CloneAsDoubleContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { @@ -614,9 +611,8 @@ internal PrimitiveColumnContainer CloneAsDecimalContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { @@ -634,9 +630,8 @@ internal PrimitiveColumnContainer CloneAsShortContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { @@ -654,9 +649,8 @@ internal PrimitiveColumnContainer CloneAsUShortContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { @@ -674,9 +668,8 @@ internal PrimitiveColumnContainer CloneAsIntContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { @@ -694,9 +687,8 @@ internal PrimitiveColumnContainer CloneAsUIntContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { @@ -714,9 +706,8 @@ internal PrimitiveColumnContainer CloneAsLongContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { @@ -734,9 +725,8 @@ internal PrimitiveColumnContainer CloneAsULongContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { @@ -754,9 +744,8 @@ internal PrimitiveColumnContainer CloneAsFloatContainer() foreach (ReadOnlyDataFrameBuffer buffer in Buffers) { ret.Length += buffer.Length; - DataFrameBuffer newBuffer = new DataFrameBuffer(); + DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); - newBuffer.EnsureCapacity(buffer.Length); ReadOnlySpan span = buffer.ReadOnlySpan; for (int i = 0; i < span.Length; i++) { From 6abf02e2394b893230888ddd829e47407f56d218 Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Sat, 9 Sep 2023 09:08:51 +0300 Subject: [PATCH 5/8] remove uncorrectly added using --- src/Microsoft.Data.Analysis/DataFrameBuffer.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs index c0f2af2f4c..0f73eadf48 100644 --- a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs +++ b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs @@ -5,7 +5,6 @@ using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using Microsoft.ML.Data; namespace Microsoft.Data.Analysis { From 1a47ce44fd1a0504e82c18e4d51ab137e6ba6c68 Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Sun, 10 Sep 2023 10:20:13 +0300 Subject: [PATCH 6/8] Make DataFrameBuffer Length field protected --- .../ArrowStringDataFrameColumn.cs | 6 +++--- src/Microsoft.Data.Analysis/DataFrameBuffer.cs | 16 ++++++++++++---- .../PrimitiveColumnContainer.cs | 14 +++++--------- .../ReadOnlyDataFrameBuffer.cs | 11 ++++++----- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs index 56041711e3..d0b9479e17 100644 --- a/src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs @@ -213,9 +213,9 @@ private void Append(ReadOnlySpan value) _offsetsBuffers.Add(mutableOffsetsBuffer); mutableOffsetsBuffer.Append(0); } - mutableDataBuffer.EnsureCapacity(value.Length); - value.CopyTo(mutableDataBuffer.RawSpan.Slice(mutableDataBuffer.Length)); - mutableDataBuffer.Length += value.Length; + var startIndex = mutableDataBuffer.Length; + mutableDataBuffer.IncreaseSize(value.Length); + value.CopyTo(mutableDataBuffer.RawSpan.Slice(startIndex)); mutableOffsetsBuffer.Append(mutableOffsetsBuffer[mutableOffsetsBuffer.Length - 1] + value.Length); } SetValidityBit(Length - 1, value != default); diff --git a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs index 0f73eadf48..7aee0a552e 100644 --- a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs +++ b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs @@ -15,6 +15,8 @@ namespace Microsoft.Data.Analysis internal class DataFrameBuffer : ReadOnlyDataFrameBuffer where T : unmanaged { + private const int MinCapacity = 8; + private Memory _memory; public override ReadOnlyMemory ReadOnlyBuffer => _memory; @@ -36,14 +38,14 @@ public Span RawSpan get => MemoryMarshal.Cast(Buffer.Span); } - public DataFrameBuffer(int numberOfValues = 8) + public DataFrameBuffer(int capacity = 0) { - if ((long)numberOfValues * Size > MaxCapacity) + if ((long)capacity * Size > MaxCapacity) { - throw new ArgumentException($"{numberOfValues} exceeds buffer capacity", nameof(numberOfValues)); + throw new ArgumentException($"{capacity} exceeds buffer capacity", nameof(capacity)); } - _memory = new byte[numberOfValues]; + _memory = new byte[Math.Max(capacity, MinCapacity)]; } internal DataFrameBuffer(ReadOnlyMemory buffer, int length) @@ -65,6 +67,12 @@ public void Append(T value) Span[Length - 1] = value; } + public void IncreaseSize(int numberOfValues) + { + EnsureCapacity(numberOfValues); + Length += numberOfValues; + } + public void EnsureCapacity(int numberOfValues) { long newLength = Length + (long)numberOfValues; diff --git a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs index 78d58bbca5..8c9700fb0e 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs @@ -67,9 +67,8 @@ public PrimitiveColumnContainer(ReadOnlyMemory buffer, ReadOnlyMemory dataBuffer; if (buffer.IsEmpty) { - DataFrameBuffer mutableBuffer = new DataFrameBuffer(); - mutableBuffer.EnsureCapacity(length); - mutableBuffer.Length = length; + DataFrameBuffer mutableBuffer = new DataFrameBuffer(length); + mutableBuffer.IncreaseSize(length); mutableBuffer.RawSpan.Fill(default(T)); dataBuffer = mutableBuffer; } @@ -172,15 +171,12 @@ public void AppendMany(T? value, long count) //Calculate how many values we can additionaly allocate and not exceed the MaxCapacity int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer.MaxCapacity - mutableLastBuffer.Length); - mutableLastBuffer.EnsureCapacity(allocatable); + mutableLastBuffer.IncreaseSize(allocatable); DataFrameBuffer lastNullBitMapBuffer = NullBitMapBuffers.GetOrCreateMutable(NullBitMapBuffers.Count - 1); int nullBufferAllocatable = (allocatable + 7) / 8; - lastNullBitMapBuffer.EnsureCapacity(nullBufferAllocatable); + lastNullBitMapBuffer.IncreaseSize(nullBufferAllocatable); - - mutableLastBuffer.Length += allocatable; - lastNullBitMapBuffer.Length += nullBufferAllocatable; Length += allocatable; if (value.HasValue) @@ -529,6 +525,7 @@ internal PrimitiveColumnContainer CloneAsBoolContainer() { DataFrameBuffer newBuffer = new DataFrameBuffer(buffer.Length); ret.Buffers.Add(newBuffer); + newBuffer.IncreaseSize(buffer.Length); if (typeof(T) == typeof(bool)) { @@ -540,7 +537,6 @@ internal PrimitiveColumnContainer CloneAsBoolContainer() { newBuffer.Span.Fill(false); } - newBuffer.Length = buffer.Length; ret.Length += buffer.Length; } ret.NullBitMapBuffers = CloneNullBitMapBuffers(); diff --git a/src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs b/src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs index a6a41089e7..069aa3e94a 100644 --- a/src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs +++ b/src/Microsoft.Data.Analysis/ReadOnlyDataFrameBuffer.cs @@ -48,15 +48,16 @@ public ReadOnlySpan ReadOnlySpan get => (MemoryMarshal.Cast(ReadOnlyBuffer.Span)).Slice(0, Length); } - public int Length { get; internal set; } + public int Length { get; protected set; } - public ReadOnlyDataFrameBuffer(int numberOfValues = 8) + public ReadOnlyDataFrameBuffer(int length = 0) { - if ((long)numberOfValues * Size > MaxCapacity) + if ((long)length * Size > MaxCapacity) { - throw new ArgumentException($"{numberOfValues} exceeds buffer capacity", nameof(numberOfValues)); + throw new ArgumentException($"{length} exceeds buffer capacity", nameof(length)); } - _readOnlyBuffer = new byte[numberOfValues * Size]; + _readOnlyBuffer = new byte[length * Size]; + Length = length; } public ReadOnlyDataFrameBuffer(ReadOnlyMemory buffer, int length) From be47f8daaf21f2de613b3ab4818521e2ddd741a6 Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Tue, 19 Sep 2023 12:07:23 +0300 Subject: [PATCH 7/8] Fix typo --- test/Microsoft.ML.Fairlearn.Tests/GridSearchTest.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/Microsoft.ML.Fairlearn.Tests/GridSearchTest.cs b/test/Microsoft.ML.Fairlearn.Tests/GridSearchTest.cs index e539c10907..35994e6a44 100644 --- a/test/Microsoft.ML.Fairlearn.Tests/GridSearchTest.cs +++ b/test/Microsoft.ML.Fairlearn.Tests/GridSearchTest.cs @@ -45,7 +45,7 @@ private DataFrame CreateDummyDataset() } // Data generated so it is identical from Binary_Classification.ipynb from Fairlearn.github on Github - private DataFrame CreateGridScearhDataset() + private DataFrame CreateGridSearchDataset() { float[] score_feature = new float[52]; int index = 0; @@ -89,7 +89,7 @@ public void TestGridSearchTrialRunner2() } }; var experiment = context.Auto().CreateExperiment(); - var df = CreateGridScearhDataset(); + var df = CreateGridSearchDataset(); var shuffledDataset = context.Data.ShuffleRows(df); var trainTestSplit = context.Data.TrainTestSplit(shuffledDataset, 0.2); var pipeline = context.Transforms.Categorical.OneHotHashEncoding("sensitiveFeature_encode", "sensitiveFeature") From d1b06865040a3c68672d3caca42b11e988ac1ecb Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Tue, 19 Sep 2023 13:03:51 +0300 Subject: [PATCH 8/8] Use RawSpan --- src/Microsoft.Data.Analysis/DataFrameBuffer.cs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs index 7aee0a552e..fd7f23964f 100644 --- a/src/Microsoft.Data.Analysis/DataFrameBuffer.cs +++ b/src/Microsoft.Data.Analysis/DataFrameBuffer.cs @@ -57,14 +57,10 @@ internal DataFrameBuffer(ReadOnlyMemory buffer, int length) public void Append(T value) { - if (Length == MaxCapacity) - { - throw new ArgumentException("Current buffer is full", nameof(value)); - } EnsureCapacity(1); - if (Length < MaxCapacity) - ++Length; - Span[Length - 1] = value; + + RawSpan[Length] = value; + Length++; } public void IncreaseSize(int numberOfValues)