Skip to content

Commit 1203495

Browse files
author
Prashanth Govindarajan
committed
Remove unused param
Docs maxRows More unit tests Fixed ArrowStringDataFrameColumn construction in the unit test
1 parent 1ce802b commit 1203495

File tree

7 files changed

+87
-64
lines changed

7 files changed

+87
-64
lines changed

src/Microsoft.Data.Analysis/DataFrame.IDataView.cs

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ public partial class DataFrame : IDataView
1616
bool IDataView.CanShuffle => false;
1717

1818
private DataViewSchema _schema;
19-
internal DataViewSchema DataViewSchema
19+
private DataViewSchema DataViewSchema
2020
{
2121
get
2222
{
@@ -70,29 +70,22 @@ private sealed class RowCursor : DataViewRowCursor
7070
private bool _disposed;
7171
private long _position;
7272
private readonly DataFrame _dataFrame;
73-
private readonly List<Delegate> _getters;
74-
private Dictionary<int, int> _columnIndexToGetterIndex;
73+
private readonly Delegate[] _getters;
7574

7675
public RowCursor(DataFrame dataFrame, bool[] activeColumns)
7776
{
7877
Debug.Assert(dataFrame != null);
7978
Debug.Assert(activeColumns != null);
8079

81-
_columnIndexToGetterIndex = new Dictionary<int, int>();
8280
_position = -1;
8381
_dataFrame = dataFrame;
84-
_getters = new List<Delegate>();
85-
for (int i = 0; i < Schema.Count; i++)
82+
_getters = new Delegate[Schema.Count];
83+
for (int i = 0; i < _getters.Length; i++)
8684
{
8785
if (!activeColumns[i])
88-
{
8986
continue;
90-
}
91-
92-
Delegate getter = CreateGetterDelegate(i);
93-
_getters.Add(getter);
94-
Debug.Assert(getter != null);
95-
_columnIndexToGetterIndex[i] = _getters.Count - 1;
87+
_getters[i] = CreateGetterDelegate(i);
88+
Debug.Assert(_getters[i] != null);
9689
}
9790
}
9891

@@ -103,15 +96,11 @@ public RowCursor(DataFrame dataFrame, bool[] activeColumns)
10396
protected override void Dispose(bool disposing)
10497
{
10598
if (_disposed)
106-
{
10799
return;
108-
}
109-
110100
if (disposing)
111101
{
112102
_position = -1;
113103
}
114-
115104
_disposed = true;
116105
base.Dispose(disposing);
117106
}
@@ -127,7 +116,7 @@ public override ValueGetter<TValue> GetGetter<TValue>(DataViewSchema.Column colu
127116
if (!IsColumnActive(column))
128117
throw new ArgumentOutOfRangeException(nameof(column));
129118

130-
return (ValueGetter<TValue>)_getters[_columnIndexToGetterIndex[column.Index]];
119+
return (ValueGetter<TValue>)_getters[column.Index];
131120
}
132121

133122
public override ValueGetter<DataViewRowId> GetIdGetter()
@@ -137,15 +126,13 @@ public override ValueGetter<DataViewRowId> GetIdGetter()
137126

138127
public override bool IsColumnActive(DataViewSchema.Column column)
139128
{
140-
return _getters[_columnIndexToGetterIndex[column.Index]] != null;
129+
return _getters[column.Index] != null;
141130
}
142131

143132
public override bool MoveNext()
144133
{
145134
if (_disposed)
146-
{
147135
return false;
148-
}
149136
_position++;
150137
return _position < _dataFrame.Rows.Count;
151138
}

src/Microsoft.Data.Analysis/DataFrameColumn.cs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -251,15 +251,14 @@ public virtual DataFrameColumn Sort(bool ascending = true)
251251
/// Appends a value to this <see cref="DataFrameColumn"/> using <paramref name="cursor"/>
252252
/// </summary>
253253
/// <param name="cursor">The row cursor which has the current position</param>
254-
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
255254
/// <param name="ValueGetter">The cached ValueGetter for this column.</param>
256-
protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate ValueGetter) => throw new NotImplementedException();
255+
protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, Delegate ValueGetter) => throw new NotImplementedException();
257256

258257
/// <summary>
259258
/// Returns the ValueGetter for each active column in <paramref name="cursor"/> as a delegate to be cached.
260259
/// </summary>
261260
/// <param name="cursor">The row cursor which has the current position</param>
262-
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
261+
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> to return the ValueGetter for.</param>
263262
protected internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException();
264263

265264
/// <summary>

src/Microsoft.Data.Analysis/IDataView.Extension.cs

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -13,113 +13,131 @@ public static class IDataViewExtensions
1313
{
1414
private const int defaultMaxRows = 100;
1515

16+
/// <summary>
17+
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> from this <paramref name="dataView"/>.
18+
/// </summary>
19+
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
20+
/// <param name="maxRows">The max number or rows in the <see cref="Microsoft.Data.Analysis.DataFrame"/>. Defaults to 100. Use -1 to construct a DataFrame using all the rows in <paramref name="dataView"/>.</param>
21+
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with <paramref name="maxRows"/>.</returns>
1622
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = defaultMaxRows)
1723
{
1824
return ToDataFrame(dataView, maxRows, null);
1925
}
2026

27+
/// <summary>
28+
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> with the first 100 rows of this <paramref name="dataView"/>.
29+
/// </summary>
30+
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
31+
/// <param name="selectColumns">The columns selected for the resultant DataFrame</param>
32+
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with the selected columns and 100 rows.</returns>
2133
public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns)
2234
{
2335
return ToDataFrame(dataView, defaultMaxRows, selectColumns);
2436
}
2537

38+
/// <summary>
39+
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> with the first <paramref name="maxRows"/> of this <paramref name="dataView"/>.
40+
/// </summary>
41+
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
42+
/// <param name="maxRows">The max number or rows in the <see cref="Microsoft.Data.Analysis.DataFrame"/>. Use -1 to construct a DataFrame using all the rows in <paramref name="dataView"/>.</param>
43+
/// <param name="selectColumns">The columns selected for the resultant DataFrame</param>
44+
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with the selected columns and <paramref name="maxRows"/> rows.</returns>
2645
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns)
2746
{
2847
DataViewSchema schema = dataView.Schema;
29-
List<DataFrameColumn> columns = new List<DataFrameColumn>(schema.Count);
48+
List<DataFrameColumn> dataFrameColumns = new List<DataFrameColumn>(schema.Count);
49+
maxRows = maxRows == -1 ? long.MaxValue : maxRows;
3050

3151
HashSet<string> selectColumnsSet = null;
3252
if (selectColumns != null && selectColumns.Length > 0)
3353
{
3454
selectColumnsSet = new HashSet<string>(selectColumns);
3555
}
3656

37-
List<DataViewSchema.Column> activeColumns = new List<DataViewSchema.Column>();
38-
foreach (DataViewSchema.Column column in schema)
57+
List<DataViewSchema.Column> activeDataViewColumns = new List<DataViewSchema.Column>();
58+
foreach (DataViewSchema.Column dataViewColumn in schema)
3959
{
40-
if (column.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(column.Name)))
60+
if (dataViewColumn.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(dataViewColumn.Name)))
4161
{
4262
continue;
4363
}
4464

45-
activeColumns.Add(column);
46-
DataViewType type = column.Type;
65+
activeDataViewColumns.Add(dataViewColumn);
66+
DataViewType type = dataViewColumn.Type;
4767
if (type == BooleanDataViewType.Instance)
4868
{
49-
columns.Add(new BooleanDataFrameColumn(column.Name));
69+
dataFrameColumns.Add(new BooleanDataFrameColumn(dataViewColumn.Name));
5070
}
5171
else if (type == NumberDataViewType.Byte)
5272
{
53-
columns.Add(new ByteDataFrameColumn(column.Name));
73+
dataFrameColumns.Add(new ByteDataFrameColumn(dataViewColumn.Name));
5474
}
5575
else if (type == NumberDataViewType.Double)
5676
{
57-
columns.Add(new DoubleDataFrameColumn(column.Name));
77+
dataFrameColumns.Add(new DoubleDataFrameColumn(dataViewColumn.Name));
5878
}
5979
else if (type == NumberDataViewType.Single)
6080
{
61-
columns.Add(new SingleDataFrameColumn(column.Name));
81+
dataFrameColumns.Add(new SingleDataFrameColumn(dataViewColumn.Name));
6282
}
6383
else if (type == NumberDataViewType.Int32)
6484
{
65-
columns.Add(new Int32DataFrameColumn(column.Name));
85+
dataFrameColumns.Add(new Int32DataFrameColumn(dataViewColumn.Name));
6686
}
6787
else if (type == NumberDataViewType.Int64)
6888
{
69-
columns.Add(new Int64DataFrameColumn(column.Name));
89+
dataFrameColumns.Add(new Int64DataFrameColumn(dataViewColumn.Name));
7090
}
7191
else if (type == NumberDataViewType.SByte)
7292
{
73-
columns.Add(new SByteDataFrameColumn(column.Name));
93+
dataFrameColumns.Add(new SByteDataFrameColumn(dataViewColumn.Name));
7494
}
7595
else if (type == NumberDataViewType.Int16)
7696
{
77-
columns.Add(new Int16DataFrameColumn(column.Name));
97+
dataFrameColumns.Add(new Int16DataFrameColumn(dataViewColumn.Name));
7898
}
7999
else if (type == NumberDataViewType.UInt32)
80100
{
81-
columns.Add(new UInt32DataFrameColumn(column.Name));
101+
dataFrameColumns.Add(new UInt32DataFrameColumn(dataViewColumn.Name));
82102
}
83103
else if (type == NumberDataViewType.UInt64)
84104
{
85-
columns.Add(new UInt64DataFrameColumn(column.Name));
105+
dataFrameColumns.Add(new UInt64DataFrameColumn(dataViewColumn.Name));
86106
}
87107
else if (type == NumberDataViewType.UInt16)
88108
{
89-
columns.Add(new UInt16DataFrameColumn(column.Name));
109+
dataFrameColumns.Add(new UInt16DataFrameColumn(dataViewColumn.Name));
90110
}
91111
else if (type == TextDataViewType.Instance)
92112
{
93-
columns.Add(new StringDataFrameColumn(column.Name));
113+
dataFrameColumns.Add(new StringDataFrameColumn(dataViewColumn.Name));
94114
}
95115
else
96116
{
97117
throw new NotSupportedException(String.Format(Microsoft.Data.Strings.NotSupportedColumnType, type.RawType.Name));
98118
}
99119
}
100120

101-
using (DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns))
121+
using (DataViewRowCursor cursor = dataView.GetRowCursor(activeDataViewColumns))
102122
{
103-
Delegate[] activeColumnDelegates = new Delegate[activeColumns.Count];
123+
Delegate[] activeColumnDelegates = new Delegate[activeDataViewColumns.Count];
104124
int columnIndex = 0;
105-
foreach (DataViewSchema.Column column in activeColumns)
125+
foreach (DataViewSchema.Column activeDataViewColumn in activeDataViewColumns)
106126
{
107-
Delegate valueGetter = columns[columnIndex].GetValueGetterUsingCursor(cursor, column);
127+
Delegate valueGetter = dataFrameColumns[columnIndex].GetValueGetterUsingCursor(cursor, activeDataViewColumn);
108128
activeColumnDelegates[columnIndex] = valueGetter;
109129
columnIndex++;
110130
}
111131
while (cursor.MoveNext() && cursor.Position < maxRows)
112132
{
113-
columnIndex = 0;
114-
foreach (DataViewSchema.Column column in activeColumns)
133+
for (int i = 0; i < activeColumnDelegates.Length; i++)
115134
{
116-
columns[columnIndex].AddValueUsingCursor(cursor, column, activeColumnDelegates[columnIndex]);
117-
columnIndex++;
135+
dataFrameColumns[i].AddValueUsingCursor(cursor, activeColumnDelegates[i]);
118136
}
119137
}
120138
}
121139

122-
return new DataFrame(columns);
140+
return new DataFrame(dataFrameColumns);
123141
}
124142
}
125143

src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -776,7 +776,7 @@ private static ValueGetter<ushort> CreateCharValueGetterDelegate(DataViewRowCurs
776776
private static ValueGetter<double> CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn<decimal> column) =>
777777
(ref double value) => value = (double?)column[cursor.Position] ?? double.NaN;
778778

779-
protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column, Delegate getter)
779+
protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter)
780780
{
781781
long row = cursor.Position;
782782
T value = default;

src/Microsoft.Data.Analysis/StringDataFrameColumn.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor)
468468
private ValueGetter<ReadOnlyMemory<char>> CreateValueGetterDelegate(DataViewRowCursor cursor) =>
469469
(ref ReadOnlyMemory<char> value) => value = this[cursor.Position].AsMemory();
470470

471-
protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate getter)
471+
protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter)
472472
{
473473
long row = cursor.Position;
474474
ReadOnlyMemory<char> value = default;
@@ -489,6 +489,7 @@ protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, D
489489
throw new IndexOutOfRangeException(nameof(row));
490490
}
491491
}
492+
492493
protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
493494
{
494495
return cursor.GetGetter<ReadOnlyMemory<char>>(schemaColumn);

test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -252,25 +252,41 @@ public void TestDataFrameFromIDataView_SelectColumns()
252252
Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All());
253253
}
254254

255-
[Fact]
256-
public void TestDataFrameFromIDataView_SelectRows()
255+
[Theory]
256+
[InlineData(10, 5)]
257+
[InlineData(110, 100)]
258+
[InlineData(110, -1)]
259+
public void TestDataFrameFromIDataView_SelectRows(int dataFrameSize, int rowSize)
257260
{
258-
DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false);
261+
DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(dataFrameSize, withNulls: false);
259262
df.Columns.Remove("Char"); // Because chars are returned as uint16 by DataViewSchema, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts
260263
df.Columns.Remove("Decimal"); // Because decimal is returned as double by DataViewSchema, so end up comparing DecimalDataFrameColumn to DoubleDataFrameColumn and fail asserts
261264
IDataView dfAsIDataView = df;
262-
DataFrame newDf = dfAsIDataView.ToDataFrame(5);
263-
Assert.Equal(5, newDf.Rows.Count);
265+
DataFrame newDf;
266+
if (rowSize == 100)
267+
{
268+
// Test default
269+
newDf = dfAsIDataView.ToDataFrame();
270+
}
271+
else
272+
{
273+
newDf = dfAsIDataView.ToDataFrame(rowSize);
274+
}
275+
if (rowSize == -1)
276+
{
277+
rowSize = dataFrameSize;
278+
}
279+
Assert.Equal(rowSize, newDf.Rows.Count);
264280
Assert.Equal(df.Columns.Count, newDf.Columns.Count);
265281
for (int i = 0; i < newDf.Columns.Count; i++)
266282
{
267-
Assert.Equal(5, newDf.Columns[i].Length);
283+
Assert.Equal(rowSize, newDf.Columns[i].Length);
268284
Assert.Equal(df.Columns[i].Name, newDf.Columns[i].Name);
269285
}
270286
Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count);
271287
for (int c = 0; c < df.Columns.Count; c++)
272288
{
273-
for (int r = 0; r < 5; r++)
289+
for (int r = 0; r < rowSize; r++)
274290
{
275291
Assert.Equal(df.Columns[c][r], newDf.Columns[c][r]);
276292
}

test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,12 @@ public static ArrowStringDataFrameColumn CreateArrowStringColumn(int length, boo
6262

6363
// write the current length to (index + 1)
6464
int offsetIndex = (i + 1) * 4;
65-
offsetMemory[offsetIndex++] = (byte)(3 * validStringsIndex);
66-
offsetMemory[offsetIndex++] = 0;
67-
offsetMemory[offsetIndex++] = 0;
68-
offsetMemory[offsetIndex++] = 0;
65+
int offsetValue = 3 * validStringsIndex;
66+
byte[] offsetValueBytes = BitConverter.GetBytes(offsetValue);
67+
offsetMemory[offsetIndex++] = offsetValueBytes[0];
68+
offsetMemory[offsetIndex++] = offsetValueBytes[1];
69+
offsetMemory[offsetIndex++] = offsetValueBytes[2];
70+
offsetMemory[offsetIndex++] = offsetValueBytes[3];
6971
}
7072

7173
int nullCount = withNulls ? 1 : 0;

0 commit comments

Comments
 (0)