Skip to content

Commit 69cc4bc

Browse files
authored
Fix incorrect DataFrame min max computation with NULL (#6734)
* Step 1 * Step 2 * Fixed code review findings
1 parent 578d7bc commit 69cc4bc

9 files changed

+791
-445
lines changed

src/Microsoft.Data.Analysis/DateTimeComputation.cs

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
using System;
66
using System.Collections.Generic;
7+
using System.Diagnostics;
8+
using System.Reflection;
79
using System.Text;
810

911
namespace Microsoft.Data.Analysis
@@ -189,26 +191,35 @@ public void CumulativeSum(PrimitiveColumnContainer<DateTime> column, IEnumerable
189191
throw new NotSupportedException();
190192
}
191193

192-
public void Max(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
194+
public void Max(PrimitiveColumnContainer<DateTime> column, out DateTime? ret)
193195
{
194-
ret = column.Buffers[0].ReadOnlySpan[0];
196+
var maxDate = DateTime.MinValue;
197+
bool hasMaxValue = false;
198+
195199
for (int b = 0; b < column.Buffers.Count; b++)
196200
{
197-
var buffer = column.Buffers[b];
198-
var readOnlySpan = buffer.ReadOnlySpan;
201+
var readOnlySpan = column.Buffers[b].ReadOnlySpan;
202+
var bitmapSpan = column.NullBitMapBuffers[b].ReadOnlySpan;
199203
for (int i = 0; i < readOnlySpan.Length; i++)
200204
{
205+
//Check if bit is not set (value is null) - skip
206+
if (!BitmapHelper.IsValid(bitmapSpan, i))
207+
continue;
208+
201209
var val = readOnlySpan[i];
202210

203-
if (val > ret)
211+
if (val > maxDate)
204212
{
205-
ret = val;
213+
maxDate = val;
214+
hasMaxValue = true;
206215
}
207216
}
208217
}
218+
219+
ret = hasMaxValue ? maxDate : null;
209220
}
210221

211-
public void Max(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
222+
public void Max(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime? ret)
212223
{
213224
ret = default;
214225
var readOnlySpan = column.Buffers[0].ReadOnlySpan;
@@ -237,26 +248,36 @@ public void Max(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> row
237248
}
238249
}
239250

240-
public void Min(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
251+
public void Min(PrimitiveColumnContainer<DateTime> column, out DateTime? ret)
241252
{
242-
ret = column.Buffers[0].ReadOnlySpan[0];
253+
var minDate = DateTime.MaxValue;
254+
bool hasMinValue = false;
255+
243256
for (int b = 0; b < column.Buffers.Count; b++)
244257
{
245-
var buffer = column.Buffers[b];
246-
var readOnlySpan = buffer.ReadOnlySpan;
258+
var readOnlySpan = column.Buffers[b].ReadOnlySpan;
259+
var bitmapSpan = column.NullBitMapBuffers[b].ReadOnlySpan;
260+
247261
for (int i = 0; i < readOnlySpan.Length; i++)
248262
{
263+
//Check if bit is not set (value is null) - skip
264+
if (!BitmapHelper.IsValid(bitmapSpan, i))
265+
continue;
266+
249267
var val = readOnlySpan[i];
250268

251-
if (val < ret)
269+
if (val < minDate)
252270
{
253-
ret = val;
271+
minDate = val;
272+
hasMinValue = true;
254273
}
255274
}
256275
}
276+
277+
ret = hasMinValue ? minDate : null;
257278
}
258279

259-
public void Min(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
280+
public void Min(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime? ret)
260281
{
261282
ret = default;
262283
var readOnlySpan = column.Buffers[0].ReadOnlySpan;
@@ -285,22 +306,22 @@ public void Min(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> row
285306
}
286307
}
287308

288-
public void Product(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
309+
public void Product(PrimitiveColumnContainer<DateTime> column, out DateTime? ret)
289310
{
290311
throw new NotSupportedException();
291312
}
292313

293-
public void Product(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
314+
public void Product(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime? ret)
294315
{
295316
throw new NotSupportedException();
296317
}
297318

298-
public void Sum(PrimitiveColumnContainer<DateTime> column, out DateTime ret)
319+
public void Sum(PrimitiveColumnContainer<DateTime> column, out DateTime? ret)
299320
{
300321
throw new NotSupportedException();
301322
}
302323

303-
public void Sum(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime ret)
324+
public void Sum(PrimitiveColumnContainer<DateTime> column, IEnumerable<long> rows, out DateTime? ret)
304325
{
305326
throw new NotSupportedException();
306327
}

src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,24 @@
1212

1313
namespace Microsoft.Data.Analysis
1414
{
15+
internal static class BitmapHelper
16+
{
17+
// Faster to use when we already have a span since it avoids indexing
18+
public static bool IsValid(ReadOnlySpan<byte> bitMapBufferSpan, int index)
19+
{
20+
int nullBitMapSpanIndex = index / 8;
21+
byte thisBitMap = bitMapBufferSpan[nullBitMapSpanIndex];
22+
return IsBitSet(thisBitMap, index);
23+
}
24+
25+
public static bool IsBitSet(byte curBitMap, int index)
26+
{
27+
return ((curBitMap >> (index & 7)) & 1) != 0;
28+
}
29+
}
30+
1531
/// <summary>
16-
/// PrimitiveDataFrameColumnContainer is just a store for the column data. APIs that want to change the data must be defined in PrimitiveDataFrameColumn
32+
/// PrimitiveColumnContainer is just a store for the column data. APIs that want to change the data must be defined in PrimitiveDataFrameColumn
1733
/// </summary>
1834
/// <typeparam name="T"></typeparam>
1935
internal partial class PrimitiveColumnContainer<T> : IEnumerable<T?>
@@ -224,7 +240,7 @@ public void ApplyElementwise(Func<T?, long, T?> func)
224240
for (int i = 0; i < mutableBuffer.Length; i++)
225241
{
226242
long curIndex = i + prevLength;
227-
bool isValid = IsValid(mutableNullBitMapBuffer, i);
243+
bool isValid = BitmapHelper.IsValid(mutableNullBitMapBuffer, i);
228244
T? value = func(isValid ? mutableBuffer[i] : null, curIndex);
229245
mutableBuffer[i] = value.GetValueOrDefault();
230246
SetValidityBit(mutableNullBitMapBuffer, i, value != null);
@@ -247,22 +263,14 @@ public void Apply<TResult>(Func<T?, TResult?> func, PrimitiveColumnContainer<TRe
247263

248264
for (int i = 0; i < sourceBuffer.Length; i++)
249265
{
250-
bool isValid = IsValid(sourceNullBitMap, i);
266+
bool isValid = BitmapHelper.IsValid(sourceNullBitMap, i);
251267
TResult? value = func(isValid ? sourceBuffer[i] : null);
252268
mutableResultBuffer[i] = value.GetValueOrDefault();
253269
resultContainer.SetValidityBit(mutableResultNullBitMapBuffers, i, value != null);
254270
}
255271
}
256272
}
257273

258-
// Faster to use when we already have a span since it avoids indexing
259-
public bool IsValid(ReadOnlySpan<byte> bitMapBufferSpan, int index)
260-
{
261-
int nullBitMapSpanIndex = index / 8;
262-
byte thisBitMap = bitMapBufferSpan[nullBitMapSpanIndex];
263-
return IsBitSet(thisBitMap, index);
264-
}
265-
266274
public bool IsValid(long index) => NullCount == 0 || GetValidityBit(index);
267275

268276
private byte SetBit(byte curBitMap, int index, bool value)
@@ -330,11 +338,6 @@ internal void SetValidityBit(long index, bool value)
330338
SetValidityBit(bitMapBuffer.Span, (int)index, value);
331339
}
332340

333-
private bool IsBitSet(byte curBitMap, int index)
334-
{
335-
return ((curBitMap >> (index & 7)) & 1) != 0;
336-
}
337-
338341
private bool GetValidityBit(long index)
339342
{
340343
if ((uint)index >= Length)
@@ -351,7 +354,7 @@ private bool GetValidityBit(long index)
351354
int bitMapBufferIndex = (int)((uint)index / 8);
352355
Debug.Assert(bitMapBuffer.Length > bitMapBufferIndex);
353356
byte curBitMap = bitMapBuffer[bitMapBufferIndex];
354-
return IsBitSet(curBitMap, (int)index);
357+
return BitmapHelper.IsBitSet(curBitMap, (int)index);
355358
}
356359

357360
public long Length;
@@ -513,7 +516,7 @@ public PrimitiveColumnContainer<T> Clone<U>(PrimitiveColumnContainer<U> mapIndic
513516
spanIndex = buffer.Length - 1 - i;
514517

515518
long mapRowIndex = mapIndicesIntSpan.IsEmpty ? mapIndicesLongSpan[spanIndex] : mapIndicesIntSpan[spanIndex];
516-
bool mapRowIndexIsValid = mapIndices.IsValid(mapIndicesNullBitMapSpan, spanIndex);
519+
bool mapRowIndexIsValid = BitmapHelper.IsValid(mapIndicesNullBitMapSpan, spanIndex);
517520
if (mapRowIndexIsValid && (mapRowIndex < minRange || mapRowIndex >= maxRange))
518521
{
519522
int bufferIndex = (int)(mapRowIndex / maxCapacity);
@@ -528,7 +531,7 @@ public PrimitiveColumnContainer<T> Clone<U>(PrimitiveColumnContainer<U> mapIndic
528531
{
529532
mapRowIndex -= minRange;
530533
value = thisSpan[(int)mapRowIndex];
531-
isValid = IsValid(thisNullBitMapSpan, (int)mapRowIndex);
534+
isValid = BitmapHelper.IsValid(thisNullBitMapSpan, (int)mapRowIndex);
532535
}
533536

534537
retSpan[i] = isValid ? value : default;

src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.Computations.cs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -93,49 +93,49 @@ public override DataFrameColumn CumulativeSum(IEnumerable<long> rowIndices, bool
9393
/// <inheritdoc/>
9494
public override object Max()
9595
{
96-
PrimitiveColumnComputation<T>.Instance.Max(_columnContainer, out T ret);
96+
PrimitiveColumnComputation<T>.Instance.Max(_columnContainer, out T? ret);
9797
return ret;
9898
}
9999
/// <inheritdoc/>
100100
public override object Max(IEnumerable<long> rowIndices)
101101
{
102-
PrimitiveColumnComputation<T>.Instance.Max(_columnContainer, rowIndices, out T ret);
102+
PrimitiveColumnComputation<T>.Instance.Max(_columnContainer, rowIndices, out T? ret);
103103
return ret;
104104
}
105105
/// <inheritdoc/>
106106
public override object Min()
107107
{
108-
PrimitiveColumnComputation<T>.Instance.Min(_columnContainer, out T ret);
108+
PrimitiveColumnComputation<T>.Instance.Min(_columnContainer, out T? ret);
109109
return ret;
110110
}
111111
/// <inheritdoc/>
112112
public override object Min(IEnumerable<long> rowIndices)
113113
{
114-
PrimitiveColumnComputation<T>.Instance.Min(_columnContainer, rowIndices, out T ret);
114+
PrimitiveColumnComputation<T>.Instance.Min(_columnContainer, rowIndices, out T? ret);
115115
return ret;
116116
}
117117
/// <inheritdoc/>
118118
public override object Product()
119119
{
120-
PrimitiveColumnComputation<T>.Instance.Product(_columnContainer, out T ret);
120+
PrimitiveColumnComputation<T>.Instance.Product(_columnContainer, out T? ret);
121121
return ret;
122122
}
123123
/// <inheritdoc/>
124124
public override object Product(IEnumerable<long> rowIndices)
125125
{
126-
PrimitiveColumnComputation<T>.Instance.Product(_columnContainer, rowIndices, out T ret);
126+
PrimitiveColumnComputation<T>.Instance.Product(_columnContainer, rowIndices, out T? ret);
127127
return ret;
128128
}
129129
/// <inheritdoc/>
130130
public override object Sum()
131131
{
132-
PrimitiveColumnComputation<T>.Instance.Sum(_columnContainer, out T ret);
132+
PrimitiveColumnComputation<T>.Instance.Sum(_columnContainer, out T? ret);
133133
return ret;
134134
}
135135
/// <inheritdoc/>
136136
public override object Sum(IEnumerable<long> rowIndices)
137137
{
138-
PrimitiveColumnComputation<T>.Instance.Sum(_columnContainer, rowIndices, out T ret);
138+
PrimitiveColumnComputation<T>.Instance.Sum(_columnContainer, rowIndices, out T? ret);
139139
return ret;
140140
}
141141
/// <inheritdoc/>

src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.Computations.tt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,13 @@ namespace Microsoft.Data.Analysis
5151
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(ret._columnContainer);
5252
return ret;
5353
<# } else if (compMethod.MethodType == MethodType.ElementwiseComputation && compMethod.HasReturnValue == true) {#>
54-
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, out T ret);
54+
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, out T? ret);
5555
return ret;
5656
<# } else if (compMethod.MethodType == MethodType.Reduction && compMethod.IsNumeric == true && compMethod.SupportsRowSubsets == true) { #>
57-
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, rowIndices, out T ret);
57+
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, rowIndices, out T? ret);
5858
return ret;
5959
<# } else if (compMethod.MethodType == MethodType.Reduction && compMethod.IsNumeric == true) { #>
60-
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, out T ret);
60+
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, out T? ret);
6161
return ret;
6262
<# } else { #>
6363
PrimitiveColumnComputation<T>.Instance.<#=compMethod.MethodName#>(_columnContainer, out bool ret);

src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.Sort.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ private Int64DataFrameColumn GetSortIndices(IComparer<T> comparer, out Int64Data
4545
for (int i = 0; i < sortIndices.Length; i++)
4646
{
4747
int localSortIndex = sortIndices[i];
48-
if (_columnContainer.IsValid(nullBitMapSpan, localSortIndex))
48+
if (BitmapHelper.IsValid(nullBitMapSpan, localSortIndex))
4949
{
5050
nonNullSortIndices.Add(sortIndices[i]);
5151
}

src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ public override Dictionary<TKey, ICollection<long>> GroupColumnValues<TKey>(out
584584
for (int i = 0; i < readOnlySpan.Length; i++)
585585
{
586586
long currentLength = i + previousLength;
587-
if (_columnContainer.IsValid(nullBitMapSpan, i))
587+
if (BitmapHelper.IsValid(nullBitMapSpan, i))
588588
{
589589
bool containsKey = multimap.TryGetValue(readOnlySpan[i], out ICollection<long> values);
590590
if (containsKey)

0 commit comments

Comments
 (0)