Skip to content

Commit 4bf9be3

Browse files
Yash Dattaisnotinvain
authored andcommitted
PARQUET-136: NPE thrown in StatisticsFilter when all values in a string/binary column trunk are null
In case of all nulls in a binary column, statistics object read from file metadata is empty, and should return true for all nulls check for the column. Even if column has no values, it can be ignored. The other way is to fix this behaviour in the writer, but is that what we want ? Author: Yash Datta <[email protected]> Author: Alex Levenson <[email protected]> Author: Yash Datta <[email protected]> Closes apache#99 from saucam/npe and squashes the following commits: 5138e44 [Yash Datta] PARQUET-136: Remove unreachable block b17cd38 [Yash Datta] Revert "PARQUET-161: Trigger tests" 82209e6 [Yash Datta] PARQUET-161: Trigger tests aab2f81 [Yash Datta] PARQUET-161: Review comments for the test case 2217ee2 [Yash Datta] PARQUET-161: Add a test case for checking the correct statistics info is recorded in case of all nulls in a column c2f8d6f [Yash Datta] PARQUET-161: Fix the write path to write statistics object in case of only nulls in the column 97bb517 [Yash Datta] Revert "revert TestStatisticsFilter.java" a06f0d0 [Yash Datta] Merge pull request apache#1 from isnotinvain/alexlevenson/PARQUET-161-136 b1001eb [Alex Levenson] Fix statistics isEmpty, handle more edge cases in statistics filter 0c88be0 [Alex Levenson] revert TestStatisticsFilter.java 1ac9192 [Yash Datta] PARQUET-136: Its better to not filter chunks for which empty statistics object is returned. Empty statistics can be read in case of 1. pre-statistics files, 2. files written from current writer that has a bug, as it does not write the statistics if column has all nulls e5e924e [Yash Datta] Revert "PARQUET-136: In case of all nulls in a binary column, statistics object read from file metadata is empty, and should return true for all nulls check for the column" 8cc5106 [Yash Datta] Revert "PARQUET-136: fix hasNulls to cater to the case where all values are nulls" c7c126f [Yash Datta] PARQUET-136: fix hasNulls to cater to the case where all values are nulls 974a22b [Yash Datta] PARQUET-136: In case of all nulls in a binary column, statistics object read from file metadata is empty, and should return true for all nulls check for the column
1 parent d70fdbc commit 4bf9be3

File tree

11 files changed

+152
-47
lines changed

11 files changed

+152
-47
lines changed

parquet-column/src/main/java/parquet/column/statistics/BinaryStatistics.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class BinaryStatistics extends Statistics<Binary> {
2424

2525
@Override
2626
public void updateStats(Binary value) {
27-
if (this.isEmpty()) {
27+
if (!this.hasNonNullValue()) {
2828
initializeStats(value, value);
2929
} else {
3030
updateStats(value, value);
@@ -34,7 +34,7 @@ public void updateStats(Binary value) {
3434
@Override
3535
public void mergeStatisticsMinMax(Statistics stats) {
3636
BinaryStatistics binaryStats = (BinaryStatistics)stats;
37-
if (this.isEmpty()) {
37+
if (!this.hasNonNullValue()) {
3838
initializeStats(binaryStats.getMin(), binaryStats.getMax());
3939
} else {
4040
updateStats(binaryStats.getMin(), binaryStats.getMax());
@@ -60,9 +60,11 @@ public byte[] getMinBytes() {
6060

6161
@Override
6262
public String toString() {
63-
if(!this.isEmpty())
63+
if (this.hasNonNullValue())
6464
return String.format("min: %s, max: %s, num_nulls: %d", min.toStringUsingUTF8(), max.toStringUsingUTF8(), this.getNumNulls());
65-
else
65+
else if (!this.isEmpty())
66+
return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
67+
else
6668
return "no stats for this column";
6769
}
6870

@@ -100,4 +102,4 @@ public void setMinMax(Binary min, Binary max) {
100102
this.min = min;
101103
this.markAsNotEmpty();
102104
}
103-
}
105+
}

parquet-column/src/main/java/parquet/column/statistics/BooleanStatistics.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class BooleanStatistics extends Statistics<Boolean> {
2424

2525
@Override
2626
public void updateStats(boolean value) {
27-
if (this.isEmpty()) {
27+
if (!this.hasNonNullValue()) {
2828
initializeStats(value, value);
2929
} else {
3030
updateStats(value, value);
@@ -34,7 +34,7 @@ public void updateStats(boolean value) {
3434
@Override
3535
public void mergeStatisticsMinMax(Statistics stats) {
3636
BooleanStatistics boolStats = (BooleanStatistics)stats;
37-
if (this.isEmpty()) {
37+
if (!this.hasNonNullValue()) {
3838
initializeStats(boolStats.getMin(), boolStats.getMax());
3939
} else {
4040
updateStats(boolStats.getMin(), boolStats.getMax());
@@ -60,9 +60,11 @@ public byte[] getMinBytes() {
6060

6161
@Override
6262
public String toString() {
63-
if(!this.isEmpty())
63+
if (this.hasNonNullValue())
6464
return String.format("min: %b, max: %b, num_nulls: %d", min, max, this.getNumNulls());
65-
else
65+
else if(!this.isEmpty())
66+
return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
67+
else
6668
return "no stats for this column";
6769
}
6870

parquet-column/src/main/java/parquet/column/statistics/DoubleStatistics.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class DoubleStatistics extends Statistics<Double> {
2424

2525
@Override
2626
public void updateStats(double value) {
27-
if (this.isEmpty()) {
27+
if (!this.hasNonNullValue()) {
2828
initializeStats(value, value);
2929
} else {
3030
updateStats(value, value);
@@ -34,7 +34,7 @@ public void updateStats(double value) {
3434
@Override
3535
public void mergeStatisticsMinMax(Statistics stats) {
3636
DoubleStatistics doubleStats = (DoubleStatistics)stats;
37-
if (this.isEmpty()) {
37+
if (!this.hasNonNullValue()) {
3838
initializeStats(doubleStats.getMin(), doubleStats.getMax());
3939
} else {
4040
updateStats(doubleStats.getMin(), doubleStats.getMax());
@@ -60,8 +60,10 @@ public byte[] getMinBytes() {
6060

6161
@Override
6262
public String toString() {
63-
if(!this.isEmpty())
63+
if(this.hasNonNullValue())
6464
return String.format("min: %.5f, max: %.5f, num_nulls: %d", min, max, this.getNumNulls());
65+
else if (!this.isEmpty())
66+
return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
6567
else
6668
return "no stats for this column";
6769
}
@@ -100,4 +102,4 @@ public void setMinMax(double min, double max) {
100102
this.min = min;
101103
this.markAsNotEmpty();
102104
}
103-
}
105+
}

parquet-column/src/main/java/parquet/column/statistics/FloatStatistics.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class FloatStatistics extends Statistics<Float> {
2424

2525
@Override
2626
public void updateStats(float value) {
27-
if (this.isEmpty()) {
27+
if (!this.hasNonNullValue()) {
2828
initializeStats(value, value);
2929
} else {
3030
updateStats(value, value);
@@ -34,7 +34,7 @@ public void updateStats(float value) {
3434
@Override
3535
public void mergeStatisticsMinMax(Statistics stats) {
3636
FloatStatistics floatStats = (FloatStatistics)stats;
37-
if (this.isEmpty()) {
37+
if (!this.hasNonNullValue()) {
3838
initializeStats(floatStats.getMin(), floatStats.getMax());
3939
} else {
4040
updateStats(floatStats.getMin(), floatStats.getMax());
@@ -60,8 +60,10 @@ public byte[] getMinBytes() {
6060

6161
@Override
6262
public String toString() {
63-
if(!this.isEmpty())
63+
if (this.hasNonNullValue())
6464
return String.format("min: %.5f, max: %.5f, num_nulls: %d", min, max, this.getNumNulls());
65+
else if (!this.isEmpty())
66+
return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
6567
else
6668
return "no stats for this column";
6769
}

parquet-column/src/main/java/parquet/column/statistics/IntStatistics.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class IntStatistics extends Statistics<Integer> {
2424

2525
@Override
2626
public void updateStats(int value) {
27-
if (this.isEmpty()) {
27+
if (!this.hasNonNullValue()) {
2828
initializeStats(value, value);
2929
} else {
3030
updateStats(value, value);
@@ -34,7 +34,7 @@ public void updateStats(int value) {
3434
@Override
3535
public void mergeStatisticsMinMax(Statistics stats) {
3636
IntStatistics intStats = (IntStatistics)stats;
37-
if (this.isEmpty()) {
37+
if (!this.hasNonNullValue()) {
3838
initializeStats(intStats.getMin(), intStats.getMax());
3939
} else {
4040
updateStats(intStats.getMin(), intStats.getMax());
@@ -60,8 +60,10 @@ public byte[] getMinBytes() {
6060

6161
@Override
6262
public String toString() {
63-
if(!this.isEmpty())
63+
if (this.hasNonNullValue())
6464
return String.format("min: %d, max: %d, num_nulls: %d", min, max, this.getNumNulls());
65+
else if (!this.isEmpty())
66+
return String.format("num_nulls: %d, min/max is not defined", this.getNumNulls());
6567
else
6668
return "no stats for this column";
6769
}

parquet-column/src/main/java/parquet/column/statistics/LongStatistics.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class LongStatistics extends Statistics<Long> {
2424

2525
@Override
2626
public void updateStats(long value) {
27-
if (this.isEmpty()) {
27+
if (!this.hasNonNullValue()) {
2828
initializeStats(value, value);
2929
} else {
3030
updateStats(value, value);
@@ -34,7 +34,7 @@ public void updateStats(long value) {
3434
@Override
3535
public void mergeStatisticsMinMax(Statistics stats) {
3636
LongStatistics longStats = (LongStatistics)stats;
37-
if (this.isEmpty()) {
37+
if (!this.hasNonNullValue()) {
3838
initializeStats(longStats.getMin(), longStats.getMax());
3939
} else {
4040
updateStats(longStats.getMin(), longStats.getMax());
@@ -60,8 +60,10 @@ public byte[] getMinBytes() {
6060

6161
@Override
6262
public String toString() {
63-
if(!this.isEmpty())
63+
if (this.hasNonNullValue())
6464
return String.format("min: %d, max: %d, num_nulls: %d", min, max, this.getNumNulls());
65+
else if (!this.isEmpty())
66+
return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
6567
else
6668
return "no stats for this column";
6769
}
@@ -100,4 +102,4 @@ public void setMinMax(long min, long max) {
100102
this.min = min;
101103
this.markAsNotEmpty();
102104
}
103-
}
105+
}

parquet-column/src/main/java/parquet/column/statistics/Statistics.java

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@
2828
*/
2929
public abstract class Statistics<T extends Comparable<T>> {
3030

31-
private boolean firstValueAccountedFor;
31+
private boolean hasNonNullValue;
3232
private long num_nulls;
3333

3434
public Statistics() {
35-
firstValueAccountedFor = false;
35+
hasNonNullValue = false;
3636
num_nulls = 0;
3737
}
3838

@@ -142,7 +142,10 @@ public void mergeStatistics(Statistics stats) {
142142

143143
if (this.getClass() == stats.getClass()) {
144144
incrementNumNulls(stats.getNumNulls());
145-
mergeStatisticsMinMax(stats);
145+
if (stats.hasNonNullValue()) {
146+
mergeStatisticsMinMax(stats);
147+
markAsNotEmpty();
148+
}
146149
} else {
147150
throw new StatisticsClassException(this.getClass().toString(), stats.getClass().toString());
148151
}
@@ -220,11 +223,22 @@ public void setNumNulls(long nulls) {
220223
* @return true if object is empty, false otherwise
221224
*/
222225
public boolean isEmpty() {
223-
return !firstValueAccountedFor;
226+
return !hasNonNullValue && num_nulls == 0;
224227
}
225228

229+
/**
230+
* Returns whether there have been non-null values added to this statistics
231+
*/
232+
public boolean hasNonNullValue() {
233+
return hasNonNullValue;
234+
}
235+
236+
/**
237+
* Sets the page/column as having a valid non-null value
238+
* kind of misnomer here
239+
*/
226240
protected void markAsNotEmpty() {
227-
firstValueAccountedFor = true;
241+
hasNonNullValue = true;
228242
}
229243
}
230244

0 commit comments

Comments
 (0)