Skip to content

Commit c381968

Browse files
sircodesalotOfTheRoundrdblue
authored andcommitted
PARQUET-355: Add Statistics Test for Parquet Columns
In response to PARQUET-251 created an integration test that generates random values and compares the statistics against the values read from a parquet file. There are two tools classes `DataGenerationContext` and `RandomValueGenerators` which are located in the same package as the unit test. I'm sure there is a better place to put these, but I leave that to your discretion. Thanks Reuben Author: Reuben Kuhnert <[email protected]> Author: Ryan Blue <[email protected]> Closes apache#255 from sircodesalotOfTheRound/stats-validation and squashes the following commits: 680e96a [Reuben Kuhnert] Merge pull request apache#1 from rdblue/PARQUET-355-stats-validation-tests 9f0033f [Ryan Blue] PARQUET-355: Use ColumnReaderImpl. 7d0b4fe [Reuben Kuhnert] PARQUET-355: Add Statistics Validation Test
1 parent 0637e2f commit c381968

File tree

3 files changed

+723
-3
lines changed

3 files changed

+723
-3
lines changed

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderImpl.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
* @author Julien Le Dem
5858
*
5959
*/
60-
class ColumnReaderImpl implements ColumnReader {
60+
public class ColumnReaderImpl implements ColumnReader {
6161
private static final Log LOG = Log.getLog(ColumnReaderImpl.class);
6262

6363
/**
@@ -149,8 +149,8 @@ public double getDouble() {
149149
private int dictionaryId;
150150

151151
private long endOfPageValueCount;
152-
private int readValues;
153-
private int pageValueCount;
152+
private int readValues = 0;
153+
private int pageValueCount = 0;
154154

155155
private final PrimitiveConverter converter;
156156
private Binding binding;
Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.parquet.statistics;
21+
22+
import org.apache.parquet.io.api.Binary;
23+
import java.math.BigInteger;
24+
import java.util.Random;
25+
26+
public class RandomValues {
27+
private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890";
28+
29+
private static abstract class RandomValueGenerator<T extends Comparable<T>> {
30+
private final Random random;
31+
32+
protected RandomValueGenerator(long seed) {
33+
this.random = new Random(seed);
34+
}
35+
36+
public boolean shouldGenerateNull() {
37+
return (random.nextInt(10) == 0);
38+
}
39+
40+
public int randomInt() { return randomInt(Integer.MAX_VALUE - 1); }
41+
public int randomInt(int maximum) {
42+
// Maximum may be a random number (which may be negative).
43+
return random.nextInt(Math.abs(maximum) + 1);
44+
}
45+
46+
public long randomLong() { return random.nextLong(); }
47+
public long randomLong(long maximum) { return randomLong() % maximum; }
48+
49+
public float randomFloat() { return random.nextFloat(); }
50+
public float randomFloat(float maximum) { return random.nextFloat() % maximum; }
51+
52+
public double randomDouble() { return random.nextDouble(); }
53+
public double randomDouble(double maximum) { return random.nextDouble() % maximum; }
54+
55+
public BigInteger randomInt96() {
56+
return new BigInteger(95, random);
57+
}
58+
59+
public BigInteger randomInt96(BigInteger maximum) {
60+
BigInteger result;
61+
while ((result = randomInt96()).compareTo(maximum) > 0);
62+
return result;
63+
}
64+
65+
public char randomLetter() {
66+
return ALPHABET.charAt(randomInt() % ALPHABET.length());
67+
}
68+
69+
public String randomString(int maxLength) {
70+
return randomFixedLengthString(randomInt(maxLength));
71+
}
72+
73+
public String randomFixedLengthString(int length) {
74+
StringBuilder builder = new StringBuilder();
75+
for (int index = 0; index < length; index++) {
76+
builder.append(randomLetter());
77+
}
78+
79+
return builder.toString();
80+
}
81+
82+
protected abstract T nextValue();
83+
}
84+
85+
private static abstract class RandomBinaryBase<T extends Comparable<T>> extends RandomValueGenerator<T> {
86+
protected final int bufferLength;
87+
protected final byte[] buffer;
88+
89+
public RandomBinaryBase(long seed, int bufferLength) {
90+
super(seed);
91+
92+
this.bufferLength = bufferLength;
93+
this.buffer = new byte[bufferLength];
94+
}
95+
96+
public abstract Binary nextBinaryValue();
97+
98+
public Binary asReusedBinary(byte[] data) {
99+
int length = Math.min(data.length, bufferLength);
100+
System.arraycopy(data, 0, buffer, 0, length);
101+
return Binary.fromReusedByteArray(data, 0, length);
102+
}
103+
}
104+
105+
public static class IntGenerator extends RandomValueGenerator<Integer> {
106+
private final RandomRange<Integer> randomRange = new RandomRange<Integer>(randomInt(), randomInt());
107+
private final int minimum = randomRange.minimum();
108+
private final int maximum = randomRange.maximum();
109+
private final int range = (maximum - minimum);
110+
111+
public IntGenerator(long seed) {
112+
super(seed);
113+
}
114+
115+
@Override
116+
protected Integer nextValue() {
117+
return (minimum + randomInt(range));
118+
}
119+
}
120+
121+
public static class LongGenerator extends RandomValueGenerator<Long> {
122+
private final RandomRange<Long> randomRange = new RandomRange<Long>(randomLong(), randomLong());
123+
private final long minimum = randomRange.minimum();
124+
private final long maximum = randomRange.maximum();
125+
private final long range = (maximum - minimum);
126+
127+
public LongGenerator(long seed) {
128+
super(seed);
129+
}
130+
131+
@Override
132+
protected Long nextValue() {
133+
return (minimum + randomLong(range));
134+
}
135+
}
136+
137+
public static class Int96Generator extends RandomBinaryBase<BigInteger> {
138+
private final RandomRange<BigInteger> randomRange = new RandomRange<BigInteger>(randomInt96(), randomInt96());
139+
private final BigInteger minimum = randomRange.minimum();
140+
private final BigInteger maximum = randomRange.maximum();
141+
private final BigInteger range = maximum.subtract(minimum);
142+
143+
private static final int INT_96_LENGTH = 12;
144+
145+
public Int96Generator(long seed) {
146+
super(seed, INT_96_LENGTH);
147+
}
148+
149+
@Override
150+
protected BigInteger nextValue() {
151+
return (minimum.add(randomInt96(range)));
152+
}
153+
154+
@Override
155+
public Binary nextBinaryValue() {
156+
return asReusedBinary(nextValue().toByteArray());
157+
}
158+
}
159+
160+
public static class FloatGenerator extends RandomValueGenerator<Float> {
161+
private final RandomRange<Float> randomRange = new RandomRange<Float>(randomFloat(), randomFloat());
162+
private final float minimum = randomRange.minimum();
163+
private final float maximum = randomRange.maximum();
164+
private final float range = (maximum - minimum);
165+
166+
public FloatGenerator(long seed) {
167+
super(seed);
168+
}
169+
170+
@Override
171+
protected Float nextValue() {
172+
return (minimum + randomFloat(range));
173+
}
174+
}
175+
176+
public static class DoubleGenerator extends RandomValueGenerator<Double> {
177+
private final RandomRange<Double> randomRange = new RandomRange<Double>(randomDouble(), randomDouble());
178+
private final double minimum = randomRange.minimum();
179+
private final double maximum = randomRange.maximum();
180+
private final double range = (maximum - minimum);
181+
182+
public DoubleGenerator(long seed) {
183+
super(seed);
184+
}
185+
186+
@Override
187+
protected Double nextValue() {
188+
return (minimum + randomDouble(range));
189+
}
190+
}
191+
192+
public static class StringGenerator extends RandomBinaryBase<String> {
193+
private static final int MAX_STRING_LENGTH = 16;
194+
public StringGenerator(long seed) {
195+
super(seed, MAX_STRING_LENGTH);
196+
}
197+
198+
@Override
199+
protected String nextValue() {
200+
int stringLength = randomInt(15) + 1;
201+
return randomString(stringLength);
202+
}
203+
204+
@Override
205+
public Binary nextBinaryValue() {
206+
return asReusedBinary(nextValue().getBytes());
207+
}
208+
}
209+
210+
public static class BinaryGenerator extends RandomBinaryBase<Binary> {
211+
private static final int MAX_STRING_LENGTH = 16;
212+
public BinaryGenerator(long seed) {
213+
super(seed, MAX_STRING_LENGTH);
214+
}
215+
216+
@Override
217+
protected Binary nextValue() {
218+
// use a random length, but ensure it is at least a few bytes
219+
int length = 5 + randomInt(buffer.length - 5);
220+
for (int index = 0; index < length; index++) {
221+
buffer[index] = (byte) randomInt();
222+
}
223+
224+
return Binary.fromReusedByteArray(buffer, 0, length);
225+
}
226+
227+
@Override
228+
public Binary nextBinaryValue() {
229+
return nextValue();
230+
}
231+
}
232+
233+
public static class FixedGenerator extends RandomBinaryBase<Binary> {
234+
public FixedGenerator(long seed, int length) {
235+
super(seed, length);
236+
}
237+
238+
@Override
239+
protected Binary nextValue() {
240+
for (int index = 0; index < buffer.length; index++) {
241+
buffer[index] = (byte) randomInt();
242+
}
243+
244+
return Binary.fromReusedByteArray(buffer);
245+
}
246+
247+
@Override
248+
public Binary nextBinaryValue() {
249+
return nextValue();
250+
}
251+
}
252+
253+
private static class RandomRange<T extends Comparable<T>> {
254+
private T minimum;
255+
private T maximum;
256+
257+
public RandomRange(T lhs, T rhs) {
258+
this.minimum = lhs;
259+
this.maximum = rhs;
260+
261+
if (minimum.compareTo(rhs) > 0) {
262+
T temporary = minimum;
263+
minimum = maximum;
264+
maximum = temporary;
265+
}
266+
}
267+
268+
public T minimum() { return this.minimum; }
269+
public T maximum() { return this.maximum; }
270+
}
271+
}

0 commit comments

Comments
 (0)