{
+
+ private java.lang.CharSequence a;
+ private java.lang.Integer b;
+
+ /** Creates a new Builder */
+ private Builder() {
+ super(org.apache.spark.shuffle.parquet.avro.AvroTestEntity.SCHEMA$);
+ }
+
+ /** Creates a Builder by copying an existing Builder */
+ private Builder(org.apache.spark.shuffle.parquet.avro.AvroTestEntity.Builder other) {
+ super(other);
+ if (isValidValue(fields()[0], other.a)) {
+ this.a = data().deepCopy(fields()[0].schema(), other.a);
+ fieldSetFlags()[0] = true;
+ }
+ if (isValidValue(fields()[1], other.b)) {
+ this.b = data().deepCopy(fields()[1].schema(), other.b);
+ fieldSetFlags()[1] = true;
+ }
+ }
+
+ /** Creates a Builder by copying an existing AvroTestEntity instance */
+ private Builder(org.apache.spark.shuffle.parquet.avro.AvroTestEntity other) {
+ super(org.apache.spark.shuffle.parquet.avro.AvroTestEntity.SCHEMA$);
+ if (isValidValue(fields()[0], other.a)) {
+ this.a = data().deepCopy(fields()[0].schema(), other.a);
+ fieldSetFlags()[0] = true;
+ }
+ if (isValidValue(fields()[1], other.b)) {
+ this.b = data().deepCopy(fields()[1].schema(), other.b);
+ fieldSetFlags()[1] = true;
+ }
+ }
+
+ /**
+ * Gets the value of the 'a' field.
+ */
+ public java.lang.CharSequence getA() {
+ return a;
+ }
+
+ /**
+ * Sets the value of the 'a' field.
+ * @param value the value to set.
+ */
+ public org.apache.spark.shuffle.parquet.avro.AvroTestEntity.Builder setA(java.lang.CharSequence value) {
+ validate(fields()[0], value);
+ this.a = value;
+ fieldSetFlags()[0] = true;
+ return this;
+ }
+
+ /**
+ * Checks whether the 'a' field has been set.
+ */
+ public boolean hasA() {
+ return fieldSetFlags()[0];
+ }
+
+
+ /**
+ * Clears the value of the 'a' field.
+ */
+ public org.apache.spark.shuffle.parquet.avro.AvroTestEntity.Builder clearA() {
+ a = null;
+ fieldSetFlags()[0] = false;
+ return this;
+ }
+
+ /**
+ * Gets the value of the 'b' field.
+ */
+ public java.lang.Integer getB() {
+ return b;
+ }
+
+ /**
+ * Sets the value of the 'b' field.
+ * @param value the value to set.
+ */
+ public org.apache.spark.shuffle.parquet.avro.AvroTestEntity.Builder setB(java.lang.Integer value) {
+ validate(fields()[1], value);
+ this.b = value;
+ fieldSetFlags()[1] = true;
+ return this;
+ }
+
+ /**
+ * Checks whether the 'b' field has been set.
+ */
+ public boolean hasB() {
+ return fieldSetFlags()[1];
+ }
+
+
+ /**
+ * Clears the value of the 'b' field.
+ */
+ public org.apache.spark.shuffle.parquet.avro.AvroTestEntity.Builder clearB() {
+ b = null;
+ fieldSetFlags()[1] = false;
+ return this;
+ }
+
+ @Override
+ public AvroTestEntity build() {
+ try {
+ AvroTestEntity record = new AvroTestEntity();
+ record.a = fieldSetFlags()[0] ? this.a : (java.lang.CharSequence) defaultValue(fields()[0]);
+ record.b = fieldSetFlags()[1] ? this.b : (java.lang.Integer) defaultValue(fields()[1]);
+ return record;
+ } catch (Exception e) {
+ throw new org.apache.avro.AvroRuntimeException(e);
+ }
+ }
+ }
+
+ private static final org.apache.avro.io.DatumWriter
+ WRITER$ = new org.apache.avro.specific.SpecificDatumWriter(SCHEMA$);
+
+ private static final org.apache.avro.io.DatumReader
+ READER$ = new org.apache.avro.specific.SpecificDatumReader(SCHEMA$);
+
+}
diff --git a/core/src/test/resources/org/apache/spark/shuffle/parquet/avro/tests.avdl b/core/src/test/resources/org/apache/spark/shuffle/parquet/avro/tests.avdl
new file mode 100644
index 0000000000000..e04526774d20f
--- /dev/null
+++ b/core/src/test/resources/org/apache/spark/shuffle/parquet/avro/tests.avdl
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// avrotools idl tests.avdl > tests.json
+// avrotools compile protocol tests.json core/src/test/java/
+
+@namespace("org.apache.spark.shuffle.parquet.avro")
+protocol AvroParquetTest {
+
+record AvroTestEntity {
+ union {null, string} a;
+ union {null, int} b;
+}
+
+}
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index d91b799ecfc08..491a874f1cb8e 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -296,7 +296,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
assert(metrics.recordsRead === numRecords)
assert(metrics.recordsWritten === numRecords)
- assert(metrics.bytesWritten === metrics.byresRead)
+ assert(metrics.bytesWritten === metrics.bytesRead)
assert(metrics.bytesWritten > 0)
}
@@ -312,7 +312,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
assert(metrics.recordsRead === numRecords)
assert(metrics.recordsWritten === numRecords)
- assert(metrics.bytesWritten === metrics.byresRead)
+ assert(metrics.bytesWritten === metrics.bytesRead)
assert(metrics.bytesWritten > 0)
}
}
@@ -333,7 +333,7 @@ object ShuffleSuite {
recordsWritten: Long,
recordsRead: Long,
bytesWritten: Long,
- byresRead: Long)
+ bytesRead: Long)
def runAndReturnMetrics(sc: SparkContext)(job: => Unit): AggregatedShuffleMetrics = {
@volatile var recordsWritten: Long = 0
diff --git a/core/src/test/scala/org/apache/spark/shuffle/parquet/ParquetShuffleSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/parquet/ParquetShuffleSuite.scala
new file mode 100644
index 0000000000000..2036349437267
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/shuffle/parquet/ParquetShuffleSuite.scala
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.parquet
+
+import org.apache.spark.serializer.KryoSerializer
+import org.apache.spark.shuffle.parquet.avro.AvroTestEntity
+import org.apache.spark._
+
+class ParquetShuffleSuite extends SparkFunSuite with LocalSparkContext {
+
+ def newConf(withFallback: Boolean = false): SparkConf = {
+ val conf = new SparkConf()
+ ParquetShuffleConfig.enableParquetShuffle(conf)
+ if(withFallback) {
+ ParquetShuffleConfig.setFallbackShuffleManager(conf, "sort")
+ }
+ conf.set("spark.serializer", classOf[KryoSerializer].getName)
+ }
+
+ val fallbackConf = newConf(withFallback = true)
+ val noFallbackConf = newConf(withFallback = false)
+
+ test("fallback shuffle without aggregation") {
+ sc = new SparkContext("local", "test", fallbackConf)
+ val numRecords = 10000
+
+ val metrics = ShuffleSuite.runAndReturnMetrics(sc) {
+ sc.parallelize(1 to numRecords, 4)
+ .map(key => (key, 1))
+ .groupByKey()
+ .collect()
+ }
+
+ assert(metrics.recordsRead === numRecords)
+ assert(metrics.recordsWritten === numRecords)
+ assert(metrics.bytesWritten === metrics.bytesRead)
+ assert(metrics.bytesWritten > 0)
+ }
+
+ test("fallback for shuffle with aggregation") {
+ sc = new SparkContext("local", "test", fallbackConf)
+ val numRecords = 10000
+
+ val metrics = ShuffleSuite.runAndReturnMetrics(sc) {
+ sc.parallelize(1 to numRecords, 4)
+ .flatMap(key => Array.fill(100)((key, 1)))
+ .countByKey()
+ }
+
+ assert(metrics.recordsRead === numRecords)
+ assert(metrics.recordsWritten === numRecords)
+ assert(metrics.bytesWritten === metrics.bytesRead)
+ assert(metrics.bytesWritten > 0)
+ }
+
+ test("shuffle without aggregation") {
+ sc = new SparkContext("local", "test", noFallbackConf)
+ val numRecords = 10000
+ val records = for (i <- 1 to numRecords) yield {
+ val obj = AvroTestEntity.newBuilder().setA("test").setB(i).build()
+ (obj, if (i % 10 == 0) null else obj)
+ }
+
+ val metrics = ShuffleSuite.runAndReturnMetrics(sc) {
+ sc.parallelize(records, 4)
+ .groupByKey()
+ .collect()
+ }
+
+ assert(metrics.recordsRead === numRecords)
+ assert(metrics.recordsWritten === numRecords)
+ assert(metrics.bytesWritten === metrics.bytesRead)
+ assert(metrics.bytesWritten > 0)
+ }
+
+ test("shuffle with aggregation") {
+ sc = new SparkContext("local", "test", noFallbackConf)
+ val numRecords = 10000
+ val records = for (i <- 1 to numRecords) yield {
+ val obj = AvroTestEntity.newBuilder().setA("agg").setB(i).build()
+ (obj, if (i % 10 == 0) null else obj)
+ }
+
+ val metrics = ShuffleSuite.runAndReturnMetrics(sc) {
+ sc.parallelize(records, 4)
+ .reduceByKey({(a, b) => AvroTestEntity.newBuilder().setA("agg").build()})
+ .collect()
+ }
+
+ assert(metrics.recordsRead === numRecords)
+ assert(metrics.recordsWritten === numRecords)
+ assert(metrics.bytesWritten === metrics.bytesRead)
+ assert(metrics.bytesWritten > 0)
+ }
+
+}
diff --git a/docs/configuration.md b/docs/configuration.md
index 1a701f18881fe..9d29ac5dba21e 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -447,12 +447,12 @@ Apart from these, the following properties are also available, and may be useful
spark.shuffle.manager |
sort |
- Implementation to use for shuffling data. There are three implementations available:
- sort, hash and the new (1.5+) tungsten-sort.
- Sort-based shuffle is more memory-efficient and is the default option starting in 1.2.
- Tungsten-sort is similar to the sort based shuffle, with a direct binary cache-friendly
- implementation with a fall back to regular sort based shuffle if its requirements are not
- met.
+ Implementation to use for shuffling data. There are four implementations available:
+ sort, hash, the new (1.5+) tungsten-sort, and
+ parquet. Sort-based shuffle is more memory-efficient and is the default
+ option starting in 1.2. Tungsten-sort is similar to the sort based shuffle, with a direct
+ binary cache-friendly implementation with a fall back to regular sort based shuffle if
+ its requirements are not met.
|
diff --git a/pom.xml b/pom.xml
index 88ebceca769e9..afe5b56089e19 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1601,7 +1601,7 @@
org.apache.parquet
parquet-avro
${parquet.version}
- ${parquet.test.deps.scope}
+ ${parquet.deps.scope}
com.twitter