Hadoop RDD needs to sort the input partitions if we are going to assume a partitioner

squito · squito · commit ed154ce4fda5 · 2015-02-07T20:05:22.000-06:00
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -200,7 +200,10 @@ class HadoopRDD[K, V](
     if (inputFormat.isInstanceOf[Configurable]) {
       inputFormat.asInstanceOf[Configurable].setConf(jobConf)
     }
-    val inputSplits = inputFormat.getSplits(jobConf, minPartitions)
+    // we have to sort the partitions here so that part-0000 goes to partition 0, etc.  This is
+    // so we can use the same partitioner after we save an RDD to hdfs and then read it back
+    // SPARK-1061
+    val inputSplits = inputFormat.getSplits(jobConf, minPartitions).sorted(SplitOrdering)
     val array = new Array[Partition](inputSplits.size)
     for (i <- 0 until inputSplits.size) {
       array(i) = new HadoopPartition(id, i, inputSplits(i))
@@ -416,3 +419,16 @@ private[spark] object HadoopRDD extends Logging {
     out.seq
   }
 }
+
+private[spark] object SplitOrdering extends Ordering[InputSplit] {
+  def compare(x: InputSplit, y: InputSplit): Int = {
+    (x,y) match {
+      case fileSplits: (FileSplit, FileSplit) =>
+       fileSplitOrdering.compare(fileSplits._1, fileSplits._2)
+      case _ => 1
+    }
+  }
+
+  val fileSplitOrdering: Ordering[FileSplit] = Ordering.by{fileSplit =>
+    (fileSplit.getPath.toString, fileSplit.getStart)}
+}
diff --git a/core/src/test/scala/org/apache/spark/rdd/HadoopRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/HadoopRDDSuite.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.rdd
+
+import java.util
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapred.FileSplit
+import org.scalatest.{Matchers, FunSuite}
+
+import scala.collection.JavaConverters._
+
+class HadoopRDDSuite extends FunSuite with Matchers {
+  test("file split ordering") {
+    val splits = (0 until 10).map{idx =>
+      new FileSplit(new Path("/foo/bar/part-0000" + idx), 0l, 0l, Array[String]())}
+
+    val javaShuffledSplits = new util.ArrayList[FileSplit]()
+    splits.foreach{s => javaShuffledSplits.add(s)}
+    java.util.Collections.shuffle(javaShuffledSplits)
+    val scalaShuffledSplits = javaShuffledSplits.asScala
+    scalaShuffledSplits.sorted(SplitOrdering) should be (splits)
+  }
+}