improve comments and tests.

jiangxb1987 · jiangxb1987 · commit ccdc0e6f738f · 2018-01-22T13:22:28.000-08:00
diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -46,10 +46,10 @@ object Partitioner {
    * If spark.default.parallelism is set, we'll use the value of SparkContext defaultParallelism
    * as the default partitions number, otherwise we'll use the max number of upstream partitions.
    *
-   * If any of the RDDs already has a partitioner, and the partitioner is an eligible one (with a
-   * partitions number that is not less than the max number of upstream partitions by an order of
-   * magnitude), or the number of partitions is larger than the default one, we'll choose the
-   * exsiting partitioner.
+   * When available, we choose the partitioner from rdds with maximum number of partitions. If this
+   * partitioner is eligible (number of partitions within an order of maximum number of partitions
+   * in rdds), or has partition number higher than default partitions number - we use this
+   * partitioner.
    *
    * Otherwise, we'll use a new HashPartitioner with the default partitions number.
    *
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -262,14 +262,11 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
 
   test("defaultPartitioner") {
     val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 150)
-    val rdd2 = sc
-      .parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
+    val rdd2 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
       .partitionBy(new HashPartitioner(10))
-    val rdd3 = sc
-      .parallelize(Array((1, 6), (7, 8), (3, 10), (5, 12), (13, 14)))
+    val rdd3 = sc.parallelize(Array((1, 6), (7, 8), (3, 10), (5, 12), (13, 14)))
       .partitionBy(new HashPartitioner(100))
-    val rdd4 = sc
-      .parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
+    val rdd4 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
       .partitionBy(new HashPartitioner(9))
     val rdd5 = sc.parallelize((1 to 10).map(x => (x, x)), 11)
 
@@ -299,20 +296,24 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
       val rdd4 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
         .partitionBy(new HashPartitioner(9))
       val rdd5 = sc.parallelize((1 to 10).map(x => (x, x)), 11)
+      val rdd6 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
+        .partitionBy(new HashPartitioner(3))
 
       val partitioner1 = Partitioner.defaultPartitioner(rdd1, rdd2)
       val partitioner2 = Partitioner.defaultPartitioner(rdd2, rdd3)
       val partitioner3 = Partitioner.defaultPartitioner(rdd3, rdd1)
       val partitioner4 = Partitioner.defaultPartitioner(rdd1, rdd2, rdd3)
       val partitioner5 = Partitioner.defaultPartitioner(rdd4, rdd5)
       val partitioner6 = Partitioner.defaultPartitioner(rdd5, rdd5)
+      val partitioner7 = Partitioner.defaultPartitioner(rdd1, rdd6)
 
       assert(partitioner1.numPartitions == rdd2.getNumPartitions)
       assert(partitioner2.numPartitions == rdd3.getNumPartitions)
       assert(partitioner3.numPartitions == rdd3.getNumPartitions)
       assert(partitioner4.numPartitions == rdd3.getNumPartitions)
       assert(partitioner5.numPartitions == rdd4.getNumPartitions)
       assert(partitioner6.numPartitions == sc.defaultParallelism)
+      assert(partitioner7.numPartitions == sc.defaultParallelism)
     } finally {
       sc.conf.remove("spark.default.parallelism")
     }
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -358,8 +358,8 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     }
   }
 
-  test("cogroup between multiple RDD when defaultParallelism is set with huge number of " +
-    "partitions from upstream RDDs") {
+  test("cogroup between multiple RDD when defaultParallelism is set; with huge number of " +
+    "partitions in upstream RDDs") {
     assert(!sc.conf.contains("spark.default.parallelism"))
     try {
       sc.conf.set("spark.default.parallelism", "4")

Original file line number	Diff line number	Diff line change
`@@ -46,10 +46,10 @@ object Partitioner {`
`46`	`46`	`* If spark.default.parallelism is set, we'll use the value of SparkContext defaultParallelism`
`47`	`47`	`* as the default partitions number, otherwise we'll use the max number of upstream partitions.`
`48`	`48`	`*`
`49`		`- * If any of the RDDs already has a partitioner, and the partitioner is an eligible one (with a`
`50`		`- * partitions number that is not less than the max number of upstream partitions by an order of`
`51`		`- * magnitude), or the number of partitions is larger than the default one, we'll choose the`
`52`		`- * exsiting partitioner.`
	`49`	`+ * When available, we choose the partitioner from rdds with maximum number of partitions. If this`
	`50`	`+ * partitioner is eligible (number of partitions within an order of maximum number of partitions`
	`51`	`+ * in rdds), or has partition number higher than default partitions number - we use this`
	`52`	`+ * partitioner.`
`53`	`53`	`*`
`54`	`54`	`* Otherwise, we'll use a new HashPartitioner with the default partitions number.`
`55`	`55`	`*`
Original file line number	Diff line number	Diff line change
`@@ -358,8 +358,8 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {`
`358`	`358`	`}`
`359`	`359`	`}`
`360`	`360`
`361`		`- test("cogroup between multiple RDD when defaultParallelism is set with huge number of " +`
`362`		`- "partitions from upstream RDDs") {`
	`361`	`+ test("cogroup between multiple RDD when defaultParallelism is set; with huge number of " +`
	`362`	`+ "partitions in upstream RDDs") {`
`363`	`363`	`assert(!sc.conf.contains("spark.default.parallelism"))`
`364`	`364`	`try {`
`365`	`365`	`sc.conf.set("spark.default.parallelism", "4")`