@@ -43,17 +43,19 @@ object Partitioner {
4343 /**
4444 * Choose a partitioner to use for a cogroup-like operation between a number of RDDs.
4545 *
46- * If any of the RDDs already has a partitioner, and the number of partitions of the
47- * partitioner is either greater than or is less than and within a single order of
48- * magnitude of the max number of upstream partitions, choose that one.
46+ * If spark.default.parallelism is set, we'll use the value of SparkContext defaultParallelism
47+ * as the default partitions number, otherwise we'll use the max number of upstream partitions.
4948 *
50- * Otherwise, we use a default HashPartitioner. For the number of partitions, if
51- * spark.default.parallelism is set, then we'll use the value from SparkContext
52- * defaultParallelism, otherwise we'll use the max number of upstream partitions.
49+ * If any of the RDDs already has a partitioner, and the partitioner is an eligible one (with a
50+ * partitions number that is not less than the max number of upstream partitions by an order of
51+ * magnitude), or the number of partitions is larger than the default one, we'll choose the
52+ * exsiting partitioner.
5353 *
54- * Unless spark.default.parallelism is set, the number of partitions will be the
55- * same as the number of partitions in the largest upstream RDD, as this should
56- * be least likely to cause out-of-memory errors.
54+ * Otherwise, we'll use a new HashPartitioner with the default partitions number.
55+ *
56+ * Unless spark.default.parallelism is set, the number of partitions will be the same as the
57+ * number of partitions in the largest upstream RDD, as this should be least likely to cause
58+ * out-of-memory errors.
5759 *
5860 * We use two method parameters (rdd, others) to enforce callers passing at least 1 RDD.
5961 */
@@ -67,31 +69,32 @@ object Partitioner {
6769 None
6870 }
6971
70- if (isEligiblePartitioner(hasMaxPartitioner, rdds)) {
72+ val defaultNumPartitions = if (rdd.context.conf.contains(" spark.default.parallelism" )) {
73+ rdd.context.defaultParallelism
74+ } else {
75+ rdds.map(_.partitions.length).max
76+ }
77+
78+ // If the existing max partitioner is an eligible one, or its partitions number is larger
79+ // than the default number of partitions, use the existing partitioner.
80+ if (hasMaxPartitioner.nonEmpty && (isEligiblePartitioner(hasMaxPartitioner.get, rdds) ||
81+ defaultNumPartitions < hasMaxPartitioner.get.getNumPartitions)) {
7182 hasMaxPartitioner.get.partitioner.get
7283 } else {
73- if (rdd.context.conf.contains(" spark.default.parallelism" )) {
74- new HashPartitioner (rdd.context.defaultParallelism)
75- } else {
76- new HashPartitioner (rdds.map(_.partitions.length).max)
77- }
84+ new HashPartitioner (defaultNumPartitions)
7885 }
7986 }
8087
8188 /**
82- * Returns true if the number of partitions of the RDD is either greater
83- * than or is less than and within a single order of magnitude of the
84- * max number of upstream partitions;
85- * otherwise, returns false
89+ * Returns true if the number of partitions of the RDD is either greater than or is less than and
90+ * within a single order of magnitude of the max number of upstream partitions, otherwise returns
91+ * false.
8692 */
8793 private def isEligiblePartitioner (
88- hasMaxPartitioner : Option [ RDD [_] ],
94+ hasMaxPartitioner : RDD [_],
8995 rdds : Seq [RDD [_]]): Boolean = {
90- if (hasMaxPartitioner.isEmpty) {
91- return false
92- }
9396 val maxPartitions = rdds.map(_.partitions.length).max
94- log10(maxPartitions) - log10(hasMaxPartitioner.get. getNumPartitions) < 1
97+ log10(maxPartitions) - log10(hasMaxPartitioner.getNumPartitions) < 1
9598 }
9699}
97100
0 commit comments