@@ -57,6 +57,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
5757 with SparkHadoopMapReduceUtil
5858 with Serializable
5959{
60+
6061 /**
6162 * Generic function to combine the elements for each key using a custom set of aggregation
6263 * functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined type" C
@@ -70,12 +71,13 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
7071 * In addition, users can control the partitioning of the output RDD, and whether to perform
7172 * map-side aggregation (if a mapper can produce multiple items with the same key).
7273 */
73- def combineByKey [C ](createCombiner : V => C ,
74+ def combineByKeyWithClassTag [C ](
75+ createCombiner : V => C ,
7476 mergeValue : (C , V ) => C ,
7577 mergeCombiners : (C , C ) => C ,
7678 partitioner : Partitioner ,
7779 mapSideCombine : Boolean = true ,
78- serializer : Serializer = null ): RDD [(K , C )] = self.withScope {
80+ serializer : Serializer = null )( implicit ct : ClassTag [ C ]) : RDD [(K , C )] = self.withScope {
7981 require(mergeCombiners != null , " mergeCombiners must be defined" ) // required as of Spark 0.9.0
8082 if (keyClass.isArray) {
8183 if (mapSideCombine) {
@@ -103,13 +105,48 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
103105 }
104106
105107 /**
106- * Simplified version of combineByKey that hash-partitions the output RDD.
108+ * This method is here for backward compatibility. It
109+ * does not provide combiner classtag information to
110+ * the shuffle.
111+ *
112+ * @see [[combineByKeyWithClassTag ]]
113+ */
114+ def combineByKey [C ](
115+ createCombiner : V => C ,
116+ mergeValue : (C , V ) => C ,
117+ mergeCombiners : (C , C ) => C ,
118+ partitioner : Partitioner ,
119+ mapSideCombine : Boolean = true ,
120+ serializer : Serializer = null ): RDD [(K , C )] = self.withScope {
121+ combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners,
122+ partitioner, mapSideCombine, serializer)(null )
123+ }
124+
125+ /**
126+ * This method is here for backward compatibility. It
127+ * does not provide combiner classtag information to
128+ * the shuffle.
129+ *
130+ * @see [[combineByKeyWithClassTag ]]
107131 */
108- def combineByKey [C ](createCombiner : V => C ,
132+ def combineByKey [C ](
133+ createCombiner : V => C ,
109134 mergeValue : (C , V ) => C ,
110135 mergeCombiners : (C , C ) => C ,
111136 numPartitions : Int ): RDD [(K , C )] = self.withScope {
112- combineByKey(createCombiner, mergeValue, mergeCombiners, new HashPartitioner (numPartitions))
137+ combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners, numPartitions)(null )
138+ }
139+
140+ /**
141+ * Simplified version of combineByKeyWithClassTag that hash-partitions the output RDD.
142+ */
143+ def combineByKeyWithClassTag [C ](
144+ createCombiner : V => C ,
145+ mergeValue : (C , V ) => C ,
146+ mergeCombiners : (C , C ) => C ,
147+ numPartitions : Int )(implicit ct : ClassTag [C ]): RDD [(K , C )] = self.withScope {
148+ combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners,
149+ new HashPartitioner (numPartitions))
113150 }
114151
115152 /**
@@ -133,7 +170,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
133170
134171 // We will clean the combiner closure later in `combineByKey`
135172 val cleanedSeqOp = self.context.clean(seqOp)
136- combineByKey[U ]((v : V ) => cleanedSeqOp(createZero(), v), cleanedSeqOp, combOp, partitioner)
173+ combineByKeyWithClassTag[U ]((v : V ) => cleanedSeqOp(createZero(), v),
174+ cleanedSeqOp, combOp, partitioner)
137175 }
138176
139177 /**
@@ -182,7 +220,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
182220 val createZero = () => cachedSerializer.deserialize[V ](ByteBuffer .wrap(zeroArray))
183221
184222 val cleanedFunc = self.context.clean(func)
185- combineByKey[V ]((v : V ) => cleanedFunc(createZero(), v), cleanedFunc, cleanedFunc, partitioner)
223+ combineByKeyWithClassTag[V ]((v : V ) => cleanedFunc(createZero(), v),
224+ cleanedFunc, cleanedFunc, partitioner)
186225 }
187226
188227 /**
@@ -268,7 +307,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
268307 * "combiner" in MapReduce.
269308 */
270309 def reduceByKey (partitioner : Partitioner , func : (V , V ) => V ): RDD [(K , V )] = self.withScope {
271- combineByKey [V ]((v : V ) => v, func, func, partitioner)
310+ combineByKeyWithClassTag [V ]((v : V ) => v, func, func, partitioner)
272311 }
273312
274313 /**
@@ -392,7 +431,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
392431 h1
393432 }
394433
395- combineByKey(createHLL, mergeValueHLL, mergeHLL, partitioner).mapValues(_.cardinality())
434+ combineByKeyWithClassTag(createHLL, mergeValueHLL, mergeHLL, partitioner)
435+ .mapValues(_.cardinality())
396436 }
397437
398438 /**
@@ -466,7 +506,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
466506 val createCombiner = (v : V ) => CompactBuffer (v)
467507 val mergeValue = (buf : CompactBuffer [V ], v : V ) => buf += v
468508 val mergeCombiners = (c1 : CompactBuffer [V ], c2 : CompactBuffer [V ]) => c1 ++= c2
469- val bufs = combineByKey [CompactBuffer [V ]](
509+ val bufs = combineByKeyWithClassTag [CompactBuffer [V ]](
470510 createCombiner, mergeValue, mergeCombiners, partitioner, mapSideCombine = false )
471511 bufs.asInstanceOf [RDD [(K , Iterable [V ])]]
472512 }
@@ -565,12 +605,28 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
565605 }
566606
567607 /**
568- * Simplified version of combineByKey that hash-partitions the resulting RDD using the
608+ * This method is here for backward compatibility. It
609+ * does not provide combiner classtag information to
610+ * the shuffle.
611+ *
612+ * @see [[combineByKeyWithClassTag ]]
613+ */
614+ def combineByKey [C ](
615+ createCombiner : V => C ,
616+ mergeValue : (C , V ) => C ,
617+ mergeCombiners : (C , C ) => C ): RDD [(K , C )] = self.withScope {
618+ combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners)(null )
619+ }
620+
621+ /**
622+ * Simplified version of combineByKeyWithClassTag that hash-partitions the resulting RDD using the
569623 * existing partitioner/parallelism level.
570624 */
571- def combineByKey [C ](createCombiner : V => C , mergeValue : (C , V ) => C , mergeCombiners : (C , C ) => C )
572- : RDD [(K , C )] = self.withScope {
573- combineByKey(createCombiner, mergeValue, mergeCombiners, defaultPartitioner(self))
625+ def combineByKeyWithClassTag [C ](
626+ createCombiner : V => C ,
627+ mergeValue : (C , V ) => C ,
628+ mergeCombiners : (C , C ) => C )(implicit ct : ClassTag [C ]): RDD [(K , C )] = self.withScope {
629+ combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners, defaultPartitioner(self))
574630 }
575631
576632 /**
0 commit comments