@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, encoderFor, Ou
2525import org .apache .spark .sql .catalyst .expressions .{Alias , CreateStruct , Attribute }
2626import org .apache .spark .sql .catalyst .plans .logical ._
2727import org .apache .spark .sql .execution .QueryExecution
28+ import org .apache .spark .sql .expressions .Aggregator
2829
2930/**
3031 * :: Experimental ::
@@ -36,11 +37,13 @@ import org.apache.spark.sql.execution.QueryExecution
3637 * making this change to the class hierarchy would break some function signatures. As such, this
3738 * class should be considered a preview of the final API. Changes will be made to the interface
3839 * after Spark 1.6.
40+ *
41+ * @since 1.6.0
3942 */
4043@ Experimental
41- class GroupedDataset [K , T ] private [sql](
44+ class GroupedDataset [K , V ] private [sql](
4245 kEncoder : Encoder [K ],
43- tEncoder : Encoder [T ],
46+ tEncoder : Encoder [V ],
4447 val queryExecution : QueryExecution ,
4548 private val dataAttributes : Seq [Attribute ],
4649 private val groupingAttributes : Seq [Attribute ]) extends Serializable {
@@ -67,8 +70,10 @@ class GroupedDataset[K, T] private[sql](
6770 /**
6871 * Returns a new [[GroupedDataset ]] where the type of the key has been mapped to the specified
6972 * type. The mapping of key columns to the type follows the same rules as `as` on [[Dataset ]].
73+ *
74+ * @since 1.6.0
7075 */
71- def asKey [L : Encoder ]: GroupedDataset [L , T ] =
76+ def keyAs [L : Encoder ]: GroupedDataset [L , V ] =
7277 new GroupedDataset (
7378 encoderFor[L ],
7479 unresolvedTEncoder,
@@ -78,6 +83,8 @@ class GroupedDataset[K, T] private[sql](
7883
7984 /**
8085 * Returns a [[Dataset ]] that contains each unique key.
86+ *
87+ * @since 1.6.0
8188 */
8289 def keys : Dataset [K ] = {
8390 new Dataset [K ](
@@ -92,12 +99,18 @@ class GroupedDataset[K, T] private[sql](
9299 * function can return an iterator containing elements of an arbitrary type which will be returned
93100 * as a new [[Dataset ]].
94101 *
102+ * This function does not support partial aggregation, and as a result requires shuffling all
103+ * the data in the [[Dataset ]]. If an application intends to perform an aggregation over each
104+ * key, it is best to use the reduce function or an [[Aggregator ]].
105+ *
95106 * Internally, the implementation will spill to disk if any given group is too large to fit into
96107 * memory. However, users must take care to avoid materializing the whole iterator for a group
97108 * (for example, by calling `toList`) unless they are sure that this is possible given the memory
98109 * constraints of their cluster.
110+ *
111+ * @since 1.6.0
99112 */
100- def flatMap [U : Encoder ](f : (K , Iterator [T ]) => TraversableOnce [U ]): Dataset [U ] = {
113+ def flatMapGroup [U : Encoder ](f : (K , Iterator [V ]) => TraversableOnce [U ]): Dataset [U ] = {
101114 new Dataset [U ](
102115 sqlContext,
103116 MapGroups (
@@ -108,41 +121,88 @@ class GroupedDataset[K, T] private[sql](
108121 logicalPlan))
109122 }
110123
111- def flatMap [U ](f : FlatMapGroupFunction [K , T , U ], encoder : Encoder [U ]): Dataset [U ] = {
112- flatMap((key, data) => f.call(key, data.asJava).asScala)(encoder)
124+ /**
125+ * Applies the given function to each group of data. For each unique group, the function will
126+ * be passed the group key and an iterator that contains all of the elements in the group. The
127+ * function can return an iterator containing elements of an arbitrary type which will be returned
128+ * as a new [[Dataset ]].
129+ *
130+ * This function does not support partial aggregation, and as a result requires shuffling all
131+ * the data in the [[Dataset ]]. If an application intends to perform an aggregation over each
132+ * key, it is best to use the reduce function or an [[Aggregator ]].
133+ *
134+ * Internally, the implementation will spill to disk if any given group is too large to fit into
135+ * memory. However, users must take care to avoid materializing the whole iterator for a group
136+ * (for example, by calling `toList`) unless they are sure that this is possible given the memory
137+ * constraints of their cluster.
138+ *
139+ * @since 1.6.0
140+ */
141+ def flatMapGroup [U ](f : FlatMapGroupFunction [K , V , U ], encoder : Encoder [U ]): Dataset [U ] = {
142+ flatMapGroup((key, data) => f.call(key, data.asJava).asScala)(encoder)
113143 }
114144
115145 /**
116146 * Applies the given function to each group of data. For each unique group, the function will
117147 * be passed the group key and an iterator that contains all of the elements in the group. The
118148 * function can return an element of arbitrary type which will be returned as a new [[Dataset ]].
119149 *
150+ * This function does not support partial aggregation, and as a result requires shuffling all
151+ * the data in the [[Dataset ]]. If an application intends to perform an aggregation over each
152+ * key, it is best to use the reduce function or an [[Aggregator ]].
153+ *
120154 * Internally, the implementation will spill to disk if any given group is too large to fit into
121155 * memory. However, users must take care to avoid materializing the whole iterator for a group
122156 * (for example, by calling `toList`) unless they are sure that this is possible given the memory
123157 * constraints of their cluster.
158+ *
159+ * @since 1.6.0
124160 */
125- def map [U : Encoder ](f : (K , Iterator [T ]) => U ): Dataset [U ] = {
126- val func = (key : K , it : Iterator [T ]) => Iterator (f(key, it))
127- flatMap (func)
161+ def mapGroup [U : Encoder ](f : (K , Iterator [V ]) => U ): Dataset [U ] = {
162+ val func = (key : K , it : Iterator [V ]) => Iterator (f(key, it))
163+ flatMapGroup (func)
128164 }
129165
130- def map [U ](f : MapGroupFunction [K , T , U ], encoder : Encoder [U ]): Dataset [U ] = {
131- map((key, data) => f.call(key, data.asJava))(encoder)
166+ /**
167+ * Applies the given function to each group of data. For each unique group, the function will
168+ * be passed the group key and an iterator that contains all of the elements in the group. The
169+ * function can return an element of arbitrary type which will be returned as a new [[Dataset ]].
170+ *
171+ * This function does not support partial aggregation, and as a result requires shuffling all
172+ * the data in the [[Dataset ]]. If an application intends to perform an aggregation over each
173+ * key, it is best to use the reduce function or an [[Aggregator ]].
174+ *
175+ * Internally, the implementation will spill to disk if any given group is too large to fit into
176+ * memory. However, users must take care to avoid materializing the whole iterator for a group
177+ * (for example, by calling `toList`) unless they are sure that this is possible given the memory
178+ * constraints of their cluster.
179+ *
180+ * @since 1.6.0
181+ */
182+ def mapGroup [U ](f : MapGroupFunction [K , V , U ], encoder : Encoder [U ]): Dataset [U ] = {
183+ mapGroup((key, data) => f.call(key, data.asJava))(encoder)
132184 }
133185
134186 /**
135187 * Reduces the elements of each group of data using the specified binary function.
136188 * The given function must be commutative and associative or the result may be non-deterministic.
189+ *
190+ * @since 1.6.0
137191 */
138- def reduce (f : (T , T ) => T ): Dataset [(K , T )] = {
139- val func = (key : K , it : Iterator [T ]) => Iterator (key -> it.reduce(f))
192+ def reduce (f : (V , V ) => V ): Dataset [(K , V )] = {
193+ val func = (key : K , it : Iterator [V ]) => Iterator (( key, it.reduce(f) ))
140194
141195 implicit val resultEncoder = ExpressionEncoder .tuple(unresolvedKEncoder, unresolvedTEncoder)
142- flatMap (func)
196+ flatMapGroup (func)
143197 }
144198
145- def reduce (f : ReduceFunction [T ]): Dataset [(K , T )] = {
199+ /**
200+ * Reduces the elements of each group of data using the specified binary function.
201+ * The given function must be commutative and associative or the result may be non-deterministic.
202+ *
203+ * @since 1.6.0
204+ */
205+ def reduce (f : ReduceFunction [V ]): Dataset [(K , V )] = {
146206 reduce(f.call _)
147207 }
148208
@@ -185,41 +245,51 @@ class GroupedDataset[K, T] private[sql](
185245 /**
186246 * Computes the given aggregation, returning a [[Dataset ]] of tuples for each unique key
187247 * and the result of computing this aggregation over all elements in the group.
248+ *
249+ * @since 1.6.0
188250 */
189- def agg [U1 ](col1 : TypedColumn [T , U1 ]): Dataset [(K , U1 )] =
251+ def agg [U1 ](col1 : TypedColumn [V , U1 ]): Dataset [(K , U1 )] =
190252 aggUntyped(col1).asInstanceOf [Dataset [(K , U1 )]]
191253
192254 /**
193255 * Computes the given aggregations, returning a [[Dataset ]] of tuples for each unique key
194256 * and the result of computing these aggregations over all elements in the group.
257+ *
258+ * @since 1.6.0
195259 */
196- def agg [U1 , U2 ](col1 : TypedColumn [T , U1 ], col2 : TypedColumn [T , U2 ]): Dataset [(K , U1 , U2 )] =
260+ def agg [U1 , U2 ](col1 : TypedColumn [V , U1 ], col2 : TypedColumn [V , U2 ]): Dataset [(K , U1 , U2 )] =
197261 aggUntyped(col1, col2).asInstanceOf [Dataset [(K , U1 , U2 )]]
198262
199263 /**
200264 * Computes the given aggregations, returning a [[Dataset ]] of tuples for each unique key
201265 * and the result of computing these aggregations over all elements in the group.
266+ *
267+ * @since 1.6.0
202268 */
203269 def agg [U1 , U2 , U3 ](
204- col1 : TypedColumn [T , U1 ],
205- col2 : TypedColumn [T , U2 ],
206- col3 : TypedColumn [T , U3 ]): Dataset [(K , U1 , U2 , U3 )] =
270+ col1 : TypedColumn [V , U1 ],
271+ col2 : TypedColumn [V , U2 ],
272+ col3 : TypedColumn [V , U3 ]): Dataset [(K , U1 , U2 , U3 )] =
207273 aggUntyped(col1, col2, col3).asInstanceOf [Dataset [(K , U1 , U2 , U3 )]]
208274
209275 /**
210276 * Computes the given aggregations, returning a [[Dataset ]] of tuples for each unique key
211277 * and the result of computing these aggregations over all elements in the group.
278+ *
279+ * @since 1.6.0
212280 */
213281 def agg [U1 , U2 , U3 , U4 ](
214- col1 : TypedColumn [T , U1 ],
215- col2 : TypedColumn [T , U2 ],
216- col3 : TypedColumn [T , U3 ],
217- col4 : TypedColumn [T , U4 ]): Dataset [(K , U1 , U2 , U3 , U4 )] =
282+ col1 : TypedColumn [V , U1 ],
283+ col2 : TypedColumn [V , U2 ],
284+ col3 : TypedColumn [V , U3 ],
285+ col4 : TypedColumn [V , U4 ]): Dataset [(K , U1 , U2 , U3 , U4 )] =
218286 aggUntyped(col1, col2, col3, col4).asInstanceOf [Dataset [(K , U1 , U2 , U3 , U4 )]]
219287
220288 /**
221289 * Returns a [[Dataset ]] that contains a tuple with each key and the number of items present
222290 * for that key.
291+ *
292+ * @since 1.6.0
223293 */
224294 def count (): Dataset [(K , Long )] = agg(functions.count(" *" ).as(ExpressionEncoder [Long ]))
225295
@@ -228,10 +298,12 @@ class GroupedDataset[K, T] private[sql](
228298 * be passed the grouping key and 2 iterators containing all elements in the group from
229299 * [[Dataset ]] `this` and `other`. The function can return an iterator containing elements of an
230300 * arbitrary type which will be returned as a new [[Dataset ]].
301+ *
302+ * @since 1.6.0
231303 */
232304 def cogroup [U , R : Encoder ](
233305 other : GroupedDataset [K , U ])(
234- f : (K , Iterator [T ], Iterator [U ]) => TraversableOnce [R ]): Dataset [R ] = {
306+ f : (K , Iterator [V ], Iterator [U ]) => TraversableOnce [R ]): Dataset [R ] = {
235307 implicit def uEnc : Encoder [U ] = other.unresolvedTEncoder
236308 new Dataset [R ](
237309 sqlContext,
@@ -243,9 +315,17 @@ class GroupedDataset[K, T] private[sql](
243315 other.logicalPlan))
244316 }
245317
318+ /**
319+ * Applies the given function to each cogrouped data. For each unique group, the function will
320+ * be passed the grouping key and 2 iterators containing all elements in the group from
321+ * [[Dataset ]] `this` and `other`. The function can return an iterator containing elements of an
322+ * arbitrary type which will be returned as a new [[Dataset ]].
323+ *
324+ * @since 1.6.0
325+ */
246326 def cogroup [U , R ](
247327 other : GroupedDataset [K , U ],
248- f : CoGroupFunction [K , T , U , R ],
328+ f : CoGroupFunction [K , V , U , R ],
249329 encoder : Encoder [R ]): Dataset [R ] = {
250330 cogroup(other)((key, left, right) => f.call(key, left.asJava, right.asJava).asScala)(encoder)
251331 }
0 commit comments