@@ -133,68 +133,64 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
133133 * Return a subset of this RDD sampled by key (via stratified sampling).
134134 *
135135 * Create a sample of this RDD using variable sampling rates for different keys as specified by
136- * `fractions`, a key to sampling rate map.
137- *
138- * If `exact` is set to false, create the sample via simple random sampling, with one pass
139- * over the RDD, to produce a sample of size that's approximately equal to the sum of
140- * math.ceil(numItems * samplingRate) over all key values; otherwise, use additional passes over
141- * the RDD to create a sample size that's exactly equal to the sum of
136+ * `fractions`, a key to sampling rate map, via simple random sampling with one pass over the
137+ * RDD, to produce a sample of size that's approximately equal to the sum of
142138 * math.ceil(numItems * samplingRate) over all key values.
143139 */
144140 def sampleByKey (withReplacement : Boolean ,
145141 fractions : JMap [K , Double ],
146- exact : Boolean ,
147142 seed : Long ): JavaPairRDD [K , V ] =
148- new JavaPairRDD [K , V ](rdd.sampleByKey(withReplacement, fractions, exact, seed))
143+ new JavaPairRDD [K , V ](rdd.sampleByKey(withReplacement, fractions, seed))
149144
150145 /**
151146 * Return a subset of this RDD sampled by key (via stratified sampling).
152147 *
153148 * Create a sample of this RDD using variable sampling rates for different keys as specified by
154- * `fractions`, a key to sampling rate map.
155- *
156- * If `exact` is set to false, create the sample via simple random sampling, with one pass
157- * over the RDD, to produce a sample of size that's approximately equal to the sum of
158- * math.ceil(numItems * samplingRate) over all key values; otherwise, use additional passes over
159- * the RDD to create a sample size that's exactly equal to the sum of
149+ * `fractions`, a key to sampling rate map, via simple random sampling with one pass over the
150+ * RDD, to produce a sample of size that's approximately equal to the sum of
160151 * math.ceil(numItems * samplingRate) over all key values.
161152 *
162- * Use Utils.random.nextLong as the default seed for the random number generator
153+ * Use Utils.random.nextLong as the default seed for the random number generator.
163154 */
164155 def sampleByKey (withReplacement : Boolean ,
165- fractions : JMap [K , Double ],
166- exact : Boolean ): JavaPairRDD [K , V ] =
167- sampleByKey(withReplacement, fractions, exact, Utils .random.nextLong)
156+ fractions : JMap [K , Double ]): JavaPairRDD [K , V ] =
157+ sampleByKey(withReplacement, fractions, Utils .random.nextLong)
168158
169159 /**
170- * Return a subset of this RDD sampled by key (via stratified sampling).
160+ * ::Experimental::
171161 *
172- * Create a sample of this RDD using variable sampling rates for different keys as specified by
173- * `fractions`, a key to sampling rate map .
162+ * Return a subset of this RDD sampled by key (via stratified sampling) containing exactly
163+ * math.ceil(numItems * samplingRate) for each stratum (group of pairs with the same key) .
174164 *
175- * Produce a sample of size that's approximately equal to the sum of
176- * math.ceil(numItems * samplingRate) over all key values with one pass over the RDD via
177- * simple random sampling.
165+ * This method differs from [[sampleByKey ]] in that we make additional passes over the RDD to
166+ * create a sample size that's exactly equal to the sum of math.ceil(numItems * samplingRate)
167+ * over all key values with a 99.99% confidence. When sampling without replacement, we need one
168+ * additional pass over the RDD to guarantee sample size; when sampling with replacement, we need
169+ * two additional passes.
178170 */
179- def sampleByKey (withReplacement : Boolean ,
171+ @ Experimental
172+ def sampleByKeyExact (withReplacement : Boolean ,
180173 fractions : JMap [K , Double ],
181174 seed : Long ): JavaPairRDD [K , V ] =
182- sampleByKey( withReplacement, fractions, false , seed)
175+ new JavaPairRDD [ K , V ](rdd.sampleByKeyExact( withReplacement, fractions, seed) )
183176
184177 /**
185- * Return a subset of this RDD sampled by key (via stratified sampling).
178+ * ::Experimental::
186179 *
187- * Create a sample of this RDD using variable sampling rates for different keys as specified by
188- * `fractions`, a key to sampling rate map .
180+ * Return a subset of this RDD sampled by key (via stratified sampling) containing exactly
181+ * math.ceil(numItems * samplingRate) for each stratum (group of pairs with the same key) .
189182 *
190- * Produce a sample of size that's approximately equal to the sum of
191- * math.ceil(numItems * samplingRate) over all key values with one pass over the RDD via
192- * simple random sampling.
183+ * This method differs from [[sampleByKey ]] in that we make additional passes over the RDD to
184+ * create a sample size that's exactly equal to the sum of math.ceil(numItems * samplingRate)
185+ * over all key values with a 99.99% confidence. When sampling without replacement, we need one
186+ * additional pass over the RDD to guarantee sample size; when sampling with replacement, we need
187+ * two additional passes.
193188 *
194- * Use Utils.random.nextLong as the default seed for the random number generator
189+ * Use Utils.random.nextLong as the default seed for the random number generator.
195190 */
196- def sampleByKey (withReplacement : Boolean , fractions : JMap [K , Double ]): JavaPairRDD [K , V ] =
197- sampleByKey(withReplacement, fractions, false , Utils .random.nextLong)
191+ @ Experimental
192+ def sampleByKeyExact (withReplacement : Boolean , fractions : JMap [K , Double ]): JavaPairRDD [K , V ] =
193+ sampleByKeyExact(withReplacement, fractions, Utils .random.nextLong)
198194
199195 /**
200196 * Return the union of this RDD and another one. Any identical elements will appear multiple
0 commit comments