Skip to content

Commit 0915e23

Browse files
committed
addressed comments v2.1
1 parent 3a5c177 commit 0915e23

File tree

3 files changed

+19
-19
lines changed

3 files changed

+19
-19
lines changed

sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717

1818
package org.apache.spark.sql
1919

20+
import java.lang.{String => JavaString}
21+
import java.util.{List => JavaList}
22+
23+
import scala.collection.JavaConversions._
24+
2025
import org.apache.spark.annotation.Experimental
2126
import org.apache.spark.sql.execution.stat.FrequentItems
2227

@@ -31,9 +36,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
3136
* Finding frequent items for columns, possibly with false positives. Using the
3237
* frequent element count algorithm described in
3338
* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
39+
* The `support` should be greater than 1e-4.
3440
*
35-
* @param cols the names of the columns to search frequent items in
36-
* @param support The minimum frequency for an item to be considered `frequent` Should be greater
41+
* @param cols the names of the columns to search frequent items in.
42+
* @param support The minimum frequency for an item to be considered `frequent`. Should be greater
3743
* than 1e-4.
3844
* @return A Local DataFrame with the Array of frequent items for each column.
3945
*/
@@ -47,7 +53,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
4753
* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
4854
* Returns items more frequent than 1 percent.
4955
*
50-
* @param cols the names of the columns to search frequent items in
56+
* @param cols the names of the columns to search frequent items in.
5157
* @return A Local DataFrame with the Array of frequent items for each column.
5258
*/
5359
def freqItems(cols: Seq[String]): DataFrame = {
@@ -58,13 +64,14 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
5864
* Finding frequent items for columns, possibly with false positives. Using the
5965
* frequent element count algorithm described in
6066
* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
67+
* The `support` should be greater than 1e-4.
6168
*
62-
* @param cols the names of the columns to search frequent items in
63-
* @param support The minimum frequency for an item to be considered `frequent` Should be greater
69+
* @param cols the names of the columns to search frequent items in.
70+
* @param support The minimum frequency for an item to be considered `frequent`. Should be greater
6471
* than 1e-4.
6572
* @return A Local DataFrame with the Array of frequent items for each column.
6673
*/
67-
def freqItems(cols: List[String], support: Double): DataFrame = {
74+
def freqItems(cols: JavaList[JavaString], support: Double): DataFrame = {
6875
FrequentItems.singlePassFreqItems(df, cols, support)
6976
}
7077

@@ -74,10 +81,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
7481
* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
7582
* Returns items more frequent than 1 percent of the time.
7683
*
77-
* @param cols the names of the columns to search frequent items in
84+
* @param cols the names of the columns to search frequent items in.
7885
* @return A Local DataFrame with the Array of frequent items for each column.
7986
*/
80-
def freqItems(cols: List[String]): DataFrame = {
87+
def freqItems(cols: JavaList[JavaString]): DataFrame = {
8188
FrequentItems.singlePassFreqItems(df, cols, 0.01)
8289
}
8390
}

sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,23 +54,18 @@ private[sql] object FrequentItems extends Logging {
5454
* @param other The map containing the counts for that partition
5555
*/
5656
def merge(other: FreqItemCounter): this.type = {
57-
other.toSeq.foreach { case (k, v) =>
57+
other.baseMap.toSeq.foreach { case (k, v) =>
5858
add(k, v)
5959
}
6060
this
6161
}
62-
63-
def toSeq: Seq[(Any, Long)] = baseMap.toSeq
64-
65-
def foldLeft[A, B](start: A)(f: (A, (Any, Long)) => A): A = baseMap.foldLeft(start)(f)
66-
67-
def freqItems: Seq[Any] = baseMap.keys.toSeq
6862
}
6963

7064
/**
7165
* Finding frequent items for columns, possibly with false positives. Using the
7266
* frequent element count algorithm described in
7367
* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
68+
* The `support` should be greater than 1e-4.
7469
* For Internal use only.
7570
*
7671
* @param df The input DataFrame
@@ -114,7 +109,7 @@ private[sql] object FrequentItems extends Logging {
114109
baseCounts
115110
}
116111
)
117-
val justItems = freqItems.map(m => m.freqItems)
112+
val justItems = freqItems.map(m => m.baseMap.keys.toSeq)
118113
val resultRow = Row(justItems:_*)
119114
// append frequent Items to the column name for easy debugging
120115
val outputCols = colInfo.map{ v =>

sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import scala.collection.mutable.Buffer;
3434

3535
import java.io.Serializable;
36-
import java.util.ArrayList;
3736
import java.util.Arrays;
3837
import java.util.List;
3938
import java.util.Map;
@@ -181,8 +180,7 @@ public void testCreateDataFrameFromJavaBeans() {
181180
public void testFrequentItems() {
182181
DataFrame df = context.table("testData2");
183182
List<String> cols = Arrays.asList("a");
184-
DataFrame results = df.stat().freqItems(JavaConversions.asScalaIterable(cols).toList(), 0.2);
185-
System.out.println(results.collect()[0].getSeq(0));
183+
DataFrame results = df.stat().freqItems(cols, 0.2);
186184
Assert.assertTrue(results.collect()[0].getSeq(0).contains(1));
187185
}
188186
}

0 commit comments

Comments
 (0)