addressed comments v2.1

brkyvz · brkyvz · commit 0915e23e78f7 · 2015-04-30T07:35:49.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -17,6 +17,11 @@
 
 package org.apache.spark.sql
 
+import java.lang.{String => JavaString}
+import java.util.{List => JavaList}
+
+import scala.collection.JavaConversions._
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.execution.stat.FrequentItems
 
@@ -31,9 +36,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * The `support` should be greater than 1e-4.
    *
-   * @param cols the names of the columns to search frequent items in
-   * @param support The minimum frequency for an item to be considered `frequent` Should be greater
+   * @param cols the names of the columns to search frequent items in.
+   * @param support The minimum frequency for an item to be considered `frequent`. Should be greater
    *                than 1e-4.
    * @return A Local DataFrame with the Array of frequent items for each column.
    */
@@ -47,7 +53,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    * Returns items more frequent than 1 percent.
    *
-   * @param cols the names of the columns to search frequent items in
+   * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
    */
   def freqItems(cols: Seq[String]): DataFrame = {
@@ -58,13 +64,14 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * The `support` should be greater than 1e-4.
    *
-   * @param cols the names of the columns to search frequent items in
-   * @param support The minimum frequency for an item to be considered `frequent` Should be greater
+   * @param cols the names of the columns to search frequent items in.
+   * @param support The minimum frequency for an item to be considered `frequent`. Should be greater
    *                than 1e-4.
    * @return A Local DataFrame with the Array of frequent items for each column.
    */
-  def freqItems(cols: List[String], support: Double): DataFrame = {
+  def freqItems(cols: JavaList[JavaString], support: Double): DataFrame = {
     FrequentItems.singlePassFreqItems(df, cols, support)
   }
 
@@ -74,10 +81,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    * Returns items more frequent than 1 percent of the time.
    *
-   * @param cols the names of the columns to search frequent items in
+   * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
    */
-  def freqItems(cols: List[String]): DataFrame = {
+  def freqItems(cols: JavaList[JavaString]): DataFrame = {
     FrequentItems.singlePassFreqItems(df, cols, 0.01)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -54,23 +54,18 @@ private[sql] object FrequentItems extends Logging {
      * @param other The map containing the counts for that partition
      */
     def merge(other: FreqItemCounter): this.type = {
-      other.toSeq.foreach { case (k, v) =>
+      other.baseMap.toSeq.foreach { case (k, v) =>
         add(k, v)
       }
       this
     }
-    
-    def toSeq: Seq[(Any, Long)] = baseMap.toSeq
-    
-    def foldLeft[A, B](start: A)(f: (A, (Any, Long)) => A): A = baseMap.foldLeft(start)(f)
-    
-    def freqItems: Seq[Any] = baseMap.keys.toSeq
   }
 
   /**
    * Finding frequent items for columns, possibly with false positives. Using the 
    * frequent element count algorithm described in
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * The `support` should be greater than 1e-4.
    * For Internal use only.
    *
    * @param df The input DataFrame
@@ -114,7 +109,7 @@ private[sql] object FrequentItems extends Logging {
         baseCounts
       }
     )
-    val justItems = freqItems.map(m => m.freqItems)
+    val justItems = freqItems.map(m => m.baseMap.keys.toSeq)
     val resultRow = Row(justItems:_*)
     // append frequent Items to the column name for easy debugging
     val outputCols = colInfo.map{ v =>
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -33,7 +33,6 @@
 import scala.collection.mutable.Buffer;
 
 import java.io.Serializable;
-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
@@ -181,8 +180,7 @@ public void testCreateDataFrameFromJavaBeans() {
   public void testFrequentItems() {
     DataFrame df = context.table("testData2");
     List<String> cols = Arrays.asList("a");
-    DataFrame results = df.stat().freqItems(JavaConversions.asScalaIterable(cols).toList(), 0.2);
-    System.out.println(results.collect()[0].getSeq(0));
+    DataFrame results = df.stat().freqItems(cols, 0.2);
     Assert.assertTrue(results.collect()[0].getSeq(0).contains(1));
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -54,23 +54,18 @@ private[sql] object FrequentItems extends Logging {`
`54`	`54`	`* @param other The map containing the counts for that partition`
`55`	`55`	`*/`
`56`	`56`	`def merge(other: FreqItemCounter): this.type = {`
`57`		`- other.toSeq.foreach { case (k, v) =>`
	`57`	`+ other.baseMap.toSeq.foreach { case (k, v) =>`
`58`	`58`	`add(k, v)`
`59`	`59`	`}`
`60`	`60`	`this`
`61`	`61`	`}`
`62`		`-`
`63`		`- def toSeq: Seq[(Any, Long)] = baseMap.toSeq`
`64`		`-`
`65`		`- def foldLeft[A, B](start: A)(f: (A, (Any, Long)) => A): A = baseMap.foldLeft(start)(f)`
`66`		`-`
`67`		`- def freqItems: Seq[Any] = baseMap.keys.toSeq`
`68`	`62`	`}`
`69`	`63`
`70`	`64`	`/**`
`71`	`65`	`* Finding frequent items for columns, possibly with false positives. Using the`
`72`	`66`	`* frequent element count algorithm described in`
`73`	`67`	`* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].`
	`68`	+ * The `support` should be greater than 1e-4.
`74`	`69`	`* For Internal use only.`
`75`	`70`	`*`
`76`	`71`	`* @param df The input DataFrame`
`@@ -114,7 +109,7 @@ private[sql] object FrequentItems extends Logging {`
`114`	`109`	`baseCounts`
`115`	`110`	`}`
`116`	`111`	`)`
`117`		`- val justItems = freqItems.map(m => m.freqItems)`
	`112`	`+ val justItems = freqItems.map(m => m.baseMap.keys.toSeq)`
`118`	`113`	`val resultRow = Row(justItems:_*)`
`119`	`114`	`// append frequent Items to the column name for easy debugging`
`120`	`115`	`val outputCols = colInfo.map{ v =>`