addressed comments v1.0

brkyvz · brkyvz · commit 38e784dc4f00 · 2015-04-29T23:36:11.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql
 import java.io.CharArrayWriter
 import java.sql.DriverManager
 
+import org.apache.spark.sql.execution.stat.FrequentItems
+
 import scala.collection.JavaConversions._
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
@@ -41,7 +43,6 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, LogicalRDD}
 import org.apache.spark.sql.jdbc.JDBCWriteDetails
 import org.apache.spark.sql.json.JsonRDD
-import org.apache.spark.sql.ml.FrequentItems
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.sources.{ResolvedDataSource, CreateTableUsingAsSelect}
 import org.apache.spark.util.Utils
@@ -331,6 +332,17 @@ class DataFrame private[sql](
    */
   def na: DataFrameNaFunctions = new DataFrameNaFunctions(this)
 
+  /**
+   * Returns a [[DataFrameStatFunctions]] for working statistic functions support.
+   * {{{
+   *   // Finding frequent items in column with name 'a'.
+   *   df.stat.freqItems(Seq("a"))
+   * }}}
+   *
+   * @group dfops
+   */
+  def stat: DataFrameStatFunctions = new DataFrameStatFunctions(this)
+
   /**
    * Cartesian join with another [[DataFrame]].
    *
@@ -1415,37 +1427,4 @@ class DataFrame private[sql](
     val jrdd = rdd.map(EvaluatePython.rowToArray(_, fieldTypes)).toJavaRDD()
     SerDeUtil.javaToPython(jrdd)
   }
-
-  /////////////////////////////////////////////////////////////////////////////
-  // Statistic functions
-  /////////////////////////////////////////////////////////////////////////////
-
-  // scalastyle:off
-  object stat {
-  // scalastyle:on
-
-    /**
-     * Finding frequent items for columns, possibly with false positives. Using the algorithm
-     * described in `http://www.cs.umd.edu/~samir/498/karp.pdf`.
-     *
-     * @param cols the names of the columns to search frequent items in
-     * @param support The minimum frequency for an item to be considered `frequent`
-     * @return A Local DataFrame with the Array of frequent items for each column.
-     */
-    def freqItems(cols: Array[String], support: Double): DataFrame = {
-      FrequentItems.singlePassFreqItems(toDF(), cols, support)
-    }
-
-    /**
-     * Finding frequent items for columns, possibly with false positives. Using the algorithm
-     * described in `http://www.cs.umd.edu/~samir/498/karp.pdf`.
-     * Returns items more frequent than 1/1000'th of the time.
-     *
-     * @param cols the names of the columns to search frequent items in
-     * @return A Local DataFrame with the Array of frequent items for each column.
-     */
-    def freqItems(cols: Array[String]): DataFrame = {
-      FrequentItems.singlePassFreqItems(toDF(), cols, 0.001)
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -0,0 +1,55 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.execution.stat.FrequentItems
+
+/**
+ * :: Experimental ::
+ * Statistic functions for [[DataFrame]]s.
+ */
+@Experimental
+final class DataFrameStatFunctions private[sql](df: DataFrame) {
+
+  /**
+   * Finding frequent items for columns, possibly with false positives. Using the
+   * frequent element count algorithm described in
+   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   *
+   * @param cols the names of the columns to search frequent items in
+   * @param support The minimum frequency for an item to be considered `frequent`
+   * @return A Local DataFrame with the Array of frequent items for each column.
+   */
+  def freqItems(cols: Seq[String], support: Double): DataFrame = {
+    FrequentItems.singlePassFreqItems(df, cols, support)
+  }
+
+  /**
+   * Finding frequent items for columns, possibly with false positives. Using the
+   * frequent element count algorithm described in
+   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * Returns items more frequent than 1/1000'th of the time.
+   *
+   * @param cols the names of the columns to search frequent items in
+   * @return A Local DataFrame with the Array of frequent items for each column.
+   */
+  def freqItems(cols: Seq[String]): DataFrame = {
+    FrequentItems.singlePassFreqItems(df, cols, 0.001)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -0,0 +1,127 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.stat
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.{Column, DataFrame, Row}
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+import org.apache.spark.sql.types.{ArrayType, StructField, StructType}
+
+import scala.collection.mutable.{Map => MutableMap}
+
+private[sql] object FrequentItems extends Logging {
+
+  /** A helper class wrapping `MutableMap[Any, Long]` for simplicity. */
+  private class FreqItemCounter(size: Int) extends Serializable {
+    val baseMap: MutableMap[Any, Long] = MutableMap.empty[Any, Long]
+
+    /**
+     * Add a new example to the counts if it exists, otherwise deduct the count
+     * from existing items.
+     */
+    def add(key: Any, count: Long): this.type = {
+      if (baseMap.contains(key))  {
+        baseMap(key) += count
+      } else {
+        if (baseMap.size < size) {
+          baseMap += key -> count
+        } else {
+          // TODO: Make this more efficient... A flatMap?
+          baseMap.retain((k, v) => v > count)
+          baseMap.transform((k, v) => v - count)
+        }
+      }
+      this
+    }
+
+    /**
+     * Merge two maps of counts.
+     * @param other The map containing the counts for that partition
+     */
+    def merge(other: FreqItemCounter): this.type = {
+      other.toSeq.foreach { case (k, v) =>
+        add(k, v)
+      }
+      this
+    }
+    
+    def toSeq: Seq[(Any, Long)] = baseMap.toSeq
+    
+    def foldLeft[A, B](start: A)(f: (A, (Any, Long)) => A): A = baseMap.foldLeft(start)(f)
+    
+    def freqItems: Seq[Any] = baseMap.keys.toSeq
+  }
+
+  /**
+   * Finding frequent items for columns, possibly with false positives. Using the 
+   * frequent element count algorithm described in
+   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * For Internal use only.
+   *
+   * @param df The input DataFrame
+   * @param cols the names of the columns to search frequent items in
+   * @param support The minimum frequency for an item to be considered `frequent`
+   * @return A Local DataFrame with the Array of frequent items for each column.
+   */
+  private[sql] def singlePassFreqItems(
+      df: DataFrame, 
+      cols: Seq[String],
+      support: Double): DataFrame = {
+    if (support < 1e-6) {
+      logWarning(s"The selected support ($support) is too small, and might cause memory problems.")
+    }
+    val numCols = cols.length
+    // number of max items to keep counts for
+    val sizeOfMap = (1 / support).toInt
+    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
+    val originalSchema = df.schema
+    val colInfo = cols.map { name =>
+      val index = originalSchema.fieldIndex(name)
+      (name, originalSchema.fields(index).dataType)
+    }
+    
+    val freqItems = df.select(cols.map(Column(_)):_*).rdd.aggregate(countMaps)(
+      seqOp = (counts, row) => {
+        var i = 0
+        while (i < numCols) {
+          val thisMap = counts(i)
+          val key = row.get(i)
+          thisMap.add(key, 1L)
+          i += 1
+        }
+        counts
+      },
+      combOp = (baseCounts, counts) => {
+        var i = 0
+        while (i < numCols) {
+          baseCounts(i).merge(counts(i))
+          i += 1
+        }
+        baseCounts
+      }
+    )
+    val justItems = freqItems.map(m => m.freqItems)
+    val resultRow = Row(justItems:_*)
+    // append frequent Items to the column name for easy debugging
+    val outputCols = colInfo.map{ v =>
+      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
+    }
+    val schema = StructType(outputCols).toAttributes
+    new DataFrame(df.sqlContext, LocalRelation(schema, Seq(resultRow)))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/ml/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/ml/FrequentItems.scala
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -178,5 +178,4 @@ public void testCreateDataFrameFromJavaBeans() {
       Assert.assertEquals(bean.getD().get(i), d.apply(i));
     }
   }
-
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -178,5 +178,4 @@ public void testCreateDataFrameFromJavaBeans() {`
`178`	`178`	`Assert.assertEquals(bean.getD().get(i), d.apply(i));`
`179`	`179`	`}`
`180`	`180`	`}`
`181`		`-`
`182`	`181`	`}`