Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ class DataFrame private[sql](
// make it a NamedExpression.
case Column(u: UnresolvedAttribute) => UnresolvedAlias(u)
case Column(expr: NamedExpression) => expr
// Leave an unaliased explode with an empty list of names since the analzyer will generate the
// Leave an unaliased explode with an empty list of names since the analyzer will generate the
// correct defaults after the nested expression's type has been resolved.
case Column(explode: Explode) => MultiAlias(explode, Nil)
case Column(expr: Expression) => Alias(expr, expr.prettyString)()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param col2 the name of the second column
* @return the covariance of the two columns.
*
* {{{
* val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
* .withColumn("rand2", rand(seed=27))
* df.stat.cov("rand1", "rand2")
* res1: Double = 0.065...
* }}}
*
* @since 1.4.0
*/
def cov(col1: String, col2: String): Double = {
Expand All @@ -54,6 +61,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param col2 the name of the column to calculate the correlation against
* @return The Pearson Correlation Coefficient as a Double.
*
* {{{
* val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
* .withColumn("rand2", rand(seed=27))
* df.stat.corr("rand1", "rand2")
* res1: Double = 0.613...
* }}}
*
* @since 1.4.0
*/
def corr(col1: String, col2: String, method: String): Double = {
Expand All @@ -69,6 +83,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param col2 the name of the column to calculate the correlation against
* @return The Pearson Correlation Coefficient as a Double.
*
* {{{
* val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
* .withColumn("rand2", rand(seed=27))
* df.stat.corr("rand1", "rand2", "pearson")
* res1: Double = 0.613...
* }}}
*
* @since 1.4.0
*/
def corr(col1: String, col2: String): Double = {
Expand All @@ -92,6 +113,20 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* of the DataFrame.
* @return A DataFrame containing for the contingency table.
*
* {{{
* val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
* (3, 3))).toDF("key", "value")
* val ct = df.stat.crosstab("key", "value")
* ct.show()
* +---------+---+---+---+
* |key_value| 1| 2| 3|
* +---------+---+---+---+
* | 2| 2| 0| 1|
* | 1| 1| 1| 0|
* | 3| 0| 1| 1|
* +---------+---+---+---+
* }}}
*
* @since 1.4.0
*/
def crosstab(col1: String, col2: String): DataFrame = {
Expand All @@ -112,6 +147,32 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* than 1e-4.
* @return A Local DataFrame with the Array of frequent items for each column.
*
* {{{
* val rows = Seq.tabulate(100) { i =>
* if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
* }
* val df = sqlContext.createDataFrame(rows).toDF("a", "b")
* // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
* // "a" and "b"
* val freqSingles = df.stat.freqItems(Array("a", "b"), 0.4)
* freqSingles.show()
* +-----------+-------------+
* |a_freqItems| b_freqItems|
* +-----------+-------------+
* | [1, 99]|[-1.0, -99.0]|
* +-----------+-------------+
* // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
* val pairDf = df.select(struct("a", "b").as("a-b"))
* val freqPairs = pairDf.stat.freqItems(Array("a-b"), 0.1)
* freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show()
* +----------+
* | freq_ab|
* +----------+
* | [1,-1.0]|
* | ... |
* +----------+
* }}}
*
* @since 1.4.0
*/
def freqItems(cols: Array[String], support: Double): DataFrame = {
Expand Down Expand Up @@ -147,6 +208,32 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param cols the names of the columns to search frequent items in.
* @return A Local DataFrame with the Array of frequent items for each column.
*
* {{{
* val rows = Seq.tabulate(100) { i =>
* if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
* }
* val df = sqlContext.createDataFrame(rows).toDF("a", "b")
* // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
* // "a" and "b"
* val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4)
* freqSingles.show()
* +-----------+-------------+
* |a_freqItems| b_freqItems|
* +-----------+-------------+
* | [1, 99]|[-1.0, -99.0]|
* +-----------+-------------+
* // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
* val pairDf = df.select(struct("a", "b").as("a-b"))
* val freqPairs = pairDf.stat.freqItems(Seq("a-b"), 0.1)
* freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show()
* +----------+
* | freq_ab|
* +----------+
* | [1,-1.0]|
* | ... |
* +----------+
* }}}
*
* @since 1.4.0
*/
def freqItems(cols: Seq[String], support: Double): DataFrame = {
Expand Down Expand Up @@ -180,6 +267,20 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @tparam T stratum type
* @return a new [[DataFrame]] that represents the stratified sample
*
* {{{
* val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
* (3, 3))).toDF("key", "value")
* val fractions = Map(1 -> 1.0, 3 -> 0.5)
* df.stat.sampleBy("key", fractions, 36L).show()
* +---+-----+
* |key|value|
* +---+-----+
* | 1| 1|
* | 1| 2|
* | 3| 2|
* +---+-----+
* }}}
*
* @since 1.5.0
*/
def sampleBy[T](col: String, fractions: Map[T, Double], seed: Long): DataFrame = {
Expand Down