@@ -751,6 +751,86 @@ class DataFrame private[sql](
751751 select(colNames :_* )
752752 }
753753
754+ /**
755+ * Compute specified aggregations for given columns of this [[DataFrame ]].
756+ * Each row of the resulting [[DataFrame ]] contains column with aggregation name
757+ * and columns with aggregation results for each given column.
758+ * The aggregations are described as a List of mappings of their name to function
759+ * which generates aggregation expression from column name.
760+ *
761+ * Note: can process only simple aggregation expressions
762+ * which can be parsed by spark [[SqlParser ]]
763+ *
764+ * {{{
765+ * val aggregations = List(
766+ * "max" -> (col => s"max($col)"), // expression computes max
767+ * "avg" -> (col => s"sum($col)/count($col)")) // expression computes average
768+ * df.multipleAggExpr("summary", aggregations, "age", "height")
769+ *
770+ * // summary age height
771+ * // max 92.0 192.0
772+ * // avg 53.0 178.0
773+ * }}}
774+ */
775+ @ scala.annotation.varargs
776+ private def multipleAggExpr (
777+ aggCol : String ,
778+ aggregations : List [(String , String => String )],
779+ cols : String * ): DataFrame = {
780+
781+ val sqlParser = new SqlParser ()
782+
783+ def addAggNameCol (aggDF : DataFrame , aggName : String = " " ) =
784+ aggDF.selectExpr(s " ' $aggName' as $aggCol" :: cols.toList:_* )
785+
786+ def unionWithNextAgg (aggSoFarDF : DataFrame , nextAgg : (String , String => String )) =
787+ nextAgg match { case (aggName, colToAggExpr) =>
788+ val nextAggDF = if (cols.nonEmpty) {
789+ def colToAggCol (col : String ) =
790+ Column (sqlParser.parseExpression(colToAggExpr(col))).as(col)
791+ val aggCols = cols.map(colToAggCol)
792+ agg(aggCols.head, aggCols.tail:_* )
793+ } else {
794+ sqlContext.emptyDataFrame
795+ }
796+ val nextAggWithNameDF = addAggNameCol(nextAggDF, aggName)
797+ aggSoFarDF.unionAll(nextAggWithNameDF)
798+ }
799+
800+ val emptyAgg = addAggNameCol(this ).limit(0 )
801+ aggregations.foldLeft(emptyAgg)(unionWithNextAgg)
802+ }
803+
804+ /**
805+ * Compute numerical statistics for given columns of this [[DataFrame ]]:
806+ * count, mean (avg), stddev (standard deviation), min, max.
807+ * Each row of the resulting [[DataFrame ]] contains column with statistic name
808+ * and columns with statistic results for each given column.
809+ * If no columns are given then computes for all numerical columns.
810+ *
811+ * {{{
812+ * df.describe("age", "height")
813+ *
814+ * // summary age height
815+ * // count 10.0 10.0
816+ * // mean 53.3 178.05
817+ * // stddev 11.6 15.7
818+ * // min 18.0 163.0
819+ * // max 92.0 192.0
820+ * }}}
821+ */
822+ @ scala.annotation.varargs
823+ def describe (cols : String * ): DataFrame = {
824+ val numCols = if (cols.isEmpty) numericColumns.map(_.prettyString) else cols
825+ val aggregations = List [(String , String => String )](
826+ " count" -> (col => s " count( $col) " ),
827+ " mean" -> (col => s " avg( $col) " ),
828+ " stddev" -> (col => s " sqrt(avg( $col* $col) - avg( $col)*avg( $col)) " ),
829+ " min" -> (col => s " min( $col) " ),
830+ " max" -> (col => s " max( $col) " ))
831+ multipleAggExpr(" summary" , aggregations, numCols:_* )
832+ }
833+
754834 /**
755835 * Returns the first `n` rows.
756836 * @group action
0 commit comments