@@ -19,12 +19,12 @@ license: |
1919 limitations under the License.
2020---
2121
22- Spark SQL provides build-in Aggregate functions defined in the dataset API and SQL interface. Aggregate functions
22+ Spark SQL provides build-in aggregate functions defined in the dataset API and SQL interface. Aggregate functions
2323operate on a group of rows and return a single value.
2424
25- Spark SQL Aggregate functions are grouped as <code >agg_funcs</code > in spark SQL. Below is the list of functions.
25+ Spark SQL aggregate functions are grouped as <code >agg_funcs</code > in Spark SQL. Below is the list of functions.
2626
27- ** Note:** Every below function has another signature which takes String as a column name instead of Column.
27+ ** Note:** All functions below have another signature which takes String as a column name instead of Column.
2828
2929* Table of contents
3030{: toc }
@@ -34,37 +34,37 @@ Spark SQL Aggregate functions are grouped as <code>agg_funcs</code> in spark SQL
3434 </thead >
3535 <tbody >
3636 <tr>
37- <td> <b>{any | some | bool_or}</b>(<i>e : Column</i>)</td>
37+ <td> <b>{any | some | bool_or}</b>(<i>c : Column</i>)</td>
3838 <td>Column name</td>
3939 <td>Returns true if at least one value is true</td>
4040 </tr>
4141 <tr>
42- <td> <b>approx_count_distinct</b>(<i>e : Column[, relativeSD: Double]]</i>)</td>
42+ <td> <b>approx_count_distinct</b>(<i>c : Column[, relativeSD: Double]]</i>)</td>
4343 <td>Column name; relativeSD: the maximum estimation error allowed.</td>
4444 <td>Returns the estimated cardinality by HyperLogLog++</td>
4545 </tr>
4646 <tr>
47- <td> <b>{avg | mean}</b>(<i>e : Column</i>)</td>
47+ <td> <b>{avg | mean}</b>(<i>c : Column</i>)</td>
4848 <td>Column name</td>
4949 <td> Returns the average of values in the input column.</td>
5050 </tr>
5151 <tr>
52- <td> <b>{bool_and | every}</b>(<i>e : Column</i>)</td>
52+ <td> <b>{bool_and | every}</b>(<i>c : Column</i>)</td>
5353 <td>Column name</td>
5454 <td>Returns true if all values are true</td>
5555 </tr>
5656 <tr>
57- <td> <b>collect_list</b>(<i>e : Column</i>)</td>
57+ <td> <b>collect_list</b>(<i>c : Column</i>)</td>
5858 <td>Column name</td>
5959 <td>Collects and returns a list of non-unique elements. The function is non-deterministic because the order of collected results depends on the order of the rows which may be non-deterministic after a shuffle</td>
6060 </tr>
6161 <tr>
62- <td> <b>collect_set</b>(<i>e : Column</i>)</td>
62+ <td> <b>collect_set</b>(<i>c : Column</i>)</td>
6363 <td>Column name</td>
6464 <td>Collects and returns a set of unique elements. The function is non-deterministic because the order of collected results depends on the order of the rows which may be non-deterministic after a shuffle.</td>
6565 </tr>
6666 <tr>
67- <td> <b>corr</b>(<i>e1 : Column, e2 : Column</i>)</td>
67+ <td> <b>corr</b>(<i>c1 : Column, c2 : Column</i>)</td>
6868 <td>Column name</td>
6969 <td>Returns Pearson coefficient of correlation between a set of number pairs</td>
7070 </tr>
@@ -74,12 +74,12 @@ Spark SQL Aggregate functions are grouped as <code>agg_funcs</code> in spark SQL
7474 <td>Returns the total number of retrieved rows, including rows containing null</td>
7575 </tr>
7676 <tr>
77- <td> <b>count</b>(<i>e : Column[, e : Column]</i>)</td>
77+ <td> <b>count</b>(<i>c : Column[, c : Column]</i>)</td>
7878 <td>Column name</td>
7979 <td>Returns the number of rows for which the supplied column(s) are all not null</td>
8080 </tr>
8181 <tr>
82- <td> <b>count</b>(<b>DISTINCT</b> <i> e : Column[, e : Column</i>])</td>
82+ <td> <b>count</b>(<b>DISTINCT</b> <i> c : Column[, c : Column</i>])</td>
8383 <td>Column name</td>
8484 <td>Returns the number of rows for which the supplied column(s) are unique and not null</td>
8585 </tr>
@@ -89,102 +89,102 @@ Spark SQL Aggregate functions are grouped as <code>agg_funcs</code> in spark SQL
8989 <td>Returns the count number from the predicate evaluate to <code>TRUE</code> values</td>
9090 </tr>
9191 <tr>
92- <td> <b>count_min_sketch</b>(<i>e : Column, eps: double, confidence: double, seed integer</i>)</td>
92+ <td> <b>count_min_sketch</b>(<i>c : Column, eps: double, confidence: double, seed integer</i>)</td>
9393 <td>Column name; eps is a value between 0.0 and 1.0; confidence is a value between 0.0 and 1.0; seed is a positive integer</td>
9494 <td>Returns a count-min sketch of a column with the given esp, confidence and seed. The result is an array of bytes, which can be deserialized to a `CountMinSketch` before usage. Count-min sketch is a probabilistic data structure used for cardinality estimation using sub-linear space..</td>
9595 </tr>
9696 <tr>
97- <td> <b>covar_pop</b>(<i>e1 : Column, e2 : Column</i>)</td>
97+ <td> <b>covar_pop</b>(<i>c1 : Column, c2 : Column</i>)</td>
9898 <td>Column name</td>
9999 <td>Returns the population covariance of a set of number pairs</td>
100100 </tr>
101101 <tr>
102- <td> <b>covar_samp</b>(<i>e1 : Column, e2 : Column</i>)</td>
102+ <td> <b>covar_samp</b>(<i>c1 : Column, c2 : Column</i>)</td>
103103 <td>Column name</td>
104104 <td>Returns the sample covariance of a set of number pairs</td>
105105 </tr>
106106 <tr>
107- <td> <b>{first | first_value}</b>(<i>e : Column[, isIgnoreNull]</i>)</td>
107+ <td> <b>{first | first_value}</b>(<i>c : Column[, isIgnoreNull]</i>)</td>
108108 <td>Column name[, True/False(default)]</td>
109109 <td>Returns the first value of column for a group of rows. If <code>isIgnoreNull</code> is true, returns only non-null values, default is false. This function is non-deterministic</td>
110110 </tr>
111111 <tr>
112- <td> <b>kurtosis</b>(<i>e : Column</i>)</td>
112+ <td> <b>kurtosis</b>(<i>c : Column</i>)</td>
113113 <td>Column name</td>
114114 <td>Returns the kurtosis value calculated from values of a group</td>
115115 </tr>
116116 <tr>
117- <td> <b>{last | last_value}</b>(<i>e : Column[, isIgnoreNull]</i>)</td>
117+ <td> <b>{last | last_value}</b>(<i>c : Column[, isIgnoreNull]</i>)</td>
118118 <td>Column name[, True/False(default)]</td>
119119 <td>Returns the last value of column for a group of rows. If <code>isIgnoreNull</code> is true, returns only non-null values, default is false. This function is non-deterministic</td>
120120 </tr>
121121 <tr>
122- <td> <b>max</b>(<i>e : Column</i>)</td>
122+ <td> <b>max</b>(<i>c : Column</i>)</td>
123123 <td>Column name</td>
124124 <td>Returns the maximum value of the column.</td>
125125 </tr>
126126 <tr>
127- <td> <b>max_by</b>(<i>e1 : Column, e2 : Column</i>)</td>
127+ <td> <b>max_by</b>(<i>c1 : Column, c2 : Column</i>)</td>
128128 <td>Column name</td>
129- <td>Returns the value of column e1 associated with the maximum value of column e2 .</td>
129+ <td>Returns the value of column c1 associated with the maximum value of column c2 .</td>
130130 </tr>
131131 <tr>
132- <td> <b>min</b>(<i>e : Column</i>)</td>
132+ <td> <b>min</b>(<i>c : Column</i>)</td>
133133 <td>Column name</td>
134134 <td>Returns the minimum value of the column.</td>
135135 </tr>
136136 <tr>
137- <td> <b>min_by</b>(<i>e1 : Column, e2 : Column</i>)</td>
137+ <td> <b>min_by</b>(<i>c1 : Column, c2 : Column</i>)</td>
138138 <td>Column name</td>
139- <td>Returns the value of column e1 associated with the minimum value of column e2 .</td>
139+ <td>Returns the value of column c1 associated with the minimum value of column c2 .</td>
140140 </tr>
141141 <tr>
142- <td> <b>percentile</b>(<i>e : Column, percentage [, frequency]</i>)</td>
142+ <td> <b>percentile</b>(<i>c : Column, percentage [, frequency]</i>)</td>
143143 <td>Column name; percentage is a number between 0 and 1; frequency is a positive integer</td>
144144 <td>Returns the exact percentile value of numeric column at the given percentage.</td>
145145 </tr>
146146 <tr>
147- <td> <b>percentile</b>(<i>e : Column, <b>array</b>(percentage1 [, percentage2]...) [, frequency]</i>)</td>
147+ <td> <b>percentile</b>(<i>c : Column, <b>array</b>(percentage1 [, percentage2]...) [, frequency]</i>)</td>
148148 <td>Column name; percentage array is an array of number between 0 and 1; frequency is a positive integer</td>
149149 <td>Returns the exact percentile value array of numeric column at the given percentage(s).</td>
150150 </tr>
151151 <tr>
152- <td> <b>{percentile_approx | percentile_approx}</b>(<i>e : Column, percentage [, frequency]</i>)</td>
152+ <td> <b>{percentile_approx | percentile_approx}</b>(<i>c : Column, percentage [, frequency]</i>)</td>
153153 <td>Column name; percentage is a number between 0 and 1; frequency is a positive integer</td>
154154 <td>Returns the approximate percentile value of numeric column at the given percentage.</td>
155155 </tr>
156156 <tr>
157- <td> <b>{percentile_approx | percentile_approx}</b>(<i>e : Column, <b>array</b>(percentage1 [, percentage2]...) [, frequency]</i>)</td>
157+ <td> <b>{percentile_approx | percentile_approx}</b>(<i>c : Column, <b>array</b>(percentage1 [, percentage2]...) [, frequency]</i>)</td>
158158 <td>Column name; percentage is a number between 0 and 1; frequency is a positive integer</td>
159159 <td>Returns the approximate percentile value of numeric column at the given percentage.</td>
160160 </tr>
161161 <tr>
162- <td> <b>skewness</b>(<i>e : Column</i>)</td>
162+ <td> <b>skewness</b>(<i>c : Column</i>)</td>
163163 <td>Column name</td>
164164 <td>Returns the skewness value calculated from values of a group</td>
165165 </tr>
166166 <tr>
167- <td> <b>{stddev_samp | stddev | std}</b>(<i>e : Column</i>)</td>
167+ <td> <b>{stddev_samp | stddev | std}</b>(<i>c : Column</i>)</td>
168168 <td>Column name</td>
169169 <td>Returns the sample standard deviation calculated from values of a group</td>
170170 </tr>
171171 <tr>
172- <td> <b>stddev_pop</b>(<i>e : Column</i>)</td>
172+ <td> <b>stddev_pop</b>(<i>c : Column</i>)</td>
173173 <td>Column name</td>
174174 <td>Returns the population standard deviation calculated from values of a group</td>
175175 </tr>
176176 <tr>
177- <td> <b>sum</b>(<i>e : Column</i>)</td>
177+ <td> <b>sum</b>(<i>c : Column</i>)</td>
178178 <td>Column name</td>
179179 <td>Returns the sum calculated from values of a group.</td>
180180 </tr>
181181 <tr>
182- <td> <b>{variance | var_samp}</b>(<i>e : Column</i>)</td>
182+ <td> <b>{variance | var_samp}</b>(<i>c : Column</i>)</td>
183183 <td>Column name</td>
184184 <td>Returns the sample variance calculated from values of a group</td>
185185 </tr>
186186 <tr>
187- <td> <b>var_pop</b>(<i>e : Column</i>)</td>
187+ <td> <b>var_pop</b>(<i>c : Column</i>)</td>
188188 <td>Column name</td>
189189 <td>Returns the population variance calculated from values of a group</td>
190190 </tr>
@@ -362,11 +362,11 @@ SELECT c1 FROM buildin_agg GROUP BY c1 HAVING COUNT_IF(c2 % 2 = 0);
362362
363363--COUNT_MIN_SKETCH
364364SELECT COUNT_MIN_SKETCH(c1, 1D, 0.2D, 3) FROM buildin_agg;
365- +------------------------------------------------------------------------------------------------------------------------------------------------------------- +
366- | count_min_sketch(c1, 0.9, 0.2, 3) |
367- +------------------------------------------------------------------------------------------------------------------------------------------------------------- +
368- | [ 00 00 00 01 00 00 00 00 00 00 00 07 00 00 00 01 00 00 00 03 00 00 00 00 5D 93 49 A6 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 06 ] |
369- +------------------------------------------------------------------------------------------------------------------------------------------------------------- +
365+ +----------------------------------------------------------+
366+ | count_min_sketch(c1, 0.9, 0.2, 3) |
367+ +----------------------------------------------------------+
368+ | [ 00 00 00 01 00 00 00 00 00 00 00 07 00 00 00 01 00 00... ] |
369+ +----------------------------------------------------------+
370370
371371--COVAR_POP
372372SELECT COVAR_POP(c1, c2) FROM buildin_agg;
0 commit comments