From 3382bf55695927a3e5cca9bcc9ee1c0de60c8333 Mon Sep 17 00:00:00 2001 From: Takeshi Yamamuro Date: Wed, 15 Apr 2020 17:53:10 +0900 Subject: [PATCH 1/6] Fix --- docs/_data/menu-sql.yaml | 2 + docs/sql-ref-functions-builtin.md | 1960 +++++++++++++++++ docs/sql-ref-functions.md | 11 + .../expressions/ExpressionDescription.java | 14 +- .../catalyst/expressions/ExpressionInfo.java | 27 +- .../catalyst/analysis/FunctionRegistry.scala | 3 +- .../aggregate/ApproximatePercentile.scala | 1 + .../expressions/aggregate/Average.scala | 1 + .../aggregate/CentralMomentAgg.scala | 6 + .../catalyst/expressions/aggregate/Corr.scala | 1 + .../expressions/aggregate/Count.scala | 1 + .../expressions/aggregate/CountIf.scala | 1 + .../aggregate/CountMinSketchAgg.scala | 1 + .../expressions/aggregate/Covariance.scala | 2 + .../expressions/aggregate/First.scala | 1 + .../aggregate/HyperLogLogPlusPlus.scala | 1 + .../catalyst/expressions/aggregate/Last.scala | 1 + .../catalyst/expressions/aggregate/Max.scala | 1 + .../expressions/aggregate/MaxByAndMinBy.scala | 2 + .../catalyst/expressions/aggregate/Min.scala | 1 + .../expressions/aggregate/Percentile.scala | 1 + .../catalyst/expressions/aggregate/Sum.scala | 1 + .../aggregate/UnevaluableAggs.scala | 2 + .../aggregate/bitwiseAggregates.scala | 2 + .../expressions/aggregate/collect.scala | 2 + .../expressions/collectionOperations.scala | 59 +- .../expressions/complexTypeCreator.scala | 1 + .../expressions/datetimeExpressions.scala | 32 + .../expressions/jsonExpressions.scala | 11 +- .../sql/SparkSessionExtensionSuite.scala | 3 + .../scala/org/apache/spark/sql/UDFSuite.scala | 20 + sql/create-docs.sh | 3 + sql/gen-sql-builtin-functions-docs.py | 242 ++ 33 files changed, 2394 insertions(+), 23 deletions(-) create mode 100644 docs/sql-ref-functions-builtin.md create mode 100644 sql/gen-sql-builtin-functions-docs.py diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index badb98d2e593b..1120dd20e6f39 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -242,6 +242,8 @@ - text: Functions url: sql-ref-functions.html subitems: + - text: Built-in Functions + url: sql-ref-functions-builtin.html - text: Scalar UDFs (User-Defined Functions) url: sql-ref-functions-udf-scalar.html - text: UDAFs (User-Defined Aggregate Functions) diff --git a/docs/sql-ref-functions-builtin.md b/docs/sql-ref-functions-builtin.md new file mode 100644 index 0000000000000..99d9fe5cb9536 --- /dev/null +++ b/docs/sql-ref-functions-builtin.md @@ -0,0 +1,1960 @@ +--- +layout: global +title: Built-in Functions +displayTitle: Built-in Functions +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + + + +### Aggregate Functions + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionDescription
any(expr)Returns true if at least one value of `expr` is true.
approx_count_distinct(expr[, relativeSD])Returns the estimated cardinality by HyperLogLog++. + `relativeSD` defines the maximum estimation error allowed.
approx_percentile(col, percentage [, accuracy])Returns the approximate percentile value of numeric + column `col` at the given percentage. The value of percentage must be between 0.0 + and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which + controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields + better accuracy, `1.0/accuracy` is the relative error of the approximation. + When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0. + In this case, returns the approximate percentile array of column `col` at the given + percentage array.
avg(expr)Returns the mean calculated from values of a group.
bit_or(expr)Returns the bitwise OR of all non-null input values, or null if none.
bit_xor(expr)Returns the bitwise XOR of all non-null input values, or null if none.
bool_and(expr)Returns true if all values of `expr` are true.
bool_or(expr)Returns true if at least one value of `expr` is true.
collect_list(expr)Collects and returns a list of non-unique elements.
collect_set(expr)Collects and returns a set of unique elements.
corr(expr1, expr2)Returns Pearson coefficient of correlation between a set of number pairs.
count(*)Returns the total number of retrieved rows, including rows containing null.
count(expr[, expr...])Returns the number of rows for which the supplied expression(s) are all non-null.
count(DISTINCT expr[, expr...])Returns the number of rows for which the supplied expression(s) are unique and non-null.
count_if(expr)Returns the number of `TRUE` values for the expression.
count_min_sketch(col, eps, confidence, seed)Returns a count-min sketch of a column with the given esp, + confidence and seed. The result is an array of bytes, which can be deserialized to a + `CountMinSketch` before usage. Count-min sketch is a probabilistic data structure used for + cardinality estimation using sub-linear space.
covar_pop(expr1, expr2)Returns the population covariance of a set of number pairs.
covar_samp(expr1, expr2)Returns the sample covariance of a set of number pairs.
every(expr)Returns true if all values of `expr` are true.
first(expr[, isIgnoreNull])Returns the first value of `expr` for a group of rows. + If `isIgnoreNull` is true, returns only non-null values.
first_value(expr[, isIgnoreNull])Returns the first value of `expr` for a group of rows. + If `isIgnoreNull` is true, returns only non-null values.
kurtosis(expr)Returns the kurtosis value calculated from values of a group.
last(expr[, isIgnoreNull])Returns the last value of `expr` for a group of rows. + If `isIgnoreNull` is true, returns only non-null values
last_value(expr[, isIgnoreNull])Returns the last value of `expr` for a group of rows. + If `isIgnoreNull` is true, returns only non-null values
max(expr)Returns the maximum value of `expr`.
max_by(x, y)Returns the value of `x` associated with the maximum value of `y`.
mean(expr)Returns the mean calculated from values of a group.
min(expr)Returns the minimum value of `expr`.
min_by(x, y)Returns the value of `x` associated with the minimum value of `y`.
percentile(col, percentage [, frequency])Returns the exact percentile value of numeric column + `col` at the given percentage. The value of percentage must be between 0.0 and 1.0. The + value of frequency should be positive integral
percentile(col, array(percentage1 [, percentage2]...) [, frequency])Returns the exact + percentile value array of numeric column `col` at the given percentage(s). Each value + of the percentage array must be between 0.0 and 1.0. The value of frequency should be + positive integral
percentile_approx(col, percentage [, accuracy])Returns the approximate percentile value of numeric + column `col` at the given percentage. The value of percentage must be between 0.0 + and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which + controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields + better accuracy, `1.0/accuracy` is the relative error of the approximation. + When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0. + In this case, returns the approximate percentile array of column `col` at the given + percentage array.
skewness(expr)Returns the skewness value calculated from values of a group.
some(expr)Returns true if at least one value of `expr` is true.
std(expr)Returns the sample standard deviation calculated from values of a group.
stddev(expr)Returns the sample standard deviation calculated from values of a group.
stddev_pop(expr)Returns the population standard deviation calculated from values of a group.
stddev_samp(expr)Returns the sample standard deviation calculated from values of a group.
sum(expr)Returns the sum calculated from values of a group.
var_pop(expr)Returns the population variance calculated from values of a group.
var_samp(expr)Returns the sample variance calculated from values of a group.
variance(expr)Returns the sample variance calculated from values of a group.
+ +#### Examples + +{% highlight sql %} +-- any +SELECT any(col) FROM VALUES (true), (false), (false) AS tab(col); + +--------+ + |any(col)| + +--------+ + | true| + +--------+ + +SELECT any(col) FROM VALUES (NULL), (true), (false) AS tab(col); + +--------+ + |any(col)| + +--------+ + | true| + +--------+ + +SELECT any(col) FROM VALUES (false), (false), (NULL) AS tab(col); + +--------+ + |any(col)| + +--------+ + | false| + +--------+ + +-- approx_count_distinct +SELECT approx_count_distinct(col1) FROM VALUES (1), (1), (2), (2), (3) tab(col1); + +---------------------------+ + |approx_count_distinct(col1)| + +---------------------------+ + | 3| + +---------------------------+ + +-- approx_percentile +SELECT approx_percentile(10.0, array(0.5, 0.4, 0.1), 100); + +--------------------------------------------------+ + |approx_percentile(10.0, array(0.5, 0.4, 0.1), 100)| + +--------------------------------------------------+ + | [10.0, 10.0, 10.0]| + +--------------------------------------------------+ + +SELECT approx_percentile(10.0, 0.5, 100); + +-------------------------------------------------+ + |approx_percentile(10.0, CAST(0.5 AS DOUBLE), 100)| + +-------------------------------------------------+ + | 10.0| + +-------------------------------------------------+ + +-- avg +SELECT avg(col) FROM VALUES (1), (2), (3) AS tab(col); + +--------+ + |avg(col)| + +--------+ + | 2.0| + +--------+ + +SELECT avg(col) FROM VALUES (1), (2), (NULL) AS tab(col); + +--------+ + |avg(col)| + +--------+ + | 1.5| + +--------+ + +-- bit_or +SELECT bit_or(col) FROM VALUES (3), (5) AS tab(col); + +-----------+ + |bit_or(col)| + +-----------+ + | 7| + +-----------+ + +-- bit_xor +SELECT bit_xor(col) FROM VALUES (3), (5) AS tab(col); + +------------+ + |bit_xor(col)| + +------------+ + | 6| + +------------+ + +-- bool_and +SELECT bool_and(col) FROM VALUES (true), (true), (true) AS tab(col); + +-------------+ + |bool_and(col)| + +-------------+ + | true| + +-------------+ + +SELECT bool_and(col) FROM VALUES (NULL), (true), (true) AS tab(col); + +-------------+ + |bool_and(col)| + +-------------+ + | true| + +-------------+ + +SELECT bool_and(col) FROM VALUES (true), (false), (true) AS tab(col); + +-------------+ + |bool_and(col)| + +-------------+ + | false| + +-------------+ + +-- bool_or +SELECT bool_or(col) FROM VALUES (true), (false), (false) AS tab(col); + +------------+ + |bool_or(col)| + +------------+ + | true| + +------------+ + +SELECT bool_or(col) FROM VALUES (NULL), (true), (false) AS tab(col); + +------------+ + |bool_or(col)| + +------------+ + | true| + +------------+ + +SELECT bool_or(col) FROM VALUES (false), (false), (NULL) AS tab(col); + +------------+ + |bool_or(col)| + +------------+ + | false| + +------------+ + +-- collect_list +SELECT collect_list(col) FROM VALUES (1), (2), (1) AS tab(col); + +-----------------+ + |collect_list(col)| + +-----------------+ + | [1, 2, 1]| + +-----------------+ + +-- collect_set +SELECT collect_set(col) FROM VALUES (1), (2), (1) AS tab(col); + +----------------+ + |collect_set(col)| + +----------------+ + | [1, 2]| + +----------------+ + +-- corr +SELECT corr(c1, c2) FROM VALUES (3, 2), (3, 3), (6, 4) as tab(c1, c2); + +--------------------------------------------+ + |corr(CAST(c1 AS DOUBLE), CAST(c2 AS DOUBLE))| + +--------------------------------------------+ + | 0.8660254037844387| + +--------------------------------------------+ + +-- count +SELECT count(*) FROM VALUES (NULL), (5), (5), (20) AS tab(col); + +--------+ + |count(1)| + +--------+ + | 4| + +--------+ + +SELECT count(col) FROM VALUES (NULL), (5), (5), (20) AS tab(col); + +----------+ + |count(col)| + +----------+ + | 3| + +----------+ + +SELECT count(DISTINCT col) FROM VALUES (NULL), (5), (5), (10) AS tab(col); + +-------------------+ + |count(DISTINCT col)| + +-------------------+ + | 2| + +-------------------+ + +-- count_if +SELECT count_if(col % 2 = 0) FROM VALUES (NULL), (0), (1), (2), (3) AS tab(col); + +-------------------------+ + |count_if(((col % 2) = 0))| + +-------------------------+ + | 2| + +-------------------------+ + +SELECT count_if(col IS NULL) FROM VALUES (NULL), (0), (1), (2), (3) AS tab(col); + +-----------------------+ + |count_if((col IS NULL))| + +-----------------------+ + | 1| + +-----------------------+ + +-- count_min_sketch +-- covar_pop +SELECT covar_pop(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2); + +-------------------------------------------------+ + |covar_pop(CAST(c1 AS DOUBLE), CAST(c2 AS DOUBLE))| + +-------------------------------------------------+ + | 0.6666666666666666| + +-------------------------------------------------+ + +-- covar_samp +SELECT covar_samp(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2); + +--------------------------------------------------+ + |covar_samp(CAST(c1 AS DOUBLE), CAST(c2 AS DOUBLE))| + +--------------------------------------------------+ + | 1.0| + +--------------------------------------------------+ + +-- every +SELECT every(col) FROM VALUES (true), (true), (true) AS tab(col); + +----------+ + |every(col)| + +----------+ + | true| + +----------+ + +SELECT every(col) FROM VALUES (NULL), (true), (true) AS tab(col); + +----------+ + |every(col)| + +----------+ + | true| + +----------+ + +SELECT every(col) FROM VALUES (true), (false), (true) AS tab(col); + +----------+ + |every(col)| + +----------+ + | false| + +----------+ + +-- first +SELECT first(col) FROM VALUES (10), (5), (20) AS tab(col); + +-----------------+ + |first(col, false)| + +-----------------+ + | 10| + +-----------------+ + +SELECT first(col) FROM VALUES (NULL), (5), (20) AS tab(col); + +-----------------+ + |first(col, false)| + +-----------------+ + | null| + +-----------------+ + +SELECT first(col, true) FROM VALUES (NULL), (5), (20) AS tab(col); + +----------------+ + |first(col, true)| + +----------------+ + | 5| + +----------------+ + +-- first_value +SELECT first_value(col) FROM VALUES (10), (5), (20) AS tab(col); + +-----------------------+ + |first_value(col, false)| + +-----------------------+ + | 10| + +-----------------------+ + +SELECT first_value(col) FROM VALUES (NULL), (5), (20) AS tab(col); + +-----------------------+ + |first_value(col, false)| + +-----------------------+ + | null| + +-----------------------+ + +SELECT first_value(col, true) FROM VALUES (NULL), (5), (20) AS tab(col); + +----------------------+ + |first_value(col, true)| + +----------------------+ + | 5| + +----------------------+ + +-- kurtosis +SELECT kurtosis(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col); + +-----------------------------+ + |kurtosis(CAST(col AS DOUBLE))| + +-----------------------------+ + | -0.7014368047529618| + +-----------------------------+ + +SELECT kurtosis(col) FROM VALUES (1), (10), (100), (10), (1) as tab(col); + +-----------------------------+ + |kurtosis(CAST(col AS DOUBLE))| + +-----------------------------+ + | 0.19432323191699075| + +-----------------------------+ + +-- last +SELECT last(col) FROM VALUES (10), (5), (20) AS tab(col); + +----------------+ + |last(col, false)| + +----------------+ + | 20| + +----------------+ + +SELECT last(col) FROM VALUES (10), (5), (NULL) AS tab(col); + +----------------+ + |last(col, false)| + +----------------+ + | null| + +----------------+ + +SELECT last(col, true) FROM VALUES (10), (5), (NULL) AS tab(col); + +---------------+ + |last(col, true)| + +---------------+ + | 5| + +---------------+ + +-- last_value +SELECT last_value(col) FROM VALUES (10), (5), (20) AS tab(col); + +----------------------+ + |last_value(col, false)| + +----------------------+ + | 20| + +----------------------+ + +SELECT last_value(col) FROM VALUES (10), (5), (NULL) AS tab(col); + +----------------------+ + |last_value(col, false)| + +----------------------+ + | null| + +----------------------+ + +SELECT last_value(col, true) FROM VALUES (10), (5), (NULL) AS tab(col); + +---------------------+ + |last_value(col, true)| + +---------------------+ + | 5| + +---------------------+ + +-- max +SELECT max(col) FROM VALUES (10), (50), (20) AS tab(col); + +--------+ + |max(col)| + +--------+ + | 50| + +--------+ + +-- max_by +SELECT max_by(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 20)) AS tab(x, y); + +-----------+ + |maxby(x, y)| + +-----------+ + | b| + +-----------+ + +-- mean +SELECT mean(col) FROM VALUES (1), (2), (3) AS tab(col); + +---------+ + |mean(col)| + +---------+ + | 2.0| + +---------+ + +SELECT mean(col) FROM VALUES (1), (2), (NULL) AS tab(col); + +---------+ + |mean(col)| + +---------+ + | 1.5| + +---------+ + +-- min +SELECT min(col) FROM VALUES (10), (-1), (20) AS tab(col); + +--------+ + |min(col)| + +--------+ + | -1| + +--------+ + +-- min_by +SELECT min_by(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 20)) AS tab(x, y); + +-----------+ + |minby(x, y)| + +-----------+ + | a| + +-----------+ + +-- percentile +SELECT percentile(col, 0.3) FROM VALUES (0), (10) AS tab(col); + +---------------------------------------+ + |percentile(col, CAST(0.3 AS DOUBLE), 1)| + +---------------------------------------+ + | 3.0| + +---------------------------------------+ + +SELECT percentile(col, array(0.25, 0.75)) FROM VALUES (0), (10) AS tab(col); + +-------------------------------------+ + |percentile(col, array(0.25, 0.75), 1)| + +-------------------------------------+ + | [2.5, 7.5]| + +-------------------------------------+ + +-- percentile_approx +SELECT percentile_approx(10.0, array(0.5, 0.4, 0.1), 100); + +--------------------------------------------------+ + |percentile_approx(10.0, array(0.5, 0.4, 0.1), 100)| + +--------------------------------------------------+ + | [10.0, 10.0, 10.0]| + +--------------------------------------------------+ + +SELECT percentile_approx(10.0, 0.5, 100); + +-------------------------------------------------+ + |percentile_approx(10.0, CAST(0.5 AS DOUBLE), 100)| + +-------------------------------------------------+ + | 10.0| + +-------------------------------------------------+ + +-- skewness +SELECT skewness(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col); + +-----------------------------+ + |skewness(CAST(col AS DOUBLE))| + +-----------------------------+ + | 1.1135657469022013| + +-----------------------------+ + +SELECT skewness(col) FROM VALUES (-1000), (-100), (10), (20) AS tab(col); + +-----------------------------+ + |skewness(CAST(col AS DOUBLE))| + +-----------------------------+ + | -1.1135657469022011| + +-----------------------------+ + +-- some +SELECT some(col) FROM VALUES (true), (false), (false) AS tab(col); + +---------+ + |some(col)| + +---------+ + | true| + +---------+ + +SELECT some(col) FROM VALUES (NULL), (true), (false) AS tab(col); + +---------+ + |some(col)| + +---------+ + | true| + +---------+ + +SELECT some(col) FROM VALUES (false), (false), (NULL) AS tab(col); + +---------+ + |some(col)| + +---------+ + | false| + +---------+ + +-- std +SELECT std(col) FROM VALUES (1), (2), (3) AS tab(col); + +------------------------+ + |std(CAST(col AS DOUBLE))| + +------------------------+ + | 1.0| + +------------------------+ + +-- stddev +SELECT stddev(col) FROM VALUES (1), (2), (3) AS tab(col); + +---------------------------+ + |stddev(CAST(col AS DOUBLE))| + +---------------------------+ + | 1.0| + +---------------------------+ + +-- stddev_pop +SELECT stddev_pop(col) FROM VALUES (1), (2), (3) AS tab(col); + +-------------------------------+ + |stddev_pop(CAST(col AS DOUBLE))| + +-------------------------------+ + | 0.816496580927726| + +-------------------------------+ + +-- stddev_samp +SELECT stddev_samp(col) FROM VALUES (1), (2), (3) AS tab(col); + +--------------------------------+ + |stddev_samp(CAST(col AS DOUBLE))| + +--------------------------------+ + | 1.0| + +--------------------------------+ + +-- sum +SELECT sum(col) FROM VALUES (5), (10), (15) AS tab(col); + +--------+ + |sum(col)| + +--------+ + | 30| + +--------+ + +SELECT sum(col) FROM VALUES (NULL), (10), (15) AS tab(col); + +--------+ + |sum(col)| + +--------+ + | 25| + +--------+ + +SELECT sum(col) FROM VALUES (NULL), (NULL) AS tab(col); + +--------+ + |sum(col)| + +--------+ + | null| + +--------+ + +-- var_pop +SELECT var_pop(col) FROM VALUES (1), (2), (3) AS tab(col); + +----------------------------+ + |var_pop(CAST(col AS DOUBLE))| + +----------------------------+ + | 0.6666666666666666| + +----------------------------+ + +-- var_samp +SELECT var_samp(col) FROM VALUES (1), (2), (3) AS tab(col); + +-----------------------------+ + |var_samp(CAST(col AS DOUBLE))| + +-----------------------------+ + | 1.0| + +-----------------------------+ + +-- variance +SELECT variance(col) FROM VALUES (1), (2), (3) AS tab(col); + +-----------------------------+ + |variance(CAST(col AS DOUBLE))| + +-----------------------------+ + | 1.0| + +-----------------------------+ + +{% endhighlight %} + +### Array Functions + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionDescription
array_contains(array, value)Returns true if the array contains the value.
array_distinct(array)Removes duplicate values from the array.
array_except(array1, array2)Returns an array of the elements in array1 but not in array2, + without duplicates.
array_intersect(array1, array2)Returns an array of the elements in the intersection of array1 and + array2, without duplicates.
array_join(array, delimiter[, nullReplacement])Concatenates the elements of the given array + using the delimiter and an optional string to replace nulls. If no value is set for + nullReplacement, any null value is filtered.
array_max(array)Returns the maximum value in the array. NULL elements are skipped.
array_min(array)Returns the minimum value in the array. NULL elements are skipped.
array_position(array, element)Returns the (1-based) index of the first element of the array as long.
array_remove(array, element)Remove all elements that equal to element from array.
array_repeat(element, count)Returns the array containing element count times.
array_union(array1, array2)Returns an array of the elements in the union of array1 and array2, + without duplicates.
arrays_overlap(a1, a2)Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.
arrays_zip(a1, a2, ...)Returns a merged array of structs in which the N-th struct contains all + N-th values of input arrays.
concat(col1, col2, ..., colN)Returns the concatenation of col1, col2, ..., colN.
flatten(arrayOfArrays)Transforms an array of arrays into a single array.
reverse(array)Returns a reversed string or an array with reverse order of elements.
sequence(start, stop, step)Generates an array of elements from start to stop (inclusive), + incrementing by step. The type of the returned elements is the same as the type of argument + expressions. + + Supported types are: byte, short, integer, long, date, timestamp. + + The start and stop expressions must resolve to the same type. + If start and stop expressions resolve to the 'date' or 'timestamp' type + then the step expression must resolve to the 'interval' type, otherwise to the same type + as the start and stop expressions.
shuffle(array)Returns a random permutation of the given array.
slice(x, start, length)Subsets array x starting from index start (array indices start at 1, or starting from the end if start is negative) with the specified length.
sort_array(array[, ascendingOrder])Sorts the input array in ascending or descending order + according to the natural ordering of the array elements. Null elements will be placed + at the beginning of the returned array in ascending order or at the end of the returned + array in descending order.
+ +#### Examples + +{% highlight sql %} +-- array_contains +SELECT array_contains(array(1, 2, 3), 2); + +---------------------------------+ + |array_contains(array(1, 2, 3), 2)| + +---------------------------------+ + | true| + +---------------------------------+ + +-- array_distinct +SELECT array_distinct(array(1, 2, 3, null, 3)); + +----------------------------------------------------+ + |array_distinct(array(1, 2, 3, CAST(NULL AS INT), 3))| + +----------------------------------------------------+ + | [1, 2, 3,]| + +----------------------------------------------------+ + +-- array_except +SELECT array_except(array(1, 2, 3), array(1, 3, 5)); + +--------------------------------------------+ + |array_except(array(1, 2, 3), array(1, 3, 5))| + +--------------------------------------------+ + | [2]| + +--------------------------------------------+ + +-- array_intersect +SELECT array_intersect(array(1, 2, 3), array(1, 3, 5)); + +-----------------------------------------------+ + |array_intersect(array(1, 2, 3), array(1, 3, 5))| + +-----------------------------------------------+ + | [1, 3]| + +-----------------------------------------------+ + +-- array_join +SELECT array_join(array('hello', 'world'), ' '); + +----------------------------------+ + |array_join(array(hello, world), )| + +----------------------------------+ + | hello world| + +----------------------------------+ + +SELECT array_join(array('hello', null ,'world'), ' '); + +--------------------------------------------------------+ + |array_join(array(hello, CAST(NULL AS STRING), world), )| + +--------------------------------------------------------+ + | hello world| + +--------------------------------------------------------+ + +SELECT array_join(array('hello', null ,'world'), ' ', ','); + +-----------------------------------------------------------+ + |array_join(array(hello, CAST(NULL AS STRING), world), , ,)| + +-----------------------------------------------------------+ + | hello , world| + +-----------------------------------------------------------+ + +-- array_max +SELECT array_max(array(1, 20, null, 3)); + +---------------------------------------------+ + |array_max(array(1, 20, CAST(NULL AS INT), 3))| + +---------------------------------------------+ + | 20| + +---------------------------------------------+ + +-- array_min +SELECT array_min(array(1, 20, null, 3)); + +---------------------------------------------+ + |array_min(array(1, 20, CAST(NULL AS INT), 3))| + +---------------------------------------------+ + | 1| + +---------------------------------------------+ + +-- array_position +SELECT array_position(array(3, 2, 1), 1); + +---------------------------------+ + |array_position(array(3, 2, 1), 1)| + +---------------------------------+ + | 3| + +---------------------------------+ + +-- array_remove +SELECT array_remove(array(1, 2, 3, null, 3), 3); + +-----------------------------------------------------+ + |array_remove(array(1, 2, 3, CAST(NULL AS INT), 3), 3)| + +-----------------------------------------------------+ + | [1, 2,]| + +-----------------------------------------------------+ + +-- array_repeat +SELECT array_repeat('123', 2); + +--------------------+ + |array_repeat(123, 2)| + +--------------------+ + | [123, 123]| + +--------------------+ + +-- array_union +SELECT array_union(array(1, 2, 3), array(1, 3, 5)); + +-------------------------------------------+ + |array_union(array(1, 2, 3), array(1, 3, 5))| + +-------------------------------------------+ + | [1, 2, 3, 5]| + +-------------------------------------------+ + +-- arrays_overlap +SELECT arrays_overlap(array(1, 2, 3), array(3, 4, 5)); + +----------------------------------------------+ + |arrays_overlap(array(1, 2, 3), array(3, 4, 5))| + +----------------------------------------------+ + | true| + +----------------------------------------------+ + +-- arrays_zip +SELECT arrays_zip(array(1, 2, 3), array(2, 3, 4)); + +------------------------------------------+ + |arrays_zip(array(1, 2, 3), array(2, 3, 4))| + +------------------------------------------+ + | [[1, 2], [2, 3], ...| + +------------------------------------------+ + +SELECT arrays_zip(array(1, 2), array(2, 3), array(3, 4)); + +-------------------------------------------------+ + |arrays_zip(array(1, 2), array(2, 3), array(3, 4))| + +-------------------------------------------------+ + | [[1, 2, 3], [2, 3...| + +-------------------------------------------------+ + +-- concat +SELECT concat('Spark', 'SQL'); + +------------------+ + |concat(Spark, SQL)| + +------------------+ + | SparkSQL| + +------------------+ + +SELECT concat(array(1, 2, 3), array(4, 5), array(6)); + +---------------------------------------------+ + |concat(array(1, 2, 3), array(4, 5), array(6))| + +---------------------------------------------+ + | [1, 2, 3, 4, 5, 6]| + +---------------------------------------------+ + +-- flatten +SELECT flatten(array(array(1, 2), array(3, 4))); + +----------------------------------------+ + |flatten(array(array(1, 2), array(3, 4)))| + +----------------------------------------+ + | [1, 2, 3, 4]| + +----------------------------------------+ + +-- reverse +SELECT reverse('Spark SQL'); + +------------------+ + |reverse(Spark SQL)| + +------------------+ + | LQS krapS| + +------------------+ + +SELECT reverse(array(2, 1, 4, 3)); + +--------------------------+ + |reverse(array(2, 1, 4, 3))| + +--------------------------+ + | [3, 4, 1, 2]| + +--------------------------+ + +-- sequence +SELECT sequence(1, 5); + +---------------+ + | sequence(1, 5)| + +---------------+ + |[1, 2, 3, 4, 5]| + +---------------+ + +SELECT sequence(5, 1); + +---------------+ + | sequence(5, 1)| + +---------------+ + |[5, 4, 3, 2, 1]| + +---------------+ + +SELECT sequence(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month); + +---------------------------------------------------------------------------+ + |sequence(to_date('2018-01-01'), to_date('2018-03-01'), INTERVAL '1 months')| + +---------------------------------------------------------------------------+ + | [2018-01-01, 2018...| + +---------------------------------------------------------------------------+ + +-- shuffle +SELECT shuffle(array(1, 20, 3, 5)); + +---------------------------+ + |shuffle(array(1, 20, 3, 5))| + +---------------------------+ + | [1, 3, 20, 5]| + +---------------------------+ + +SELECT shuffle(array(1, 20, null, 3)); + +-------------------------------------------+ + |shuffle(array(1, 20, CAST(NULL AS INT), 3))| + +-------------------------------------------+ + | [3,, 20, 1]| + +-------------------------------------------+ + +-- slice +SELECT slice(array(1, 2, 3, 4), 2, 2); + +------------------------------+ + |slice(array(1, 2, 3, 4), 2, 2)| + +------------------------------+ + | [2, 3]| + +------------------------------+ + +SELECT slice(array(1, 2, 3, 4), -2, 2); + +-------------------------------+ + |slice(array(1, 2, 3, 4), -2, 2)| + +-------------------------------+ + | [3, 4]| + +-------------------------------+ + +-- sort_array +SELECT sort_array(array('b', 'd', null, 'c', 'a'), true); + +---------------------------------------------------------+ + |sort_array(array(b, d, CAST(NULL AS STRING), c, a), true)| + +---------------------------------------------------------+ + | [, a, b, c, d]| + +---------------------------------------------------------+ + +{% endhighlight %} + +### Date and Timestamp Functions + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionDescription
add_months(start_date, num_months)Returns the date that is `num_months` after `start_date`.
current_date()Returns the current date at the start of query evaluation.
current_timestamp()Returns the current timestamp at the start of query evaluation.
date_add(start_date, num_days)Returns the date that is `num_days` after `start_date`.
date_format(timestamp, fmt)Converts `timestamp` to a value of string in the format specified by the date format `fmt`.
date_part(field, source)Extracts a part of the date/timestamp or interval source.
date_sub(start_date, num_days)Returns the date that is `num_days` before `start_date`.
date_trunc(fmt, ts)Returns timestamp `ts` truncated to the unit specified by the format model `fmt`. + `fmt` should be one of ["MILLENNIUM", "CENTURY", "DECADE", "YEAR", "YYYY", "YY", + "QUARTER", "MON", "MONTH", "MM", "WEEK", "DAY", "DD", + "HOUR", "MINUTE", "SECOND", "MILLISECOND", "MICROSECOND"]
datediff(endDate, startDate)Returns the number of days from `startDate` to `endDate`.
dayofweek(date)Returns the day of the week for date/timestamp (1 = Sunday, 2 = Monday, ..., 7 = Saturday).
dayofyear(date)Returns the day of year of the date/timestamp.
from_unixtime(unix_time, format)Returns `unix_time` in the specified `format`.
from_utc_timestamp(timestamp, timezone)Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders that time as a timestamp in the given time zone. For example, 'GMT+1' would yield '2017-07-14 03:40:00.0'.
hour(timestamp)Returns the hour component of the string/timestamp.
last_day(date)Returns the last day of the month which the date belongs to.
make_date(year, month, day)Create date from year, month and day fields.
make_timestamp(year, month, day, hour, min, sec[, timezone])Create timestamp from year, month, day, hour, min, sec and timezone fields.
minute(timestamp)Returns the minute component of the string/timestamp.
month(date)Returns the month component of the date/timestamp.
months_between(timestamp1, timestamp2[, roundOff])If `timestamp1` is later than `timestamp2`, then the result + is positive. If `timestamp1` and `timestamp2` are on the same day of month, or both + are the last day of month, time of day will be ignored. Otherwise, the difference is + calculated based on 31 days per month, and rounded to 8 digits unless roundOff=false.
next_day(start_date, day_of_week)Returns the first date which is later than `start_date` and named as indicated.
now()Returns the current timestamp at the start of query evaluation.
quarter(date)Returns the quarter of the year for date, in the range 1 to 4.
second(timestamp)Returns the second component of the string/timestamp.
to_date(date_str[, fmt])Parses the `date_str` expression with the `fmt` expression to + a date. Returns null with invalid input. By default, it follows casting rules to a date if + the `fmt` is omitted.
to_timestamp(timestamp_str[, fmt])Parses the `timestamp_str` expression with the `fmt` expression + to a timestamp. Returns null with invalid input. By default, it follows casting rules to + a timestamp if the `fmt` is omitted.
to_unix_timestamp(timeExp[, format])Returns the UNIX timestamp of the given time.
to_utc_timestamp(timestamp, timezone)Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield '2017-07-14 01:40:00.0'.
trunc(date, fmt)Returns `date` with the time portion of the day truncated to the unit specified by the format model `fmt`. + `fmt` should be one of ["week", "mon", "month", "mm", "quarter", "year", "yyyy", "yy", "decade", "century", "millennium"]
unix_timestamp([timeExp[, format]])Returns the UNIX timestamp of current or specified time.
weekday(date)Returns the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).
weekofyear(date)Returns the week of the year of the given date. A week is considered to start on a Monday and week 1 is the first week with >3 days.
year(date)Returns the year component of the date/timestamp.
+ +#### Examples + +{% highlight sql %} +-- add_months +SELECT add_months('2016-08-31', 1); + +---------------------------------------+ + |add_months(CAST(2016-08-31 AS DATE), 1)| + +---------------------------------------+ + | 2016-09-30| + +---------------------------------------+ + +-- current_date +-- current_timestamp +-- date_add +SELECT date_add('2016-07-30', 1); + +-------------------------------------+ + |date_add(CAST(2016-07-30 AS DATE), 1)| + +-------------------------------------+ + | 2016-07-31| + +-------------------------------------+ + +-- date_format +SELECT date_format('2016-04-08', 'y'); + +---------------------------------------------+ + |date_format(CAST(2016-04-08 AS TIMESTAMP), y)| + +---------------------------------------------+ + | 2016| + +---------------------------------------------+ + +-- date_part +SELECT date_part('YEAR', TIMESTAMP '2019-08-12 01:00:00.123456'); + +---------------------------------------------------------+ + |date_part('YEAR', TIMESTAMP '2019-08-12 01:00:00.123456')| + +---------------------------------------------------------+ + | 2019| + +---------------------------------------------------------+ + +SELECT date_part('week', timestamp'2019-08-12 01:00:00.123456'); + +---------------------------------------------------------+ + |date_part('week', TIMESTAMP '2019-08-12 01:00:00.123456')| + +---------------------------------------------------------+ + | 33| + +---------------------------------------------------------+ + +SELECT date_part('doy', DATE'2019-08-12'); + +-----------------------------------+ + |date_part('doy', DATE '2019-08-12')| + +-----------------------------------+ + | 224| + +-----------------------------------+ + +SELECT date_part('SECONDS', timestamp'2019-10-01 00:00:01.000001'); + +------------------------------------------------------------+ + |date_part('SECONDS', TIMESTAMP '2019-10-01 00:00:01.000001')| + +------------------------------------------------------------+ + | 1.000001| + +------------------------------------------------------------+ + +SELECT date_part('days', interval 1 year 10 months 5 days); + +------------------------------------------------------+ + |date_part('days', INTERVAL '1 years 10 months 5 days')| + +------------------------------------------------------+ + | 5| + +------------------------------------------------------+ + +SELECT date_part('seconds', interval 5 hours 30 seconds 1 milliseconds 1 microseconds); + +----------------------------------------------------------+ + |date_part('seconds', INTERVAL '5 hours 30.001001 seconds')| + +----------------------------------------------------------+ + | 30.001001| + +----------------------------------------------------------+ + +-- date_sub +SELECT date_sub('2016-07-30', 1); + +-------------------------------------+ + |date_sub(CAST(2016-07-30 AS DATE), 1)| + +-------------------------------------+ + | 2016-07-29| + +-------------------------------------+ + +-- date_trunc +SELECT date_trunc('YEAR', '2015-03-05T09:32:05.359'); + +------------------------------------------------------------+ + |date_trunc(YEAR, CAST(2015-03-05T09:32:05.359 AS TIMESTAMP))| + +------------------------------------------------------------+ + | 2015-01-01 00:00:00| + +------------------------------------------------------------+ + +SELECT date_trunc('MM', '2015-03-05T09:32:05.359'); + +----------------------------------------------------------+ + |date_trunc(MM, CAST(2015-03-05T09:32:05.359 AS TIMESTAMP))| + +----------------------------------------------------------+ + | 2015-03-01 00:00:00| + +----------------------------------------------------------+ + +SELECT date_trunc('DD', '2015-03-05T09:32:05.359'); + +----------------------------------------------------------+ + |date_trunc(DD, CAST(2015-03-05T09:32:05.359 AS TIMESTAMP))| + +----------------------------------------------------------+ + | 2015-03-05 00:00:00| + +----------------------------------------------------------+ + +SELECT date_trunc('HOUR', '2015-03-05T09:32:05.359'); + +------------------------------------------------------------+ + |date_trunc(HOUR, CAST(2015-03-05T09:32:05.359 AS TIMESTAMP))| + +------------------------------------------------------------+ + | 2015-03-05 09:00:00| + +------------------------------------------------------------+ + +SELECT date_trunc('MILLISECOND', '2015-03-05T09:32:05.123456'); + +----------------------------------------------------------------------+ + |date_trunc(MILLISECOND, CAST(2015-03-05T09:32:05.123456 AS TIMESTAMP))| + +----------------------------------------------------------------------+ + | 2015-03-05 09:32:...| + +----------------------------------------------------------------------+ + +SELECT date_trunc('DECADE', '2015-03-05T09:32:05.123456'); + +-----------------------------------------------------------------+ + |date_trunc(DECADE, CAST(2015-03-05T09:32:05.123456 AS TIMESTAMP))| + +-----------------------------------------------------------------+ + | 2010-01-01 00:00:00| + +-----------------------------------------------------------------+ + +SELECT date_trunc('CENTURY', '2015-03-05T09:32:05.123456'); + +------------------------------------------------------------------+ + |date_trunc(CENTURY, CAST(2015-03-05T09:32:05.123456 AS TIMESTAMP))| + +------------------------------------------------------------------+ + | 2001-01-01 00:00:00| + +------------------------------------------------------------------+ + +-- datediff +SELECT datediff('2009-07-31', '2009-07-30'); + +------------------------------------------------------------+ + |datediff(CAST(2009-07-31 AS DATE), CAST(2009-07-30 AS DATE))| + +------------------------------------------------------------+ + | 1| + +------------------------------------------------------------+ + +SELECT datediff('2009-07-30', '2009-07-31'); + +------------------------------------------------------------+ + |datediff(CAST(2009-07-30 AS DATE), CAST(2009-07-31 AS DATE))| + +------------------------------------------------------------+ + | -1| + +------------------------------------------------------------+ + +-- dayofweek +SELECT dayofweek('2009-07-30'); + +-----------------------------------+ + |dayofweek(CAST(2009-07-30 AS DATE))| + +-----------------------------------+ + | 5| + +-----------------------------------+ + +-- dayofyear +SELECT dayofyear('2016-04-09'); + +-----------------------------------+ + |dayofyear(CAST(2016-04-09 AS DATE))| + +-----------------------------------+ + | 100| + +-----------------------------------+ + +-- from_unixtime +SELECT from_unixtime(0, 'yyyy-MM-dd HH:mm:ss'); + +-----------------------------------------------------+ + |from_unixtime(CAST(0 AS BIGINT), yyyy-MM-dd HH:mm:ss)| + +-----------------------------------------------------+ + | 1970-01-01 09:00:00| + +-----------------------------------------------------+ + +-- from_utc_timestamp +SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul'); + +-------------------------------------------------------------+ + |from_utc_timestamp(CAST(2016-08-31 AS TIMESTAMP), Asia/Seoul)| + +-------------------------------------------------------------+ + | 2016-08-31 09:00:00| + +-------------------------------------------------------------+ + +-- hour +SELECT hour('2009-07-30 12:58:59'); + +--------------------------------------------+ + |hour(CAST(2009-07-30 12:58:59 AS TIMESTAMP))| + +--------------------------------------------+ + | 12| + +--------------------------------------------+ + +-- last_day +SELECT last_day('2009-01-12'); + +----------------------------------+ + |last_day(CAST(2009-01-12 AS DATE))| + +----------------------------------+ + | 2009-01-31| + +----------------------------------+ + +-- make_date +SELECT make_date(2013, 7, 15); + +----------------------+ + |make_date(2013, 7, 15)| + +----------------------+ + | 2013-07-15| + +----------------------+ + +SELECT make_date(2019, 13, 1); + +----------------------+ + |make_date(2019, 13, 1)| + +----------------------+ + | null| + +----------------------+ + +SELECT make_date(2019, 7, NULL); + +-------------------------------------+ + |make_date(2019, 7, CAST(NULL AS INT))| + +-------------------------------------+ + | null| + +-------------------------------------+ + +SELECT make_date(2019, 2, 30); + +----------------------+ + |make_date(2019, 2, 30)| + +----------------------+ + | null| + +----------------------+ + +-- make_timestamp +SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887); + +-----------------------------------------------------------------+ + |make_timestamp(2014, 12, 28, 6, 30, CAST(45.887 AS DECIMAL(8,6)))| + +-----------------------------------------------------------------+ + | 2014-12-28 06:30:...| + +-----------------------------------------------------------------+ + +SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887, 'CET'); + +----------------------------------------------------------------------+ + |make_timestamp(2014, 12, 28, 6, 30, CAST(45.887 AS DECIMAL(8,6)), CET)| + +----------------------------------------------------------------------+ + | 2014-12-28 14:30:...| + +----------------------------------------------------------------------+ + +SELECT make_timestamp(2019, 6, 30, 23, 59, 60); + +-------------------------------------------------------------+ + |make_timestamp(2019, 6, 30, 23, 59, CAST(60 AS DECIMAL(8,6)))| + +-------------------------------------------------------------+ + | 2019-07-01 00:00:00| + +-------------------------------------------------------------+ + +SELECT make_timestamp(2019, 13, 1, 10, 11, 12, 'PST'); + +------------------------------------------------------------------+ + |make_timestamp(2019, 13, 1, 10, 11, CAST(12 AS DECIMAL(8,6)), PST)| + +------------------------------------------------------------------+ + | null| + +------------------------------------------------------------------+ + +SELECT make_timestamp(null, 7, 22, 15, 30, 0); + +-------------------------------------------------------------------------+ + |make_timestamp(CAST(NULL AS INT), 7, 22, 15, 30, CAST(0 AS DECIMAL(8,6)))| + +-------------------------------------------------------------------------+ + | null| + +-------------------------------------------------------------------------+ + +-- minute +SELECT minute('2009-07-30 12:58:59'); + +----------------------------------------------+ + |minute(CAST(2009-07-30 12:58:59 AS TIMESTAMP))| + +----------------------------------------------+ + | 58| + +----------------------------------------------+ + +-- month +SELECT month('2016-07-30'); + +-------------------------------+ + |month(CAST(2016-07-30 AS DATE))| + +-------------------------------+ + | 7| + +-------------------------------+ + +-- months_between +SELECT months_between('1997-02-28 10:30:00', '1996-10-30'); + +-------------------------------------------------------------------------------------------+ + |months_between(CAST(1997-02-28 10:30:00 AS TIMESTAMP), CAST(1996-10-30 AS TIMESTAMP), true)| + +-------------------------------------------------------------------------------------------+ + | 3.94959677| + +-------------------------------------------------------------------------------------------+ + +SELECT months_between('1997-02-28 10:30:00', '1996-10-30', false); + +--------------------------------------------------------------------------------------------+ + |months_between(CAST(1997-02-28 10:30:00 AS TIMESTAMP), CAST(1996-10-30 AS TIMESTAMP), false)| + +--------------------------------------------------------------------------------------------+ + | 3.9495967741935485| + +--------------------------------------------------------------------------------------------+ + +-- next_day +SELECT next_day('2015-01-14', 'TU'); + +--------------------------------------+ + |next_day(CAST(2015-01-14 AS DATE), TU)| + +--------------------------------------+ + | 2015-01-20| + +--------------------------------------+ + +-- now +-- quarter +SELECT quarter('2016-08-31'); + +---------------------------------+ + |quarter(CAST(2016-08-31 AS DATE))| + +---------------------------------+ + | 3| + +---------------------------------+ + +-- second +SELECT second('2009-07-30 12:58:59'); + +----------------------------------------------+ + |second(CAST(2009-07-30 12:58:59 AS TIMESTAMP))| + +----------------------------------------------+ + | 59| + +----------------------------------------------+ + +-- to_date +SELECT to_date('2009-07-30 04:17:52'); + +------------------------------+ + |to_date('2009-07-30 04:17:52')| + +------------------------------+ + | 2009-07-30| + +------------------------------+ + +SELECT to_date('2016-12-31', 'yyyy-MM-dd'); + +-----------------------------------+ + |to_date('2016-12-31', 'yyyy-MM-dd')| + +-----------------------------------+ + | 2016-12-31| + +-----------------------------------+ + +-- to_timestamp +SELECT to_timestamp('2016-12-31 00:12:00'); + +-----------------------------------+ + |to_timestamp('2016-12-31 00:12:00')| + +-----------------------------------+ + | 2016-12-31 00:12:00| + +-----------------------------------+ + +SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd'); + +----------------------------------------+ + |to_timestamp('2016-12-31', 'yyyy-MM-dd')| + +----------------------------------------+ + | 2016-12-31 00:00:00| + +----------------------------------------+ + +-- to_unix_timestamp +SELECT to_unix_timestamp('2016-04-08', 'yyyy-MM-dd'); + +-----------------------------------------+ + |to_unix_timestamp(2016-04-08, yyyy-MM-dd)| + +-----------------------------------------+ + | 1460041200| + +-----------------------------------------+ + +-- to_utc_timestamp +SELECT to_utc_timestamp('2016-08-31', 'Asia/Seoul'); + +-----------------------------------------------------------+ + |to_utc_timestamp(CAST(2016-08-31 AS TIMESTAMP), Asia/Seoul)| + +-----------------------------------------------------------+ + | 2016-08-30 15:00:00| + +-----------------------------------------------------------+ + +-- trunc +SELECT trunc('2019-08-04', 'week'); + +-------------------------------------+ + |trunc(CAST(2019-08-04 AS DATE), week)| + +-------------------------------------+ + | 2019-07-29| + +-------------------------------------+ + +SELECT trunc('2019-08-04', 'quarter'); + +----------------------------------------+ + |trunc(CAST(2019-08-04 AS DATE), quarter)| + +----------------------------------------+ + | 2019-07-01| + +----------------------------------------+ + +SELECT trunc('2009-02-12', 'MM'); + +-----------------------------------+ + |trunc(CAST(2009-02-12 AS DATE), MM)| + +-----------------------------------+ + | 2009-02-01| + +-----------------------------------+ + +SELECT trunc('2015-10-27', 'YEAR'); + +-------------------------------------+ + |trunc(CAST(2015-10-27 AS DATE), YEAR)| + +-------------------------------------+ + | 2015-01-01| + +-------------------------------------+ + +SELECT trunc('2015-10-27', 'DECADE'); + +---------------------------------------+ + |trunc(CAST(2015-10-27 AS DATE), DECADE)| + +---------------------------------------+ + | 2010-01-01| + +---------------------------------------+ + +SELECT trunc('1981-01-19', 'century'); + +----------------------------------------+ + |trunc(CAST(1981-01-19 AS DATE), century)| + +----------------------------------------+ + | 1901-01-01| + +----------------------------------------+ + +SELECT trunc('1981-01-19', 'millennium'); + +-------------------------------------------+ + |trunc(CAST(1981-01-19 AS DATE), millennium)| + +-------------------------------------------+ + | 1001-01-01| + +-------------------------------------------+ + +-- unix_timestamp +SELECT unix_timestamp(); + +--------------------------------------------------------+ + |unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss)| + +--------------------------------------------------------+ + | 1586941183| + +--------------------------------------------------------+ + +SELECT unix_timestamp('2016-04-08', 'yyyy-MM-dd'); + +--------------------------------------+ + |unix_timestamp(2016-04-08, yyyy-MM-dd)| + +--------------------------------------+ + | 1460041200| + +--------------------------------------+ + +-- weekday +SELECT weekday('2009-07-30'); + +---------------------------------+ + |weekday(CAST(2009-07-30 AS DATE))| + +---------------------------------+ + | 3| + +---------------------------------+ + +-- weekofyear +SELECT weekofyear('2008-02-20'); + +------------------------------------+ + |weekofyear(CAST(2008-02-20 AS DATE))| + +------------------------------------+ + | 8| + +------------------------------------+ + +-- year +SELECT year('2016-07-30'); + +------------------------------+ + |year(CAST(2016-07-30 AS DATE))| + +------------------------------+ + | 2016| + +------------------------------+ + +{% endhighlight %} + +### JSON Functions + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionDescription
from_json(jsonStr, schema[, options])Returns a struct value with the given `jsonStr` and `schema`.
get_json_object(json_txt, path)Extracts a json object from `path`.
json_array_length(jsonArray)Returns the number of elements in the outmost JSON array.
json_object_keys(json_object)Returns all the keys of the outmost JSON object as an array.
json_tuple(jsonStr, p1, p2, ..., pn)Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.
schema_of_json(json[, options])Returns schema in the DDL format of JSON string.
to_json(expr[, options])Returns a JSON string with a given struct value
+ +#### Examples + +{% highlight sql %} +-- from_json +SELECT from_json('{"a":1, "b":0.8}', 'a INT, b DOUBLE'); + +---------------------------+ + |from_json({"a":1, "b":0.8})| + +---------------------------+ + | [1, 0.8]| + +---------------------------+ + +SELECT from_json('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')); + +--------------------------------+ + |from_json({"time":"26/08/2015"})| + +--------------------------------+ + | [2015-08-26 00:00...| + +--------------------------------+ + +-- get_json_object +SELECT get_json_object('{"a":"b"}', '$.a'); + +-------------------------------+ + |get_json_object({"a":"b"}, $.a)| + +-------------------------------+ + | b| + +-------------------------------+ + +-- json_array_length +SELECT json_array_length('[1,2,3,4]'); + +----------------------------+ + |json_array_length([1,2,3,4])| + +----------------------------+ + | 4| + +----------------------------+ + +SELECT json_array_length('[1,2,3,{"f1":1,"f2":[5,6]},4]'); + +------------------------------------------------+ + |json_array_length([1,2,3,{"f1":1,"f2":[5,6]},4])| + +------------------------------------------------+ + | 5| + +------------------------------------------------+ + +SELECT json_array_length('[1,2'); + +-----------------------+ + |json_array_length([1,2)| + +-----------------------+ + | null| + +-----------------------+ + +-- json_object_keys +Select json_object_keys('{}'); + +--------------------+ + |json_object_keys({})| + +--------------------+ + | []| + +--------------------+ + +Select json_object_keys('{"key": "value"}'); + +----------------------------------+ + |json_object_keys({"key": "value"})| + +----------------------------------+ + | [key]| + +----------------------------------+ + +Select json_object_keys('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}'); + +--------------------------------------------------------+ + |json_object_keys({"f1":"abc","f2":{"f3":"a", "f4":"b"}})| + +--------------------------------------------------------+ + | [f1, f2]| + +--------------------------------------------------------+ + +-- json_tuple +SELECT json_tuple('{"a":1, "b":2}', 'a', 'b'); + +---+---+ + | c0| c1| + +---+---+ + | 1| 2| + +---+---+ + +-- schema_of_json +SELECT schema_of_json('[{"col":0}]'); + +---------------------------+ + |schema_of_json([{"col":0}])| + +---------------------------+ + | array + + + Function + Description + + + + + map_concat(map, ...) + Returns the union of all the given maps + + + map_entries(map) + Returns an unordered array of all entries in the given map. + + + map_from_entries(arrayOfEntries) + Returns a map created from the given array of entries. + + + map_keys(map) + Returns an unordered array containing the keys of the map. + + + map_values(map) + Returns an unordered array containing the values of the map. + + + + +#### Examples + +{% highlight sql %} +-- map_concat +SELECT map_concat(map(1, 'a', 2, 'b'), map(3, 'c')); + +--------------------------------------+ + |map_concat(map(1, a, 2, b), map(3, c))| + +--------------------------------------+ + | [1 -> a, 2 -> b, ...| + +--------------------------------------+ + +-- map_entries +SELECT map_entries(map(1, 'a', 2, 'b')); + +----------------------------+ + |map_entries(map(1, a, 2, b))| + +----------------------------+ + | [[1, a], [2, b]]| + +----------------------------+ + +-- map_from_entries +SELECT map_from_entries(array(struct(1, 'a'), struct(2, 'b'))); + +---------------------------------------------------------------------------------------+ + |map_from_entries(array(named_struct(col1, 1, col2, a), named_struct(col1, 2, col2, b)))| + +---------------------------------------------------------------------------------------+ + | [1 -> a, 2 -> b]| + +---------------------------------------------------------------------------------------+ + +-- map_keys +SELECT map_keys(map(1, 'a', 2, 'b')); + +-------------------------+ + |map_keys(map(1, a, 2, b))| + +-------------------------+ + | [1, 2]| + +-------------------------+ + +-- map_values +SELECT map_values(map(1, 'a', 2, 'b')); + +---------------------------+ + |map_values(map(1, a, 2, b))| + +---------------------------+ + | [a, b]| + +---------------------------+ + +{% endhighlight %} diff --git a/docs/sql-ref-functions.md b/docs/sql-ref-functions.md index e8a0353579301..a51e04515f894 100644 --- a/docs/sql-ref-functions.md +++ b/docs/sql-ref-functions.md @@ -22,6 +22,17 @@ license: | Spark SQL provides two function features to meet a wide range of user needs: built-in functions and user-defined functions (UDFs). Built-in functions are commonly used routines that Spark SQL predefines and a complete list of the functions can be found in the [Built-in Functions](api/sql/) API document. UDFs allow users to define their own functions when the system’s built-in functions are not enough to perform the desired task. +### Built-in Functions + +Spark SQL has some categories of frequently-used built-in functions for aggregtion, arrays/maps, date/timestamp, and JSON data. +This subsection presents the usages and descriptions of these functions. + + * [Aggregate Functions](sql-ref-functions-builtin.html#aggregate-functions) + * [Array Functions](sql-ref-functions-builtin.html#array-functions) + * [Map Functions](sql-ref-functions-builtin.html#map-functions) + * [Date and Timestamp Functions](sql-ref-functions-builtin.html#date-and-timestamp-functions) + * [JSON Functions](sql-ref-functions-builtin.html#json-functions) + ### UDFs (User-Defined Functions) User-Defined Functions (UDFs) are a feature of Spark SQL that allows users to define their own functions when the system's built-in functions are not enough to perform the desired task. To use UDFs in Spark SQL, users must first define the function, then register the function with Spark, and finally call the registered function. The User-Defined Functions can act on a single row or act on multiple rows at once. Spark SQL also supports integration of existing Hive implementations of UDFs, UDAFs and UDTFs. diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java index acdf6afe10ce5..c48a38a3654e6 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java @@ -31,7 +31,7 @@ * `usage()` will be used for the function usage in brief way. * * These below are concatenated and used for the function usage in verbose way, suppose arguments, - * examples, note, since and deprecated will be provided. + * examples, note, group, since and deprecated will be provided. * * `arguments()` describes arguments for the expression. * @@ -39,13 +39,16 @@ * * `note()` contains some notes for the expression optionally. * + * `group()` describes the category that the expression belongs to. The valid value is + * `agg_funcs`, `datetime_funcs`, or `collection_funcs`. + * * `since()` contains version information for the expression. Version is specified by, * for example, "2.2.0". * * `deprecated()` contains deprecation information for the expression optionally, for example, * "Deprecated since 2.2.0. Use something else instead". * - * The format, in particular for `arguments()`, `examples()`,`note()`, `since()` and + * The format, in particular for `arguments()`, `examples()`,`note()`, `group()`, `since()` and * `deprecated()`, should strictly be as follows. * *
@@ -68,6 +71,7 @@
  *   note = """
  *     ...
  *   """,
+ *   group = "agg_funcs",
  *   since = "3.0.0",
  *   deprecated = """
  *     ...
@@ -78,8 +82,9 @@
  *  We can refer the function name by `_FUNC_`, in `usage()`, `arguments()` and `examples()` as
  *  it is registered in `FunctionRegistry`.
  *
- *  Note that, if `extended()` is defined, `arguments()`, `examples()`, `note()`, `since()` and
- *  `deprecated()` should be not defined together. `extended()` exists for backward compatibility.
+ *  Note that, if `extended()` is defined, `arguments()`, `examples()`, `note()`, `group()`,
+ *  `since()` and `deprecated()` should be not defined together. `extended()` exists
+ *  for backward compatibility.
  *
  *  Note this contents are used in the SparkSQL documentation for built-in functions. The contents
  *  here are considered as a Markdown text and then rendered.
@@ -98,6 +103,7 @@
     String arguments() default "";
     String examples() default "";
     String note() default "";
+    String group() default "";
     String since() default "";
     String deprecated() default "";
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
index 8ee90ed6f4c3b..c31f95a5c2feb 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
@@ -19,6 +19,10 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
 /**
  * Expression information, will be used to describe a expression.
  */
@@ -31,9 +35,14 @@ public class ExpressionInfo {
     private String arguments;
     private String examples;
     private String note;
+    private String group;
     private String since;
     private String deprecated;
 
+    private static final Set validGroups =
+        new HashSet<>(Arrays.asList("agg_funcs", "array_funcs", "datetime_funcs",
+            "json_funcs", "map_funcs"));
+
     public String getClassName() {
         return className;
     }
@@ -75,6 +84,10 @@ public String getDeprecated() {
         return deprecated;
     }
 
+    public String getGroup() {
+        return group;
+    }
+
     public String getDb() {
         return db;
     }
@@ -87,6 +100,7 @@ public ExpressionInfo(
             String arguments,
             String examples,
             String note,
+            String group,
             String since,
             String deprecated) {
         assert name != null;
@@ -94,6 +108,7 @@ public ExpressionInfo(
         assert examples != null;
         assert examples.isEmpty() || examples.contains("    Examples:");
         assert note != null;
+        assert group != null;
         assert since != null;
         assert deprecated != null;
 
@@ -104,6 +119,7 @@ public ExpressionInfo(
         this.arguments = arguments;
         this.examples = examples;
         this.note = note;
+        this.group = group;
         this.since = since;
         this.deprecated = deprecated;
 
@@ -120,6 +136,11 @@ public ExpressionInfo(
             }
             this.extended += "\n    Note:\n      " + note.trim() + "\n";
         }
+        if (!group.isEmpty() && !validGroups.contains(group)) {
+            throw new IllegalArgumentException("'group' is malformed in the expression [" +
+                this.name + "]. It should be a value in " + validGroups + "; however, " +
+                "got [" + group + "].");
+        }
         if (!since.isEmpty()) {
             if (Integer.parseInt(since.split("\\.")[0]) < 0) {
                 throw new IllegalArgumentException("'since' is malformed in the expression [" +
@@ -140,11 +161,11 @@ public ExpressionInfo(
     }
 
     public ExpressionInfo(String className, String name) {
-        this(className, null, name, null, "", "", "", "", "");
+        this(className, null, name, null, "", "", "", "", "", "");
     }
 
     public ExpressionInfo(String className, String db, String name) {
-        this(className, db, name, null, "", "", "", "", "");
+        this(className, db, name, null, "", "", "", "", "", "");
     }
 
     /**
@@ -155,7 +176,7 @@ public ExpressionInfo(String className, String db, String name) {
     public ExpressionInfo(String className, String db, String name, String usage, String extended) {
         // `arguments` and `examples` are concatenated for the extended description. So, here
         // simply pass the `extended` as `arguments` and an empty string for `examples`.
-        this(className, db, name, usage, extended, "", "", "", "");
+        this(className, db, name, usage, extended, "", "", "", "", "");
     }
 
     private String replaceFunctionName(String usage) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index aba755cec8990..bedc73484ee58 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -654,7 +654,7 @@ object FunctionRegistry {
     val clazz = scala.reflect.classTag[Cast].runtimeClass
     val usage = "_FUNC_(expr) - Casts the value `expr` to the target data type `_FUNC_`."
     val expressionInfo =
-      new ExpressionInfo(clazz.getCanonicalName, null, name, usage, "", "", "", "", "")
+      new ExpressionInfo(clazz.getCanonicalName, null, name, usage, "", "", "", "", "", "")
     (name, (expressionInfo, builder))
   }
 
@@ -674,6 +674,7 @@ object FunctionRegistry {
           df.arguments(),
           df.examples(),
           df.note(),
+          df.group(),
           df.since(),
           df.deprecated())
       } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index f8060956df875..d06eeeef23936 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -65,6 +65,7 @@ import org.apache.spark.sql.types._
       > SELECT _FUNC_(10.0, 0.5, 100);
        10.0
   """,
+  group = "agg_funcs",
   since = "2.1.0")
 case class ApproximatePercentile(
     child: Expression,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
index 17f906c698de2..d3ce1f8d331ab 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.types._
       > SELECT _FUNC_(col) FROM VALUES (1), (2), (NULL) AS tab(col);
        1.5
   """,
+  group = "agg_funcs",
   since = "1.0.0")
 case class Average(child: Expression) extends DeclarativeAggregate with ImplicitCastInputTypes {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
index bf402807d62d3..53759ca3d9165 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
@@ -142,6 +142,7 @@ abstract class CentralMomentAgg(child: Expression)
       > SELECT _FUNC_(col) FROM VALUES (1), (2), (3) AS tab(col);
        0.816496580927726
   """,
+  group = "agg_funcs",
   since = "1.6.0")
 // scalastyle:on line.size.limit
 case class StddevPop(child: Expression) extends CentralMomentAgg(child) {
@@ -164,6 +165,7 @@ case class StddevPop(child: Expression) extends CentralMomentAgg(child) {
       > SELECT _FUNC_(col) FROM VALUES (1), (2), (3) AS tab(col);
        1.0
   """,
+  group = "agg_funcs",
   since = "1.6.0")
 // scalastyle:on line.size.limit
 case class StddevSamp(child: Expression) extends CentralMomentAgg(child) {
@@ -187,6 +189,7 @@ case class StddevSamp(child: Expression) extends CentralMomentAgg(child) {
       > SELECT _FUNC_(col) FROM VALUES (1), (2), (3) AS tab(col);
        0.6666666666666666
   """,
+  group = "agg_funcs",
   since = "1.6.0")
 case class VariancePop(child: Expression) extends CentralMomentAgg(child) {
 
@@ -207,6 +210,7 @@ case class VariancePop(child: Expression) extends CentralMomentAgg(child) {
       > SELECT _FUNC_(col) FROM VALUES (1), (2), (3) AS tab(col);
        1.0
   """,
+  group = "agg_funcs",
   since = "1.6.0")
 case class VarianceSamp(child: Expression) extends CentralMomentAgg(child) {
 
@@ -229,6 +233,7 @@ case class VarianceSamp(child: Expression) extends CentralMomentAgg(child) {
       > SELECT _FUNC_(col) FROM VALUES (-1000), (-100), (10), (20) AS tab(col);
        -1.1135657469022011
   """,
+  group = "agg_funcs",
   since = "1.6.0")
 case class Skewness(child: Expression) extends CentralMomentAgg(child) {
 
@@ -251,6 +256,7 @@ case class Skewness(child: Expression) extends CentralMomentAgg(child) {
       > SELECT _FUNC_(col) FROM VALUES (1), (10), (100), (10), (1) as tab(col);
        0.19432323191699075
   """,
+  group = "agg_funcs",
   since = "1.6.0")
 case class Kurtosis(child: Expression) extends CentralMomentAgg(child) {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
index 91446e05d853d..9ef05bb5d4fec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
@@ -99,6 +99,7 @@ abstract class PearsonCorrelation(x: Expression, y: Expression)
       > SELECT _FUNC_(c1, c2) FROM VALUES (3, 2), (3, 3), (6, 4) as tab(c1, c2);
        0.8660254037844387
   """,
+  group = "agg_funcs",
   since = "1.6.0")
 // scalastyle:on line.size.limit
 case class Corr(x: Expression, y: Expression)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala
index 2a8edac502c0f..e043c81975066 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.types._
       > SELECT _FUNC_(DISTINCT col) FROM VALUES (NULL), (5), (5), (10) AS tab(col);
        2
   """,
+  group = "agg_funcs",
   since = "1.0.0")
 // scalastyle:on line.size.limit
 case class Count(children: Seq[Expression]) extends DeclarativeAggregate {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountIf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountIf.scala
index d31355cd022fa..5bb95ead3f715 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountIf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountIf.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.types.{AbstractDataType, BooleanType, DataType, Long
       > SELECT _FUNC_(col IS NULL) FROM VALUES (NULL), (0), (1), (2), (3) AS tab(col);
        1
   """,
+  group = "agg_funcs",
   since = "3.0.0")
 case class CountIf(predicate: Expression) extends UnevaluableAggregate with ImplicitCastInputTypes {
   override def prettyName: String = "count_if"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAgg.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAgg.scala
index 4bd13cf284935..787b21859c6da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAgg.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAgg.scala
@@ -44,6 +44,7 @@ import org.apache.spark.util.sketch.CountMinSketch
       `CountMinSketch` before usage. Count-min sketch is a probabilistic data structure used for
       cardinality estimation using sub-linear space.
   """,
+  group = "agg_funcs",
   since = "2.2.0")
 case class CountMinSketchAgg(
     child: Expression,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala
index ac99fa8049f93..f03c2f2710a04 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala
@@ -86,6 +86,7 @@ abstract class Covariance(x: Expression, y: Expression)
       > SELECT _FUNC_(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2);
        0.6666666666666666
   """,
+  group = "agg_funcs",
   since = "2.0.0")
 case class CovPopulation(left: Expression, right: Expression) extends Covariance(left, right) {
   override val evaluateExpression: Expression = {
@@ -102,6 +103,7 @@ case class CovPopulation(left: Expression, right: Expression) extends Covariance
       > SELECT _FUNC_(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2);
        1.0
   """,
+  group = "agg_funcs",
   since = "2.0.0")
 case class CovSample(left: Expression, right: Expression) extends Covariance(left, right) {
   override val evaluateExpression: Expression = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
index 2c0060c22a865..df806edbfda05 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
@@ -47,6 +47,7 @@ import org.apache.spark.sql.types._
     The function is non-deterministic because its results depends on the order of the rows
     which may be non-deterministic after a shuffle.
   """,
+  group = "agg_funcs",
   since = "2.0.0")
 case class First(child: Expression, ignoreNullsExpr: Expression)
   extends DeclarativeAggregate with ExpectsInputTypes {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
index e3c628e70d11b..aed36902b1567 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
@@ -53,6 +53,7 @@ import org.apache.spark.sql.types._
       > SELECT _FUNC_(col1) FROM VALUES (1), (1), (2), (2), (3) tab(col1);
        3
   """,
+  group = "agg_funcs",
   since = "1.6.0")
 case class HyperLogLogPlusPlus(
     child: Expression,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
index 6793ac7632ffd..e55bced192f34 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
@@ -47,6 +47,7 @@ import org.apache.spark.sql.types._
     The function is non-deterministic because its results depends on the order of the rows
     which may be non-deterministic after a shuffle.
   """,
+  group = "agg_funcs",
   since = "2.0.0")
 case class Last(child: Expression, ignoreNullsExpr: Expression)
   extends DeclarativeAggregate with ExpectsInputTypes {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala
index 7520db146ba6a..9bba6604c84ac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.types._
       > SELECT _FUNC_(col) FROM VALUES (10), (50), (20) AS tab(col);
        50
   """,
+  group = "agg_funcs",
   since = "1.0.0")
 case class Max(child: Expression) extends DeclarativeAggregate {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala
index b69b341b0ee3e..2e202240923c3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala
@@ -98,6 +98,7 @@ abstract class MaxMinBy extends DeclarativeAggregate {
       > SELECT _FUNC_(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 20)) AS tab(x, y);
        b
   """,
+  group = "agg_funcs",
   since = "3.0.0")
 case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy {
   override protected def funcName: String = "max_by"
@@ -116,6 +117,7 @@ case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMin
       > SELECT _FUNC_(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 20)) AS tab(x, y);
        a
   """,
+  group = "agg_funcs",
   since = "3.0.0")
 case class MinBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy {
   override protected def funcName: String = "min_by"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala
index 106eb968e3917..1d861aa0dd8cf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.types._
       > SELECT _FUNC_(col) FROM VALUES (10), (-1), (20) AS tab(col);
        -1
   """,
+  group = "agg_funcs",
   since = "1.0.0")
 case class Min(child: Expression) extends DeclarativeAggregate {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
index 0f1c0fb5fcb69..0eba61c741133 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
@@ -62,6 +62,7 @@ import org.apache.spark.util.collection.OpenHashMap
       > SELECT _FUNC_(col, array(0.25, 0.75)) FROM VALUES (0), (10) AS tab(col);
        [2.5,7.5]
   """,
+  group = "agg_funcs",
   since = "2.1.0")
 case class Percentile(
     child: Expression,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
index 8bfd889ea0563..d2daaac72fc85 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.types._
       > SELECT _FUNC_(col) FROM VALUES (NULL), (NULL) AS tab(col);
        NULL
   """,
+  group = "agg_funcs",
   since = "1.0.0")
 case class Sum(child: Expression) extends DeclarativeAggregate with ImplicitCastInputTypes {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/UnevaluableAggs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/UnevaluableAggs.scala
index a1cd4a77d0445..cb77ded3372a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/UnevaluableAggs.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/UnevaluableAggs.scala
@@ -51,6 +51,7 @@ abstract class UnevaluableBooleanAggBase(arg: Expression)
       > SELECT _FUNC_(col) FROM VALUES (true), (false), (true) AS tab(col);
        false
   """,
+  group = "agg_funcs",
   since = "3.0.0")
 case class BoolAnd(arg: Expression) extends UnevaluableBooleanAggBase(arg) {
   override def nodeName: String = getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("bool_and")
@@ -67,6 +68,7 @@ case class BoolAnd(arg: Expression) extends UnevaluableBooleanAggBase(arg) {
       > SELECT _FUNC_(col) FROM VALUES (false), (false), (NULL) AS tab(col);
        false
   """,
+  group = "agg_funcs",
   since = "3.0.0")
 case class BoolOr(arg: Expression) extends UnevaluableBooleanAggBase(arg) {
   override def nodeName: String = getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("bool_or")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/bitwiseAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/bitwiseAggregates.scala
index b77c3bd9cbde4..b4c1b2c708fb2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/bitwiseAggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/bitwiseAggregates.scala
@@ -77,6 +77,7 @@ case class BitAndAgg(child: Expression) extends BitAggregate {
       > SELECT _FUNC_(col) FROM VALUES (3), (5) AS tab(col);
        7
   """,
+  group = "agg_funcs",
   since = "3.0.0")
 case class BitOrAgg(child: Expression) extends BitAggregate {
 
@@ -94,6 +95,7 @@ case class BitOrAgg(child: Expression) extends BitAggregate {
       > SELECT _FUNC_(col) FROM VALUES (3), (5) AS tab(col);
        6
   """,
+  group = "agg_funcs",
   since = "3.0.0")
 case class BitXorAgg(child: Expression) extends BitAggregate {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
index 29f89989b4961..5848aa3f840c6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -96,6 +96,7 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper
     The function is non-deterministic because the order of collected results depends
     on the order of the rows which may be non-deterministic after a shuffle.
   """,
+  group = "agg_funcs",
   since = "2.0.0")
 case class CollectList(
     child: Expression,
@@ -129,6 +130,7 @@ case class CollectList(
     The function is non-deterministic because the order of collected results depends
     on the order of the rows which may be non-deterministic after a shuffle.
   """,
+  group = "agg_funcs",
   since = "2.0.0")
 case class CollectSet(
     child: Expression,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 8b61bc4f22b94..4fd68dcfe5156 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -138,7 +138,8 @@ object Size {
     Examples:
       > SELECT _FUNC_(map(1, 'a', 2, 'b'));
        [1,2]
-  """)
+  """,
+  group = "map_funcs")
 case class MapKeys(child: Expression)
   extends UnaryExpression with ExpectsInputTypes {
 
@@ -169,6 +170,7 @@ case class MapKeys(child: Expression)
       > SELECT _FUNC_(array(1, 2), array(2, 3), array(3, 4));
        [{"0":1,"1":2,"2":3},{"0":2,"1":3,"2":4}]
   """,
+  group = "array_funcs",
   since = "2.4.0")
 case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsInputTypes {
 
@@ -327,7 +329,8 @@ case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsI
     Examples:
       > SELECT _FUNC_(map(1, 'a', 2, 'b'));
        ["a","b"]
-  """)
+  """,
+  group = "map_funcs")
 case class MapValues(child: Expression)
   extends UnaryExpression with ExpectsInputTypes {
 
@@ -356,6 +359,7 @@ case class MapValues(child: Expression)
       > SELECT _FUNC_(map(1, 'a', 2, 'b'));
        [{"key":1,"value":"a"},{"key":2,"value":"b"}]
   """,
+  group = "map_funcs",
   since = "3.0.0")
 case class MapEntries(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
@@ -523,7 +527,9 @@ case class MapEntries(child: Expression) extends UnaryExpression with ExpectsInp
     Examples:
       > SELECT _FUNC_(map(1, 'a', 2, 'b'), map(3, 'c'));
        {1:"a",2:"b",3:"c"}
-  """, since = "2.4.0")
+  """,
+  group = "map_funcs",
+  since = "2.4.0")
 case class MapConcat(children: Seq[Expression]) extends ComplexTypeMergingExpression {
 
   override def checkInputDataTypes(): TypeCheckResult = {
@@ -641,6 +647,7 @@ case class MapConcat(children: Seq[Expression]) extends ComplexTypeMergingExpres
       > SELECT _FUNC_(array(struct(1, 'a'), struct(2, 'b')));
        {1:"a",2:"b"}
   """,
+  group = "map_funcs",
   since = "2.4.0")
 case class MapFromEntries(child: Expression) extends UnaryExpression {
 
@@ -862,7 +869,8 @@ object ArraySortLike {
     Examples:
       > SELECT _FUNC_(array('b', 'd', null, 'c', 'a'), true);
        [null,"a","b","c","d"]
-  """)
+  """,
+  group = "array_funcs")
 // scalastyle:on line.size.limit
 case class SortArray(base: Expression, ascendingOrder: Expression)
   extends BinaryExpression with ArraySortLike {
@@ -920,6 +928,7 @@ case class SortArray(base: Expression, ascendingOrder: Expression)
   note = """
     The function is non-deterministic.
   """,
+  group = "array_funcs",
   since = "2.4.0")
 case class Shuffle(child: Expression, randomSeed: Option[Long] = None)
   extends UnaryExpression with ExpectsInputTypes with Stateful with ExpressionWithRandomSeed {
@@ -1002,6 +1011,7 @@ case class Shuffle(child: Expression, randomSeed: Option[Long] = None)
       > SELECT _FUNC_(array(2, 1, 4, 3));
        [3,4,1,2]
   """,
+  group = "array_funcs",
   since = "1.5.0",
   note = """
     Reverse logic for arrays is available since 2.4.0.
@@ -1073,7 +1083,8 @@ case class Reverse(child: Expression) extends UnaryExpression with ImplicitCastI
     Examples:
       > SELECT _FUNC_(array(1, 2, 3), 2);
        true
-  """)
+  """,
+  group = "array_funcs")
 case class ArrayContains(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -1169,7 +1180,9 @@ case class ArrayContains(left: Expression, right: Expression)
     Examples:
       > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5));
        true
-  """, since = "2.4.0")
+  """,
+  group = "array_funcs",
+  since = "2.4.0")
 // scalastyle:off line.size.limit
 case class ArraysOverlap(left: Expression, right: Expression)
   extends BinaryArrayExpressionWithImplicitCast {
@@ -1392,7 +1405,9 @@ case class ArraysOverlap(left: Expression, right: Expression)
        [2,3]
       > SELECT _FUNC_(array(1, 2, 3, 4), -2, 2);
        [3,4]
-  """, since = "2.4.0")
+  """,
+  group = "array_funcs",
+  since = "2.4.0")
 // scalastyle:on line.size.limit
 case class Slice(x: Expression, start: Expression, length: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
@@ -1505,7 +1520,9 @@ case class Slice(x: Expression, start: Expression, length: Expression)
        hello world
       > SELECT _FUNC_(array('hello', null ,'world'), ' ', ',');
        hello , world
-  """, since = "2.4.0")
+  """,
+  group = "array_funcs",
+  since = "2.4.0")
 case class ArrayJoin(
     array: Expression,
     delimiter: Expression,
@@ -1668,7 +1685,9 @@ case class ArrayJoin(
     Examples:
       > SELECT _FUNC_(array(1, 20, null, 3));
        1
-  """, since = "2.4.0")
+  """,
+  group = "array_funcs",
+  since = "2.4.0")
 case class ArrayMin(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def nullable: Boolean = true
@@ -1733,7 +1752,9 @@ case class ArrayMin(child: Expression) extends UnaryExpression with ImplicitCast
     Examples:
       > SELECT _FUNC_(array(1, 20, null, 3));
        20
-  """, since = "2.4.0")
+  """,
+  group = "array_funcs",
+  since = "2.4.0")
 case class ArrayMax(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def nullable: Boolean = true
@@ -1807,6 +1828,7 @@ case class ArrayMax(child: Expression) extends UnaryExpression with ImplicitCast
       > SELECT _FUNC_(array(3, 2, 1), 1);
        3
   """,
+  group = "array_funcs",
   since = "2.4.0")
 case class ArrayPosition(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -2021,7 +2043,8 @@ case class ElementAt(left: Expression, right: Expression)
   """,
   note = """
     Concat logic for arrays is available since 2.4.0.
-  """)
+  """,
+  group = "array_funcs")
 case class Concat(children: Seq[Expression]) extends ComplexTypeMergingExpression {
 
   private def allowedTypes: Seq[AbstractDataType] = Seq(StringType, BinaryType, ArrayType)
@@ -2220,6 +2243,7 @@ case class Concat(children: Seq[Expression]) extends ComplexTypeMergingExpressio
       > SELECT _FUNC_(array(array(1, 2), array(3, 4)));
        [1,2,3,4]
   """,
+  group = "array_funcs",
   since = "2.4.0")
 case class Flatten(child: Expression) extends UnaryExpression {
 
@@ -2352,6 +2376,7 @@ case class Flatten(child: Expression) extends UnaryExpression {
       > SELECT _FUNC_(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month);
        [2018-01-01,2018-02-01,2018-03-01]
   """,
+  group = "array_funcs",
   since = "2.4.0"
 )
 case class Sequence(
@@ -2734,6 +2759,7 @@ object Sequence {
       > SELECT _FUNC_('123', 2);
        ["123","123"]
   """,
+  group = "array_funcs",
   since = "2.4.0")
 case class ArrayRepeat(left: Expression, right: Expression)
   extends BinaryExpression with ExpectsInputTypes {
@@ -2854,7 +2880,9 @@ case class ArrayRepeat(left: Expression, right: Expression)
     Examples:
       > SELECT _FUNC_(array(1, 2, 3, null, 3), 3);
        [1,2,null]
-  """, since = "2.4.0")
+  """,
+  group = "array_funcs",
+  since = "2.4.0")
 case class ArrayRemove(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -3049,7 +3077,9 @@ trait ArraySetLike {
     Examples:
       > SELECT _FUNC_(array(1, 2, 3, null, 3));
        [1,2,3,null]
-  """, since = "2.4.0")
+  """,
+  group = "array_funcs",
+  since = "2.4.0")
 case class ArrayDistinct(child: Expression)
   extends UnaryExpression with ArraySetLike with ExpectsInputTypes {
 
@@ -3226,6 +3256,7 @@ object ArrayBinaryLike {
       > SELECT _FUNC_(array(1, 2, 3), array(1, 3, 5));
        [1,2,3,5]
   """,
+  group = "array_funcs",
   since = "2.4.0")
 case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLike
   with ComplexTypeMergingExpression {
@@ -3437,6 +3468,7 @@ object ArrayUnion {
       > SELECT _FUNC_(array(1, 2, 3), array(1, 3, 5));
        [1,3]
   """,
+  group = "array_funcs",
   since = "2.4.0")
 case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBinaryLike
   with ComplexTypeMergingExpression {
@@ -3678,6 +3710,7 @@ case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBina
       > SELECT _FUNC_(array(1, 2, 3), array(1, 3, 5));
        [2]
   """,
+  group = "array_funcs",
   since = "2.4.0")
 case class ArrayExcept(left: Expression, right: Expression) extends ArrayBinaryLike
   with ComplexTypeMergingExpression {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 6c31511571387..858c91a4d8e86 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -333,6 +333,7 @@ object CreateStruct extends FunctionBuilder {
       "",
       "",
       "",
+      "",
       "")
     ("struct", (info, this))
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 44601b4b8db91..d38165b936ecc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -64,6 +64,7 @@ trait TimeZoneAwareExpression extends Expression {
  */
 @ExpressionDescription(
   usage = "_FUNC_() - Returns the current date at the start of query evaluation.",
+  group = "datetime_funcs",
   since = "1.5.0")
 case class CurrentDate(timeZoneId: Option[String] = None)
   extends LeafExpression with TimeZoneAwareExpression with CodegenFallback {
@@ -91,6 +92,7 @@ case class CurrentDate(timeZoneId: Option[String] = None)
  */
 @ExpressionDescription(
   usage = "_FUNC_() - Returns the current timestamp at the start of query evaluation.",
+  group = "datetime_funcs",
   since = "1.5.0")
 case class CurrentTimestamp() extends LeafExpression with CodegenFallback {
   override def foldable: Boolean = true
@@ -153,6 +155,7 @@ case class CurrentBatchTimestamp(
       > SELECT _FUNC_('2016-07-30', 1);
        2016-07-31
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class DateAdd(startDate: Expression, days: Expression)
   extends BinaryExpression with ExpectsInputTypes {
@@ -188,6 +191,7 @@ case class DateAdd(startDate: Expression, days: Expression)
       > SELECT _FUNC_('2016-07-30', 1);
        2016-07-29
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class DateSub(startDate: Expression, days: Expression)
   extends BinaryExpression with ExpectsInputTypes {
@@ -219,6 +223,7 @@ case class DateSub(startDate: Expression, days: Expression)
       > SELECT _FUNC_('2009-07-30 12:58:59');
        12
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class Hour(child: Expression, timeZoneId: Option[String] = None)
   extends UnaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes {
@@ -250,6 +255,7 @@ case class Hour(child: Expression, timeZoneId: Option[String] = None)
       > SELECT _FUNC_('2009-07-30 12:58:59');
        58
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class Minute(child: Expression, timeZoneId: Option[String] = None)
   extends UnaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes {
@@ -281,6 +287,7 @@ case class Minute(child: Expression, timeZoneId: Option[String] = None)
       > SELECT _FUNC_('2009-07-30 12:58:59');
        59
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class Second(child: Expression, timeZoneId: Option[String] = None)
   extends UnaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes {
@@ -378,6 +385,7 @@ case class Microseconds(child: Expression, timeZoneId: Option[String] = None)
       > SELECT _FUNC_('2016-04-09');
        100
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
@@ -402,6 +410,7 @@ case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCas
       > SELECT _FUNC_('2016-07-30');
        2016
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class Year(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
@@ -442,6 +451,7 @@ case class IsoYear(child: Expression) extends UnaryExpression with ImplicitCastI
       > SELECT _FUNC_('2016-08-31');
        3
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
@@ -466,6 +476,7 @@ case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastI
       > SELECT _FUNC_('2016-07-30');
        7
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class Month(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
@@ -515,6 +526,7 @@ case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCa
       > SELECT _FUNC_('2009-07-30');
        5
   """,
+  group = "datetime_funcs",
   since = "2.3.0")
 // scalastyle:on line.size.limit
 case class DayOfWeek(child: Expression) extends DayWeek {
@@ -541,6 +553,7 @@ case class DayOfWeek(child: Expression) extends DayWeek {
       > SELECT _FUNC_('2009-07-30');
        3
   """,
+  group = "datetime_funcs",
   since = "2.4.0")
 // scalastyle:on line.size.limit
 case class WeekDay(child: Expression) extends DayWeek {
@@ -574,6 +587,7 @@ abstract class DayWeek extends UnaryExpression with ImplicitCastInputTypes {
       > SELECT _FUNC_('2008-02-20');
        8
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
@@ -611,6 +625,7 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
       > SELECT _FUNC_('2016-04-08', 'y');
        2016
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Option[String] = None)
@@ -693,6 +708,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
       > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
        1460098800
   """,
+  group = "datetime_funcs",
   since = "1.6.0")
 // scalastyle:on line.size.limit
 case class ToUnixTimestamp(
@@ -742,6 +758,7 @@ case class ToUnixTimestamp(
       > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
        1460041200
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class UnixTimestamp(timeExp: Expression, format: Expression, timeZoneId: Option[String] = None)
@@ -935,6 +952,7 @@ abstract class UnixTime extends ToTimestamp {
       > SELECT _FUNC_(0, 'yyyy-MM-dd HH:mm:ss');
        1969-12-31 16:00:00
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[String] = None)
@@ -1055,6 +1073,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
       > SELECT _FUNC_('2009-01-12');
        2009-01-31
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def child: Expression = startDate
@@ -1090,6 +1109,7 @@ case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitC
       > SELECT _FUNC_('2015-01-14', 'TU');
        2015-01-20
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class NextDay(startDate: Expression, dayOfWeek: Expression)
@@ -1202,6 +1222,7 @@ case class TimeAdd(start: Expression, interval: Expression, timeZoneId: Option[S
       > SELECT _FUNC_('2016-08-31', 'Asia/Seoul');
        2016-08-31 09:00:00
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class FromUTCTimestamp(left: Expression, right: Expression)
@@ -1296,6 +1317,7 @@ case class TimeSub(start: Expression, interval: Expression, timeZoneId: Option[S
       > SELECT _FUNC_('2016-08-31', 1);
        2016-09-30
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class AddMonths(startDate: Expression, numMonths: Expression)
@@ -1345,6 +1367,7 @@ case class AddMonths(startDate: Expression, numMonths: Expression)
       > SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30', false);
        3.9495967741935485
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class MonthsBetween(
@@ -1406,6 +1429,7 @@ case class MonthsBetween(
       > SELECT _FUNC_('2016-08-31', 'Asia/Seoul');
        2016-08-30 15:00:00
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class ToUTCTimestamp(left: Expression, right: Expression)
@@ -1477,6 +1501,7 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
       > SELECT _FUNC_('2016-12-31', 'yyyy-MM-dd');
        2016-12-31
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class ParseToDate(left: Expression, format: Option[Expression], child: Expression)
@@ -1527,6 +1552,7 @@ case class ParseToDate(left: Expression, format: Option[Expression], child: Expr
       > SELECT _FUNC_('2016-12-31', 'yyyy-MM-dd');
        2016-12-31 00:00:00
   """,
+  group = "datetime_funcs",
   since = "2.2.0")
 // scalastyle:on line.size.limit
 case class ParseToTimestamp(left: Expression, format: Option[Expression], child: Expression)
@@ -1658,6 +1684,7 @@ trait TruncInstant extends BinaryExpression with ImplicitCastInputTypes {
       > SELECT _FUNC_('1981-01-19', 'millennium');
        1001-01-01
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 // scalastyle:on line.size.limit
 case class TruncDate(date: Expression, format: Expression)
@@ -1711,6 +1738,7 @@ case class TruncDate(date: Expression, format: Expression)
       > SELECT _FUNC_('CENTURY', '2015-03-05T09:32:05.123456');
        2001-01-01 00:00:00
   """,
+  group = "datetime_funcs",
   since = "2.3.0")
 // scalastyle:on line.size.limit
 case class TruncTimestamp(
@@ -1758,6 +1786,7 @@ case class TruncTimestamp(
       > SELECT _FUNC_('2009-07-30', '2009-07-31');
        -1
   """,
+  group = "datetime_funcs",
   since = "1.5.0")
 case class DateDiff(endDate: Expression, startDate: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -1811,6 +1840,7 @@ private case class GetTimestamp(
       > SELECT _FUNC_(2019, 2, 30);
        NULL
   """,
+  group = "datetime_funcs",
   since = "3.0.0")
 case class MakeDate(year: Expression, month: Expression, day: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
@@ -1872,6 +1902,7 @@ case class MakeDate(year: Expression, month: Expression, day: Expression)
       > SELECT _FUNC_(null, 7, 22, 15, 30, 0);
        NULL
   """,
+  group = "datetime_funcs",
   since = "3.0.0")
 // scalastyle:on line.size.limit
 case class MakeTimestamp(
@@ -2161,6 +2192,7 @@ object DatePart {
       > SELECT _FUNC_('seconds', interval 5 hours 30 seconds 1 milliseconds 1 microseconds);
        30.001001
   """,
+  group = "datetime_funcs",
   since = "3.0.0")
 case class DatePart(field: Expression, source: Expression, child: Expression)
   extends RuntimeReplaceable {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 98068360183ff..4ef6f7fab7df6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -118,7 +118,8 @@ private[this] object SharedFactory {
     Examples:
       > SELECT _FUNC_('{"a":"b"}', '$.a');
        b
-  """)
+  """,
+  group = "json_funcs")
 case class GetJsonObject(json: Expression, path: Expression)
   extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
 
@@ -341,7 +342,8 @@ case class GetJsonObject(json: Expression, path: Expression)
     Examples:
       > SELECT _FUNC_('{"a":1, "b":2}', 'a', 'b');
        1	2
-  """)
+  """,
+  group = "json_funcs")
 // scalastyle:on line.size.limit line.contains.tab
 case class JsonTuple(children: Seq[Expression])
   extends Generator with CodegenFallback {
@@ -509,6 +511,7 @@ case class JsonTuple(children: Seq[Expression])
       > SELECT _FUNC_('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy'));
        {"time":2015-08-26 00:00:00}
   """,
+  group = "json_funcs",
   since = "2.2.0")
 // scalastyle:on line.size.limit
 case class JsonToStructs(
@@ -628,6 +631,7 @@ case class JsonToStructs(
       > SELECT _FUNC_(array((map('a', 1))));
        [{"a":1}]
   """,
+  group = "json_funcs",
   since = "2.2.0")
 // scalastyle:on line.size.limit
 case class StructsToJson(
@@ -737,6 +741,7 @@ case class StructsToJson(
       > SELECT _FUNC_('[{"col":01}]', map('allowNumericLeadingZeros', 'true'));
        array>
   """,
+  group = "json_funcs",
   since = "2.4.0")
 case class SchemaOfJson(
     child: Expression,
@@ -817,6 +822,7 @@ case class SchemaOfJson(
       > SELECT _FUNC_('[1,2');
         NULL
   """,
+  group = "json_funcs",
   since = "3.1.0"
 )
 case class LengthOfJsonArray(child: Expression) extends UnaryExpression
@@ -886,6 +892,7 @@ case class LengthOfJsonArray(child: Expression) extends UnaryExpression
       > Select _FUNC_('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}');
         ["f1","f2"]
   """,
+  group = "json_funcs",
   since = "3.1.0"
 )
 case class JsonObjectKeys(child: Expression) extends UnaryExpression with CodegenFallback
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala
index 2d48f8df23e65..d9c90c7dbd085 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala
@@ -345,6 +345,7 @@ object MyExtensions {
       """
        note
       """,
+      "",
       "3.0.0",
       """
        deprecated
@@ -755,6 +756,7 @@ object MyExtensions2 {
       """
        note
       """,
+      "",
       "3.0.0",
       """
        deprecated
@@ -787,6 +789,7 @@ object MyExtensions2Duplicate {
       """
        note
       """,
+      "",
       "3.0.0",
       """
        deprecated
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 08f41f6819a0e..497f8622ee9cd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -21,6 +21,7 @@ import java.math.BigDecimal
 
 import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.FunctionIdentifier
+import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.{QueryExecution, SimpleMode}
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
@@ -544,6 +545,25 @@ class UDFSuite extends QueryTest with SharedSparkSession {
     assert(info.getExtended.contains("> SELECT upper('SparkSql');"))
   }
 
+  test("group info in ExpressionInfo") {
+    val info = spark.sessionState.catalog.lookupFunctionInfo(FunctionIdentifier("sum"))
+    assert(info.getGroup === "agg_funcs")
+
+    Seq("agg_funcs", "array_funcs", "datetime_funcs", "json_funcs", "map_funcs")
+        .foreach { groupName =>
+      val info = new ExpressionInfo(
+        "testClass", null, "testName", null, "", "", "", groupName, "", "")
+      assert(info.getGroup === groupName)
+    }
+
+    val errMsg = intercept[IllegalArgumentException] {
+      val invalidGroupName = "invalidGroupName"
+      new ExpressionInfo("testClass", null, "testName", null, "", "", "", invalidGroupName, "", "")
+    }.getMessage
+    assert(errMsg === "'group' is malformed in the expression [testName]. It should be a value " +
+      "in [agg_funcs, collection_funcs, datetime_funcs]; however, got [invalidGroupName].")
+  }
+
   test("SPARK-28521 error message for CAST(parameter types contains DataType)") {
     val e = intercept[AnalysisException] {
       spark.sql("SELECT CAST(1)")
diff --git a/sql/create-docs.sh b/sql/create-docs.sh
index 44aa877332fd5..334c269cc630e 100755
--- a/sql/create-docs.sh
+++ b/sql/create-docs.sh
@@ -48,6 +48,9 @@ echo "Generating SQL API Markdown files."
 echo "Generating SQL configuration table HTML file."
 "$SPARK_HOME/bin/spark-submit" gen-sql-config-docs.py
 
+echo "Generating SQL document Makrdown files for bult-in functions."
+"$SPARK_HOME/bin/spark-submit" gen-sql-builtin-functions-docs.py
+
 echo "Generating HTML files for SQL API documentation."
 mkdocs build --clean
 rm -fr docs
diff --git a/sql/gen-sql-builtin-functions-docs.py b/sql/gen-sql-builtin-functions-docs.py
new file mode 100644
index 0000000000000..ab69be7ca12c4
--- /dev/null
+++ b/sql/gen-sql-builtin-functions-docs.py
@@ -0,0 +1,242 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import itertools
+import os
+import re
+from collections import namedtuple
+
+from pyspark.java_gateway import launch_gateway
+from pyspark.sql import SparkSession
+
+ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group")
+
+markdown_header = \
+    "---\n"\
+    "layout: global\n"\
+    "title: Built-in Functions\n"\
+    "displayTitle: Built-in Functions\n"\
+    "license: |\n"\
+    "  Licensed to the Apache Software Foundation (ASF) under one or more\n"\
+    "  contributor license agreements.  See the NOTICE file distributed with\n"\
+    "  this work for additional information regarding copyright ownership.\n"\
+    "  The ASF licenses this file to You under the Apache License, Version 2.0\n"\
+    "  (the \"License\"); you may not use this file except in compliance with\n"\
+    "  the License.  You may obtain a copy of the License at\n"\
+    "\n"\
+    "  http://www.apache.org/licenses/LICENSE-2.0\n"\
+    "\n"\
+    "  Unless required by applicable law or agreed to in writing, software\n"\
+    "  distributed under the License is distributed on an \"AS IS\" BASIS,\n"\
+    "  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"\
+    "  See the License for the specific language governing permissions and\n"\
+    "  limitations under the License.\n"\
+    "---"
+
+group_titles = {
+    "agg_funcs": "Aggregate Functions",
+    "array_funcs": "Array Functions",
+    "datetime_funcs": "Date and Timestamp Functions",
+    "json_funcs": "JSON Functions",
+    "map_funcs": "Map Functions"
+}
+
+
+def _list_grouped_function_infos(jvm):
+    """
+    Returns a list of function information grouped by each group value via JVM.
+    Sorts wrapped expression infos in each group by name and returns them.
+    """
+
+    jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
+    expected_groups = group_titles.keys()
+    infos = []
+
+    for jinfo in filter(lambda x: x.getGroup() in expected_groups, jinfos):
+        name = jinfo.getName()
+        usage = jinfo.getUsage()
+        usage = usage.replace("_FUNC_", name) if usage is not None else usage
+        infos.append(ExpressionInfo(
+            name=name,
+            usage=usage,
+            examples=jinfo.getExamples().replace("_FUNC_", name),
+            group=jinfo.getGroup()))
+
+    # Groups expression info by each group value
+    grouped_infos = itertools.groupby(sorted(infos, key=lambda x: x.group), key=lambda x: x.group)
+    # Then, sort expression infos in each group by name
+    return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos]
+
+
+# TODO(maropu) Needs to add a column to describe arguments and their types
+def _make_pretty_usage(infos):
+    """
+    Makes the usage description pretty and returns a formatted string.
+
+    Expected input:
+
+        func(*) - ...
+
+        func(expr[, expr...]) - ...
+
+    Expected output:
+    
+      
+        
+          
+          
+        
+      
+      
+        
+          
+          
+        
+        
+          
+          
+        
+      
+    
FunctionDescription
func(*)...
func(expr[, expr...])...
+ + """ + + result = [] + result.append("") + result.append(" ") + result.append(" ") + result.append(" ") + result.append(" ") + result.append(" ") + result.append(" ") + result.append(" ") + + for info in infos: + # Extracts (signature, description) pairs from `info.usage`, e.g., + # the signature is `func(expr)` and the description is `...` in an usage `func(expr) - ...`. + usages = iter(re.split(r"(%s\(.*\)) - " % info.name, info.usage.strip())[1:]) + for (sig, description) in zip(usages, usages): + result.append(" ") + result.append(" " % sig) + result.append(" " % description.strip()) + result.append(" ") + + result.append(" ") + result.append("
FunctionDescription
%s%s
\n") + return "\n".join(result) + + +def _make_pretty_query_example(jspark, query): + result = [] + query_output = jspark.sql(query).showString(20, 20, False) + result.append(query) + result.extend(map(lambda x: " %s" % x, query_output.split("\n"))) + return "\n".join(result) + + +def _make_pretty_examples(jspark, infos): + """ + Makes the examples description pretty and returns a formatted string. + + Expected input: + + Examples: + > SELECT func(col)...; + ... + > SELECT func(col)...; + ... + + Expected output: + -- group_value + SELECT func(col)...; + +---------+ + |func(col)| + +---------+ + | ...| + +---------+ + + SELECT func(col)...; + +---------+ + |func(col)| + +---------+ + | ...| + +---------+ + + """ + + result = [] + result.append("\n#### Examples\n") + result.append("{% highlight sql %}") + + for info in infos: + result.append("-- %s" % info.name) + query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n")) + for query_example in query_examples: + query = query_example.lstrip(" > ") + result.append(_make_pretty_query_example(jspark, query)) + + result.append("{% endhighlight %}\n") + return "\n".join(result) + + +def generate_sql_markdown(jvm, jspark, path): + """ + Generates a markdown file after listing the function information. The output file + is created in `path`. + + Expected output: + --- + layout: global + title: Built-in Functions + displayTitle: Built-in Functions + license: + ... + --- + + ### Aggregate Functions + + + ... +
+ + #### Examples + + {% hightlight sql %} + ... + {% endhighlight %} + + """ + + with open(path, 'w') as mdfile: + filename = os.path.basename(__file__) + mdfile.write("%s\n\n" % markdown_header) + mdfile.write("\n" % filename) + + for key, infos in _list_grouped_function_infos(jvm): + mdfile.write("\n### %s\n\n" % group_titles[key]) + function_table = _make_pretty_usage(infos) + examples = _make_pretty_examples(jspark, infos) + mdfile.write(function_table) + mdfile.write(examples) + + +if __name__ == "__main__": + jvm = launch_gateway().jvm + jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate() + spark_root_dir = os.path.dirname(os.path.dirname(__file__)) + markdown_file_path = os.path.join(spark_root_dir, "docs/sql-ref-functions-builtin.md") + generate_sql_markdown(jvm, jspark, markdown_file_path) From b9407ca6691b0e3fdaa9e84a9ed24c8233e79001 Mon Sep 17 00:00:00 2001 From: Takeshi Yamamuro Date: Thu, 16 Apr 2020 11:07:15 +0900 Subject: [PATCH 2/6] Fix --- docs/sql-ref-functions-builtin.md | 63 ++++++++++++++++++- docs/sql-ref-functions.md | 1 + .../catalyst/expressions/ExpressionInfo.java | 2 +- .../scala/org/apache/spark/sql/UDFSuite.scala | 2 +- sql/gen-sql-builtin-functions-docs.py | 32 +++++----- 5 files changed, 82 insertions(+), 18 deletions(-) diff --git a/docs/sql-ref-functions-builtin.md b/docs/sql-ref-functions-builtin.md index 99d9fe5cb9536..cf8a895f2543b 100644 --- a/docs/sql-ref-functions-builtin.md +++ b/docs/sql-ref-functions-builtin.md @@ -1060,7 +1060,7 @@ SELECT shuffle(array(1, 20, null, 3)); +-------------------------------------------+ |shuffle(array(1, 20, CAST(NULL AS INT), 3))| +-------------------------------------------+ - | [3,, 20, 1]| + | [1,, 20, 3]| +-------------------------------------------+ -- slice @@ -1659,7 +1659,7 @@ SELECT unix_timestamp(); +--------------------------------------------------------+ |unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss)| +--------------------------------------------------------+ - | 1586941183| + | 1587002726| +--------------------------------------------------------+ SELECT unix_timestamp('2016-04-08', 'yyyy-MM-dd'); @@ -1958,3 +1958,62 @@ SELECT map_values(map(1, 'a', 2, 'b')); +---------------------------+ {% endhighlight %} + +### Window Functions + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionDescription
cume_dist()Computes the position of a value relative to all values in the partition.
dense_rank()Computes the rank of a value in a group of values. The result is one plus the + previously assigned rank value. Unlike the function rank, dense_rank will not produce gaps + in the ranking sequence.
lag(input[, offset[, default]])Returns the value of `input` at the `offset`th row + before the current row in the window. The default value of `offset` is 1 and the default + value of `default` is null. If the value of `input` at the `offset`th row is null, + null is returned. If there is no such offset row (e.g., when the offset is 1, the first + row of the window does not have any previous row), `default` is returned.
lead(input[, offset[, default]])Returns the value of `input` at the `offset`th row + after the current row in the window. The default value of `offset` is 1 and the default + value of `default` is null. If the value of `input` at the `offset`th row is null, + null is returned. If there is no such an offset row (e.g., when the offset is 1, the last + row of the window does not have any subsequent row), `default` is returned.
ntile(n)Divides the rows for each window partition into `n` buckets ranging + from 1 to at most `n`.
percent_rank()Computes the percentage ranking of a value in a group of values.
rank()Computes the rank of a value in a group of values. The result is one plus the number + of rows preceding or equal to the current row in the ordering of the partition. The values + will produce gaps in the sequence.
row_number()Assigns a unique, sequential number to each row, starting with one, + according to the ordering of rows within the window partition.
diff --git a/docs/sql-ref-functions.md b/docs/sql-ref-functions.md index a51e04515f894..6368fb705f893 100644 --- a/docs/sql-ref-functions.md +++ b/docs/sql-ref-functions.md @@ -28,6 +28,7 @@ Spark SQL has some categories of frequently-used built-in functions for aggregti This subsection presents the usages and descriptions of these functions. * [Aggregate Functions](sql-ref-functions-builtin.html#aggregate-functions) + * [Window Functions](sql-ref-functions-builtin.html#window-functions) * [Array Functions](sql-ref-functions-builtin.html#array-functions) * [Map Functions](sql-ref-functions-builtin.html#map-functions) * [Date and Timestamp Functions](sql-ref-functions-builtin.html#date-and-timestamp-functions) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java index c31f95a5c2feb..a500822b21f02 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java @@ -41,7 +41,7 @@ public class ExpressionInfo { private static final Set validGroups = new HashSet<>(Arrays.asList("agg_funcs", "array_funcs", "datetime_funcs", - "json_funcs", "map_funcs")); + "json_funcs", "map_funcs", "window_funcs")); public String getClassName() { return className; diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index 497f8622ee9cd..18d6a1b710297 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -549,7 +549,7 @@ class UDFSuite extends QueryTest with SharedSparkSession { val info = spark.sessionState.catalog.lookupFunctionInfo(FunctionIdentifier("sum")) assert(info.getGroup === "agg_funcs") - Seq("agg_funcs", "array_funcs", "datetime_funcs", "json_funcs", "map_funcs") + Seq("agg_funcs", "array_funcs", "datetime_funcs", "json_funcs", "map_funcs", "window_funcs") .foreach { groupName => val info = new ExpressionInfo( "testClass", null, "testName", null, "", "", "", groupName, "", "") diff --git a/sql/gen-sql-builtin-functions-docs.py b/sql/gen-sql-builtin-functions-docs.py index ab69be7ca12c4..5a5a4bd6d0169 100644 --- a/sql/gen-sql-builtin-functions-docs.py +++ b/sql/gen-sql-builtin-functions-docs.py @@ -52,7 +52,8 @@ "array_funcs": "Array Functions", "datetime_funcs": "Date and Timestamp Functions", "json_funcs": "JSON Functions", - "map_funcs": "Map Functions" + "map_funcs": "Map Functions", + "window_funcs": "Window Functions", } @@ -150,7 +151,8 @@ def _make_pretty_query_example(jspark, query): def _make_pretty_examples(jspark, infos): """ - Makes the examples description pretty and returns a formatted string. + Makes the examples description pretty and returns a formatted string if `infos` + has any `examples` starting with the example prefix. Otherwise, returns None. Expected input: @@ -178,19 +180,20 @@ def _make_pretty_examples(jspark, infos): """ - result = [] - result.append("\n#### Examples\n") - result.append("{% highlight sql %}") + if any(info.examples.startswith("\n Examples:") for info in infos): + result = [] + result.append("\n#### Examples\n") + result.append("{% highlight sql %}") - for info in infos: - result.append("-- %s" % info.name) - query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n")) - for query_example in query_examples: - query = query_example.lstrip(" > ") - result.append(_make_pretty_query_example(jspark, query)) + for info in infos: + result.append("-- %s" % info.name) + query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n")) + for query_example in query_examples: + query = query_example.lstrip(" > ") + result.append(_make_pretty_query_example(jspark, query)) - result.append("{% endhighlight %}\n") - return "\n".join(result) + result.append("{% endhighlight %}\n") + return "\n".join(result) def generate_sql_markdown(jvm, jspark, path): @@ -231,7 +234,8 @@ def generate_sql_markdown(jvm, jspark, path): function_table = _make_pretty_usage(infos) examples = _make_pretty_examples(jspark, infos) mdfile.write(function_table) - mdfile.write(examples) + if examples is not None: + mdfile.write(examples) if __name__ == "__main__": From d66a9aad77d95af679e3df1f2b6808a89ddd4a81 Mon Sep 17 00:00:00 2001 From: Takeshi Yamamuro Date: Thu, 16 Apr 2020 13:45:34 +0900 Subject: [PATCH 3/6] Fix --- sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index 18d6a1b710297..92ea0ce246d4e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -557,11 +557,10 @@ class UDFSuite extends QueryTest with SharedSparkSession { } val errMsg = intercept[IllegalArgumentException] { - val invalidGroupName = "invalidGroupName" + val invalidGroupName = "invalid_group_funcs" new ExpressionInfo("testClass", null, "testName", null, "", "", "", invalidGroupName, "", "") }.getMessage - assert(errMsg === "'group' is malformed in the expression [testName]. It should be a value " + - "in [agg_funcs, collection_funcs, datetime_funcs]; however, got [invalidGroupName].") + assert(errMsg.contains("'group' is malformed in the expression [testName].")) } test("SPARK-28521 error message for CAST(parameter types contains DataType)") { From 7e9ebdf701a7dc251d1120641f2aa7268ced7b16 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Fri, 17 Apr 2020 22:53:09 +0900 Subject: [PATCH 4/6] Address nits and add window_funcs --- docs/sql-ref-functions-builtin.md | 2019 ----------------- .../expressions/ExpressionDescription.java | 2 +- .../expressions/windowExpressions.scala | 24 +- sql/create-docs.sh | 2 +- sql/gen-sql-builtin-functions-docs.py | 4 + sql/gen-sql-config-docs.py | 2 + 6 files changed, 24 insertions(+), 2029 deletions(-) delete mode 100644 docs/sql-ref-functions-builtin.md diff --git a/docs/sql-ref-functions-builtin.md b/docs/sql-ref-functions-builtin.md deleted file mode 100644 index cf8a895f2543b..0000000000000 --- a/docs/sql-ref-functions-builtin.md +++ /dev/null @@ -1,2019 +0,0 @@ ---- -layout: global -title: Built-in Functions -displayTitle: Built-in Functions -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -### Aggregate Functions - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FunctionDescription
any(expr)Returns true if at least one value of `expr` is true.
approx_count_distinct(expr[, relativeSD])Returns the estimated cardinality by HyperLogLog++. - `relativeSD` defines the maximum estimation error allowed.
approx_percentile(col, percentage [, accuracy])Returns the approximate percentile value of numeric - column `col` at the given percentage. The value of percentage must be between 0.0 - and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which - controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields - better accuracy, `1.0/accuracy` is the relative error of the approximation. - When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0. - In this case, returns the approximate percentile array of column `col` at the given - percentage array.
avg(expr)Returns the mean calculated from values of a group.
bit_or(expr)Returns the bitwise OR of all non-null input values, or null if none.
bit_xor(expr)Returns the bitwise XOR of all non-null input values, or null if none.
bool_and(expr)Returns true if all values of `expr` are true.
bool_or(expr)Returns true if at least one value of `expr` is true.
collect_list(expr)Collects and returns a list of non-unique elements.
collect_set(expr)Collects and returns a set of unique elements.
corr(expr1, expr2)Returns Pearson coefficient of correlation between a set of number pairs.
count(*)Returns the total number of retrieved rows, including rows containing null.
count(expr[, expr...])Returns the number of rows for which the supplied expression(s) are all non-null.
count(DISTINCT expr[, expr...])Returns the number of rows for which the supplied expression(s) are unique and non-null.
count_if(expr)Returns the number of `TRUE` values for the expression.
count_min_sketch(col, eps, confidence, seed)Returns a count-min sketch of a column with the given esp, - confidence and seed. The result is an array of bytes, which can be deserialized to a - `CountMinSketch` before usage. Count-min sketch is a probabilistic data structure used for - cardinality estimation using sub-linear space.
covar_pop(expr1, expr2)Returns the population covariance of a set of number pairs.
covar_samp(expr1, expr2)Returns the sample covariance of a set of number pairs.
every(expr)Returns true if all values of `expr` are true.
first(expr[, isIgnoreNull])Returns the first value of `expr` for a group of rows. - If `isIgnoreNull` is true, returns only non-null values.
first_value(expr[, isIgnoreNull])Returns the first value of `expr` for a group of rows. - If `isIgnoreNull` is true, returns only non-null values.
kurtosis(expr)Returns the kurtosis value calculated from values of a group.
last(expr[, isIgnoreNull])Returns the last value of `expr` for a group of rows. - If `isIgnoreNull` is true, returns only non-null values
last_value(expr[, isIgnoreNull])Returns the last value of `expr` for a group of rows. - If `isIgnoreNull` is true, returns only non-null values
max(expr)Returns the maximum value of `expr`.
max_by(x, y)Returns the value of `x` associated with the maximum value of `y`.
mean(expr)Returns the mean calculated from values of a group.
min(expr)Returns the minimum value of `expr`.
min_by(x, y)Returns the value of `x` associated with the minimum value of `y`.
percentile(col, percentage [, frequency])Returns the exact percentile value of numeric column - `col` at the given percentage. The value of percentage must be between 0.0 and 1.0. The - value of frequency should be positive integral
percentile(col, array(percentage1 [, percentage2]...) [, frequency])Returns the exact - percentile value array of numeric column `col` at the given percentage(s). Each value - of the percentage array must be between 0.0 and 1.0. The value of frequency should be - positive integral
percentile_approx(col, percentage [, accuracy])Returns the approximate percentile value of numeric - column `col` at the given percentage. The value of percentage must be between 0.0 - and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which - controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields - better accuracy, `1.0/accuracy` is the relative error of the approximation. - When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0. - In this case, returns the approximate percentile array of column `col` at the given - percentage array.
skewness(expr)Returns the skewness value calculated from values of a group.
some(expr)Returns true if at least one value of `expr` is true.
std(expr)Returns the sample standard deviation calculated from values of a group.
stddev(expr)Returns the sample standard deviation calculated from values of a group.
stddev_pop(expr)Returns the population standard deviation calculated from values of a group.
stddev_samp(expr)Returns the sample standard deviation calculated from values of a group.
sum(expr)Returns the sum calculated from values of a group.
var_pop(expr)Returns the population variance calculated from values of a group.
var_samp(expr)Returns the sample variance calculated from values of a group.
variance(expr)Returns the sample variance calculated from values of a group.
- -#### Examples - -{% highlight sql %} --- any -SELECT any(col) FROM VALUES (true), (false), (false) AS tab(col); - +--------+ - |any(col)| - +--------+ - | true| - +--------+ - -SELECT any(col) FROM VALUES (NULL), (true), (false) AS tab(col); - +--------+ - |any(col)| - +--------+ - | true| - +--------+ - -SELECT any(col) FROM VALUES (false), (false), (NULL) AS tab(col); - +--------+ - |any(col)| - +--------+ - | false| - +--------+ - --- approx_count_distinct -SELECT approx_count_distinct(col1) FROM VALUES (1), (1), (2), (2), (3) tab(col1); - +---------------------------+ - |approx_count_distinct(col1)| - +---------------------------+ - | 3| - +---------------------------+ - --- approx_percentile -SELECT approx_percentile(10.0, array(0.5, 0.4, 0.1), 100); - +--------------------------------------------------+ - |approx_percentile(10.0, array(0.5, 0.4, 0.1), 100)| - +--------------------------------------------------+ - | [10.0, 10.0, 10.0]| - +--------------------------------------------------+ - -SELECT approx_percentile(10.0, 0.5, 100); - +-------------------------------------------------+ - |approx_percentile(10.0, CAST(0.5 AS DOUBLE), 100)| - +-------------------------------------------------+ - | 10.0| - +-------------------------------------------------+ - --- avg -SELECT avg(col) FROM VALUES (1), (2), (3) AS tab(col); - +--------+ - |avg(col)| - +--------+ - | 2.0| - +--------+ - -SELECT avg(col) FROM VALUES (1), (2), (NULL) AS tab(col); - +--------+ - |avg(col)| - +--------+ - | 1.5| - +--------+ - --- bit_or -SELECT bit_or(col) FROM VALUES (3), (5) AS tab(col); - +-----------+ - |bit_or(col)| - +-----------+ - | 7| - +-----------+ - --- bit_xor -SELECT bit_xor(col) FROM VALUES (3), (5) AS tab(col); - +------------+ - |bit_xor(col)| - +------------+ - | 6| - +------------+ - --- bool_and -SELECT bool_and(col) FROM VALUES (true), (true), (true) AS tab(col); - +-------------+ - |bool_and(col)| - +-------------+ - | true| - +-------------+ - -SELECT bool_and(col) FROM VALUES (NULL), (true), (true) AS tab(col); - +-------------+ - |bool_and(col)| - +-------------+ - | true| - +-------------+ - -SELECT bool_and(col) FROM VALUES (true), (false), (true) AS tab(col); - +-------------+ - |bool_and(col)| - +-------------+ - | false| - +-------------+ - --- bool_or -SELECT bool_or(col) FROM VALUES (true), (false), (false) AS tab(col); - +------------+ - |bool_or(col)| - +------------+ - | true| - +------------+ - -SELECT bool_or(col) FROM VALUES (NULL), (true), (false) AS tab(col); - +------------+ - |bool_or(col)| - +------------+ - | true| - +------------+ - -SELECT bool_or(col) FROM VALUES (false), (false), (NULL) AS tab(col); - +------------+ - |bool_or(col)| - +------------+ - | false| - +------------+ - --- collect_list -SELECT collect_list(col) FROM VALUES (1), (2), (1) AS tab(col); - +-----------------+ - |collect_list(col)| - +-----------------+ - | [1, 2, 1]| - +-----------------+ - --- collect_set -SELECT collect_set(col) FROM VALUES (1), (2), (1) AS tab(col); - +----------------+ - |collect_set(col)| - +----------------+ - | [1, 2]| - +----------------+ - --- corr -SELECT corr(c1, c2) FROM VALUES (3, 2), (3, 3), (6, 4) as tab(c1, c2); - +--------------------------------------------+ - |corr(CAST(c1 AS DOUBLE), CAST(c2 AS DOUBLE))| - +--------------------------------------------+ - | 0.8660254037844387| - +--------------------------------------------+ - --- count -SELECT count(*) FROM VALUES (NULL), (5), (5), (20) AS tab(col); - +--------+ - |count(1)| - +--------+ - | 4| - +--------+ - -SELECT count(col) FROM VALUES (NULL), (5), (5), (20) AS tab(col); - +----------+ - |count(col)| - +----------+ - | 3| - +----------+ - -SELECT count(DISTINCT col) FROM VALUES (NULL), (5), (5), (10) AS tab(col); - +-------------------+ - |count(DISTINCT col)| - +-------------------+ - | 2| - +-------------------+ - --- count_if -SELECT count_if(col % 2 = 0) FROM VALUES (NULL), (0), (1), (2), (3) AS tab(col); - +-------------------------+ - |count_if(((col % 2) = 0))| - +-------------------------+ - | 2| - +-------------------------+ - -SELECT count_if(col IS NULL) FROM VALUES (NULL), (0), (1), (2), (3) AS tab(col); - +-----------------------+ - |count_if((col IS NULL))| - +-----------------------+ - | 1| - +-----------------------+ - --- count_min_sketch --- covar_pop -SELECT covar_pop(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2); - +-------------------------------------------------+ - |covar_pop(CAST(c1 AS DOUBLE), CAST(c2 AS DOUBLE))| - +-------------------------------------------------+ - | 0.6666666666666666| - +-------------------------------------------------+ - --- covar_samp -SELECT covar_samp(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2); - +--------------------------------------------------+ - |covar_samp(CAST(c1 AS DOUBLE), CAST(c2 AS DOUBLE))| - +--------------------------------------------------+ - | 1.0| - +--------------------------------------------------+ - --- every -SELECT every(col) FROM VALUES (true), (true), (true) AS tab(col); - +----------+ - |every(col)| - +----------+ - | true| - +----------+ - -SELECT every(col) FROM VALUES (NULL), (true), (true) AS tab(col); - +----------+ - |every(col)| - +----------+ - | true| - +----------+ - -SELECT every(col) FROM VALUES (true), (false), (true) AS tab(col); - +----------+ - |every(col)| - +----------+ - | false| - +----------+ - --- first -SELECT first(col) FROM VALUES (10), (5), (20) AS tab(col); - +-----------------+ - |first(col, false)| - +-----------------+ - | 10| - +-----------------+ - -SELECT first(col) FROM VALUES (NULL), (5), (20) AS tab(col); - +-----------------+ - |first(col, false)| - +-----------------+ - | null| - +-----------------+ - -SELECT first(col, true) FROM VALUES (NULL), (5), (20) AS tab(col); - +----------------+ - |first(col, true)| - +----------------+ - | 5| - +----------------+ - --- first_value -SELECT first_value(col) FROM VALUES (10), (5), (20) AS tab(col); - +-----------------------+ - |first_value(col, false)| - +-----------------------+ - | 10| - +-----------------------+ - -SELECT first_value(col) FROM VALUES (NULL), (5), (20) AS tab(col); - +-----------------------+ - |first_value(col, false)| - +-----------------------+ - | null| - +-----------------------+ - -SELECT first_value(col, true) FROM VALUES (NULL), (5), (20) AS tab(col); - +----------------------+ - |first_value(col, true)| - +----------------------+ - | 5| - +----------------------+ - --- kurtosis -SELECT kurtosis(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col); - +-----------------------------+ - |kurtosis(CAST(col AS DOUBLE))| - +-----------------------------+ - | -0.7014368047529618| - +-----------------------------+ - -SELECT kurtosis(col) FROM VALUES (1), (10), (100), (10), (1) as tab(col); - +-----------------------------+ - |kurtosis(CAST(col AS DOUBLE))| - +-----------------------------+ - | 0.19432323191699075| - +-----------------------------+ - --- last -SELECT last(col) FROM VALUES (10), (5), (20) AS tab(col); - +----------------+ - |last(col, false)| - +----------------+ - | 20| - +----------------+ - -SELECT last(col) FROM VALUES (10), (5), (NULL) AS tab(col); - +----------------+ - |last(col, false)| - +----------------+ - | null| - +----------------+ - -SELECT last(col, true) FROM VALUES (10), (5), (NULL) AS tab(col); - +---------------+ - |last(col, true)| - +---------------+ - | 5| - +---------------+ - --- last_value -SELECT last_value(col) FROM VALUES (10), (5), (20) AS tab(col); - +----------------------+ - |last_value(col, false)| - +----------------------+ - | 20| - +----------------------+ - -SELECT last_value(col) FROM VALUES (10), (5), (NULL) AS tab(col); - +----------------------+ - |last_value(col, false)| - +----------------------+ - | null| - +----------------------+ - -SELECT last_value(col, true) FROM VALUES (10), (5), (NULL) AS tab(col); - +---------------------+ - |last_value(col, true)| - +---------------------+ - | 5| - +---------------------+ - --- max -SELECT max(col) FROM VALUES (10), (50), (20) AS tab(col); - +--------+ - |max(col)| - +--------+ - | 50| - +--------+ - --- max_by -SELECT max_by(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 20)) AS tab(x, y); - +-----------+ - |maxby(x, y)| - +-----------+ - | b| - +-----------+ - --- mean -SELECT mean(col) FROM VALUES (1), (2), (3) AS tab(col); - +---------+ - |mean(col)| - +---------+ - | 2.0| - +---------+ - -SELECT mean(col) FROM VALUES (1), (2), (NULL) AS tab(col); - +---------+ - |mean(col)| - +---------+ - | 1.5| - +---------+ - --- min -SELECT min(col) FROM VALUES (10), (-1), (20) AS tab(col); - +--------+ - |min(col)| - +--------+ - | -1| - +--------+ - --- min_by -SELECT min_by(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 20)) AS tab(x, y); - +-----------+ - |minby(x, y)| - +-----------+ - | a| - +-----------+ - --- percentile -SELECT percentile(col, 0.3) FROM VALUES (0), (10) AS tab(col); - +---------------------------------------+ - |percentile(col, CAST(0.3 AS DOUBLE), 1)| - +---------------------------------------+ - | 3.0| - +---------------------------------------+ - -SELECT percentile(col, array(0.25, 0.75)) FROM VALUES (0), (10) AS tab(col); - +-------------------------------------+ - |percentile(col, array(0.25, 0.75), 1)| - +-------------------------------------+ - | [2.5, 7.5]| - +-------------------------------------+ - --- percentile_approx -SELECT percentile_approx(10.0, array(0.5, 0.4, 0.1), 100); - +--------------------------------------------------+ - |percentile_approx(10.0, array(0.5, 0.4, 0.1), 100)| - +--------------------------------------------------+ - | [10.0, 10.0, 10.0]| - +--------------------------------------------------+ - -SELECT percentile_approx(10.0, 0.5, 100); - +-------------------------------------------------+ - |percentile_approx(10.0, CAST(0.5 AS DOUBLE), 100)| - +-------------------------------------------------+ - | 10.0| - +-------------------------------------------------+ - --- skewness -SELECT skewness(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col); - +-----------------------------+ - |skewness(CAST(col AS DOUBLE))| - +-----------------------------+ - | 1.1135657469022013| - +-----------------------------+ - -SELECT skewness(col) FROM VALUES (-1000), (-100), (10), (20) AS tab(col); - +-----------------------------+ - |skewness(CAST(col AS DOUBLE))| - +-----------------------------+ - | -1.1135657469022011| - +-----------------------------+ - --- some -SELECT some(col) FROM VALUES (true), (false), (false) AS tab(col); - +---------+ - |some(col)| - +---------+ - | true| - +---------+ - -SELECT some(col) FROM VALUES (NULL), (true), (false) AS tab(col); - +---------+ - |some(col)| - +---------+ - | true| - +---------+ - -SELECT some(col) FROM VALUES (false), (false), (NULL) AS tab(col); - +---------+ - |some(col)| - +---------+ - | false| - +---------+ - --- std -SELECT std(col) FROM VALUES (1), (2), (3) AS tab(col); - +------------------------+ - |std(CAST(col AS DOUBLE))| - +------------------------+ - | 1.0| - +------------------------+ - --- stddev -SELECT stddev(col) FROM VALUES (1), (2), (3) AS tab(col); - +---------------------------+ - |stddev(CAST(col AS DOUBLE))| - +---------------------------+ - | 1.0| - +---------------------------+ - --- stddev_pop -SELECT stddev_pop(col) FROM VALUES (1), (2), (3) AS tab(col); - +-------------------------------+ - |stddev_pop(CAST(col AS DOUBLE))| - +-------------------------------+ - | 0.816496580927726| - +-------------------------------+ - --- stddev_samp -SELECT stddev_samp(col) FROM VALUES (1), (2), (3) AS tab(col); - +--------------------------------+ - |stddev_samp(CAST(col AS DOUBLE))| - +--------------------------------+ - | 1.0| - +--------------------------------+ - --- sum -SELECT sum(col) FROM VALUES (5), (10), (15) AS tab(col); - +--------+ - |sum(col)| - +--------+ - | 30| - +--------+ - -SELECT sum(col) FROM VALUES (NULL), (10), (15) AS tab(col); - +--------+ - |sum(col)| - +--------+ - | 25| - +--------+ - -SELECT sum(col) FROM VALUES (NULL), (NULL) AS tab(col); - +--------+ - |sum(col)| - +--------+ - | null| - +--------+ - --- var_pop -SELECT var_pop(col) FROM VALUES (1), (2), (3) AS tab(col); - +----------------------------+ - |var_pop(CAST(col AS DOUBLE))| - +----------------------------+ - | 0.6666666666666666| - +----------------------------+ - --- var_samp -SELECT var_samp(col) FROM VALUES (1), (2), (3) AS tab(col); - +-----------------------------+ - |var_samp(CAST(col AS DOUBLE))| - +-----------------------------+ - | 1.0| - +-----------------------------+ - --- variance -SELECT variance(col) FROM VALUES (1), (2), (3) AS tab(col); - +-----------------------------+ - |variance(CAST(col AS DOUBLE))| - +-----------------------------+ - | 1.0| - +-----------------------------+ - -{% endhighlight %} - -### Array Functions - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FunctionDescription
array_contains(array, value)Returns true if the array contains the value.
array_distinct(array)Removes duplicate values from the array.
array_except(array1, array2)Returns an array of the elements in array1 but not in array2, - without duplicates.
array_intersect(array1, array2)Returns an array of the elements in the intersection of array1 and - array2, without duplicates.
array_join(array, delimiter[, nullReplacement])Concatenates the elements of the given array - using the delimiter and an optional string to replace nulls. If no value is set for - nullReplacement, any null value is filtered.
array_max(array)Returns the maximum value in the array. NULL elements are skipped.
array_min(array)Returns the minimum value in the array. NULL elements are skipped.
array_position(array, element)Returns the (1-based) index of the first element of the array as long.
array_remove(array, element)Remove all elements that equal to element from array.
array_repeat(element, count)Returns the array containing element count times.
array_union(array1, array2)Returns an array of the elements in the union of array1 and array2, - without duplicates.
arrays_overlap(a1, a2)Returns true if a1 contains at least a non-null element present also in a2. If the arrays have no common element and they are both non-empty and either of them contains a null element null is returned, false otherwise.
arrays_zip(a1, a2, ...)Returns a merged array of structs in which the N-th struct contains all - N-th values of input arrays.
concat(col1, col2, ..., colN)Returns the concatenation of col1, col2, ..., colN.
flatten(arrayOfArrays)Transforms an array of arrays into a single array.
reverse(array)Returns a reversed string or an array with reverse order of elements.
sequence(start, stop, step)Generates an array of elements from start to stop (inclusive), - incrementing by step. The type of the returned elements is the same as the type of argument - expressions. - - Supported types are: byte, short, integer, long, date, timestamp. - - The start and stop expressions must resolve to the same type. - If start and stop expressions resolve to the 'date' or 'timestamp' type - then the step expression must resolve to the 'interval' type, otherwise to the same type - as the start and stop expressions.
shuffle(array)Returns a random permutation of the given array.
slice(x, start, length)Subsets array x starting from index start (array indices start at 1, or starting from the end if start is negative) with the specified length.
sort_array(array[, ascendingOrder])Sorts the input array in ascending or descending order - according to the natural ordering of the array elements. Null elements will be placed - at the beginning of the returned array in ascending order or at the end of the returned - array in descending order.
- -#### Examples - -{% highlight sql %} --- array_contains -SELECT array_contains(array(1, 2, 3), 2); - +---------------------------------+ - |array_contains(array(1, 2, 3), 2)| - +---------------------------------+ - | true| - +---------------------------------+ - --- array_distinct -SELECT array_distinct(array(1, 2, 3, null, 3)); - +----------------------------------------------------+ - |array_distinct(array(1, 2, 3, CAST(NULL AS INT), 3))| - +----------------------------------------------------+ - | [1, 2, 3,]| - +----------------------------------------------------+ - --- array_except -SELECT array_except(array(1, 2, 3), array(1, 3, 5)); - +--------------------------------------------+ - |array_except(array(1, 2, 3), array(1, 3, 5))| - +--------------------------------------------+ - | [2]| - +--------------------------------------------+ - --- array_intersect -SELECT array_intersect(array(1, 2, 3), array(1, 3, 5)); - +-----------------------------------------------+ - |array_intersect(array(1, 2, 3), array(1, 3, 5))| - +-----------------------------------------------+ - | [1, 3]| - +-----------------------------------------------+ - --- array_join -SELECT array_join(array('hello', 'world'), ' '); - +----------------------------------+ - |array_join(array(hello, world), )| - +----------------------------------+ - | hello world| - +----------------------------------+ - -SELECT array_join(array('hello', null ,'world'), ' '); - +--------------------------------------------------------+ - |array_join(array(hello, CAST(NULL AS STRING), world), )| - +--------------------------------------------------------+ - | hello world| - +--------------------------------------------------------+ - -SELECT array_join(array('hello', null ,'world'), ' ', ','); - +-----------------------------------------------------------+ - |array_join(array(hello, CAST(NULL AS STRING), world), , ,)| - +-----------------------------------------------------------+ - | hello , world| - +-----------------------------------------------------------+ - --- array_max -SELECT array_max(array(1, 20, null, 3)); - +---------------------------------------------+ - |array_max(array(1, 20, CAST(NULL AS INT), 3))| - +---------------------------------------------+ - | 20| - +---------------------------------------------+ - --- array_min -SELECT array_min(array(1, 20, null, 3)); - +---------------------------------------------+ - |array_min(array(1, 20, CAST(NULL AS INT), 3))| - +---------------------------------------------+ - | 1| - +---------------------------------------------+ - --- array_position -SELECT array_position(array(3, 2, 1), 1); - +---------------------------------+ - |array_position(array(3, 2, 1), 1)| - +---------------------------------+ - | 3| - +---------------------------------+ - --- array_remove -SELECT array_remove(array(1, 2, 3, null, 3), 3); - +-----------------------------------------------------+ - |array_remove(array(1, 2, 3, CAST(NULL AS INT), 3), 3)| - +-----------------------------------------------------+ - | [1, 2,]| - +-----------------------------------------------------+ - --- array_repeat -SELECT array_repeat('123', 2); - +--------------------+ - |array_repeat(123, 2)| - +--------------------+ - | [123, 123]| - +--------------------+ - --- array_union -SELECT array_union(array(1, 2, 3), array(1, 3, 5)); - +-------------------------------------------+ - |array_union(array(1, 2, 3), array(1, 3, 5))| - +-------------------------------------------+ - | [1, 2, 3, 5]| - +-------------------------------------------+ - --- arrays_overlap -SELECT arrays_overlap(array(1, 2, 3), array(3, 4, 5)); - +----------------------------------------------+ - |arrays_overlap(array(1, 2, 3), array(3, 4, 5))| - +----------------------------------------------+ - | true| - +----------------------------------------------+ - --- arrays_zip -SELECT arrays_zip(array(1, 2, 3), array(2, 3, 4)); - +------------------------------------------+ - |arrays_zip(array(1, 2, 3), array(2, 3, 4))| - +------------------------------------------+ - | [[1, 2], [2, 3], ...| - +------------------------------------------+ - -SELECT arrays_zip(array(1, 2), array(2, 3), array(3, 4)); - +-------------------------------------------------+ - |arrays_zip(array(1, 2), array(2, 3), array(3, 4))| - +-------------------------------------------------+ - | [[1, 2, 3], [2, 3...| - +-------------------------------------------------+ - --- concat -SELECT concat('Spark', 'SQL'); - +------------------+ - |concat(Spark, SQL)| - +------------------+ - | SparkSQL| - +------------------+ - -SELECT concat(array(1, 2, 3), array(4, 5), array(6)); - +---------------------------------------------+ - |concat(array(1, 2, 3), array(4, 5), array(6))| - +---------------------------------------------+ - | [1, 2, 3, 4, 5, 6]| - +---------------------------------------------+ - --- flatten -SELECT flatten(array(array(1, 2), array(3, 4))); - +----------------------------------------+ - |flatten(array(array(1, 2), array(3, 4)))| - +----------------------------------------+ - | [1, 2, 3, 4]| - +----------------------------------------+ - --- reverse -SELECT reverse('Spark SQL'); - +------------------+ - |reverse(Spark SQL)| - +------------------+ - | LQS krapS| - +------------------+ - -SELECT reverse(array(2, 1, 4, 3)); - +--------------------------+ - |reverse(array(2, 1, 4, 3))| - +--------------------------+ - | [3, 4, 1, 2]| - +--------------------------+ - --- sequence -SELECT sequence(1, 5); - +---------------+ - | sequence(1, 5)| - +---------------+ - |[1, 2, 3, 4, 5]| - +---------------+ - -SELECT sequence(5, 1); - +---------------+ - | sequence(5, 1)| - +---------------+ - |[5, 4, 3, 2, 1]| - +---------------+ - -SELECT sequence(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month); - +---------------------------------------------------------------------------+ - |sequence(to_date('2018-01-01'), to_date('2018-03-01'), INTERVAL '1 months')| - +---------------------------------------------------------------------------+ - | [2018-01-01, 2018...| - +---------------------------------------------------------------------------+ - --- shuffle -SELECT shuffle(array(1, 20, 3, 5)); - +---------------------------+ - |shuffle(array(1, 20, 3, 5))| - +---------------------------+ - | [1, 3, 20, 5]| - +---------------------------+ - -SELECT shuffle(array(1, 20, null, 3)); - +-------------------------------------------+ - |shuffle(array(1, 20, CAST(NULL AS INT), 3))| - +-------------------------------------------+ - | [1,, 20, 3]| - +-------------------------------------------+ - --- slice -SELECT slice(array(1, 2, 3, 4), 2, 2); - +------------------------------+ - |slice(array(1, 2, 3, 4), 2, 2)| - +------------------------------+ - | [2, 3]| - +------------------------------+ - -SELECT slice(array(1, 2, 3, 4), -2, 2); - +-------------------------------+ - |slice(array(1, 2, 3, 4), -2, 2)| - +-------------------------------+ - | [3, 4]| - +-------------------------------+ - --- sort_array -SELECT sort_array(array('b', 'd', null, 'c', 'a'), true); - +---------------------------------------------------------+ - |sort_array(array(b, d, CAST(NULL AS STRING), c, a), true)| - +---------------------------------------------------------+ - | [, a, b, c, d]| - +---------------------------------------------------------+ - -{% endhighlight %} - -### Date and Timestamp Functions - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FunctionDescription
add_months(start_date, num_months)Returns the date that is `num_months` after `start_date`.
current_date()Returns the current date at the start of query evaluation.
current_timestamp()Returns the current timestamp at the start of query evaluation.
date_add(start_date, num_days)Returns the date that is `num_days` after `start_date`.
date_format(timestamp, fmt)Converts `timestamp` to a value of string in the format specified by the date format `fmt`.
date_part(field, source)Extracts a part of the date/timestamp or interval source.
date_sub(start_date, num_days)Returns the date that is `num_days` before `start_date`.
date_trunc(fmt, ts)Returns timestamp `ts` truncated to the unit specified by the format model `fmt`. - `fmt` should be one of ["MILLENNIUM", "CENTURY", "DECADE", "YEAR", "YYYY", "YY", - "QUARTER", "MON", "MONTH", "MM", "WEEK", "DAY", "DD", - "HOUR", "MINUTE", "SECOND", "MILLISECOND", "MICROSECOND"]
datediff(endDate, startDate)Returns the number of days from `startDate` to `endDate`.
dayofweek(date)Returns the day of the week for date/timestamp (1 = Sunday, 2 = Monday, ..., 7 = Saturday).
dayofyear(date)Returns the day of year of the date/timestamp.
from_unixtime(unix_time, format)Returns `unix_time` in the specified `format`.
from_utc_timestamp(timestamp, timezone)Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders that time as a timestamp in the given time zone. For example, 'GMT+1' would yield '2017-07-14 03:40:00.0'.
hour(timestamp)Returns the hour component of the string/timestamp.
last_day(date)Returns the last day of the month which the date belongs to.
make_date(year, month, day)Create date from year, month and day fields.
make_timestamp(year, month, day, hour, min, sec[, timezone])Create timestamp from year, month, day, hour, min, sec and timezone fields.
minute(timestamp)Returns the minute component of the string/timestamp.
month(date)Returns the month component of the date/timestamp.
months_between(timestamp1, timestamp2[, roundOff])If `timestamp1` is later than `timestamp2`, then the result - is positive. If `timestamp1` and `timestamp2` are on the same day of month, or both - are the last day of month, time of day will be ignored. Otherwise, the difference is - calculated based on 31 days per month, and rounded to 8 digits unless roundOff=false.
next_day(start_date, day_of_week)Returns the first date which is later than `start_date` and named as indicated.
now()Returns the current timestamp at the start of query evaluation.
quarter(date)Returns the quarter of the year for date, in the range 1 to 4.
second(timestamp)Returns the second component of the string/timestamp.
to_date(date_str[, fmt])Parses the `date_str` expression with the `fmt` expression to - a date. Returns null with invalid input. By default, it follows casting rules to a date if - the `fmt` is omitted.
to_timestamp(timestamp_str[, fmt])Parses the `timestamp_str` expression with the `fmt` expression - to a timestamp. Returns null with invalid input. By default, it follows casting rules to - a timestamp if the `fmt` is omitted.
to_unix_timestamp(timeExp[, format])Returns the UNIX timestamp of the given time.
to_utc_timestamp(timestamp, timezone)Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield '2017-07-14 01:40:00.0'.
trunc(date, fmt)Returns `date` with the time portion of the day truncated to the unit specified by the format model `fmt`. - `fmt` should be one of ["week", "mon", "month", "mm", "quarter", "year", "yyyy", "yy", "decade", "century", "millennium"]
unix_timestamp([timeExp[, format]])Returns the UNIX timestamp of current or specified time.
weekday(date)Returns the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).
weekofyear(date)Returns the week of the year of the given date. A week is considered to start on a Monday and week 1 is the first week with >3 days.
year(date)Returns the year component of the date/timestamp.
- -#### Examples - -{% highlight sql %} --- add_months -SELECT add_months('2016-08-31', 1); - +---------------------------------------+ - |add_months(CAST(2016-08-31 AS DATE), 1)| - +---------------------------------------+ - | 2016-09-30| - +---------------------------------------+ - --- current_date --- current_timestamp --- date_add -SELECT date_add('2016-07-30', 1); - +-------------------------------------+ - |date_add(CAST(2016-07-30 AS DATE), 1)| - +-------------------------------------+ - | 2016-07-31| - +-------------------------------------+ - --- date_format -SELECT date_format('2016-04-08', 'y'); - +---------------------------------------------+ - |date_format(CAST(2016-04-08 AS TIMESTAMP), y)| - +---------------------------------------------+ - | 2016| - +---------------------------------------------+ - --- date_part -SELECT date_part('YEAR', TIMESTAMP '2019-08-12 01:00:00.123456'); - +---------------------------------------------------------+ - |date_part('YEAR', TIMESTAMP '2019-08-12 01:00:00.123456')| - +---------------------------------------------------------+ - | 2019| - +---------------------------------------------------------+ - -SELECT date_part('week', timestamp'2019-08-12 01:00:00.123456'); - +---------------------------------------------------------+ - |date_part('week', TIMESTAMP '2019-08-12 01:00:00.123456')| - +---------------------------------------------------------+ - | 33| - +---------------------------------------------------------+ - -SELECT date_part('doy', DATE'2019-08-12'); - +-----------------------------------+ - |date_part('doy', DATE '2019-08-12')| - +-----------------------------------+ - | 224| - +-----------------------------------+ - -SELECT date_part('SECONDS', timestamp'2019-10-01 00:00:01.000001'); - +------------------------------------------------------------+ - |date_part('SECONDS', TIMESTAMP '2019-10-01 00:00:01.000001')| - +------------------------------------------------------------+ - | 1.000001| - +------------------------------------------------------------+ - -SELECT date_part('days', interval 1 year 10 months 5 days); - +------------------------------------------------------+ - |date_part('days', INTERVAL '1 years 10 months 5 days')| - +------------------------------------------------------+ - | 5| - +------------------------------------------------------+ - -SELECT date_part('seconds', interval 5 hours 30 seconds 1 milliseconds 1 microseconds); - +----------------------------------------------------------+ - |date_part('seconds', INTERVAL '5 hours 30.001001 seconds')| - +----------------------------------------------------------+ - | 30.001001| - +----------------------------------------------------------+ - --- date_sub -SELECT date_sub('2016-07-30', 1); - +-------------------------------------+ - |date_sub(CAST(2016-07-30 AS DATE), 1)| - +-------------------------------------+ - | 2016-07-29| - +-------------------------------------+ - --- date_trunc -SELECT date_trunc('YEAR', '2015-03-05T09:32:05.359'); - +------------------------------------------------------------+ - |date_trunc(YEAR, CAST(2015-03-05T09:32:05.359 AS TIMESTAMP))| - +------------------------------------------------------------+ - | 2015-01-01 00:00:00| - +------------------------------------------------------------+ - -SELECT date_trunc('MM', '2015-03-05T09:32:05.359'); - +----------------------------------------------------------+ - |date_trunc(MM, CAST(2015-03-05T09:32:05.359 AS TIMESTAMP))| - +----------------------------------------------------------+ - | 2015-03-01 00:00:00| - +----------------------------------------------------------+ - -SELECT date_trunc('DD', '2015-03-05T09:32:05.359'); - +----------------------------------------------------------+ - |date_trunc(DD, CAST(2015-03-05T09:32:05.359 AS TIMESTAMP))| - +----------------------------------------------------------+ - | 2015-03-05 00:00:00| - +----------------------------------------------------------+ - -SELECT date_trunc('HOUR', '2015-03-05T09:32:05.359'); - +------------------------------------------------------------+ - |date_trunc(HOUR, CAST(2015-03-05T09:32:05.359 AS TIMESTAMP))| - +------------------------------------------------------------+ - | 2015-03-05 09:00:00| - +------------------------------------------------------------+ - -SELECT date_trunc('MILLISECOND', '2015-03-05T09:32:05.123456'); - +----------------------------------------------------------------------+ - |date_trunc(MILLISECOND, CAST(2015-03-05T09:32:05.123456 AS TIMESTAMP))| - +----------------------------------------------------------------------+ - | 2015-03-05 09:32:...| - +----------------------------------------------------------------------+ - -SELECT date_trunc('DECADE', '2015-03-05T09:32:05.123456'); - +-----------------------------------------------------------------+ - |date_trunc(DECADE, CAST(2015-03-05T09:32:05.123456 AS TIMESTAMP))| - +-----------------------------------------------------------------+ - | 2010-01-01 00:00:00| - +-----------------------------------------------------------------+ - -SELECT date_trunc('CENTURY', '2015-03-05T09:32:05.123456'); - +------------------------------------------------------------------+ - |date_trunc(CENTURY, CAST(2015-03-05T09:32:05.123456 AS TIMESTAMP))| - +------------------------------------------------------------------+ - | 2001-01-01 00:00:00| - +------------------------------------------------------------------+ - --- datediff -SELECT datediff('2009-07-31', '2009-07-30'); - +------------------------------------------------------------+ - |datediff(CAST(2009-07-31 AS DATE), CAST(2009-07-30 AS DATE))| - +------------------------------------------------------------+ - | 1| - +------------------------------------------------------------+ - -SELECT datediff('2009-07-30', '2009-07-31'); - +------------------------------------------------------------+ - |datediff(CAST(2009-07-30 AS DATE), CAST(2009-07-31 AS DATE))| - +------------------------------------------------------------+ - | -1| - +------------------------------------------------------------+ - --- dayofweek -SELECT dayofweek('2009-07-30'); - +-----------------------------------+ - |dayofweek(CAST(2009-07-30 AS DATE))| - +-----------------------------------+ - | 5| - +-----------------------------------+ - --- dayofyear -SELECT dayofyear('2016-04-09'); - +-----------------------------------+ - |dayofyear(CAST(2016-04-09 AS DATE))| - +-----------------------------------+ - | 100| - +-----------------------------------+ - --- from_unixtime -SELECT from_unixtime(0, 'yyyy-MM-dd HH:mm:ss'); - +-----------------------------------------------------+ - |from_unixtime(CAST(0 AS BIGINT), yyyy-MM-dd HH:mm:ss)| - +-----------------------------------------------------+ - | 1970-01-01 09:00:00| - +-----------------------------------------------------+ - --- from_utc_timestamp -SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul'); - +-------------------------------------------------------------+ - |from_utc_timestamp(CAST(2016-08-31 AS TIMESTAMP), Asia/Seoul)| - +-------------------------------------------------------------+ - | 2016-08-31 09:00:00| - +-------------------------------------------------------------+ - --- hour -SELECT hour('2009-07-30 12:58:59'); - +--------------------------------------------+ - |hour(CAST(2009-07-30 12:58:59 AS TIMESTAMP))| - +--------------------------------------------+ - | 12| - +--------------------------------------------+ - --- last_day -SELECT last_day('2009-01-12'); - +----------------------------------+ - |last_day(CAST(2009-01-12 AS DATE))| - +----------------------------------+ - | 2009-01-31| - +----------------------------------+ - --- make_date -SELECT make_date(2013, 7, 15); - +----------------------+ - |make_date(2013, 7, 15)| - +----------------------+ - | 2013-07-15| - +----------------------+ - -SELECT make_date(2019, 13, 1); - +----------------------+ - |make_date(2019, 13, 1)| - +----------------------+ - | null| - +----------------------+ - -SELECT make_date(2019, 7, NULL); - +-------------------------------------+ - |make_date(2019, 7, CAST(NULL AS INT))| - +-------------------------------------+ - | null| - +-------------------------------------+ - -SELECT make_date(2019, 2, 30); - +----------------------+ - |make_date(2019, 2, 30)| - +----------------------+ - | null| - +----------------------+ - --- make_timestamp -SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887); - +-----------------------------------------------------------------+ - |make_timestamp(2014, 12, 28, 6, 30, CAST(45.887 AS DECIMAL(8,6)))| - +-----------------------------------------------------------------+ - | 2014-12-28 06:30:...| - +-----------------------------------------------------------------+ - -SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887, 'CET'); - +----------------------------------------------------------------------+ - |make_timestamp(2014, 12, 28, 6, 30, CAST(45.887 AS DECIMAL(8,6)), CET)| - +----------------------------------------------------------------------+ - | 2014-12-28 14:30:...| - +----------------------------------------------------------------------+ - -SELECT make_timestamp(2019, 6, 30, 23, 59, 60); - +-------------------------------------------------------------+ - |make_timestamp(2019, 6, 30, 23, 59, CAST(60 AS DECIMAL(8,6)))| - +-------------------------------------------------------------+ - | 2019-07-01 00:00:00| - +-------------------------------------------------------------+ - -SELECT make_timestamp(2019, 13, 1, 10, 11, 12, 'PST'); - +------------------------------------------------------------------+ - |make_timestamp(2019, 13, 1, 10, 11, CAST(12 AS DECIMAL(8,6)), PST)| - +------------------------------------------------------------------+ - | null| - +------------------------------------------------------------------+ - -SELECT make_timestamp(null, 7, 22, 15, 30, 0); - +-------------------------------------------------------------------------+ - |make_timestamp(CAST(NULL AS INT), 7, 22, 15, 30, CAST(0 AS DECIMAL(8,6)))| - +-------------------------------------------------------------------------+ - | null| - +-------------------------------------------------------------------------+ - --- minute -SELECT minute('2009-07-30 12:58:59'); - +----------------------------------------------+ - |minute(CAST(2009-07-30 12:58:59 AS TIMESTAMP))| - +----------------------------------------------+ - | 58| - +----------------------------------------------+ - --- month -SELECT month('2016-07-30'); - +-------------------------------+ - |month(CAST(2016-07-30 AS DATE))| - +-------------------------------+ - | 7| - +-------------------------------+ - --- months_between -SELECT months_between('1997-02-28 10:30:00', '1996-10-30'); - +-------------------------------------------------------------------------------------------+ - |months_between(CAST(1997-02-28 10:30:00 AS TIMESTAMP), CAST(1996-10-30 AS TIMESTAMP), true)| - +-------------------------------------------------------------------------------------------+ - | 3.94959677| - +-------------------------------------------------------------------------------------------+ - -SELECT months_between('1997-02-28 10:30:00', '1996-10-30', false); - +--------------------------------------------------------------------------------------------+ - |months_between(CAST(1997-02-28 10:30:00 AS TIMESTAMP), CAST(1996-10-30 AS TIMESTAMP), false)| - +--------------------------------------------------------------------------------------------+ - | 3.9495967741935485| - +--------------------------------------------------------------------------------------------+ - --- next_day -SELECT next_day('2015-01-14', 'TU'); - +--------------------------------------+ - |next_day(CAST(2015-01-14 AS DATE), TU)| - +--------------------------------------+ - | 2015-01-20| - +--------------------------------------+ - --- now --- quarter -SELECT quarter('2016-08-31'); - +---------------------------------+ - |quarter(CAST(2016-08-31 AS DATE))| - +---------------------------------+ - | 3| - +---------------------------------+ - --- second -SELECT second('2009-07-30 12:58:59'); - +----------------------------------------------+ - |second(CAST(2009-07-30 12:58:59 AS TIMESTAMP))| - +----------------------------------------------+ - | 59| - +----------------------------------------------+ - --- to_date -SELECT to_date('2009-07-30 04:17:52'); - +------------------------------+ - |to_date('2009-07-30 04:17:52')| - +------------------------------+ - | 2009-07-30| - +------------------------------+ - -SELECT to_date('2016-12-31', 'yyyy-MM-dd'); - +-----------------------------------+ - |to_date('2016-12-31', 'yyyy-MM-dd')| - +-----------------------------------+ - | 2016-12-31| - +-----------------------------------+ - --- to_timestamp -SELECT to_timestamp('2016-12-31 00:12:00'); - +-----------------------------------+ - |to_timestamp('2016-12-31 00:12:00')| - +-----------------------------------+ - | 2016-12-31 00:12:00| - +-----------------------------------+ - -SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd'); - +----------------------------------------+ - |to_timestamp('2016-12-31', 'yyyy-MM-dd')| - +----------------------------------------+ - | 2016-12-31 00:00:00| - +----------------------------------------+ - --- to_unix_timestamp -SELECT to_unix_timestamp('2016-04-08', 'yyyy-MM-dd'); - +-----------------------------------------+ - |to_unix_timestamp(2016-04-08, yyyy-MM-dd)| - +-----------------------------------------+ - | 1460041200| - +-----------------------------------------+ - --- to_utc_timestamp -SELECT to_utc_timestamp('2016-08-31', 'Asia/Seoul'); - +-----------------------------------------------------------+ - |to_utc_timestamp(CAST(2016-08-31 AS TIMESTAMP), Asia/Seoul)| - +-----------------------------------------------------------+ - | 2016-08-30 15:00:00| - +-----------------------------------------------------------+ - --- trunc -SELECT trunc('2019-08-04', 'week'); - +-------------------------------------+ - |trunc(CAST(2019-08-04 AS DATE), week)| - +-------------------------------------+ - | 2019-07-29| - +-------------------------------------+ - -SELECT trunc('2019-08-04', 'quarter'); - +----------------------------------------+ - |trunc(CAST(2019-08-04 AS DATE), quarter)| - +----------------------------------------+ - | 2019-07-01| - +----------------------------------------+ - -SELECT trunc('2009-02-12', 'MM'); - +-----------------------------------+ - |trunc(CAST(2009-02-12 AS DATE), MM)| - +-----------------------------------+ - | 2009-02-01| - +-----------------------------------+ - -SELECT trunc('2015-10-27', 'YEAR'); - +-------------------------------------+ - |trunc(CAST(2015-10-27 AS DATE), YEAR)| - +-------------------------------------+ - | 2015-01-01| - +-------------------------------------+ - -SELECT trunc('2015-10-27', 'DECADE'); - +---------------------------------------+ - |trunc(CAST(2015-10-27 AS DATE), DECADE)| - +---------------------------------------+ - | 2010-01-01| - +---------------------------------------+ - -SELECT trunc('1981-01-19', 'century'); - +----------------------------------------+ - |trunc(CAST(1981-01-19 AS DATE), century)| - +----------------------------------------+ - | 1901-01-01| - +----------------------------------------+ - -SELECT trunc('1981-01-19', 'millennium'); - +-------------------------------------------+ - |trunc(CAST(1981-01-19 AS DATE), millennium)| - +-------------------------------------------+ - | 1001-01-01| - +-------------------------------------------+ - --- unix_timestamp -SELECT unix_timestamp(); - +--------------------------------------------------------+ - |unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss)| - +--------------------------------------------------------+ - | 1587002726| - +--------------------------------------------------------+ - -SELECT unix_timestamp('2016-04-08', 'yyyy-MM-dd'); - +--------------------------------------+ - |unix_timestamp(2016-04-08, yyyy-MM-dd)| - +--------------------------------------+ - | 1460041200| - +--------------------------------------+ - --- weekday -SELECT weekday('2009-07-30'); - +---------------------------------+ - |weekday(CAST(2009-07-30 AS DATE))| - +---------------------------------+ - | 3| - +---------------------------------+ - --- weekofyear -SELECT weekofyear('2008-02-20'); - +------------------------------------+ - |weekofyear(CAST(2008-02-20 AS DATE))| - +------------------------------------+ - | 8| - +------------------------------------+ - --- year -SELECT year('2016-07-30'); - +------------------------------+ - |year(CAST(2016-07-30 AS DATE))| - +------------------------------+ - | 2016| - +------------------------------+ - -{% endhighlight %} - -### JSON Functions - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FunctionDescription
from_json(jsonStr, schema[, options])Returns a struct value with the given `jsonStr` and `schema`.
get_json_object(json_txt, path)Extracts a json object from `path`.
json_array_length(jsonArray)Returns the number of elements in the outmost JSON array.
json_object_keys(json_object)Returns all the keys of the outmost JSON object as an array.
json_tuple(jsonStr, p1, p2, ..., pn)Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.
schema_of_json(json[, options])Returns schema in the DDL format of JSON string.
to_json(expr[, options])Returns a JSON string with a given struct value
- -#### Examples - -{% highlight sql %} --- from_json -SELECT from_json('{"a":1, "b":0.8}', 'a INT, b DOUBLE'); - +---------------------------+ - |from_json({"a":1, "b":0.8})| - +---------------------------+ - | [1, 0.8]| - +---------------------------+ - -SELECT from_json('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')); - +--------------------------------+ - |from_json({"time":"26/08/2015"})| - +--------------------------------+ - | [2015-08-26 00:00...| - +--------------------------------+ - --- get_json_object -SELECT get_json_object('{"a":"b"}', '$.a'); - +-------------------------------+ - |get_json_object({"a":"b"}, $.a)| - +-------------------------------+ - | b| - +-------------------------------+ - --- json_array_length -SELECT json_array_length('[1,2,3,4]'); - +----------------------------+ - |json_array_length([1,2,3,4])| - +----------------------------+ - | 4| - +----------------------------+ - -SELECT json_array_length('[1,2,3,{"f1":1,"f2":[5,6]},4]'); - +------------------------------------------------+ - |json_array_length([1,2,3,{"f1":1,"f2":[5,6]},4])| - +------------------------------------------------+ - | 5| - +------------------------------------------------+ - -SELECT json_array_length('[1,2'); - +-----------------------+ - |json_array_length([1,2)| - +-----------------------+ - | null| - +-----------------------+ - --- json_object_keys -Select json_object_keys('{}'); - +--------------------+ - |json_object_keys({})| - +--------------------+ - | []| - +--------------------+ - -Select json_object_keys('{"key": "value"}'); - +----------------------------------+ - |json_object_keys({"key": "value"})| - +----------------------------------+ - | [key]| - +----------------------------------+ - -Select json_object_keys('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}'); - +--------------------------------------------------------+ - |json_object_keys({"f1":"abc","f2":{"f3":"a", "f4":"b"}})| - +--------------------------------------------------------+ - | [f1, f2]| - +--------------------------------------------------------+ - --- json_tuple -SELECT json_tuple('{"a":1, "b":2}', 'a', 'b'); - +---+---+ - | c0| c1| - +---+---+ - | 1| 2| - +---+---+ - --- schema_of_json -SELECT schema_of_json('[{"col":0}]'); - +---------------------------+ - |schema_of_json([{"col":0}])| - +---------------------------+ - | array - - - Function - Description - - - - - map_concat(map, ...) - Returns the union of all the given maps - - - map_entries(map) - Returns an unordered array of all entries in the given map. - - - map_from_entries(arrayOfEntries) - Returns a map created from the given array of entries. - - - map_keys(map) - Returns an unordered array containing the keys of the map. - - - map_values(map) - Returns an unordered array containing the values of the map. - - - - -#### Examples - -{% highlight sql %} --- map_concat -SELECT map_concat(map(1, 'a', 2, 'b'), map(3, 'c')); - +--------------------------------------+ - |map_concat(map(1, a, 2, b), map(3, c))| - +--------------------------------------+ - | [1 -> a, 2 -> b, ...| - +--------------------------------------+ - --- map_entries -SELECT map_entries(map(1, 'a', 2, 'b')); - +----------------------------+ - |map_entries(map(1, a, 2, b))| - +----------------------------+ - | [[1, a], [2, b]]| - +----------------------------+ - --- map_from_entries -SELECT map_from_entries(array(struct(1, 'a'), struct(2, 'b'))); - +---------------------------------------------------------------------------------------+ - |map_from_entries(array(named_struct(col1, 1, col2, a), named_struct(col1, 2, col2, b)))| - +---------------------------------------------------------------------------------------+ - | [1 -> a, 2 -> b]| - +---------------------------------------------------------------------------------------+ - --- map_keys -SELECT map_keys(map(1, 'a', 2, 'b')); - +-------------------------+ - |map_keys(map(1, a, 2, b))| - +-------------------------+ - | [1, 2]| - +-------------------------+ - --- map_values -SELECT map_values(map(1, 'a', 2, 'b')); - +---------------------------+ - |map_values(map(1, a, 2, b))| - +---------------------------+ - | [a, b]| - +---------------------------+ - -{% endhighlight %} - -### Window Functions - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FunctionDescription
cume_dist()Computes the position of a value relative to all values in the partition.
dense_rank()Computes the rank of a value in a group of values. The result is one plus the - previously assigned rank value. Unlike the function rank, dense_rank will not produce gaps - in the ranking sequence.
lag(input[, offset[, default]])Returns the value of `input` at the `offset`th row - before the current row in the window. The default value of `offset` is 1 and the default - value of `default` is null. If the value of `input` at the `offset`th row is null, - null is returned. If there is no such offset row (e.g., when the offset is 1, the first - row of the window does not have any previous row), `default` is returned.
lead(input[, offset[, default]])Returns the value of `input` at the `offset`th row - after the current row in the window. The default value of `offset` is 1 and the default - value of `default` is null. If the value of `input` at the `offset`th row is null, - null is returned. If there is no such an offset row (e.g., when the offset is 1, the last - row of the window does not have any subsequent row), `default` is returned.
ntile(n)Divides the rows for each window partition into `n` buckets ranging - from 1 to at most `n`.
percent_rank()Computes the percentage ranking of a value in a group of values.
rank()Computes the rank of a value in a group of values. The result is one plus the number - of rows preceding or equal to the current row in the ordering of the partition. The values - will produce gaps in the sequence.
row_number()Assigns a unique, sequential number to each row, starting with one, - according to the ordering of rows within the window partition.
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java index c48a38a3654e6..0f648ab9d7d26 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java @@ -40,7 +40,7 @@ * `note()` contains some notes for the expression optionally. * * `group()` describes the category that the expression belongs to. The valid value is - * `agg_funcs`, `datetime_funcs`, or `collection_funcs`. + * "agg_funcs", "array_funcs", "datetime_funcs", "json_funcs", "map_funcs" and "window_funcs". * * `since()` contains version information for the expression. Version is specified by, * for example, "2.2.0". diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala index 9070a1ab0b059..c8b6433207355 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala @@ -426,7 +426,8 @@ abstract class OffsetWindowFunction * default - a string expression which is to use when the offset is larger than the window. The default value is null. """, - since = "2.0.0") + since = "2.0.0", + group = "window_funcs") case class Lead(input: Expression, offset: Expression, default: Expression) extends OffsetWindowFunction { @@ -459,7 +460,8 @@ case class Lead(input: Expression, offset: Expression, default: Expression) * offset - an int expression which is rows to jump back in the partition. * default - a string expression which is to use when the offset row does not exist. """, - since = "2.0.0") + since = "2.0.0", + group = "window_funcs") case class Lag(input: Expression, offset: Expression, default: Expression) extends OffsetWindowFunction { @@ -517,7 +519,8 @@ object SizeBasedWindowFunction { _FUNC_() - Assigns a unique, sequential number to each row, starting with one, according to the ordering of rows within the window partition. """, - since = "2.0.0") + since = "2.0.0", + group = "window_funcs") case class RowNumber() extends RowNumberLike { override val evaluateExpression = rowNumber override def prettyName: String = "row_number" @@ -535,7 +538,8 @@ case class RowNumber() extends RowNumberLike { usage = """ _FUNC_() - Computes the position of a value relative to all values in the partition. """, - since = "2.0.0") + since = "2.0.0", + group = "window_funcs") case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction { override def dataType: DataType = DoubleType // The frame for CUME_DIST is Range based instead of Row based, because CUME_DIST must @@ -574,7 +578,8 @@ case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction { * buckets - an int expression which is number of buckets to divide the rows in. Default value is 1. """, - since = "2.0.0") + since = "2.0.0", + group = "window_funcs") case class NTile(buckets: Expression) extends RowNumberLike with SizeBasedWindowFunction { def this() = this(Literal(1)) @@ -700,7 +705,8 @@ abstract class RankLike extends AggregateWindowFunction { trigger a change in rank. This is an internal parameter and will be assigned by the Analyser. """, - since = "2.0.0") + since = "2.0.0", + group = "window_funcs") case class Rank(children: Seq[Expression]) extends RankLike { def this() = this(Nil) override def withOrder(order: Seq[Expression]): Rank = Rank(order) @@ -725,7 +731,8 @@ case class Rank(children: Seq[Expression]) extends RankLike { trigger a change in rank. This is an internal parameter and will be assigned by the Analyser. """, - since = "2.0.0") + since = "2.0.0", + group = "window_funcs") case class DenseRank(children: Seq[Expression]) extends RankLike { def this() = this(Nil) override def withOrder(order: Seq[Expression]): DenseRank = DenseRank(order) @@ -756,7 +763,8 @@ case class DenseRank(children: Seq[Expression]) extends RankLike { trigger a change in rank. This is an internal parameter and will be assigned by the Analyser. """, - since = "2.0.0") + since = "2.0.0", + group = "window_funcs") case class PercentRank(children: Seq[Expression]) extends RankLike with SizeBasedWindowFunction { def this() = this(Nil) override def withOrder(order: Seq[Expression]): PercentRank = PercentRank(order) diff --git a/sql/create-docs.sh b/sql/create-docs.sh index 334c269cc630e..9a84b8a523ec7 100755 --- a/sql/create-docs.sh +++ b/sql/create-docs.sh @@ -48,7 +48,7 @@ echo "Generating SQL API Markdown files." echo "Generating SQL configuration table HTML file." "$SPARK_HOME/bin/spark-submit" gen-sql-config-docs.py -echo "Generating SQL document Makrdown files for bult-in functions." +echo "Generating SQL document Makrdown files for built-in functions." "$SPARK_HOME/bin/spark-submit" gen-sql-builtin-functions-docs.py echo "Generating HTML files for SQL API documentation." diff --git a/sql/gen-sql-builtin-functions-docs.py b/sql/gen-sql-builtin-functions-docs.py index 5a5a4bd6d0169..1f52d63085153 100644 --- a/sql/gen-sql-builtin-functions-docs.py +++ b/sql/gen-sql-builtin-functions-docs.py @@ -143,6 +143,7 @@ def _make_pretty_usage(infos): def _make_pretty_query_example(jspark, query): result = [] + print(" %s" % query) query_output = jspark.sql(query).showString(20, 20, False) result.append(query) result.extend(map(lambda x: " %s" % x, query_output.split("\n"))) @@ -229,6 +230,7 @@ def generate_sql_markdown(jvm, jspark, path): mdfile.write("%s\n\n" % markdown_header) mdfile.write("\n" % filename) + print("Running a SQL example to generate output.") for key, infos in _list_grouped_function_infos(jvm): mdfile.write("\n### %s\n\n" % group_titles[key]) function_table = _make_pretty_usage(infos) @@ -240,7 +242,9 @@ def generate_sql_markdown(jvm, jspark, path): if __name__ == "__main__": jvm = launch_gateway().jvm + print("Initializing Spark Session to generate examples.") jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate() + jspark.sparkContext().setLogLevel("ERROR") spark_root_dir = os.path.dirname(os.path.dirname(__file__)) markdown_file_path = os.path.join(spark_root_dir, "docs/sql-ref-functions-builtin.md") generate_sql_markdown(jvm, jspark, markdown_file_path) diff --git a/sql/gen-sql-config-docs.py b/sql/gen-sql-config-docs.py index 98212ad373370..b19431eeb5487 100644 --- a/sql/gen-sql-config-docs.py +++ b/sql/gen-sql-config-docs.py @@ -22,8 +22,10 @@ # To avoid adding a new direct dependency, we import markdown from within mkdocs. from mkdocs.structure.pages import markdown + from pyspark.java_gateway import launch_gateway + SQLConfEntry = namedtuple( "SQLConfEntry", ["name", "default", "description", "version"]) From f58335dbf0c8e96462baf07d061cdd8851d84a6f Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Mon, 20 Apr 2020 18:00:12 +0900 Subject: [PATCH 5/6] Don't keep the generated file, and clean up --- docs/.gitignore | 13 +- docs/configuration.md | 4 +- docs/sql-ref-functions-builtin.md | 77 ++++++++ .../expressions/jsonExpressions.scala | 6 +- sql/create-docs.sh | 4 +- sql/gen-sql-api-docs.py | 5 +- sql/gen-sql-config-docs.py | 7 +- ...ions-docs.py => gen-sql-functions-docs.py} | 176 ++++++++---------- 8 files changed, 180 insertions(+), 112 deletions(-) create mode 100644 docs/sql-ref-functions-builtin.md rename sql/{gen-sql-builtin-functions-docs.py => gen-sql-functions-docs.py} (53%) diff --git a/docs/.gitignore b/docs/.gitignore index 2260493b46ab3..7d9cb5069ea4a 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1 +1,12 @@ -sql-configs.html +generated-agg-funcs-examples.html +generated-agg-funcs-table.html +generated-array-funcs-examples.html +generated-array-funcs-table.html +generated-datetime-funcs-examples.html +generated-datetime-funcs-table.html +generated-json-funcs-examples.html +generated-json-funcs-table.html +generated-map-funcs-examples.html +generated-map-funcs-table.html +generated-sql-configuration-table.html +generated-window-funcs-table.html diff --git a/docs/configuration.md b/docs/configuration.md index 676ecf5a82d48..e322247ed2975 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2623,10 +2623,10 @@ Spark subsystems. {% for static_file in site.static_files %} - {% if static_file.name == 'sql-configs.html' %} + {% if static_file.name == 'generated-sql-configuration-table.html' %} ### Spark SQL - {% include_relative sql-configs.html %} +{% include_relative generated-sql-configuration-table.html %} {% break %} {% endif %} {% endfor %} diff --git a/docs/sql-ref-functions-builtin.md b/docs/sql-ref-functions-builtin.md new file mode 100644 index 0000000000000..1bca68e5f19df --- /dev/null +++ b/docs/sql-ref-functions-builtin.md @@ -0,0 +1,77 @@ +--- +layout: global +title: Built-in Functions +displayTitle: Built-in Functions +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +{% for static_file in site.static_files %} + {% if static_file.name == 'generated-agg-funcs-table.html' %} +### Aggregate Functions +{% include_relative generated-agg-funcs-table.html %} +#### Examples +{% include_relative generated-agg-funcs-examples.html %} + {% break %} + {% endif %} +{% endfor %} + +{% for static_file in site.static_files %} + {% if static_file.name == 'generated-window-funcs-table.html' %} +### Window Functions +{% include_relative generated-window-funcs-table.html %} + {% break %} + {% endif %} +{% endfor %} + +{% for static_file in site.static_files %} + {% if static_file.name == 'generated-array-funcs-table.html' %} +### Array Functions +{% include_relative generated-array-funcs-table.html %} +#### Examples +{% include_relative generated-array-funcs-examples.html %} + {% break %} + {% endif %} +{% endfor %} + +{% for static_file in site.static_files %} + {% if static_file.name == 'generated-map-funcs-table.html' %} +### Map Functions +{% include_relative generated-map-funcs-table.html %} +#### Examples +{% include_relative generated-map-funcs-examples.html %} + {% break %} + {% endif %} +{% endfor %} + +{% for static_file in site.static_files %} + {% if static_file.name == 'generated-datetime-funcs-table.html' %} +### Date and Timestamp Functions +{% include_relative generated-datetime-funcs-table.html %} +#### Examples +{% include_relative generated-datetime-funcs-examples.html %} + {% break %} + {% endif %} +{% endfor %} + +{% for static_file in site.static_files %} + {% if static_file.name == 'generated-json-funcs-table.html' %} +### JSON Functions +{% include_relative generated-json-funcs-table.html %} +#### Examples +{% include_relative generated-agg-funcs-examples.html %} + {% break %} + {% endif %} +{% endfor %} + diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 4ef6f7fab7df6..205e5271517c3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -885,11 +885,11 @@ case class LengthOfJsonArray(child: Expression) extends UnaryExpression """, examples = """ Examples: - > Select _FUNC_('{}'); + > SELECT _FUNC_('{}'); [] - > Select _FUNC_('{"key": "value"}'); + > SELECT _FUNC_('{"key": "value"}'); ["key"] - > Select _FUNC_('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}'); + > SELECT _FUNC_('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}'); ["f1","f2"] """, group = "json_funcs", diff --git a/sql/create-docs.sh b/sql/create-docs.sh index 9a84b8a523ec7..6614c714e90c7 100755 --- a/sql/create-docs.sh +++ b/sql/create-docs.sh @@ -48,8 +48,8 @@ echo "Generating SQL API Markdown files." echo "Generating SQL configuration table HTML file." "$SPARK_HOME/bin/spark-submit" gen-sql-config-docs.py -echo "Generating SQL document Makrdown files for built-in functions." -"$SPARK_HOME/bin/spark-submit" gen-sql-builtin-functions-docs.py +echo "Generating HTML files for SQL function table and examples." +"$SPARK_HOME/bin/spark-submit" gen-sql-functions-docs.py echo "Generating HTML files for SQL API documentation." mkdocs build --clean diff --git a/sql/gen-sql-api-docs.py b/sql/gen-sql-api-docs.py index 4feee7ad52570..3095a51e82d76 100644 --- a/sql/gen-sql-api-docs.py +++ b/sql/gen-sql-api-docs.py @@ -20,6 +20,7 @@ from pyspark.java_gateway import launch_gateway + ExpressionInfo = namedtuple( "ExpressionInfo", "className name usage arguments examples note since deprecated") @@ -159,7 +160,7 @@ def _make_pretty_deprecated(deprecated): return "**Deprecated:**\n%s\n" % deprecated -def generate_sql_markdown(jvm, path): +def generate_sql_api_markdown(jvm, path): """ Generates a markdown file after listing the function information. The output file is created in `path`. @@ -223,4 +224,4 @@ def generate_sql_markdown(jvm, path): jvm = launch_gateway().jvm spark_root_dir = os.path.dirname(os.path.dirname(__file__)) markdown_file_path = os.path.join(spark_root_dir, "sql/docs/index.md") - generate_sql_markdown(jvm, markdown_file_path) + generate_sql_api_markdown(jvm, markdown_file_path) diff --git a/sql/gen-sql-config-docs.py b/sql/gen-sql-config-docs.py index b19431eeb5487..0043c412fbc16 100644 --- a/sql/gen-sql-config-docs.py +++ b/sql/gen-sql-config-docs.py @@ -43,7 +43,7 @@ def get_public_sql_configs(jvm): return sql_configs -def generate_sql_configs_table(sql_configs, path): +def generate_sql_configs_table_html(sql_configs, path): """ Generates an HTML table at `path` that lists all public SQL configuration options. @@ -118,6 +118,7 @@ def generate_sql_configs_table(sql_configs, path): sql_configs = get_public_sql_configs(jvm) spark_root_dir = os.path.dirname(os.path.dirname(__file__)) - sql_configs_table_path = os.path.join(spark_root_dir, "docs/sql-configs.html") + sql_configs_table_path = os.path.join( + spark_root_dir, "docs/generated-sql-configuration-table.html") - generate_sql_configs_table(sql_configs, path=sql_configs_table_path) + generate_sql_configs_table_html(sql_configs, path=sql_configs_table_path) diff --git a/sql/gen-sql-builtin-functions-docs.py b/sql/gen-sql-functions-docs.py similarity index 53% rename from sql/gen-sql-builtin-functions-docs.py rename to sql/gen-sql-functions-docs.py index 1f52d63085153..4a8ffb2a203c5 100644 --- a/sql/gen-sql-builtin-functions-docs.py +++ b/sql/gen-sql-functions-docs.py @@ -20,40 +20,17 @@ import re from collections import namedtuple +# To avoid adding a new direct dependency, we import markdown from within mkdocs. +from mkdocs.structure.pages import markdown + from pyspark.java_gateway import launch_gateway -from pyspark.sql import SparkSession + ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group") -markdown_header = \ - "---\n"\ - "layout: global\n"\ - "title: Built-in Functions\n"\ - "displayTitle: Built-in Functions\n"\ - "license: |\n"\ - " Licensed to the Apache Software Foundation (ASF) under one or more\n"\ - " contributor license agreements. See the NOTICE file distributed with\n"\ - " this work for additional information regarding copyright ownership.\n"\ - " The ASF licenses this file to You under the Apache License, Version 2.0\n"\ - " (the \"License\"); you may not use this file except in compliance with\n"\ - " the License. You may obtain a copy of the License at\n"\ - "\n"\ - " http://www.apache.org/licenses/LICENSE-2.0\n"\ - "\n"\ - " Unless required by applicable law or agreed to in writing, software\n"\ - " distributed under the License is distributed on an \"AS IS\" BASIS,\n"\ - " WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"\ - " See the License for the specific language governing permissions and\n"\ - " limitations under the License.\n"\ - "---" - -group_titles = { - "agg_funcs": "Aggregate Functions", - "array_funcs": "Array Functions", - "datetime_funcs": "Date and Timestamp Functions", - "json_funcs": "JSON Functions", - "map_funcs": "Map Functions", - "window_funcs": "Window Functions", +groups = { + "agg_funcs", "array_funcs", "datetime_funcs", + "json_funcs", "map_funcs", "window_funcs", } @@ -64,10 +41,9 @@ def _list_grouped_function_infos(jvm): """ jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos() - expected_groups = group_titles.keys() infos = [] - for jinfo in filter(lambda x: x.getGroup() in expected_groups, jinfos): + for jinfo in filter(lambda x: x.getGroup() in groups, jinfos): name = jinfo.getName() usage = jinfo.getUsage() usage = usage.replace("_FUNC_", name) if usage is not None else usage @@ -83,7 +59,7 @@ def _list_grouped_function_infos(jvm): return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos] -# TODO(maropu) Needs to add a column to describe arguments and their types +# TODO(SPARK-XXXXX): Needs to add a column to describe arguments and their types def _make_pretty_usage(infos): """ Makes the usage description pretty and returns a formatted string. @@ -112,6 +88,7 @@ def _make_pretty_usage(infos): ... + ... """ @@ -141,15 +118,6 @@ def _make_pretty_usage(infos): return "\n".join(result) -def _make_pretty_query_example(jspark, query): - result = [] - print(" %s" % query) - query_output = jspark.sql(query).showString(20, 20, False) - result.append(query) - result.extend(map(lambda x: " %s" % x, query_output.split("\n"))) - return "\n".join(result) - - def _make_pretty_examples(jspark, infos): """ Makes the examples description pretty and returns a formatted string if `infos` @@ -164,87 +132,97 @@ def _make_pretty_examples(jspark, infos): ... Expected output: - -- group_value - SELECT func(col)...; - +---------+ - |func(col)| - +---------+ - | ...| - +---------+ - - SELECT func(col)...; - +---------+ - |func(col)| - +---------+ - | ...| - +---------+ +

+      -- func
+      SELECT
+      ...
+    
+ ``` """ - if any(info.examples.startswith("\n Examples:") for info in infos): - result = [] - result.append("\n#### Examples\n") - result.append("{% highlight sql %}") - - for info in infos: - result.append("-- %s" % info.name) + pretty_output = "" + for info in infos: + if info.examples.startswith("\n Examples:"): + output = [] + output.append("-- %s" % info.name) query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n")) for query_example in query_examples: query = query_example.lstrip(" > ") - result.append(_make_pretty_query_example(jspark, query)) - - result.append("{% endhighlight %}\n") - return "\n".join(result) + print(" %s" % query) + query_output = jspark.sql(query).showString(20, 20, False) + output.append(query) + output.append(query_output) + pretty_output += "\n" + "\n".join(output) + if pretty_output != "": + return markdown.markdown( + "```sql%s```" % pretty_output, extensions=['codehilite', 'fenced_code']) -def generate_sql_markdown(jvm, jspark, path): +def generate_functions_table_html(jvm, html_output_dir): """ - Generates a markdown file after listing the function information. The output file - is created in `path`. + Generates a HTML file after listing the function information. The output file + is created under `html_output_dir`. Expected output: - --- - layout: global - title: Built-in Functions - displayTitle: Built-in Functions - license: - ... - --- - - ### Aggregate Functions + + + + + + + + + + + + + + + + ...
FunctionDescription
func(*)...
func(expr[, expr...])...
- #### Examples + """ + for key, infos in _list_grouped_function_infos(jvm): + function_table = _make_pretty_usage(infos) + key = key.replace("_", "-") + with open("%s/generated-%s-table.html" % (html_output_dir, key), 'w') as table_html: + table_html.write(function_table) - {% hightlight sql %} - ... - {% endhighlight %} +def generate_functions_examples_html(jvm, jspark, html_output_dir): """ + Generates a HTML file after listing and executing the function information. + The output file is created under `html_output_dir`. - with open(path, 'w') as mdfile: - filename = os.path.basename(__file__) - mdfile.write("%s\n\n" % markdown_header) - mdfile.write("\n" % filename) + Expected output: + +

+      -- func
+      SELECT
+      ...
+    
- print("Running a SQL example to generate output.") - for key, infos in _list_grouped_function_infos(jvm): - mdfile.write("\n### %s\n\n" % group_titles[key]) - function_table = _make_pretty_usage(infos) - examples = _make_pretty_examples(jspark, infos) - mdfile.write(function_table) - if examples is not None: - mdfile.write(examples) + """ + print("Running SQL examples to generate formatted output.") + for key, infos in _list_grouped_function_infos(jvm): + examples = _make_pretty_examples(jspark, infos) + key = key.replace("_", "-") + if examples is not None: + with open("%s/generated-%s-examples.html" % ( + html_output_dir, key), 'w') as examples_html: + examples_html.write(examples) if __name__ == "__main__": jvm = launch_gateway().jvm - print("Initializing Spark Session to generate examples.") jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate() - jspark.sparkContext().setLogLevel("ERROR") + jspark.sparkContext().setLogLevel("ERROR") # Make it less noisy. spark_root_dir = os.path.dirname(os.path.dirname(__file__)) - markdown_file_path = os.path.join(spark_root_dir, "docs/sql-ref-functions-builtin.md") - generate_sql_markdown(jvm, jspark, markdown_file_path) + html_output_dir = os.path.join(spark_root_dir, "docs") + generate_functions_table_html(jvm, html_output_dir) + generate_functions_examples_html(jvm, jspark, html_output_dir) From f302c91fc270fb4a7607381c3ca1af7ccbecd7c1 Mon Sep 17 00:00:00 2001 From: Takeshi Yamamuro Date: Tue, 21 Apr 2020 09:28:14 +0900 Subject: [PATCH 6/6] Final cleanup --- docs/.gitignore | 13 +------------ sql/gen-sql-functions-docs.py | 2 +- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/docs/.gitignore b/docs/.gitignore index 7d9cb5069ea4a..9df83f37815b7 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,12 +1 @@ -generated-agg-funcs-examples.html -generated-agg-funcs-table.html -generated-array-funcs-examples.html -generated-array-funcs-table.html -generated-datetime-funcs-examples.html -generated-datetime-funcs-table.html -generated-json-funcs-examples.html -generated-json-funcs-table.html -generated-map-funcs-examples.html -generated-map-funcs-table.html -generated-sql-configuration-table.html -generated-window-funcs-table.html +generated-*.html diff --git a/sql/gen-sql-functions-docs.py b/sql/gen-sql-functions-docs.py index 4a8ffb2a203c5..7f0b2ae582f56 100644 --- a/sql/gen-sql-functions-docs.py +++ b/sql/gen-sql-functions-docs.py @@ -59,7 +59,7 @@ def _list_grouped_function_infos(jvm): return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos] -# TODO(SPARK-XXXXX): Needs to add a column to describe arguments and their types +# TODO(SPARK-31499): Needs to add a column to describe arguments and their types def _make_pretty_usage(infos): """ Makes the usage description pretty and returns a formatted string.