Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -2317,7 +2317,8 @@ setMethod("date_format", signature(y = "Column", x = "character"),

#' from_utc_timestamp
#'
#' Assumes given timestamp is UTC and converts to given timezone.
#' Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
#' that corresponds to the same time of day in the given timezone.
#'
#' @param y Column to compute on.
#' @param x time zone to use.
Expand All @@ -2340,7 +2341,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
#' Locate the position of the first occurrence of substr column in the given string.
#' Returns null if either of the arguments are null.
#'
#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
#' could not be found in str.
#'
#' @param y column to check
Expand Down Expand Up @@ -2391,7 +2392,8 @@ setMethod("next_day", signature(y = "Column", x = "character"),

#' to_utc_timestamp
#'
#' Assumes given timestamp is in given timezone and converts to UTC.
#' Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
#' another timestamp that corresponds to the same time of day in UTC.
#'
#' @param y Column to compute on
#' @param x timezone to use
Expand Down Expand Up @@ -2539,7 +2541,7 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),

#' shiftRight
#'
#' Shift the given value numBits right. If the given value is a long value, it will return
#' (Signed) shift the given value numBits right. If the given value is a long value, it will return
#' a long value else it will return an integer value.
#'
#' @param y column to compute on.
Expand Down Expand Up @@ -2777,7 +2779,7 @@ setMethod("window", signature(x = "Column"),
#' locate
#'
#' Locate the position of the first occurrence of substr.
#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
#' could not be found in str.
#'
#' @param substr a character string to be matched.
Expand Down Expand Up @@ -2823,7 +2825,8 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),

#' rand
#'
#' Generate a random column with i.i.d. samples from U[0.0, 1.0].
#' Generate a random column with independent and identically distributed (i.i.d.) samples
#' from U[0.0, 1.0].
#'
#' @param seed a random seed. Can be missing.
#' @family normal_funcs
Expand Down Expand Up @@ -2852,7 +2855,8 @@ setMethod("rand", signature(seed = "numeric"),

#' randn
#'
#' Generate a column with i.i.d. samples from the standard normal distribution.
#' Generate a column with independent and identically distributed (i.i.d.) samples from
#' the standard normal distribution.
#'
#' @param seed a random seed. Can be missing.
#' @family normal_funcs
Expand Down Expand Up @@ -3442,8 +3446,8 @@ setMethod("size",

#' sort_array
#'
#' Sorts the input array for the given column in ascending order,
#' according to the natural ordering of the array elements.
#' Sorts the input array in ascending or descending order according
#' to the natural ordering of the array elements.
#'
#' @param x A Column to sort
#' @param asc A logical flag indicating the sorting order.
Expand Down
35 changes: 20 additions & 15 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,8 @@ def grouping_id(*cols):

(grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)

Note: the list of columns should match with grouping columns exactly, or empty (means all the
grouping columns).
.. note:: the list of columns should match with grouping columns exactly, or empty (means all
the grouping columns).

>>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()
+-----+-------------+--------+
Expand Down Expand Up @@ -457,7 +457,8 @@ def nanvl(col1, col2):

@since(1.4)
def rand(seed=None):
"""Generates a random column with i.i.d. samples from U[0.0, 1.0].
"""Generates a random column with independent and identically distributed (i.i.d.) samples
from U[0.0, 1.0].
"""
sc = SparkContext._active_spark_context
if seed is not None:
Expand All @@ -469,7 +470,8 @@ def rand(seed=None):

@since(1.4)
def randn(seed=None):
"""Generates a column with i.i.d. samples from the standard normal distribution.
"""Generates a column with independent and identically distributed (i.i.d.) samples from
the standard normal distribution.
"""
sc = SparkContext._active_spark_context
if seed is not None:
Expand Down Expand Up @@ -518,7 +520,7 @@ def shiftLeft(col, numBits):

@since(1.5)
def shiftRight(col, numBits):
"""Shift the given value numBits right.
"""(Signed) shift the given value numBits right.

>>> spark.createDataFrame([(42,)], ['a']).select(shiftRight('a', 1).alias('r')).collect()
[Row(r=21)]
Expand Down Expand Up @@ -777,8 +779,8 @@ def date_format(date, format):
A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
pattern letters of the Java class `java.text.SimpleDateFormat` can be used.

NOTE: Use when ever possible specialized functions like `year`. These benefit from a
specialized implementation.
.. note:: Use when ever possible specialized functions like `year`. These benefit from a
specialized implementation.

>>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
>>> df.select(date_format('a', 'MM/dd/yyy').alias('date')).collect()
Expand Down Expand Up @@ -1059,7 +1061,8 @@ def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):
@since(1.5)
def from_utc_timestamp(timestamp, tz):
"""
Assumes given timestamp is UTC and converts to given timezone.
Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
that corresponds to the same time of day in the given timezone.

>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(from_utc_timestamp(df.t, "PST").alias('t')).collect()
Expand All @@ -1072,7 +1075,8 @@ def from_utc_timestamp(timestamp, tz):
@since(1.5)
def to_utc_timestamp(timestamp, tz):
"""
Assumes given timestamp is in given timezone and converts to UTC.
Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
another timestamp that corresponds to the same time of day in UTC.

>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(to_utc_timestamp(df.t, "PST").alias('t')).collect()
Expand Down Expand Up @@ -1314,8 +1318,8 @@ def instr(str, substr):
Locate the position of the first occurrence of substr column in the given string.
Returns null if either of the arguments are null.

NOTE: The position is not zero based, but 1 based index, returns 0 if substr
could not be found in str.
.. note:: The position is not zero based, but 1 based index. Returns 0 if substr
could not be found in str.

>>> df = spark.createDataFrame([('abcd',)], ['s',])
>>> df.select(instr(df.s, 'b').alias('s')).collect()
Expand Down Expand Up @@ -1379,8 +1383,8 @@ def locate(substr, str, pos=1):
"""
Locate the position of the first occurrence of substr in a string column, after position pos.

NOTE: The position is not zero based, but 1 based index. returns 0 if substr
could not be found in str.
.. note:: The position is not zero based, but 1 based index. Returns 0 if substr
could not be found in str.

:param substr: a string
:param str: a Column of :class:`pyspark.sql.types.StringType`
Expand Down Expand Up @@ -1442,7 +1446,7 @@ def split(str, pattern):
"""
Splits str around pattern (pattern is a regular expression).

NOTE: pattern is a string represent the regular expression.
.. note:: pattern is a string represent the regular expression.

>>> df = spark.createDataFrame([('ab12cd',)], ['s',])
>>> df.select(split(df.s, '[0-9]+').alias('s')).collect()
Expand Down Expand Up @@ -1785,7 +1789,8 @@ def size(col):
@since(1.5)
def sort_array(col, asc=True):
"""
Collection function: sorts the input array for the given column in ascending order.
Collection function: sorts the input array in ascending or descending order according
to the natural ordering of the array elements.

:param col: name of column or expression

Expand Down
30 changes: 18 additions & 12 deletions sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1117,7 +1117,8 @@ object functions {
def not(e: Column): Column = !e

/**
* Generate a random column with i.i.d. samples from U[0.0, 1.0].
* Generate a random column with independent and identically distributed (i.i.d.) samples
* from U[0.0, 1.0].
*
* Note that this is indeterministic when data partitions are not fixed.
*
Expand All @@ -1127,15 +1128,17 @@ object functions {
def rand(seed: Long): Column = withExpr { Rand(seed) }

/**
* Generate a random column with i.i.d. samples from U[0.0, 1.0].
* Generate a random column with independent and identically distributed (i.i.d.) samples
* from U[0.0, 1.0].
*
* @group normal_funcs
* @since 1.4.0
*/
def rand(): Column = rand(Utils.random.nextLong)

/**
* Generate a column with i.i.d. samples from the standard normal distribution.
* Generate a column with independent and identically distributed (i.i.d.) samples from
* the standard normal distribution.
*
* Note that this is indeterministic when data partitions are not fixed.
*
Expand All @@ -1145,15 +1148,16 @@ object functions {
def randn(seed: Long): Column = withExpr { Randn(seed) }

/**
* Generate a column with i.i.d. samples from the standard normal distribution.
* Generate a column with independent and identically distributed (i.i.d.) samples from
* the standard normal distribution.
*
* @group normal_funcs
* @since 1.4.0
*/
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one was missed in 02f2031 and a08463b

def randn(): Column = randn(Utils.random.nextLong)

/**
* Partition ID of the Spark task.
* Partition ID.
*
* Note that this is indeterministic because it depends on data partitioning and task scheduling.
*
Expand Down Expand Up @@ -1877,8 +1881,8 @@ object functions {
def shiftLeft(e: Column, numBits: Int): Column = withExpr { ShiftLeft(e.expr, lit(numBits).expr) }

/**
* Shift the given value numBits right. If the given value is a long value, it will return
* a long value else it will return an integer value.
* (Signed) shift the given value numBits right. If the given value is a long value, it will
* return a long value else it will return an integer value.
*
* @group math_funcs
* @since 1.5.0
Expand Down Expand Up @@ -2203,7 +2207,7 @@ object functions {
* Locate the position of the first occurrence of substr column in the given string.
* Returns null if either of the arguments are null.
*
* NOTE: The position is not zero based, but 1 based index, returns 0 if substr
* NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
* could not be found in str.
*
* @group string_funcs
Expand Down Expand Up @@ -2238,7 +2242,7 @@ object functions {

/**
* Locate the position of the first occurrence of substr.
* NOTE: The position is not zero based, but 1 based index, returns 0 if substr
* NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
* could not be found in str.
*
* @group string_funcs
Expand Down Expand Up @@ -2666,7 +2670,8 @@ object functions {
}

/**
* Assumes given timestamp is UTC and converts to given timezone.
* Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
* that corresponds to the same time of day in the given timezone.
* @group datetime_funcs
* @since 1.5.0
*/
Expand All @@ -2675,7 +2680,8 @@ object functions {
}

/**
* Assumes given timestamp is in given timezone and converts to UTC.
* Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
* another timestamp that corresponds to the same time of day in UTC.
* @group datetime_funcs
* @since 1.5.0
*/
Expand Down Expand Up @@ -2996,7 +3002,7 @@ object functions {
def sort_array(e: Column): Column = sort_array(e, asc = true)

/**
* Sorts the input array for the given column in ascending / descending order,
* Sorts the input array for the given column in ascending or descending order,
* according to the natural ordering of the array elements.
*
* @group collection_funcs
Expand Down