-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-20396][SQL][PySpark] groupby().apply() with pandas udf #18732
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
176cd15
f109afb
07bccca
e7a9b27
83b647e
8d98b3e
96ce587
213dd1a
d37a9e6
1ea2b71
4943ceb
21fed0d
40d7e8a
427a847
0929d4d
d9a3e8d
ce0d54c
657942b
fa88c88
e4efb32
f572385
5162ed1
d628f4e
20fb1fe
284ba00
876b118
b0410a2
87edfcc
4413ed4
a064b21
a036f70
b88a4d8
9c2b10e
dc1d406
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2058,7 +2058,7 @@ def __init__(self, func, returnType, name=None, vectorized=False): | |
| self._name = name or ( | ||
| func.__name__ if hasattr(func, '__name__') | ||
| else func.__class__.__name__) | ||
| self._vectorized = vectorized | ||
| self.vectorized = vectorized | ||
|
|
||
| @property | ||
| def returnType(self): | ||
|
|
@@ -2090,7 +2090,7 @@ def _create_judf(self): | |
| wrapped_func = _wrap_function(sc, self.func, self.returnType) | ||
| jdt = spark._jsparkSession.parseDataType(self.returnType.json()) | ||
| judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction( | ||
| self._name, wrapped_func, jdt, self._vectorized) | ||
| self._name, wrapped_func, jdt, self.vectorized) | ||
| return judf | ||
|
|
||
| def __call__(self, *cols): | ||
|
|
@@ -2118,8 +2118,10 @@ def wrapper(*args): | |
| wrapper.__name__ = self._name | ||
| wrapper.__module__ = (self.func.__module__ if hasattr(self.func, '__module__') | ||
| else self.func.__class__.__module__) | ||
|
|
||
| wrapper.func = self.func | ||
| wrapper.returnType = self.returnType | ||
| wrapper.vectorized = self.vectorized | ||
|
|
||
| return wrapper | ||
|
|
||
|
|
@@ -2129,8 +2131,12 @@ def _create_udf(f, returnType, vectorized): | |
| def _udf(f, returnType=StringType(), vectorized=vectorized): | ||
| if vectorized: | ||
| import inspect | ||
| if len(inspect.getargspec(f).args) == 0: | ||
| raise NotImplementedError("0-parameter pandas_udfs are not currently supported") | ||
| argspec = inspect.getargspec(f) | ||
| if len(argspec.args) == 0 and argspec.varargs is None: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I hadn't really thought about it, but does this mean varargs are supported? I suppose it could, but maybe best to include a test for it.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added. |
||
| raise ValueError( | ||
| "0-arg pandas_udfs are not supported. " | ||
| "Instead, create a 1-arg pandas_udf and ignore the arg in your function." | ||
| ) | ||
| udf_obj = UserDefinedFunction(f, returnType, vectorized=vectorized) | ||
| return udf_obj._wrapped() | ||
|
|
||
|
|
@@ -2146,7 +2152,7 @@ def _udf(f, returnType=StringType(), vectorized=vectorized): | |
|
|
||
| @since(1.3) | ||
| def udf(f=None, returnType=StringType()): | ||
| """Creates a :class:`Column` expression representing a user defined function (UDF). | ||
| """Creates a user defined function (UDF). | ||
|
|
||
| .. note:: The user-defined functions must be deterministic. Due to optimization, | ||
| duplicate invocations may be eliminated or the function may even be invoked more times than | ||
|
|
@@ -2181,30 +2187,70 @@ def udf(f=None, returnType=StringType()): | |
| @since(2.3) | ||
| def pandas_udf(f=None, returnType=StringType()): | ||
| """ | ||
| Creates a :class:`Column` expression representing a user defined function (UDF) that accepts | ||
| `Pandas.Series` as input arguments and outputs a `Pandas.Series` of the same length. | ||
| Creates a vectorized user defined function (UDF). | ||
|
|
||
| :param f: python function if used as a standalone function | ||
| :param f: user-defined function. A python function if used as a standalone function | ||
| :param returnType: a :class:`pyspark.sql.types.DataType` object | ||
|
|
||
| >>> from pyspark.sql.types import IntegerType, StringType | ||
| >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) | ||
| >>> @pandas_udf(returnType=StringType()) | ||
| ... def to_upper(s): | ||
| ... return s.str.upper() | ||
| ... | ||
| >>> @pandas_udf(returnType="integer") | ||
| ... def add_one(x): | ||
| ... return x + 1 | ||
| ... | ||
| >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) | ||
| >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ | ||
| ... .show() # doctest: +SKIP | ||
| +----------+--------------+------------+ | ||
| |slen(name)|to_upper(name)|add_one(age)| | ||
| +----------+--------------+------------+ | ||
| | 8| JOHN DOE| 22| | ||
| +----------+--------------+------------+ | ||
| The user-defined function can define one of the following transformations: | ||
|
|
||
| 1. One or more `pandas.Series` -> A `pandas.Series` | ||
|
|
||
| This udf is used with :meth:`pyspark.sql.DataFrame.withColumn` and | ||
| :meth:`pyspark.sql.DataFrame.select`. | ||
| The returnType should be a primitive data type, e.g., `DoubleType()`. | ||
| The length of the returned `pandas.Series` must be of the same as the input `pandas.Series`. | ||
|
|
||
| >>> from pyspark.sql.types import IntegerType, StringType | ||
| >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) | ||
| >>> @pandas_udf(returnType=StringType()) | ||
| ... def to_upper(s): | ||
| ... return s.str.upper() | ||
| ... | ||
| >>> @pandas_udf(returnType="integer") | ||
| ... def add_one(x): | ||
| ... return x + 1 | ||
| ... | ||
| >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) | ||
| >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ | ||
| ... .show() # doctest: +SKIP | ||
| +----------+--------------+------------+ | ||
| |slen(name)|to_upper(name)|add_one(age)| | ||
| +----------+--------------+------------+ | ||
| | 8| JOHN DOE| 22| | ||
| +----------+--------------+------------+ | ||
|
|
||
| 2. A `pandas.DataFrame` -> A `pandas.DataFrame` | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks producing an warning here:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed indentation. |
||
|
|
||
| This udf is only used with :meth:`pyspark.sql.GroupedData.apply`. | ||
| The returnType should be a :class:`StructType` describing the schema of the returned | ||
| `pandas.DataFrame`. | ||
|
|
||
| >>> df = spark.createDataFrame( | ||
| ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], | ||
| ... ("id", "v")) | ||
| >>> @pandas_udf(returnType=df.schema) | ||
| ... def normalize(pdf): | ||
| ... v = pdf.v | ||
| ... return pdf.assign(v=(v - v.mean()) / v.std()) | ||
| >>> df.groupby('id').apply(normalize).show() # doctest: +SKIP | ||
| +---+-------------------+ | ||
| | id| v| | ||
| +---+-------------------+ | ||
| | 1|-0.7071067811865475| | ||
| | 1| 0.7071067811865475| | ||
| | 2|-0.8320502943378437| | ||
| | 2|-0.2773500981126146| | ||
| | 2| 1.1094003924504583| | ||
| +---+-------------------+ | ||
|
|
||
| .. note:: This type of udf cannot be used with functions such as `withColumn` or `select` | ||
| because it defines a `DataFrame` transformation rather than a `Column` | ||
| transformation. | ||
|
|
||
| .. seealso:: :meth:`pyspark.sql.GroupedData.apply` | ||
|
|
||
| .. note:: The user-defined function must be deterministic. | ||
| """ | ||
| return _create_udf(f, returnType=returnType, vectorized=True) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -54,9 +54,10 @@ class GroupedData(object): | |
| .. versionadded:: 1.3 | ||
| """ | ||
|
|
||
| def __init__(self, jgd, sql_ctx): | ||
| def __init__(self, jgd, df): | ||
| self._jgd = jgd | ||
| self.sql_ctx = sql_ctx | ||
| self._df = df | ||
| self.sql_ctx = df.sql_ctx | ||
|
|
||
| @ignore_unicode_prefix | ||
| @since(1.3) | ||
|
|
@@ -170,7 +171,7 @@ def sum(self, *cols): | |
| @since(1.6) | ||
| def pivot(self, pivot_col, values=None): | ||
| """ | ||
| Pivots a column of the current [[DataFrame]] and perform the specified aggregation. | ||
| Pivots a column of the current :class:`DataFrame` and perform the specified aggregation. | ||
| There are two versions of pivot function: one that requires the caller to specify the list | ||
| of distinct values to pivot on, and one that does not. The latter is more concise but less | ||
| efficient, because Spark needs to first compute the list of distinct values internally. | ||
|
|
@@ -192,7 +193,85 @@ def pivot(self, pivot_col, values=None): | |
| jgd = self._jgd.pivot(pivot_col) | ||
| else: | ||
| jgd = self._jgd.pivot(pivot_col, values) | ||
| return GroupedData(jgd, self.sql_ctx) | ||
| return GroupedData(jgd, self._df) | ||
|
|
||
| @since(2.3) | ||
| def apply(self, udf): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rxin just to recap our discussion regarding naming: You asked:
Answer is:
Does this make sense to you? |
||
| """ | ||
| Maps each group of the current :class:`DataFrame` using a pandas udf and returns the result | ||
| as a `DataFrame`. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "pandas udf" to "pandas_udf"
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think "pandas udf" as a word is fine. |
||
|
|
||
| The user-defined function should take a `pandas.DataFrame` and return another | ||
| `pandas.DataFrame`. For each group, all columns are passed together as a `pandas.DataFrame` | ||
| to the user-function and the returned `pandas.DataFrame`s are combined as a | ||
| :class:`DataFrame`. | ||
| The returned `pandas.DataFrame` can be of arbitrary length and its schema must match the | ||
| returnType of the pandas udf. | ||
|
|
||
| This function does not support partial aggregation, and requires shuffling all the data in | ||
| the :class:`DataFrame`. | ||
|
|
||
| :param udf: A function object returned by :meth:`pyspark.sql.functions.pandas_udf` | ||
|
|
||
| >>> from pyspark.sql.functions import pandas_udf | ||
| >>> df = spark.createDataFrame( | ||
| ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], | ||
| ... ("id", "v")) | ||
| >>> @pandas_udf(returnType=df.schema) | ||
| ... def normalize(pdf): | ||
| ... v = pdf.v | ||
| ... return pdf.assign(v=(v - v.mean()) / v.std()) | ||
| >>> df.groupby('id').apply(normalize).show() # doctest: +SKIP | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems this is still not skipped by doc test. What's the best way to run doctest locally? I tried But it's giving me a different failure.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have been using But this doesn't seem to do doctest.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the problem is,
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably, importing
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, it looks this file does not define sc = spark.sparkContext
globs['sc'] = sc
+ globs['spark'] = spark
globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahh..Thanks! Will give it a try. Still, is there a easier way to run the pyspark tests locally (the way jenkins runs them)?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure.. I think what you know is what I usually do.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @icexelloss , this works for me to run doctests locally
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, cool. I misunderstood. That's the answer to the question.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahh neat. Thanks @HyukjinKwon @BryanCutler doctest passes now. |
||
| +---+-------------------+ | ||
| | id| v| | ||
| +---+-------------------+ | ||
| | 1|-0.7071067811865475| | ||
| | 1| 0.7071067811865475| | ||
| | 2|-0.8320502943378437| | ||
| | 2|-0.2773500981126146| | ||
| | 2| 1.1094003924504583| | ||
| +---+-------------------+ | ||
|
|
||
| .. seealso:: :meth:`pyspark.sql.functions.pandas_udf` | ||
|
|
||
| """ | ||
| from pyspark.sql.functions import pandas_udf | ||
|
|
||
| # Columns are special because hasattr always return True | ||
| if isinstance(udf, Column) or not hasattr(udf, 'func') or not udf.vectorized: | ||
| raise ValueError("The argument to apply must be a pandas_udf") | ||
| if not isinstance(udf.returnType, StructType): | ||
| raise ValueError("The returnType of the pandas_udf must be a StructType") | ||
|
|
||
| df = self._df | ||
| func = udf.func | ||
| returnType = udf.returnType | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it necessary to make all these copies? I could understand maybe copying
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I actually like it because I think it's more readable this way. |
||
|
|
||
| # The python executors expects the function to use pd.Series as input and output | ||
| # So we to create a wrapper function that turns that to a pd.DataFrame before passing | ||
| # down to the user function, then turn the result pd.DataFrame back into pd.Series | ||
| columns = df.columns | ||
|
|
||
| def wrapped(*cols): | ||
| from pyspark.sql.types import to_arrow_type | ||
| import pandas as pd | ||
| result = func(pd.concat(cols, axis=1, keys=columns)) | ||
| if not isinstance(result, pd.DataFrame): | ||
| raise TypeError("Return type of the user-defined function should be " | ||
| "Pandas.DataFrame, but is {}".format(type(result))) | ||
| if not len(result.columns) == len(returnType): | ||
| raise RuntimeError( | ||
| "Number of columns of the returned Pandas.DataFrame " | ||
| "doesn't match specified schema. " | ||
| "Expected: {} Actual: {}".format(len(returnType), len(result.columns))) | ||
| arrow_return_types = (to_arrow_type(field.dataType) for field in returnType) | ||
| return [(result[result.columns[i]], arrow_type) | ||
| for i, arrow_type in enumerate(arrow_return_types)] | ||
|
|
||
| wrapped_udf_obj = pandas_udf(wrapped, returnType) | ||
| udf_column = wrapped_udf_obj(*[df[col] for col in df.columns]) | ||
| jdf = self._jgd.flatMapGroupsInPandas(udf_column._jc.expr()) | ||
| return DataFrame(jdf, self.sql_ctx) | ||
|
|
||
|
|
||
| def _test(): | ||
|
|
@@ -206,6 +285,7 @@ def _test(): | |
| .getOrCreate() | ||
| sc = spark.sparkContext | ||
| globs['sc'] = sc | ||
| globs['spark'] = spark | ||
| globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ | ||
| .toDF(StructType([StructField('age', IntegerType()), | ||
| StructField('name', StringType())])) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh sorry, I should have been more clear. This should stay
self._vectorizedsince it is a private variable to the class, it's onlywrapped.vectorized(which you already changed below), isn't being used as private so shouldn't have an underscore.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I kind of dislike the inconsistency between
UserDefinedFunctionand its wrapped function. I think they are just the same thing except for the wrapped function has doc string. For ease of mind, I think we should make them either both private or public.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we ok with having
vectorizedbeing public field? I am fine with both public or private but I do think the fields of the function returned byUserDefinedFuncion_wrapped()should have the same field names asUserDefinedFunctionto avoid confusion.