Revert "Restrict checking the number of arguments."

ueshin · ueshin · commit fdafb3561d44 · 2017-10-17T01:54:23.000+09:00
This reverts commit 122a7bc.
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -2132,15 +2132,35 @@ def wrapper(*args):
         return wrapper
 
 
-def _resolve_decorator(create_udf, f, returnType):
-    # decorator @udf, @udf(), @udf(dataType()), or similar with @pandas_udf, @pandas_grouped_udf
+def _create_udf(f, returnType, pythonUdfType):
+
+    def _udf(f, returnType=StringType(), pythonUdfType=pythonUdfType):
+        if pythonUdfType == PythonUdfType.PANDAS_UDF:
+            import inspect
+            argspec = inspect.getargspec(f)
+            if len(argspec.args) == 0 and argspec.varargs is None:
+                raise ValueError(
+                    "0-arg pandas_udfs are not supported. "
+                    "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
+                )
+        elif pythonUdfType == PythonUdfType.PANDAS_GROUPED_UDF:
+            import inspect
+            argspec = inspect.getargspec(f)
+            if len(argspec.args) != 1 and argspec.varargs is None:
+                raise ValueError("Only 1-arg pandas_grouped_udfs are supported.")
+
+        udf_obj = UserDefinedFunction(f, returnType, pythonUdfType=pythonUdfType)
+        return udf_obj._wrapped()
+
+    # decorator @udf, @udf(), @udf(dataType()), or similar with @pandas_udf
     if f is None or isinstance(f, (str, DataType)):
         # If DataType has been passed as a positional argument
         # for decorator use it as a returnType
         return_type = f or returnType
-        return functools.partial(create_udf, returnType=return_type)
+        return functools.partial(
+            _udf, returnType=return_type, pythonUdfType=pythonUdfType)
     else:
-        return create_udf(f=f, returnType=returnType)
+        return _udf(f=f, returnType=returnType, pythonUdfType=pythonUdfType)
 
 
 @since(1.3)
@@ -2174,11 +2194,7 @@ def udf(f=None, returnType=StringType()):
     |         8|      JOHN DOE|          22|
     +----------+--------------+------------+
     """
-    def _create_udf(f, returnType):
-        udf_obj = UserDefinedFunction(f, returnType, pythonUdfType=PythonUdfType.NORMAL_UDF)
-        return udf_obj._wrapped()
-
-    return _resolve_decorator(_create_udf, f, returnType)
+    return _create_udf(f, returnType=returnType, pythonUdfType=PythonUdfType.NORMAL_UDF)
 
 
 @since(2.3)
@@ -2219,19 +2235,7 @@ def pandas_udf(f=None, returnType=StringType()):
 
     .. note:: The user-defined function must be deterministic.
     """
-    def _create_udf(f, returnType):
-        import inspect
-        argspec = inspect.getargspec(f)
-        if len(argspec.args) == 0 and argspec.varargs is None:
-            raise ValueError(
-                "0-arg pandas_udfs are not supported. "
-                "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
-            )
-
-        udf_obj = UserDefinedFunction(f, returnType, pythonUdfType=PythonUdfType.PANDAS_UDF)
-        return udf_obj._wrapped()
-
-    return _resolve_decorator(_create_udf, f, returnType)
+    return _create_udf(f, returnType=returnType, pythonUdfType=PythonUdfType.PANDAS_UDF)
 
 
 @since(2.3)
@@ -2276,66 +2280,7 @@ def pandas_grouped_udf(f=None, returnType=StructType()):
 
     .. note:: The user-defined function must be deterministic.
     """
-    def _create_udf(f, returnType):
-        import inspect
-        argspec = inspect.getargspec(f)
-        if len(argspec.args) != 1:
-            raise ValueError("Only 1-arg pandas_grouped_udfs are supported.")
-
-        # create a dummy udf object as a placeholder.
-        _udf_obj = UserDefinedFunction(
-            f, returnType, pythonUdfType=PythonUdfType.PANDAS_GROUPED_UDF)
-
-        # It is possible for a callable instance without __name__ attribute or/and
-        # __module__ attribute to be wrapped here. For example, functools.partial. In this case,
-        # we should avoid wrapping the attributes from the wrapped function to the wrapper
-        # function. So, we take out these attribute names from the default names to set and
-        # then manually assign it after being wrapped.
-        assignments = tuple(
-            a for a in functools.WRAPPER_ASSIGNMENTS if a != '__name__' and a != '__module__')
-
-        @functools.wraps(_udf_obj.func, assigned=assignments)
-        def wrapper(df):
-
-            func = _udf_obj.func
-            returnType = _udf_obj.returnType
-
-            # The python executors expects the function to use pd.Series as input and output
-            # So we to create a wrapper function that turns that to a pd.DataFrame before passing
-            # down to the user function, then turn the result pd.DataFrame back into pd.Series
-            columns = df.columns
-
-            def wrapped(*cols):
-                from pyspark.sql.types import to_arrow_type
-                import pandas as pd
-                result = func(pd.concat(cols, axis=1, keys=columns))
-                if not isinstance(result, pd.DataFrame):
-                    raise TypeError("Return type of the user-defined function should be "
-                                    "Pandas.DataFrame, but is {}".format(type(result)))
-                if not len(result.columns) == len(returnType):
-                    raise RuntimeError(
-                        "Number of columns of the returned Pandas.DataFrame "
-                        "doesn't match specified schema. "
-                        "Expected: {} Actual: {}".format(len(returnType), len(result.columns)))
-                arrow_return_types = (to_arrow_type(field.dataType) for field in returnType)
-                return [(result[result.columns[i]], arrow_type)
-                        for i, arrow_type in enumerate(arrow_return_types)]
-
-            udf_obj = UserDefinedFunction(
-                wrapped, returnType, name=_udf_obj._name, pythonUdfType=_udf_obj.pythonUdfType)
-            return udf_obj(*[df[col] for col in df.columns])
-
-        wrapper.__name__ = _udf_obj._name
-        wrapper.__module__ = (_udf_obj.func.__module__ if hasattr(_udf_obj.func, '__module__')
-                              else _udf_obj.func.__class__.__module__)
-
-        wrapper.func = _udf_obj.func
-        wrapper.returnType = _udf_obj.returnType
-        wrapper.pythonUdfType = _udf_obj.pythonUdfType
-
-        return wrapper
-
-    return _resolve_decorator(_create_udf, f, returnType)
+    return _create_udf(f, returnType=returnType, pythonUdfType=PythonUdfType.PANDAS_GROUPED_UDF)
 
 
 blacklist = ['map', 'since', 'ignore_unicode_prefix']
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
@@ -245,7 +245,33 @@ def apply(self, udf):
         if not isinstance(udf.returnType, StructType):
             raise ValueError("The returnType of the pandas_grouped_udf must be a StructType")
 
-        udf_column = udf(self._df)
+        df = self._df
+        func = udf.func
+        returnType = udf.returnType
+
+        # The python executors expects the function to use pd.Series as input and output
+        # So we to create a wrapper function that turns that to a pd.DataFrame before passing
+        # down to the user function, then turn the result pd.DataFrame back into pd.Series
+        columns = df.columns
+
+        def wrapped(*cols):
+            from pyspark.sql.types import to_arrow_type
+            import pandas as pd
+            result = func(pd.concat(cols, axis=1, keys=columns))
+            if not isinstance(result, pd.DataFrame):
+                raise TypeError("Return type of the user-defined function should be "
+                                "Pandas.DataFrame, but is {}".format(type(result)))
+            if not len(result.columns) == len(returnType):
+                raise RuntimeError(
+                    "Number of columns of the returned Pandas.DataFrame "
+                    "doesn't match specified schema. "
+                    "Expected: {} Actual: {}".format(len(returnType), len(result.columns)))
+            arrow_return_types = (to_arrow_type(field.dataType) for field in returnType)
+            return [(result[result.columns[i]], arrow_type)
+                    for i, arrow_type in enumerate(arrow_return_types)]
+
+        wrapped_udf_obj = pandas_grouped_udf(wrapped, returnType)
+        udf_column = wrapped_udf_obj(*[df[col] for col in df.columns])
         jdf = self._jgd.flatMapGroupsInPandas(udf_column._jc.expr())
         return DataFrame(jdf, self.sql_ctx)
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3516,7 +3516,7 @@ def test_wrong_return_type(self):
             with self.assertRaisesRegexp(Exception, 'Invalid.*type'):
                 df.groupby('id').apply(foo).sort('id').toPandas()
 
-    def test_invalid_parameters(self):
+    def test_zero_or_more_than_1_parameters(self):
         from pyspark.sql.functions import pandas_grouped_udf
         error_str = 'Only 1-arg pandas_grouped_udfs are supported.'
         with QuietTest(self.sc):
@@ -3542,17 +3542,6 @@ def zero_no_type(pdf, x):
                 def zero_with_type(pdf, x):
                     return pdf
 
-            with self.assertRaisesRegexp(ValueError, error_str):
-                pandas_grouped_udf(lambda *args: args[0], 'one long')
-            with self.assertRaisesRegexp(ValueError, error_str):
-                @pandas_grouped_udf
-                def zero_no_type(*args):
-                    return args[0]
-            with self.assertRaisesRegexp(ValueError, error_str):
-                @pandas_grouped_udf("one long")
-                def zero_with_type(*args):
-                    return args[0]
-
     def test_wrong_args(self):
         from pyspark.sql.functions import udf, pandas_udf, pandas_grouped_udf, sum
         df = self.data
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -103,7 +103,7 @@ def read_single_udf(pickleSer, infile, eval_type):
     if eval_type == PythonEvalType.SQL_PANDAS_UDF:
         return arg_offsets, wrap_pandas_udf(row_func, return_type)
     elif eval_type == PythonEvalType.SQL_PANDAS_GROUPED_UDF:
-        # a groupby apply udf has already been wrapped
+        # a groupby apply udf has already been wrapped under apply()
         return arg_offsets, row_func
     else:
         return arg_offsets, wrap_udf(row_func, return_type)