Restrict the number of arguments for grouped udf to only 1.

ueshin · ueshin · commit 789e642763ab · 2017-10-16T23:13:43.000+09:00
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -2135,15 +2135,20 @@ def wrapper(*args):
 def _create_udf(f, returnType, pythonUdfType):
 
     def _udf(f, returnType=StringType(), pythonUdfType=pythonUdfType):
-        if pythonUdfType == PythonUdfType.PANDAS_UDF \
-           or pythonUdfType == PythonUdfType.PANDAS_GROUPED_UDF:
+        if pythonUdfType == PythonUdfType.PANDAS_UDF:
             import inspect
             argspec = inspect.getargspec(f)
             if len(argspec.args) == 0 and argspec.varargs is None:
                 raise ValueError(
-                    "0-arg pandas_udfs/pandas_grouped_udfs are not supported. "
+                    "0-arg pandas_udfs are not supported. "
                     "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
                 )
+        elif pythonUdfType == PythonUdfType.PANDAS_GROUPED_UDF:
+            import inspect
+            argspec = inspect.getargspec(f)
+            if len(argspec.args) != 1 and argspec.varargs is None:
+                raise ValueError("Only 1-arg pandas_grouped_udfs are supported.")
+
         udf_obj = UserDefinedFunction(f, returnType, pythonUdfType=pythonUdfType)
         return udf_obj._wrapped()
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3516,6 +3516,32 @@ def test_wrong_return_type(self):
             with self.assertRaisesRegexp(Exception, 'Invalid.*type'):
                 df.groupby('id').apply(foo).sort('id').toPandas()
 
+    def test_zero_or_more_than_1_parameters(self):
+        from pyspark.sql.functions import pandas_grouped_udf
+        error_str = 'Only 1-arg pandas_grouped_udfs are supported.'
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(ValueError, error_str):
+                pandas_grouped_udf(lambda: 1, 'one long')
+            with self.assertRaisesRegexp(ValueError, error_str):
+                @pandas_grouped_udf
+                def zero_no_type():
+                    return 1
+            with self.assertRaisesRegexp(ValueError, error_str):
+                @pandas_grouped_udf("one long")
+                def zero_with_type():
+                    return 1
+
+            with self.assertRaisesRegexp(ValueError, error_str):
+                pandas_grouped_udf(lambda pdf, x: pdf, 'one long')
+            with self.assertRaisesRegexp(ValueError, error_str):
+                @pandas_grouped_udf
+                def zero_no_type(pdf, x):
+                    return pdf
+            with self.assertRaisesRegexp(ValueError, error_str):
+                @pandas_grouped_udf("one long")
+                def zero_with_type(pdf, x):
+                    return pdf
+
     def test_wrong_args(self):
         from pyspark.sql.functions import udf, pandas_udf, pandas_grouped_udf, sum
         df = self.data