Print warning message instead of throwing exception.

viirya · viirya · commit d206b7cf78f8 · 2018-10-02T05:32:07.000Z
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -4637,44 +4637,6 @@ def foofoo(x, y):
             ).collect
         )
 
-    def test_pandas_udf_when_input_has_none(self):
-        import math
-        from pyspark.sql.functions import pandas_udf
-        import pandas as pd
-
-        values = [1.0] * 10 + [None] * 10 + [2.0] * 10
-        pdf = pd.DataFrame({'A': values})
-        df = self.spark.createDataFrame(pdf).repartition(1)
-
-        @pandas_udf(returnType=DoubleType())
-        def gt_2_double(column):
-            return (column >= 2).where(column.notnull())
-
-        # This pandas udf returns Pandas.Series of dtype as float64.
-        # If we define the pandas udf with incorrect data type BooleanType,
-        # we should see an exception.
-        @pandas_udf(returnType=BooleanType())
-        def gt_2_boolean(column):
-            return (column >= 2).where(column.notnull())
-
-        udf_double = df.select(['A']).withColumn('udf', gt_2_double('A'))
-        udf_boolean = df.select(['A']).withColumn('udf', gt_2_boolean('A'))
-
-        result = udf_double.collect()
-        result_part1 = [x[1] for x in result if x[0] == 1.0]
-        self.assertEqual(set(result_part1), set([0.0]))
-        result_part2 = [x[1] for x in result if x[0] == 2.0]
-        self.assertEqual(set(result_part2), set([1.0]))
-        result_part3 = [x[1] for x in result if math.isnan(x[0])]
-        self.assertEqual(set(result_part3), set([None]))
-
-        with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, "Return Pandas.Series of the user-defined " +
-                                                    "function's dtype is float64 which doesn't " +
-                                                    "match the arrow type bool of defined type " +
-                                                    "BooleanType"):
-                udf_boolean.collect()
-
 
 @unittest.skipIf(
     not _have_pandas or not _have_pyarrow,
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -95,11 +95,23 @@ def verify_result_length(*a):
 
         # Ensure return type of Pandas.Series matches the arrow return type of the user-defined
         # function. Otherwise, we may produce incorrect serialized data.
-        arrow_type_of_result = pa.from_numpy_dtype(result.dtype)
-        if arrow_return_type != arrow_type_of_result:
-            raise TypeError("Return Pandas.Series of the user-defined function's dtype is %s "
-                            "which doesn't match the arrow type %s "
-                            "of defined type %s" % (result.dtype, arrow_return_type, return_type))
+        # Note: for timestamp type, we only need to ensure both types are timestamp because the
+        # serializer will do conversion.
+        try:
+            arrow_type_of_result = pa.from_numpy_dtype(result.dtype)
+            both_are_timestamp = pa.types.is_timestamp(arrow_type_of_result) and \
+                                 pa.types.is_timestamp(arrow_return_type)
+            if not both_are_timestamp and arrow_return_type != arrow_type_of_result:
+                print("WARN: Arrow type %s of return Pandas.Series of the user-defined function's "
+                      "dtype %s doesn't match the arrow type %s "
+                      "of defined return type %s" % (arrow_type_of_result, result.dtype,
+                                                     arrow_return_type, return_type),
+                      file=sys.stderr)
+        except:
+            print("WARN: Can't infer arrow type of Pandas.Series's dtype: %s, which might not match "
+                  "the arrow type %s of defined return type %s" % (result.dtype, arrow_return_type,
+                                                                   return_type),
+                  file=sys.stderr)
 
         return result