added check for pandas_udf return is a timestamp with tz, added comments on conversion function input and output

BryanCutler · BryanCutler · commit addd35f49d58 · 2017-10-26T10:11:26.000-07:00
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -1885,8 +1885,8 @@ def toPandas(self):
                 tables = self._collectAsArrow()
                 if tables:
                     table = pyarrow.concat_tables(tables)
-                    df = table.to_pandas()
-                    return _check_dataframe_localize_timestamps(df)
+                    pdf = table.to_pandas()
+                    return _check_dataframe_localize_timestamps(pdf)
                 else:
                     return pd.DataFrame.from_records([], columns=self.columns)
             except ImportError as e:
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3458,6 +3458,24 @@ def check_data(idx, timestamp, timestamp_copy):
             self.assertEquals(data[i][1], result[i][1])  # "timestamp" col
             self.assertTrue(result[i][3])  # "is_equal" data in udf was as expected
 
+    def test_vectorized_udf_return_timestamp_tz(self):
+        from pyspark.sql.functions import pandas_udf, col
+        import pandas as pd
+        df = self.spark.range(10)
+
+        @pandas_udf(returnType=TimestampType())
+        def gen_timestamps(id):
+            ts = [pd.Timestamp(i, unit='D', tz='America/Los_Angeles') for i in id]
+            return pd.Series(ts)
+
+        result = df.withColumn("ts", gen_timestamps(col("id"))).collect()
+        spark_ts_t = TimestampType()
+        for r in result:
+            i, ts = r
+            ts_tz = pd.Timestamp(i, unit='D', tz='America/Los_Angeles').to_pydatetime()
+            expected = spark_ts_t.fromInternal(spark_ts_t.toInternal(ts_tz))
+            self.assertEquals(expected, ts)
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class GroupbyApplyTests(ReusedPySparkTestCase):
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
@@ -1629,24 +1629,33 @@ def to_arrow_type(dt):
     return arrow_type
 
 
-def _check_dataframe_localize_timestamps(df):
-    """ Convert timezone aware timestamps to timezone-naive in local time
+def _check_dataframe_localize_timestamps(pdf):
+    """
+    Convert timezone aware timestamps to timezone-naive in local time
+
+    :param pdf: pandas.DataFrame
+    :return pandas.DataFrame where any timezone aware columns have be converted to tz-naive
     """
     from pandas.api.types import is_datetime64tz_dtype
-    for column, series in df.iteritems():
+    for column, series in pdf.iteritems():
         # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
         if is_datetime64tz_dtype(series.dtype):
-            df[column] = series.dt.tz_convert('tzlocal()').dt.tz_localize(None)
-    return df
+            pdf[column] = series.dt.tz_convert('tzlocal()').dt.tz_localize(None)
+    return pdf
 
 
 def _check_series_convert_timestamps_internal(s):
-    """ Convert a tz-naive timestamp in local tz to UTC normalized for Spark internal storage
     """
-    from pandas.api.types import is_datetime64_dtype
+    Convert a tz-naive timestamp in local tz to UTC normalized for Spark internal storage
+    :param s: a pandas.Series
+    :return pandas.Series where if it is a timestamp, has been UTC normalized without a time zone
+    """
+    from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
     if is_datetime64_dtype(s.dtype):
         return s.dt.tz_localize('tzlocal()').dt.tz_convert('UTC')
+    elif is_datetime64tz_dtype(s.dtype):
+        return s.dt.tz_convert('UTC')
     else:
         return s