From f307cbc699e964f9c3048253bf998cad7e4bdee7 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 19 Sep 2024 10:26:13 +0800 Subject: [PATCH 1/3] init init --- python/pyspark/sql/connect/expressions.py | 15 ++++++++++++++- python/pyspark/sql/tests/test_column.py | 23 ++++++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/connect/expressions.py b/python/pyspark/sql/connect/expressions.py index 63128ef48e38..c6e1b2b9b99b 100644 --- a/python/pyspark/sql/connect/expressions.py +++ b/python/pyspark/sql/connect/expressions.py @@ -489,7 +489,20 @@ def __repr__(self) -> str: ts = TimestampNTZType().fromInternal(self._value) if ts is not None and isinstance(ts, datetime.datetime): return ts.strftime("%Y-%m-%d %H:%M:%S.%f") - # TODO(SPARK-49693): Refine the string representation of timedelta + elif isinstance(self._dataType, DayTimeIntervalType): + delta = DayTimeIntervalType().fromInternal(self._value) + if delta is not None and isinstance(delta, datetime.timedelta): + try: + import pandas as pd + + # Note: timedelta itself does not provide isoformat method. + # Both Pandas and java.time.Duration provide it, but the format + # is sightly different: + # java.time.Duration only applies HOURS, MINUTES, SECONDS units, + # while Pandas applies all supported units. + return pd.Timedelta(delta).isoformat() + except Exception: + pass return f"{self._value}" diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py index 220ecd387f7e..1972dd2804d9 100644 --- a/python/pyspark/sql/tests/test_column.py +++ b/python/pyspark/sql/tests/test_column.py @@ -19,12 +19,13 @@ from enum import Enum from itertools import chain import datetime +import unittest from pyspark.sql import Column, Row from pyspark.sql import functions as sf from pyspark.sql.types import StructType, StructField, IntegerType, LongType from pyspark.errors import AnalysisException, PySparkTypeError, PySparkValueError -from pyspark.testing.sqlutils import ReusedSQLTestCase +from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, pandas_requirement_message class ColumnTestsMixin: @@ -289,6 +290,26 @@ def test_lit_time_representation(self): ts = datetime.datetime(2021, 3, 4, 12, 34, 56, 1234) self.assertEqual(str(sf.lit(ts)), "Column<'2021-03-04 12:34:56.001234'>") + @unittest.skipIf(not have_pandas, pandas_requirement_message) + def test_lit_delta_representation(self): + for delta in [ + datetime.timedelta(days=1), + datetime.timedelta(hours=2), + datetime.timedelta(minutes=3), + datetime.timedelta(seconds=4), + datetime.timedelta(microseconds=5), + datetime.timedelta(days=2, hours=21, microseconds=908), + datetime.timedelta(days=1, minutes=-3, microseconds=-1001), + datetime.timedelta(days=1, hours=2, minutes=3, seconds=4, microseconds=5), + ]: + import pandas as pd + + # Column<'PT69H0.000908S'> or Column<'P2DT21H0M0.000908S'> + s = str(sf.lit(delta)) + + # Parse the ISO string representation and compare + self.assertTrue(pd.Timedelta(s[8:-2]).to_pytimedelta() == delta) + def test_enum_literals(self): class IntEnum(Enum): X = 1 From c0585319b5a5f3d07d18cbeba235ae0cc1fc6caa Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 19 Sep 2024 11:52:12 +0800 Subject: [PATCH 2/3] fix --- python/pyspark/sql/connect/expressions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/connect/expressions.py b/python/pyspark/sql/connect/expressions.py index c6e1b2b9b99b..bbc9c3d65afe 100644 --- a/python/pyspark/sql/connect/expressions.py +++ b/python/pyspark/sql/connect/expressions.py @@ -500,7 +500,7 @@ def __repr__(self) -> str: # is sightly different: # java.time.Duration only applies HOURS, MINUTES, SECONDS units, # while Pandas applies all supported units. - return pd.Timedelta(delta).isoformat() + return pd.Timedelta(delta).isoformat() # type: ignore[attr-defined] except Exception: pass return f"{self._value}" From ed8222fc59b48a5558d81be4ed731d3c3a3f7c4d Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 19 Sep 2024 12:33:21 +0800 Subject: [PATCH 3/3] address comments --- python/pyspark/sql/connect/expressions.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/python/pyspark/sql/connect/expressions.py b/python/pyspark/sql/connect/expressions.py index bbc9c3d65afe..0b5512b61925 100644 --- a/python/pyspark/sql/connect/expressions.py +++ b/python/pyspark/sql/connect/expressions.py @@ -492,17 +492,14 @@ def __repr__(self) -> str: elif isinstance(self._dataType, DayTimeIntervalType): delta = DayTimeIntervalType().fromInternal(self._value) if delta is not None and isinstance(delta, datetime.timedelta): - try: - import pandas as pd - - # Note: timedelta itself does not provide isoformat method. - # Both Pandas and java.time.Duration provide it, but the format - # is sightly different: - # java.time.Duration only applies HOURS, MINUTES, SECONDS units, - # while Pandas applies all supported units. - return pd.Timedelta(delta).isoformat() # type: ignore[attr-defined] - except Exception: - pass + import pandas as pd + + # Note: timedelta itself does not provide isoformat method. + # Both Pandas and java.time.Duration provide it, but the format + # is sightly different: + # java.time.Duration only applies HOURS, MINUTES, SECONDS units, + # while Pandas applies all supported units. + return pd.Timedelta(delta).isoformat() # type: ignore[attr-defined] return f"{self._value}"