Skip to content

Commit 94dca78

Browse files
committed
[SPARK-49693][PYTHON][CONNECT] Refine the string representation of timedelta
### What changes were proposed in this pull request? Refine the string representation of `timedelta`, by following the ISO format. Note that the used units in JVM side (`Duration`) and Pandas are different. ### Why are the changes needed? We should not leak the raw data ### Does this PR introduce _any_ user-facing change? yes PySpark Classic: ``` In [1]: from pyspark.sql import functions as sf In [2]: import datetime In [3]: sf.lit(datetime.timedelta(1, 1)) Out[3]: Column<'PT24H1S'> ``` PySpark Connect (before): ``` In [1]: from pyspark.sql import functions as sf In [2]: import datetime In [3]: sf.lit(datetime.timedelta(1, 1)) Out[3]: Column<'86401000000'> ``` PySpark Connect (after): ``` In [1]: from pyspark.sql import functions as sf In [2]: import datetime In [3]: sf.lit(datetime.timedelta(1, 1)) Out[3]: Column<'P1DT0H0M1S'> ``` ### How was this patch tested? added test ### Was this patch authored or co-authored using generative AI tooling? no Closes #48159 from zhengruifeng/pc_lit_delta. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent 398457a commit 94dca78

File tree

2 files changed

+33
-2
lines changed

2 files changed

+33
-2
lines changed

python/pyspark/sql/connect/expressions.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,17 @@ def __repr__(self) -> str:
489489
ts = TimestampNTZType().fromInternal(self._value)
490490
if ts is not None and isinstance(ts, datetime.datetime):
491491
return ts.strftime("%Y-%m-%d %H:%M:%S.%f")
492-
# TODO(SPARK-49693): Refine the string representation of timedelta
492+
elif isinstance(self._dataType, DayTimeIntervalType):
493+
delta = DayTimeIntervalType().fromInternal(self._value)
494+
if delta is not None and isinstance(delta, datetime.timedelta):
495+
import pandas as pd
496+
497+
# Note: timedelta itself does not provide isoformat method.
498+
# Both Pandas and java.time.Duration provide it, but the format
499+
# is sightly different:
500+
# java.time.Duration only applies HOURS, MINUTES, SECONDS units,
501+
# while Pandas applies all supported units.
502+
return pd.Timedelta(delta).isoformat() # type: ignore[attr-defined]
493503
return f"{self._value}"
494504

495505

python/pyspark/sql/tests/test_column.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,13 @@
1919
from enum import Enum
2020
from itertools import chain
2121
import datetime
22+
import unittest
2223

2324
from pyspark.sql import Column, Row
2425
from pyspark.sql import functions as sf
2526
from pyspark.sql.types import StructType, StructField, IntegerType, LongType
2627
from pyspark.errors import AnalysisException, PySparkTypeError, PySparkValueError
27-
from pyspark.testing.sqlutils import ReusedSQLTestCase
28+
from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, pandas_requirement_message
2829

2930

3031
class ColumnTestsMixin:
@@ -289,6 +290,26 @@ def test_lit_time_representation(self):
289290
ts = datetime.datetime(2021, 3, 4, 12, 34, 56, 1234)
290291
self.assertEqual(str(sf.lit(ts)), "Column<'2021-03-04 12:34:56.001234'>")
291292

293+
@unittest.skipIf(not have_pandas, pandas_requirement_message)
294+
def test_lit_delta_representation(self):
295+
for delta in [
296+
datetime.timedelta(days=1),
297+
datetime.timedelta(hours=2),
298+
datetime.timedelta(minutes=3),
299+
datetime.timedelta(seconds=4),
300+
datetime.timedelta(microseconds=5),
301+
datetime.timedelta(days=2, hours=21, microseconds=908),
302+
datetime.timedelta(days=1, minutes=-3, microseconds=-1001),
303+
datetime.timedelta(days=1, hours=2, minutes=3, seconds=4, microseconds=5),
304+
]:
305+
import pandas as pd
306+
307+
# Column<'PT69H0.000908S'> or Column<'P2DT21H0M0.000908S'>
308+
s = str(sf.lit(delta))
309+
310+
# Parse the ISO string representation and compare
311+
self.assertTrue(pd.Timedelta(s[8:-2]).to_pytimedelta() == delta)
312+
292313
def test_enum_literals(self):
293314
class IntEnum(Enum):
294315
X = 1

0 commit comments

Comments
 (0)