From f1e8be4552f2c50fc83def272921950648a85e5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 23 Jan 2025 09:20:45 +0100 Subject: [PATCH 1/7] fix mean for datetime-like by using the respective dtype time resolution unit, adapting tests --- xarray/core/duck_array_ops.py | 15 +++---- xarray/tests/test_duck_array_ops.py | 64 +++++++++++++++++++---------- 2 files changed, 50 insertions(+), 29 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 7e7333fd8ea..9e702fcd6c8 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -666,12 +666,10 @@ def np_timedelta64_to_float(array, datetime_unit): Notes ----- - The array is first converted to microseconds, which is less likely to - cause overflow errors. + The array is first converted to datetimeunit. """ - array = array.astype("timedelta64[ns]").astype(np.float64) - conversion_factor = np.timedelta64(1, "ns") / np.timedelta64(1, datetime_unit) - return conversion_factor * array + # todo: should we check for overflow here? + return array.astype(f"timedelta64[{datetime_unit}]").astype(np.float64) def pd_timedelta_to_float(value, datetime_unit): @@ -715,8 +713,11 @@ def mean(array, axis=None, skipna=None, **kwargs): if dtypes.is_datetime_like(array.dtype): offset = _datetime_nanmin(array) - # xarray always uses np.datetime64[ns] for np.datetime64 data - dtype = "timedelta64[ns]" + # from version 2025.01.2 xarray uses np.datetime64[unit] where unit + # is one of "s", "ms", "us", "ns" + # the respective unit is used for the timedelta representation + unit, _ = np.datetime_data(offset.dtype) + dtype = f"timedelta64[{unit}]" return ( _mean( datetime_to_numeric(array, offset), axis=axis, skipna=skipna, **kwargs diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index e1306964757..e26fe25e04c 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -9,6 +9,7 @@ from numpy import array, nan from xarray import DataArray, Dataset, cftime_range, concat +from xarray.coding.times import _NS_PER_TIME_DELTA from xarray.core import dtypes, duck_array_ops from xarray.core.duck_array_ops import ( array_notnull_equiv, @@ -28,6 +29,7 @@ where, ) from xarray.core.extension_array import PandasExtensionArray +from xarray.core.types import NPDatetimeUnitOptions, PDDatetimeUnitOptions from xarray.namedarray.pycompat import array_type from xarray.testing import assert_allclose, assert_equal, assert_identical from xarray.tests import ( @@ -411,10 +413,11 @@ def assert_dask_array(da, dask): @arm_xfail @pytest.mark.filterwarnings("ignore:All-NaN .* encountered:RuntimeWarning") @pytest.mark.parametrize("dask", [False, True] if has_dask else [False]) -def test_datetime_mean(dask: bool) -> None: +def test_datetime_mean(dask: bool, time_unit: PDDatetimeUnitOptions) -> None: # Note: only testing numpy, as dask is broken upstream + dtype = f"M8[{time_unit}]" da = DataArray( - np.array(["2010-01-01", "NaT", "2010-01-03", "NaT", "NaT"], dtype="M8[ns]"), + np.array(["2010-01-01", "NaT", "2010-01-03", "NaT", "NaT"], dtype=dtype), dims=["time"], ) if dask: @@ -846,11 +849,11 @@ def test_multiple_dims(dtype, dask, skipna, func): @pytest.mark.parametrize("dask", [True, False]) -def test_datetime_to_numeric_datetime64(dask): +def test_datetime_to_numeric_datetime64(dask, time_unit: PDDatetimeUnitOptions): if dask and not has_dask: pytest.skip("requires dask") - times = pd.date_range("2000", periods=5, freq="7D").values + times = pd.date_range("2000", periods=5, freq="7D").as_unit(time_unit).values if dask: import dask.array @@ -923,15 +926,18 @@ def test_datetime_to_numeric_cftime(dask): @requires_cftime -def test_datetime_to_numeric_potential_overflow(): +def test_datetime_to_numeric_potential_overflow(time_unit: PDDatetimeUnitOptions): import cftime - times = pd.date_range("2000", periods=5, freq="7D").values.astype("datetime64[us]") + if time_unit == "ns": + pytest.skip("out-of-bounds datetime64 overflow") + dtype = f"M8[{time_unit}]" + times = pd.date_range("2000", periods=5, freq="7D").values.astype(dtype) cftimes = cftime_range( "2000", periods=5, freq="7D", calendar="proleptic_gregorian" ).values - offset = np.datetime64("0001-01-01") + offset = np.datetime64("0001-01-01", time_unit) cfoffset = cftime.DatetimeProlepticGregorian(1, 1, 1) result = duck_array_ops.datetime_to_numeric( @@ -957,24 +963,37 @@ def test_py_timedelta_to_float(): assert py_timedelta_to_float(dt.timedelta(days=1e6), "D") == 1e6 -@pytest.mark.parametrize( - "td, expected", - ([np.timedelta64(1, "D"), 86400 * 1e9], [np.timedelta64(1, "ns"), 1.0]), -) -def test_np_timedelta64_to_float(td, expected): - out = np_timedelta64_to_float(td, datetime_unit="ns") +@pytest.mark.parametrize("np_dt_unit", ["D", "h", "m", "s", "ms", "us", "ns"]) +def test_np_timedelta64_to_float( + np_dt_unit: NPDatetimeUnitOptions, time_unit: PDDatetimeUnitOptions +): + # tests any combination of source np.timedelta64 (NPDatetimeUnitOptions) with + # np_timedelta_to_float with dedicated target unit (PDDatetimeUnitOptions) + td = np.timedelta64(1, np_dt_unit) + if _NS_PER_TIME_DELTA[np_dt_unit] < _NS_PER_TIME_DELTA[time_unit]: + pytest.skip("combination cannot be calculated without precision loss") + expected = _NS_PER_TIME_DELTA[np_dt_unit] / _NS_PER_TIME_DELTA[time_unit] + + out = np_timedelta64_to_float(td, datetime_unit=time_unit) np.testing.assert_allclose(out, expected) assert isinstance(out, float) - out = np_timedelta64_to_float(np.atleast_1d(td), datetime_unit="ns") + out = np_timedelta64_to_float(np.atleast_1d(td), datetime_unit=time_unit) np.testing.assert_allclose(out, expected) -@pytest.mark.parametrize( - "td, expected", ([pd.Timedelta(1, "D"), 86400 * 1e9], [pd.Timedelta(1, "ns"), 1.0]) -) -def test_pd_timedelta_to_float(td, expected): - out = pd_timedelta_to_float(td, datetime_unit="ns") +@pytest.mark.parametrize("np_dt_unit", ["D", "h", "m", "s", "ms", "us", "ns"]) +def test_pd_timedelta_to_float( + np_dt_unit: NPDatetimeUnitOptions, time_unit: PDDatetimeUnitOptions +): + # tests any combination of source pd.Timedelta (NPDatetimeUnitOptions) with + # np_timedelta_to_float with dedicated target unit (PDDatetimeUnitOptions) + td = pd.Timedelta(1, np_dt_unit) + if _NS_PER_TIME_DELTA[np_dt_unit] < _NS_PER_TIME_DELTA[time_unit]: + pytest.skip("combination cannot be calculated without precision loss") + expected = _NS_PER_TIME_DELTA[np_dt_unit] / _NS_PER_TIME_DELTA[time_unit] + + out = pd_timedelta_to_float(td, datetime_unit=time_unit) np.testing.assert_allclose(out, expected) assert isinstance(out, float) @@ -982,10 +1001,11 @@ def test_pd_timedelta_to_float(td, expected): @pytest.mark.parametrize( "td", [dt.timedelta(days=1), np.timedelta64(1, "D"), pd.Timedelta(1, "D"), "1 day"] ) -def test_timedelta_to_numeric(td): +def test_timedelta_to_numeric(td, time_unit: PDDatetimeUnitOptions): # Scalar input - out = timedelta_to_numeric(td, "ns") - np.testing.assert_allclose(out, 86400 * 1e9) + out = timedelta_to_numeric(td, time_unit) + expected = _NS_PER_TIME_DELTA["D"] / _NS_PER_TIME_DELTA[time_unit] + np.testing.assert_allclose(out, expected) assert isinstance(out, float) From a66656bdb88059d63f8e9c49cec28dd9f1b3c089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 23 Jan 2025 10:06:20 +0100 Subject: [PATCH 2/7] fix mypy --- xarray/tests/test_duck_array_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index e26fe25e04c..736b00a4f20 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -877,8 +877,8 @@ def test_datetime_to_numeric_datetime64(dask, time_unit: PDDatetimeUnitOptions): result = duck_array_ops.datetime_to_numeric( times, datetime_unit="h", dtype=dtype ) - expected = 24 * np.arange(0, 35, 7).astype(dtype) - np.testing.assert_array_equal(result, expected) + expected2 = 24 * np.arange(0, 35, 7).astype(dtype) + np.testing.assert_array_equal(result, expected2) @requires_cftime From 618760d9b539d1a89b29d1cc76fcfd7bd337e9b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 23 Jan 2025 11:53:29 +0100 Subject: [PATCH 3/7] add PR to existing entry for non-nanosecond datetimes --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9b40a323f39..8a9ba2d2ea3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -50,7 +50,7 @@ eventually be deprecated. New Features ~~~~~~~~~~~~ -- Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`). +- Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`, :pull:`9977`). By `Kai Mühlbauer `_ and `Spencer Clark `_. - Improve the error message raised when no key is matching the available variables in a dataset. (:pull:`9943`) By `Jimmy Westling `_. From 5d919a092ba134eaa136408c89fdee3e4d156f8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 25 Jan 2025 19:00:32 +0100 Subject: [PATCH 4/7] Update xarray/core/duck_array_ops.py Co-authored-by: Spencer Clark --- xarray/core/duck_array_ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 9e702fcd6c8..4f9f079467c 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -668,8 +668,9 @@ def np_timedelta64_to_float(array, datetime_unit): ----- The array is first converted to datetimeunit. """ - # todo: should we check for overflow here? - return array.astype(f"timedelta64[{datetime_unit}]").astype(np.float64) + unit, _ = np.datetime_data(array.dtype) + conversion_factor = np.timedelta64(1, unit) / np.timedelta64(1, datetime_unit) + return conversion_factor * array.astype(np.float64) def pd_timedelta_to_float(value, datetime_unit): From dfc9e03c88eb7d97cb3eff6d57f60384e6098585 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 25 Jan 2025 19:03:11 +0100 Subject: [PATCH 5/7] cast to "int64" in calculation of datime-like mean --- xarray/core/duck_array_ops.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 4f9f079467c..46dc81c62c6 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -662,12 +662,7 @@ def _to_pytimedelta(array, unit="us"): def np_timedelta64_to_float(array, datetime_unit): - """Convert numpy.timedelta64 to float. - - Notes - ----- - The array is first converted to datetimeunit. - """ + """Convert numpy.timedelta64 to float, possibly at a loss of resolution.""" unit, _ = np.datetime_data(array.dtype) conversion_factor = np.timedelta64(1, unit) / np.timedelta64(1, datetime_unit) return conversion_factor * array.astype(np.float64) @@ -714,15 +709,16 @@ def mean(array, axis=None, skipna=None, **kwargs): if dtypes.is_datetime_like(array.dtype): offset = _datetime_nanmin(array) - # from version 2025.01.2 xarray uses np.datetime64[unit] where unit - # is one of "s", "ms", "us", "ns" - # the respective unit is used for the timedelta representation - unit, _ = np.datetime_data(offset.dtype) - dtype = f"timedelta64[{unit}]" + # From version 2025.01.2 xarray uses np.datetime64[unit], where unit + # is one of "s", "ms", "us", "ns". + # To not have to worry about the resolution, we just convert the output + # to "int64" and let the dtype of offset take precedence. + # This is fully backwards compatible with datetime64[ns]. For other + # this might lead to imprecise output, where fractional parts are truncated. return ( _mean( datetime_to_numeric(array, offset), axis=axis, skipna=skipna, **kwargs - ).astype(dtype) + ).astype("int64") + offset ) elif _contains_cftime_datetimes(array): From 180aa0a843d80e977257c54179026bda45f1c096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 26 Jan 2025 11:30:48 +0100 Subject: [PATCH 6/7] Apply suggestions from code review --- xarray/core/duck_array_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 46dc81c62c6..31391854208 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -712,13 +712,13 @@ def mean(array, axis=None, skipna=None, **kwargs): # From version 2025.01.2 xarray uses np.datetime64[unit], where unit # is one of "s", "ms", "us", "ns". # To not have to worry about the resolution, we just convert the output - # to "int64" and let the dtype of offset take precedence. + # to "timedelta64" (without unit) and let the dtype of offset take precedence. # This is fully backwards compatible with datetime64[ns]. For other # this might lead to imprecise output, where fractional parts are truncated. return ( _mean( datetime_to_numeric(array, offset), axis=axis, skipna=skipna, **kwargs - ).astype("int64") + ).astype("timedelta64") + offset ) elif _contains_cftime_datetimes(array): From 295d8167ff5a45701f06b219ec5dac0a8f3c87ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 26 Jan 2025 17:08:20 +0100 Subject: [PATCH 7/7] Apply suggestions from code review Co-authored-by: Spencer Clark --- xarray/core/duck_array_ops.py | 3 +-- xarray/tests/test_duck_array_ops.py | 4 ---- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 31391854208..c3f1598050a 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -713,8 +713,7 @@ def mean(array, axis=None, skipna=None, **kwargs): # is one of "s", "ms", "us", "ns". # To not have to worry about the resolution, we just convert the output # to "timedelta64" (without unit) and let the dtype of offset take precedence. - # This is fully backwards compatible with datetime64[ns]. For other - # this might lead to imprecise output, where fractional parts are truncated. + # This is fully backwards compatible with datetime64[ns]. return ( _mean( datetime_to_numeric(array, offset), axis=axis, skipna=skipna, **kwargs diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 736b00a4f20..9b8b50d99a1 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -970,8 +970,6 @@ def test_np_timedelta64_to_float( # tests any combination of source np.timedelta64 (NPDatetimeUnitOptions) with # np_timedelta_to_float with dedicated target unit (PDDatetimeUnitOptions) td = np.timedelta64(1, np_dt_unit) - if _NS_PER_TIME_DELTA[np_dt_unit] < _NS_PER_TIME_DELTA[time_unit]: - pytest.skip("combination cannot be calculated without precision loss") expected = _NS_PER_TIME_DELTA[np_dt_unit] / _NS_PER_TIME_DELTA[time_unit] out = np_timedelta64_to_float(td, datetime_unit=time_unit) @@ -989,8 +987,6 @@ def test_pd_timedelta_to_float( # tests any combination of source pd.Timedelta (NPDatetimeUnitOptions) with # np_timedelta_to_float with dedicated target unit (PDDatetimeUnitOptions) td = pd.Timedelta(1, np_dt_unit) - if _NS_PER_TIME_DELTA[np_dt_unit] < _NS_PER_TIME_DELTA[time_unit]: - pytest.skip("combination cannot be calculated without precision loss") expected = _NS_PER_TIME_DELTA[np_dt_unit] / _NS_PER_TIME_DELTA[time_unit] out = pd_timedelta_to_float(td, datetime_unit=time_unit)