Skip to content

Commit bdda733

Browse files
committed
Unify timedelta64 coding logic between the old and new approaches
Always write a dtype attribute to disk regardless of how the timedeltas were decoded.
1 parent 4a581f4 commit bdda733

File tree

5 files changed

+159
-170
lines changed

5 files changed

+159
-170
lines changed

doc/whats-new.rst

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,11 @@ Bug fixes
121121
(:pull:`10352`). By `Spencer Clark <https://github.com/spencerkclark>`_.
122122
- Avoid unsafe casts from float to unsigned int in CFMaskCoder (:issue:`9815`, :pull:`9964`).
123123
By ` Elliott Sales de Andrade <https://github.com/QuLogic>`_.
124-
- Fix attribute overwriting bug when decoding literally encoded
125-
:py:class:`numpy.timedelta64` values from disk (:issue:`10468`,
126-
:pull:`10469`). By `Spencer Clark <https://github.com/spencerkclark>`_.
127-
- Fix default ``"_FillValue"`` dtype coercion bug when literally encoding
124+
- Fix attribute overwriting bug when decoding encoded
125+
:py:class:`numpy.timedelta64` values from disk with a dtype attribute
126+
(:issue:`10468`, :pull:`10469`). By `Spencer Clark
127+
<https://github.com/spencerkclark>`_.
128+
- Fix default ``"_FillValue"`` dtype coercion bug when encoding
128129
:py:class:`numpy.timedelta64` values to an on-disk format that only supports
129130
32-bit integers (:issue:`10466`, :pull:`10469`). By `Spencer Clark
130131
<https://github.com/spencerkclark>`_.

xarray/backends/netcdf3.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def encode_nc3_attrs(attrs):
103103
return {k: encode_nc3_attr_value(v) for k, v in attrs.items()}
104104

105105

106-
def _maybe_prepare_times(var, name=None):
106+
def _maybe_prepare_times(var):
107107
# checks for integer-based time-like and
108108
# replaces np.iinfo(np.int64).min with _FillValue or np.nan
109109
# this keeps backwards compatibility
@@ -112,21 +112,7 @@ def _maybe_prepare_times(var, name=None):
112112
if data.dtype.kind in "iu":
113113
units = var.attrs.get("units", None)
114114
if units is not None and coding.variables._is_time_like(units):
115-
default_int64_fill_value = np.iinfo(np.int64).min
116-
default_int32_fill_value = np.iinfo(np.int32).min
117-
mask = data == default_int64_fill_value
118-
119-
if var.attrs.get("_FillValue") == default_int64_fill_value:
120-
if (data == default_int32_fill_value).any():
121-
raise ValueError(
122-
f"Could not safely coerce default int64 _FillValue "
123-
f"({default_int64_fill_value}) to the analogous int32 "
124-
f"value ({default_int32_fill_value}), since it "
125-
f"already exists as non-missing within variable "
126-
f"{name!r}. Try explicitly setting "
127-
f"encoding['_FillValue'] to another int32 value."
128-
)
129-
var.attrs["_FillValue"] = default_int32_fill_value
115+
mask = data == np.iinfo(np.int64).min
130116
if mask.any():
131117
data = np.where(mask, var.attrs.get("_FillValue", np.nan), data)
132118
return data
@@ -138,7 +124,7 @@ def encode_nc3_variable(var, name=None):
138124
coding.strings.CharacterArrayCoder(),
139125
]:
140126
var = coder.encode(var, name=name)
141-
data = _maybe_prepare_times(var, name=name)
127+
data = _maybe_prepare_times(var)
142128
data = coerce_nc3_dtype(data)
143129
attrs = encode_nc3_attrs(var.attrs)
144130
return Variable(var.dims, data, attrs, var.encoding)

xarray/coding/times.py

Lines changed: 67 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1410,6 +1410,43 @@ def has_timedelta64_encoding_dtype(attrs_or_encoding: dict) -> bool:
14101410
return isinstance(dtype, str) and dtype.startswith("timedelta64")
14111411

14121412

1413+
def resolve_time_unit_from_attrs_dtype(
1414+
attrs_dtype: str, name: T_Name
1415+
) -> PDDatetimeUnitOptions:
1416+
dtype = np.dtype(attrs_dtype)
1417+
resolution, _ = np.datetime_data(dtype)
1418+
resolution = cast(NPDatetimeUnitOptions, resolution)
1419+
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
1420+
time_unit = cast(PDDatetimeUnitOptions, "s")
1421+
message = (
1422+
f"Following pandas, xarray only supports decoding to timedelta64 "
1423+
f"values with a resolution of 's', 'ms', 'us', or 'ns'. Encoded "
1424+
f"values for variable {name!r} have a resolution of "
1425+
f"{resolution!r}. Attempting to decode to a resolution of 's'. "
1426+
f"Note, depending on the encoded values, this may lead to an "
1427+
f"OverflowError. Additionally, data will not be identically round "
1428+
f"tripped; xarray will choose an encoding dtype of "
1429+
f"'timedelta64[s]' when re-encoding."
1430+
)
1431+
emit_user_level_warning(message)
1432+
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
1433+
time_unit = cast(PDDatetimeUnitOptions, "ns")
1434+
message = (
1435+
f"Following pandas, xarray only supports decoding to timedelta64 "
1436+
f"values with a resolution of 's', 'ms', 'us', or 'ns'. Encoded "
1437+
f"values for variable {name!r} have a resolution of "
1438+
f"{resolution!r}. Attempting to decode to a resolution of 'ns'. "
1439+
f"Note, depending on the encoded values, this may lead to loss of "
1440+
f"precision. Additionally, data will not be identically round "
1441+
f"tripped; xarray will choose an encoding dtype of "
1442+
f"'timedelta64[ns]' when re-encoding."
1443+
)
1444+
emit_user_level_warning(message)
1445+
else:
1446+
time_unit = cast(PDDatetimeUnitOptions, resolution)
1447+
return time_unit
1448+
1449+
14131450
class CFTimedeltaCoder(VariableCoder):
14141451
"""Coder for CF Timedelta coding.
14151452
@@ -1430,7 +1467,7 @@ class CFTimedeltaCoder(VariableCoder):
14301467

14311468
def __init__(
14321469
self,
1433-
time_unit: PDDatetimeUnitOptions = "ns",
1470+
time_unit: PDDatetimeUnitOptions | None = None,
14341471
decode_via_units: bool = True,
14351472
decode_via_dtype: bool = True,
14361473
) -> None:
@@ -1442,45 +1479,18 @@ def __init__(
14421479
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
14431480
if np.issubdtype(variable.data.dtype, np.timedelta64):
14441481
dims, data, attrs, encoding = unpack_for_encoding(variable)
1445-
has_timedelta_dtype = has_timedelta64_encoding_dtype(encoding)
1446-
if ("units" in encoding or "dtype" in encoding) and not has_timedelta_dtype:
1447-
dtype = encoding.get("dtype", None)
1448-
units = encoding.pop("units", None)
1482+
dtype = encoding.get("dtype", None)
1483+
units = encoding.pop("units", None)
14491484

1450-
# in the case of packed data we need to encode into
1451-
# float first, the correct dtype will be established
1452-
# via CFScaleOffsetCoder/CFMaskCoder
1453-
if "add_offset" in encoding or "scale_factor" in encoding:
1454-
dtype = data.dtype if data.dtype.kind == "f" else "float64"
1485+
# in the case of packed data we need to encode into
1486+
# float first, the correct dtype will be established
1487+
# via CFScaleOffsetCoder/CFMaskCoder
1488+
if "add_offset" in encoding or "scale_factor" in encoding:
1489+
dtype = data.dtype if data.dtype.kind == "f" else "float64"
14551490

1456-
else:
1457-
resolution, _ = np.datetime_data(variable.dtype)
1458-
dtype = np.int64
1459-
attrs_dtype = f"timedelta64[{resolution}]"
1460-
units = _numpy_dtype_to_netcdf_timeunit(variable.dtype)
1461-
safe_setitem(attrs, "dtype", attrs_dtype, name=name)
1462-
# Remove dtype encoding if it exists to prevent it from
1463-
# interfering downstream in NonStringCoder.
1464-
encoding.pop("dtype", None)
1465-
1466-
if any(
1467-
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
1468-
):
1469-
raise ValueError(
1470-
f"Specifying 'add_offset' or 'scale_factor' is not "
1471-
f"supported when encoding the timedelta64 values of "
1472-
f"variable {name!r} with xarray's new default "
1473-
f"timedelta64 encoding approach. To encode {name!r} "
1474-
f"with xarray's previous timedelta64 encoding "
1475-
f"approach, which supports the 'add_offset' and "
1476-
f"'scale_factor' parameters, additionally set "
1477-
f"encoding['units'] to a unit of time, e.g. "
1478-
f"'seconds'. To proceed with encoding of {name!r} "
1479-
f"via xarray's new approach, remove any encoding "
1480-
f"entries for 'add_offset' or 'scale_factor'."
1481-
)
1482-
if "_FillValue" not in encoding and "missing_value" not in encoding:
1483-
encoding["_FillValue"] = np.iinfo(np.int64).min
1491+
resolution, _ = np.datetime_data(variable.dtype)
1492+
attrs_dtype = f"timedelta64[{resolution}]"
1493+
safe_setitem(attrs, "dtype", attrs_dtype, name=name)
14841494

14851495
data, units = encode_cf_timedelta(data, units, dtype)
14861496
safe_setitem(attrs, "units", units, name=name)
@@ -1499,57 +1509,13 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
14991509
):
15001510
dims, data, attrs, encoding = unpack_for_decoding(variable)
15011511
units = pop_to(attrs, encoding, "units")
1502-
if is_dtype_decodable and self.decode_via_dtype:
1503-
if any(
1504-
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
1505-
):
1506-
raise ValueError(
1507-
f"Decoding timedelta64 values via dtype is not "
1508-
f"supported when 'add_offset', or 'scale_factor' are "
1509-
f"present in encoding. Check the encoding parameters "
1510-
f"of variable {name!r}."
1511-
)
1512-
# Overwrite the on-disk dtype encoding, which is numeric, with
1513-
# the dtype attribute stored on disk, which corresponds to
1514-
# a timedelta64 dtype.
1515-
encoding["dtype"] = attrs.pop("dtype")
1516-
dtype = np.dtype(encoding["dtype"])
1517-
resolution, _ = np.datetime_data(dtype)
1518-
resolution = cast(NPDatetimeUnitOptions, resolution)
1519-
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
1520-
time_unit = cast(PDDatetimeUnitOptions, "s")
1521-
dtype = np.dtype("timedelta64[s]")
1522-
message = (
1523-
f"Following pandas, xarray only supports decoding to "
1524-
f"timedelta64 values with a resolution of 's', 'ms', "
1525-
f"'us', or 'ns'. Encoded values for variable {name!r} "
1526-
f"have a resolution of {resolution!r}. Attempting to "
1527-
f"decode to a resolution of 's'. Note, depending on "
1528-
f"the encoded values, this may lead to an "
1529-
f"OverflowError. Additionally, data will not be "
1530-
f"identically round tripped; xarray will choose an "
1531-
f"encoding dtype of 'timedelta64[s]' when re-encoding."
1532-
)
1533-
emit_user_level_warning(message)
1534-
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
1535-
time_unit = cast(PDDatetimeUnitOptions, "ns")
1536-
dtype = np.dtype("timedelta64[ns]")
1537-
message = (
1538-
f"Following pandas, xarray only supports decoding to "
1539-
f"timedelta64 values with a resolution of 's', 'ms', "
1540-
f"'us', or 'ns'. Encoded values for variable {name!r} "
1541-
f"have a resolution of {resolution!r}. Attempting to "
1542-
f"decode to a resolution of 'ns'. Note, depending on "
1543-
f"the encoded values, this may lead to loss of "
1544-
f"precision. Additionally, data will not be "
1545-
f"identically round tripped; xarray will choose an "
1546-
f"encoding dtype of 'timedelta64[ns]' "
1547-
f"when re-encoding."
1548-
)
1549-
emit_user_level_warning(message)
1512+
if is_dtype_decodable:
1513+
attrs_dtype = attrs.pop("dtype")
1514+
if self.time_unit is None:
1515+
time_unit = resolve_time_unit_from_attrs_dtype(attrs_dtype, name)
15501516
else:
1551-
time_unit = cast(PDDatetimeUnitOptions, resolution)
1552-
elif self.decode_via_units:
1517+
time_unit = self.time_unit
1518+
else:
15531519
if self._emit_decode_timedelta_future_warning:
15541520
emit_user_level_warning(
15551521
"In a future version, xarray will not decode "
@@ -1567,8 +1533,19 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
15671533
"'CFTimedeltaCoder' instance.",
15681534
FutureWarning,
15691535
)
1570-
dtype = np.dtype(f"timedelta64[{self.time_unit}]")
1571-
time_unit = self.time_unit
1536+
if self.time_unit is None:
1537+
time_unit = cast(PDDatetimeUnitOptions, "ns")
1538+
else:
1539+
time_unit = self.time_unit
1540+
1541+
# Handle edge case that decode_via_dtype=False and
1542+
# decode_via_units=True, and timedeltas were encoded with a
1543+
# dtype attribute. We need to remove the dtype attribute
1544+
# to prevent an error during round tripping.
1545+
if has_timedelta_dtype:
1546+
attrs.pop("dtype")
1547+
1548+
dtype = np.dtype(f"timedelta64[{time_unit}]")
15721549
transform = partial(decode_cf_timedelta, units=units, time_unit=time_unit)
15731550
data = lazy_elemwise_func(data, transform, dtype=dtype)
15741551
return Variable(dims, data, attrs, encoding, fastpath=True)

xarray/tests/test_backends.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
from xarray.conventions import encode_dataset_coordinates
5757
from xarray.core import indexing
5858
from xarray.core.options import set_options
59+
from xarray.core.types import PDDatetimeUnitOptions
5960
from xarray.core.utils import module_available
6061
from xarray.namedarray.pycompat import array_type
6162
from xarray.tests import (
@@ -351,16 +352,6 @@ def test_dtype_coercion_error(self) -> None:
351352
with pytest.raises(ValueError, match="could not safely cast"):
352353
ds.to_netcdf(path, format=format)
353354

354-
def test_literal_timedelta_fill_value_coercion_error(self) -> None:
355-
for format in self.netcdf3_formats:
356-
timedeltas = np.array(
357-
[0, np.iinfo(np.int32).min, np.iinfo(np.int64).min]
358-
).astype("timedelta64[s]")
359-
ds = Dataset({"timedeltas": ("timedeltas", timedeltas)})
360-
with create_tmp_file(allow_cleanup_failure=False) as path:
361-
with pytest.raises(ValueError, match="_FillValue"):
362-
ds.to_netcdf(path, format=format)
363-
364355

365356
class DatasetIOBase:
366357
engine: T_NetcdfEngine | None = None
@@ -652,8 +643,10 @@ def test_roundtrip_timedelta_data(self) -> None:
652643
) as actual:
653644
assert_identical(expected, actual)
654645

655-
def test_roundtrip_literal_timedelta_data(self) -> None:
656-
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore]
646+
def test_roundtrip_timedelta_data_via_dtype(
647+
self, time_unit: PDDatetimeUnitOptions
648+
) -> None:
649+
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit(time_unit) # type: ignore[arg-type, unused-ignore]
657650
expected = Dataset(
658651
{"td": ("td", time_deltas), "td0": time_deltas[0].to_numpy()}
659652
)

0 commit comments

Comments
 (0)