Skip to content
36 changes: 36 additions & 0 deletions doc/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,42 @@ converting ``NaN`` to ``-9999``, we would use
``encoding={'foo': {'dtype': 'int16', 'scale_factor': 0.1, '_FillValue': -9999}}``.
Compression and decompression with such discretization is extremely fast.

.. _io.string-encoding:

String encoding
...............

xarray can write unicode strings to netCDF files in two ways:

- As variable length strings. This is only supported on netCDF4 (HDF5) files.
- By encoding strings into bytes, and writing encoded bytes as a character
array. The default encoding is UTF-8.

By default, we use variable length strings for compatible files and fall-back
to using encoded character arrays. Character arrays can be selected even for
netCDF4 files by setting the ``dtype`` field in ``encoding`` to ``S1``
(corresponding to NumPy's single-character bytes dtype).

If character arrays are used, the string encoding that was used is stored on
disk in the ``_Encoding`` attribute, which matches an ad-hoc convention
`adopted by the netCDF4-Python library <https://github.com/Unidata/netcdf4-python/pull/665>`_.
At the time of this writing (October 2017), a standard convention for indicating
string encoding for character arrays in netCDF files was
`still under discussion <https://github.com/Unidata/netcdf-c/issues/402>`_.
Technically, you can use
`any string encoding recognized by Python <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ if you feel the need to deviate from UTF-8,
by setting the ``_Encoding`` field in ``encoding``. But
`we don't recommend it<http://utf8everywhere.org/>`_.

.. warning::

Missing values in bytes or unicode string arrays (represented by ``NaN`` in
xarray) are currently written to disk as empty strings ``''``. This means
missing values will not be restored when data is loaded from disk.
This behavior is likely to change in the future (:issue:`1647`).
Unfortunately, explicitly setting a ``_FillValue`` for string arrays to handle
missing values doesn't work yet either, though we also hope to fix this in the
future.

Chunk based compression
.......................
Expand Down
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ Breaking changes
produce a warning encouraging users to adopt the new syntax.
By `Daniel Rothenberg <https://github.com/darothen>`_.

- Unicode strings (``str`` on Python 3) are now round-tripped successfully even
when written as character arrays (e.g., as netCDF3 files or when using
``engine='scipy'``) (:issue:`1638`). This is controlled by the ``_Encoding``
attribute convention, which is also understood directly by the netCDF4-Python
interface. See :ref:`io.string-encoding` for full details.
By `Stephan Hoyer <https://github.com/shoyer>`_.

- ``repr`` and the Jupyter Notebook won't automatically compute dask variables.
Datasets loaded with ``open_dataset`` won't automatically read coords from
disk when calling ``repr`` (:issue:`1522`).
Expand Down
1 change: 1 addition & 0 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ def set_necessary_dimensions(self, variable, unlimited_dims=None):


class WritableCFDataStore(AbstractWritableDataStore):

def store(self, variables, attributes, *args, **kwargs):
# All NetCDF files get CF encoded by default, without this attempting
# to write times, for example, would fail.
Expand Down
15 changes: 11 additions & 4 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,21 @@ def prepare_variable(self, name, variable, check_encoding=False,

attrs = variable.attrs.copy()
variable, dtype = _nc4_values_and_dtype(variable)
if dtype is str:
dtype = h5py.special_dtype(vlen=unicode_type)

self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)

fill_value = attrs.pop('_FillValue', None)
if fill_value in ['\x00']:
fill_value = None
if dtype is str and fill_value is not None:
raise NotImplementedError(
'h5netcdf does not yet support setting a fill value for '
'variable-length strings '
'(https://github.com/shoyer/h5netcdf/issues/37). '
"Either remove '_FillValue' from encoding on variable %r "
"or set {'dtype': 'S1'} in encoding to use the fixed width "
'NC_CHAR type.' % name)

if dtype is str:
dtype = h5py.special_dtype(vlen=unicode_type)

encoding = _extract_h5nc_encoding(variable,
raise_on_invalid=check_encoding)
Expand Down
58 changes: 33 additions & 25 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@

import numpy as np

from .. import conventions
from .. import Variable
from ..conventions import pop_to
from ..core import indexing
from ..core.utils import (FrozenOrderedDict, NdimSizeLenMixin,
DunderArrayMixin, close_on_error,
is_remote_uri)
from ..core.pycompat import iteritems, basestring, OrderedDict, PY3
from ..core.pycompat import iteritems, basestring, OrderedDict, PY3, suppress

from .common import (WritableCFDataStore, robust_getitem,
DataStorePickleMixin, find_root)
from .netcdf3 import (encode_nc3_attr_value, encode_nc3_variable,
maybe_convert_to_char_array)
from .netcdf3 import (encode_nc3_attr_value, encode_nc3_variable)

# This lookup table maps from dtype.byteorder to a readable endian
# string used by netCDF4.
Expand Down Expand Up @@ -72,25 +72,16 @@ def __getitem__(self, key):
msg += '\n\nOriginal traceback:\n' + traceback.format_exc()
raise IndexError(msg)

if self.ndim == 0:
# work around for netCDF4-python's broken handling of 0-d
# arrays (slicing them always returns a 1-dimensional array):
# https://github.com/Unidata/netcdf4-python/pull/220
data = np.asscalar(data)
return data


def _nc4_values_and_dtype(var):
if var.dtype.kind == 'U':
# this entire clause should not be necessary with netCDF4>=1.0.9
if len(var) > 0:
var = var.astype('O')
dtype = str
elif var.dtype.kind == 'S':
# use character arrays instead of unicode, because unicode support in
# netCDF4 is still rather buggy
data, dims = maybe_convert_to_char_array(var.data, var.dims)
var = Variable(dims, data, var.attrs, var.encoding)
var = conventions.maybe_encode_as_char_array(var)
dtype = var.dtype
elif var.dtype.kind in ['i', 'u', 'f', 'c']:
dtype = var.dtype
Expand Down Expand Up @@ -189,15 +180,27 @@ def _open_netcdf4_group(filename, mode, group=None, **kwargs):
with close_on_error(ds):
ds = _nc4_group(ds, group, mode)

_disable_mask_and_scale(ds)
_disable_auto_decode_group(ds)

return ds


def _disable_mask_and_scale(ds):
def _disable_auto_decode_variable(var):
"""Disable automatic decoding on a netCDF4.Variable.

We handle these types of decoding ourselves.
"""
var.set_auto_maskandscale(False)

# only added in netCDF4-python v1.2.8
with suppress(AttributeError):
var.set_auto_chartostring(False)


def _disable_auto_decode_group(ds):
"""Disable automatic decoding on all variables in a netCDF4.Group."""
for var in ds.variables.values():
# we handle masking and scaling ourselves
var.set_auto_maskandscale(False)
_disable_auto_decode_variable(var)


class NetCDF4DataStore(WritableCFDataStore, DataStorePickleMixin):
Expand All @@ -211,7 +214,7 @@ def __init__(self, netcdf4_dataset, mode='r', writer=None, opener=None,
if autoclose and opener is None:
raise ValueError('autoclose requires an opener')

_disable_mask_and_scale(netcdf4_dataset)
_disable_auto_decode_group(netcdf4_dataset)

self.ds = netcdf4_dataset
self._autoclose = autoclose
Expand Down Expand Up @@ -313,8 +316,6 @@ def set_variables(self, *args, **kwargs):

def prepare_variable(self, name, variable, check_encoding=False,
unlimited_dims=None):
attrs = variable.attrs.copy()

variable = _force_native_endianness(variable)

if self.format == 'NETCDF4':
Expand All @@ -325,11 +326,18 @@ def prepare_variable(self, name, variable, check_encoding=False,

self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)

attrs = variable.attrs.copy()

fill_value = attrs.pop('_FillValue', None)
if fill_value in ['', '\x00']:
# these are equivalent to the default FillValue, but netCDF4
# doesn't like setting fill_value to an empty string
fill_value = None

if datatype is str and fill_value is not None:
raise NotImplementedError(
'netCDF4 does not yet support setting a fill value for '
'variable-length strings '
'(https://github.com/Unidata/netcdf4-python/issues/730). '
"Either remove '_FillValue' from encoding on variable %r "
"or set {'dtype': 'S1'} in encoding to use the fixed width "
'NC_CHAR type.' % name)

encoding = _extract_nc4_variable_encoding(
variable, raise_on_invalid=check_encoding)
Expand All @@ -346,7 +354,7 @@ def prepare_variable(self, name, variable, check_encoding=False,
endian='native',
least_significant_digit=encoding.get('least_significant_digit'),
fill_value=fill_value)
nc4_var.set_auto_maskandscale(False)
_disable_auto_decode_variable(nc4_var)

for k, v in iteritems(attrs):
# set attributes one-by-one since netCDF4<1.0.10 can't handle
Expand Down
30 changes: 10 additions & 20 deletions xarray/backends/netcdf3.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,16 @@
# coerced instead as indicated by the "coerce_nc3_dtype" function
_nc3_dtype_coercions = {'int64': 'int32', 'bool': 'int8'}

# encode all strings as UTF-8
STRING_ENCODING = 'utf-8'


def coerce_nc3_dtype(arr):
"""Coerce an array to a data type that can be stored in a netCDF-3 file

This function performs the following dtype conversions:
int64 -> int32
float64 -> float32
bool -> int8
unicode -> string

Data is checked for equality, or equivalence (non-NaN values) with
`np.allclose` with the default keyword arguments.
Expand All @@ -42,29 +43,18 @@ def coerce_nc3_dtype(arr):
new_dtype = _nc3_dtype_coercions[dtype]
# TODO: raise a warning whenever casting the data-type instead?
cast_arr = arr.astype(new_dtype)
if ((('int' in dtype or 'U' in dtype) and
not (cast_arr == arr).all()) or
('float' in dtype and
not duck_array_ops.allclose_or_equiv(cast_arr, arr))):
if not (cast_arr == arr).all():
raise ValueError('could not safely cast array from dtype %s to %s'
% (dtype, new_dtype))
arr = cast_arr
elif arr.dtype.kind == 'U':
arr = np.core.defchararray.encode(arr, 'utf-8')
return arr


def maybe_convert_to_char_array(data, dims):
if data.dtype.kind == 'S' and data.dtype.itemsize > 1:
data = conventions.string_to_char(data)
dims = dims + ('string%s' % data.shape[-1],)
return data, dims


def encode_nc3_attr_value(value):
if isinstance(value, basestring):
if not isinstance(value, unicode_type):
value = value.decode('utf-8')
if isinstance(value, bytes):
pass
elif isinstance(value, unicode_type):
value = value.encode(STRING_ENCODING)
else:
value = coerce_nc3_dtype(np.atleast_1d(value))
if value.ndim > 1:
Expand All @@ -78,10 +68,10 @@ def encode_nc3_attrs(attrs):


def encode_nc3_variable(var):
var = conventions.maybe_encode_as_char_array(var)
data = coerce_nc3_dtype(var.data)
data, dims = maybe_convert_to_char_array(data, var.dims)
attrs = encode_nc3_attrs(var.attrs)
return Variable(dims, data, attrs, var.encoding)
return Variable(var.dims, data, attrs, var.encoding)


def _isalnumMUTF8(c):
Expand Down
Loading