pydata · shoyer · Oct 27, 2017 · Oct 22, 2017 · Oct 23, 2017 · Oct 23, 2017
diff --git a/doc/io.rst b/doc/io.rst
@@ -270,6 +270,42 @@ converting ``NaN`` to ``-9999``, we would use
 ``encoding={'foo': {'dtype': 'int16', 'scale_factor': 0.1, '_FillValue': -9999}}``.
 Compression and decompression with such discretization is extremely fast.
 
+.. _io.string-encoding:
+
+String encoding
+...............
+
+xarray can write unicode strings to netCDF files in two ways:
+
+- As variable length strings. This is only supported on netCDF4 (HDF5) files.
+- By encoding strings into bytes, and writing encoded bytes as a character
+  array. The default encoding is UTF-8.
+
+By default, we use variable length strings for compatible files and fall-back
+to using encoded character arrays. Character arrays can be selected even for
+netCDF4 files by setting the ``dtype`` field in ``encoding`` to ``S1``
+(corresponding to NumPy's single-character bytes dtype).
+
+If character arrays are used, the string encoding that was used is stored on
+disk in the ``_Encoding`` attribute, which matches an ad-hoc convention
+`adopted by the netCDF4-Python library <https://github.com/Unidata/netcdf4-python/pull/665>`_.
+At the time of this writing (October 2017), a standard convention for indicating
+string encoding for character arrays in netCDF files was
+`still under discussion <https://github.com/Unidata/netcdf-c/issues/402>`_.
+Technically, you can use
+`any string encoding recognized by Python <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ if you feel the need to deviate from UTF-8,
+by setting the ``_Encoding`` field in ``encoding``. But
+`we don't recommend it<http://utf8everywhere.org/>`_.
+
+.. warning::
+
+  Missing values in bytes or unicode string arrays (represented by ``NaN`` in
+  xarray) are currently written to disk as empty strings ``''``. This means
+  missing values will not be restored when data is loaded from disk.
+  This behavior is likely to change in the future (:issue:`1647`).
+  Unfortunately, explicitly setting a ``_FillValue`` for string arrays to handle
+  missing values doesn't work yet either, though we also hope to fix this in the
+  future.
 
 Chunk based compression
 .......................

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -73,6 +73,13 @@ Breaking changes
   produce a warning encouraging users to adopt the new syntax.
   By `Daniel Rothenberg <https://github.com/darothen>`_.
 
+- Unicode strings (``str`` on Python 3) are now round-tripped successfully even
+  when written as character arrays (e.g., as netCDF3 files or when using
+  ``engine='scipy'``) (:issue:`1638`). This is controlled by the ``_Encoding``
+  attribute convention, which is also understood directly by the netCDF4-Python
+  interface. See :ref:`io.string-encoding` for full details.
+  By `Stephan Hoyer <https://github.com/shoyer>`_.
+
 - ``repr`` and the Jupyter Notebook won't automatically compute dask variables.
   Datasets loaded with ``open_dataset`` won't automatically read coords from
   disk when calling ``repr`` (:issue:`1522`).

diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -241,6 +241,7 @@ def set_necessary_dimensions(self, variable, unlimited_dims=None):
 
 
 class WritableCFDataStore(AbstractWritableDataStore):
+
     def store(self, variables, attributes, *args, **kwargs):
         # All NetCDF files get CF encoded by default, without this attempting
         # to write times, for example, would fail.

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -131,14 +131,21 @@ def prepare_variable(self, name, variable, check_encoding=False,
 
         attrs = variable.attrs.copy()
         variable, dtype = _nc4_values_and_dtype(variable)
-        if dtype is str:
-            dtype = h5py.special_dtype(vlen=unicode_type)
 
         self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
 
         fill_value = attrs.pop('_FillValue', None)
-        if fill_value in ['\x00']:
-            fill_value = None
+        if dtype is str and fill_value is not None:
+            raise NotImplementedError(
+                'h5netcdf does not yet support setting a fill value for '
+                'variable-length strings '
+                '(https://github.com/shoyer/h5netcdf/issues/37). '
+                "Either remove '_FillValue' from encoding on variable %r "
+                "or set {'dtype': 'S1'} in encoding to use the fixed width "
+                'NC_CHAR type.' % name)
+
+        if dtype is str:
+            dtype = h5py.special_dtype(vlen=unicode_type)
 
         encoding = _extract_h5nc_encoding(variable,
                                           raise_on_invalid=check_encoding)

diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -6,18 +6,18 @@
 
 import numpy as np
 
+from .. import conventions
 from .. import Variable
 from ..conventions import pop_to
 from ..core import indexing
 from ..core.utils import (FrozenOrderedDict, NdimSizeLenMixin,
                           DunderArrayMixin, close_on_error,
                           is_remote_uri)
-from ..core.pycompat import iteritems, basestring, OrderedDict, PY3
+from ..core.pycompat import iteritems, basestring, OrderedDict, PY3, suppress
 
 from .common import (WritableCFDataStore, robust_getitem,
                      DataStorePickleMixin, find_root)
-from .netcdf3 import (encode_nc3_attr_value, encode_nc3_variable,
-                      maybe_convert_to_char_array)
+from .netcdf3 import (encode_nc3_attr_value, encode_nc3_variable)
 
 # This lookup table maps from dtype.byteorder to a readable endian
 # string used by netCDF4.
@@ -72,25 +72,16 @@ def __getitem__(self, key):
                     msg += '\n\nOriginal traceback:\n' + traceback.format_exc()
                 raise IndexError(msg)
 
-        if self.ndim == 0:
-            # work around for netCDF4-python's broken handling of 0-d
-            # arrays (slicing them always returns a 1-dimensional array):
-            # https://github.com/Unidata/netcdf4-python/pull/220
-            data = np.asscalar(data)
         return data
 
 
 def _nc4_values_and_dtype(var):
     if var.dtype.kind == 'U':
-        # this entire clause should not be necessary with netCDF4>=1.0.9
-        if len(var) > 0:
-            var = var.astype('O')
         dtype = str
     elif var.dtype.kind == 'S':
         # use character arrays instead of unicode, because unicode support in
         # netCDF4 is still rather buggy
-        data, dims = maybe_convert_to_char_array(var.data, var.dims)
-        var = Variable(dims, data, var.attrs, var.encoding)
+        var = conventions.maybe_encode_as_char_array(var)
         dtype = var.dtype
     elif var.dtype.kind in ['i', 'u', 'f', 'c']:
         dtype = var.dtype
@@ -189,15 +180,27 @@ def _open_netcdf4_group(filename, mode, group=None, **kwargs):
     with close_on_error(ds):
         ds = _nc4_group(ds, group, mode)
 
-    _disable_mask_and_scale(ds)
+    _disable_auto_decode_group(ds)
 
     return ds
 
 
-def _disable_mask_and_scale(ds):
+def _disable_auto_decode_variable(var):
+    """Disable automatic decoding on a netCDF4.Variable.
+
+    We handle these types of decoding ourselves.
+    """
+    var.set_auto_maskandscale(False)
+
+    # only added in netCDF4-python v1.2.8
+    with suppress(AttributeError):
+        var.set_auto_chartostring(False)
+
+
+def _disable_auto_decode_group(ds):
+    """Disable automatic decoding on all variables in a netCDF4.Group."""
     for var in ds.variables.values():
-        # we handle masking and scaling ourselves
-        var.set_auto_maskandscale(False)
+        _disable_auto_decode_variable(var)
 
 
 class NetCDF4DataStore(WritableCFDataStore, DataStorePickleMixin):
@@ -211,7 +214,7 @@ def __init__(self, netcdf4_dataset, mode='r', writer=None, opener=None,
         if autoclose and opener is None:
             raise ValueError('autoclose requires an opener')
 
-        _disable_mask_and_scale(netcdf4_dataset)
+        _disable_auto_decode_group(netcdf4_dataset)
 
         self.ds = netcdf4_dataset
         self._autoclose = autoclose
@@ -313,8 +316,6 @@ def set_variables(self, *args, **kwargs):
 
     def prepare_variable(self, name, variable, check_encoding=False,
                          unlimited_dims=None):
-        attrs = variable.attrs.copy()
-
         variable = _force_native_endianness(variable)
 
         if self.format == 'NETCDF4':
@@ -325,11 +326,18 @@ def prepare_variable(self, name, variable, check_encoding=False,
 
         self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
 
+        attrs = variable.attrs.copy()
+
         fill_value = attrs.pop('_FillValue', None)
-        if fill_value in ['', '\x00']:
-            # these are equivalent to the default FillValue, but netCDF4
-            # doesn't like setting fill_value to an empty string
-            fill_value = None
+
+        if datatype is str and fill_value is not None:
+            raise NotImplementedError(
+                'netCDF4 does not yet support setting a fill value for '
+                'variable-length strings '
+                '(https://github.com/Unidata/netcdf4-python/issues/730). '
+                "Either remove '_FillValue' from encoding on variable %r "
+                "or set {'dtype': 'S1'} in encoding to use the fixed width "
+                'NC_CHAR type.' % name)
 
         encoding = _extract_nc4_variable_encoding(
             variable, raise_on_invalid=check_encoding)
@@ -346,7 +354,7 @@ def prepare_variable(self, name, variable, check_encoding=False,
             endian='native',
             least_significant_digit=encoding.get('least_significant_digit'),
             fill_value=fill_value)
-        nc4_var.set_auto_maskandscale(False)
+        _disable_auto_decode_variable(nc4_var)
 
         for k, v in iteritems(attrs):
             # set attributes one-by-one since netCDF4<1.0.10 can't handle

diff --git a/xarray/backends/netcdf3.py b/xarray/backends/netcdf3.py
@@ -24,15 +24,16 @@
 # coerced instead as indicated by the "coerce_nc3_dtype" function
 _nc3_dtype_coercions = {'int64': 'int32', 'bool': 'int8'}
 
+# encode all strings as UTF-8
+STRING_ENCODING = 'utf-8'
+
 
 def coerce_nc3_dtype(arr):
     """Coerce an array to a data type that can be stored in a netCDF-3 file
 
     This function performs the following dtype conversions:
         int64 -> int32
-        float64 -> float32
         bool -> int8
-        unicode -> string
 
     Data is checked for equality, or equivalence (non-NaN values) with
     `np.allclose` with the default keyword arguments.
@@ -42,29 +43,18 @@ def coerce_nc3_dtype(arr):
         new_dtype = _nc3_dtype_coercions[dtype]
         # TODO: raise a warning whenever casting the data-type instead?
         cast_arr = arr.astype(new_dtype)
-        if ((('int' in dtype or 'U' in dtype) and
-                not (cast_arr == arr).all()) or
-                ('float' in dtype and
-                    not duck_array_ops.allclose_or_equiv(cast_arr, arr))):
+        if not (cast_arr == arr).all():
             raise ValueError('could not safely cast array from dtype %s to %s'
                              % (dtype, new_dtype))
         arr = cast_arr
-    elif arr.dtype.kind == 'U':
-        arr = np.core.defchararray.encode(arr, 'utf-8')
     return arr
 
 
-def maybe_convert_to_char_array(data, dims):
-    if data.dtype.kind == 'S' and data.dtype.itemsize > 1:
-        data = conventions.string_to_char(data)
-        dims = dims + ('string%s' % data.shape[-1],)
-    return data, dims
-
-
 def encode_nc3_attr_value(value):
-    if isinstance(value, basestring):
-        if not isinstance(value, unicode_type):
-            value = value.decode('utf-8')
+    if isinstance(value, bytes):
+        pass
+    elif isinstance(value, unicode_type):
+        value = value.encode(STRING_ENCODING)
     else:
         value = coerce_nc3_dtype(np.atleast_1d(value))
         if value.ndim > 1:
@@ -78,10 +68,10 @@ def encode_nc3_attrs(attrs):
 
 
 def encode_nc3_variable(var):
+    var = conventions.maybe_encode_as_char_array(var)
     data = coerce_nc3_dtype(var.data)
-    data, dims = maybe_convert_to_char_array(data, var.dims)
     attrs = encode_nc3_attrs(var.attrs)
-    return Variable(dims, data, attrs, var.encoding)
+    return Variable(var.dims, data, attrs, var.encoding)
 
 
 def _isalnumMUTF8(c):