diff --git a/doc/io.rst b/doc/io.rst index 682fbf5202e..151f5eb740f 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -197,24 +197,30 @@ turn this decoding off manually. .. _CF conventions: http://cfconventions.org/ You can view this encoding information (among others) in the -:py:attr:`DataArray.encoding ` attribute: +:py:attr:`DataArray.encoding ` and +:py:attr:`DataArray.encoding ` attributes: .. ipython:: :verbatim: In [1]: ds_disk['y'].encoding Out[1]: - {'calendar': u'proleptic_gregorian', - 'chunksizes': None, + {'zlib': False, + 'shuffle': False, 'complevel': 0, - 'contiguous': True, - 'dtype': dtype('float64'), 'fletcher32': False, - 'least_significant_digit': None, - 'shuffle': False, + 'contiguous': True, + 'chunksizes': None, 'source': 'saved_on_disk.nc', - 'units': u'days since 2000-01-01 00:00:00', - 'zlib': False} + 'original_shape': (5,), + 'dtype': dtype('int64'), + 'units': 'days since 2000-01-01 00:00:00', + 'calendar': 'proleptic_gregorian'} + + In [9]: ds_disk.encoding + Out[9]: + {'unlimited_dims': set(), + 'source': 'saved_on_disk.nc'} Note that all operations that manipulate variables other than indexing will remove encoding information. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a19eaa8b0bc..ab61852a785 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -67,6 +67,9 @@ Enhancements - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` now supports the ``loffset`` kwarg just like Pandas. By `Deepak Cherian `_ +- Datasets are now guaranteed to have a ``'source'`` encoding, so the source + file name is always stored (:issue:`2550`). + By `Tom Nicholas `_. - The `apply` methods for `DatasetGroupBy`, `DataArrayGroupBy`, `DatasetResample` and `DataArrayResample` can now pass positional arguments to the applied function. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0ba2e94028c..244b540d0ca 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -299,6 +299,7 @@ def maybe_decode_store(store, lock=False): if isinstance(filename_or_obj, backends.AbstractDataStore): store = filename_or_obj + ds = maybe_decode_store(store) elif isinstance(filename_or_obj, basestring): if (isinstance(filename_or_obj, bytes) and @@ -339,15 +340,21 @@ def maybe_decode_store(store, lock=False): % engine) with close_on_error(store): - return maybe_decode_store(store) + ds = maybe_decode_store(store) else: if engine is not None and engine != 'scipy': raise ValueError('can only read file-like objects with ' "default engine or engine='scipy'") # assume filename_or_obj is a file-like object store = backends.ScipyDataStore(filename_or_obj) + ds = maybe_decode_store(store) - return maybe_decode_store(store) + # Ensure source filename always stored in dataset object (GH issue #2550) + if 'source' not in ds.encoding: + if isinstance(filename_or_obj, basestring): + ds.encoding['source'] = filename_or_obj + + return ds def open_dataarray(filename_or_obj, group=None, decode_cf=True, @@ -484,6 +491,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, lock=None, data_vars='all', coords='different', autoclose=None, parallel=False, **kwargs): """Open multiple files as a single dataset. + Requires dask to be installed. See documentation for details on dask [1]. Attributes from the first dataset file are used for the combined dataset. @@ -523,6 +531,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, of all non-null values. preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. + You can find the file-name from which each dataset was loaded in + ``ds.encoding['source']``. engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', 'cfgrib'}, optional Engine to use when reading files. If not provided, the default engine diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 993db79a66e..48c2f64c8db 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3426,3 +3426,14 @@ def test_no_warning_from_dask_effective_get(): ds = Dataset() ds.to_netcdf(tmpfile) assert len(record) == 0 + + +@requires_scipy_or_netCDF4 +def test_source_encoding_always_present(): + # Test for GH issue #2550. + rnddata = np.random.randn(10) + original = Dataset({'foo': ('x', rnddata)}) + with create_tmp_file() as tmp: + original.to_netcdf(tmp) + with open_dataset(tmp) as ds: + assert ds.encoding['source'] == tmp