From 113427e53fce40cc076ca843aebcab2d0c7f7e83 Mon Sep 17 00:00:00 2001 From: dcherian Date: Sat, 30 Oct 2021 06:32:18 -0600 Subject: [PATCH 01/11] Add groupby & resample benchmarks --- asv_bench/benchmarks/groupby.py | 60 +++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index fa8deaf572f..a63e8fcaf5a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import xarray as xr @@ -7,33 +8,78 @@ class GroupBy: def setup(self, *args, **kwargs): - self.ds = xr.Dataset( + self.ds1d = xr.Dataset( { "a": xr.DataArray(np.r_[np.arange(500.0), np.arange(500.0)]), "b": xr.DataArray(np.arange(1000.0)), } ) + self.ds2d = self.ds1d.expand_dims(z=10) - @parameterized(["method"], [("sum", "mean")]) - def time_agg(self, method): - return getattr(self.ds.groupby("a"), method)() + @parameterized(["ndim"], [(1, 2)]) + def time_init(self, ndim): + getattr(self, f"ds{ndim}d").groupby("b") + + @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) + def time_agg_small_num_groups(self, method, ndim): + ds = getattr(self, f"ds{ndim}d") + getattr(ds.groupby("a"), method)() + + @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) + def time_agg_large_num_groups(self, method, ndim): + ds = getattr(self, f"ds{ndim}d") + getattr(ds.groupby("b"), method)() class GroupByDask(GroupBy): def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) - self.ds = self.ds.chunk({"dim_0": 50}) + self.ds1d = self.ds1d.sel(dim_0=slice(250)).chunk({"dim_0": 50}) + self.ds2d = self.ds2d.sel(dim_0=slice(250)).chunk({"dim_0": 50, "z": 4}) class GroupByDataFrame(GroupBy): def setup(self, *args, **kwargs): super().setup(**kwargs) - self.ds = self.ds.to_dataframe() + self.ds1d = self.ds1d.to_dataframe() class GroupByDaskDataFrame(GroupBy): def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) - self.ds = self.ds.chunk({"dim_0": 50}).to_dataframe() + self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe() + + +class Resample: + def setup(self, *args, **kwargs): + self.ds1d = xr.Dataset( + { + "b": ("time", np.arange(365.0 * 24)), + }, + coords={"time": pd.date_range("2001-01-01", freq="H", periods=365 * 24)}, + ) + self.ds2d = self.ds1d.expand_dims(z=10) + + @parameterized(["ndim"], [(1, 2)]) + def time_init(self, ndim): + getattr(self, f"ds{ndim}d").resample(time="D") + + @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) + def time_agg_small_num_groups(self, method, ndim): + ds = getattr(self, f"ds{ndim}d") + getattr(ds.resample(time="3M"), method)() + + @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) + def time_agg_large_num_groups(self, method, ndim): + ds = getattr(self, f"ds{ndim}d") + getattr(ds.resample(time="6H"), method)() + + +class ResampleDask(Resample): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + self.ds1d = self.ds1d.chunk({"time": 50}) + self.ds2d = self.ds2d.chunk({"time": 50, "z": 4}) From 3b3ca773ddf1f1adaaf915ca6d778f29167202a6 Mon Sep 17 00:00:00 2001 From: dcherian Date: Sat, 30 Oct 2021 08:12:04 -0600 Subject: [PATCH 02/11] [skip-ci] From 3df1015b2bbb6c4cd038b5d14dde3f62a782b70c Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:04:04 +0100 Subject: [PATCH 03/11] temporarily remove other benchmarks for ci testing --- asv_bench/benchmarks/combine.py | 38 -- asv_bench/benchmarks/dataarray_missing.py | 80 ---- asv_bench/benchmarks/dataset_io.py | 478 ---------------------- asv_bench/benchmarks/import_xarray.py | 9 - asv_bench/benchmarks/indexing.py | 149 ------- asv_bench/benchmarks/interp.py | 51 --- asv_bench/benchmarks/pandas.py | 26 -- asv_bench/benchmarks/reindexing.py | 52 --- asv_bench/benchmarks/repr.py | 40 -- asv_bench/benchmarks/rolling.py | 110 ----- asv_bench/benchmarks/unstacking.py | 29 -- 11 files changed, 1062 deletions(-) delete mode 100644 asv_bench/benchmarks/combine.py delete mode 100644 asv_bench/benchmarks/dataarray_missing.py delete mode 100644 asv_bench/benchmarks/dataset_io.py delete mode 100644 asv_bench/benchmarks/import_xarray.py delete mode 100644 asv_bench/benchmarks/indexing.py delete mode 100644 asv_bench/benchmarks/interp.py delete mode 100644 asv_bench/benchmarks/pandas.py delete mode 100644 asv_bench/benchmarks/reindexing.py delete mode 100644 asv_bench/benchmarks/repr.py delete mode 100644 asv_bench/benchmarks/rolling.py delete mode 100644 asv_bench/benchmarks/unstacking.py diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py deleted file mode 100644 index a4f8db2786b..00000000000 --- a/asv_bench/benchmarks/combine.py +++ /dev/null @@ -1,38 +0,0 @@ -import numpy as np - -import xarray as xr - - -class Combine: - """Benchmark concatenating and merging large datasets""" - - def setup(self): - """Create 4 datasets with two different variables""" - - t_size, x_size, y_size = 50, 450, 400 - t = np.arange(t_size) - data = np.random.randn(t_size, x_size, y_size) - - self.dsA0 = xr.Dataset( - {"A": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))} - ) - self.dsA1 = xr.Dataset( - {"A": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))} - ) - self.dsB0 = xr.Dataset( - {"B": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))} - ) - self.dsB1 = xr.Dataset( - {"B": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))} - ) - - def time_combine_nested(self): - datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]] - - xr.combine_nested(datasets, concat_dim=[None, "T"]) - - def time_combine_by_coords(self): - """Also has to load and arrange t coordinate""" - datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1] - - xr.combine_by_coords(datasets) diff --git a/asv_bench/benchmarks/dataarray_missing.py b/asv_bench/benchmarks/dataarray_missing.py deleted file mode 100644 index f89fe7f8eb9..00000000000 --- a/asv_bench/benchmarks/dataarray_missing.py +++ /dev/null @@ -1,80 +0,0 @@ -import pandas as pd - -import xarray as xr - -from . import parameterized, randn, requires_dask - - -def make_bench_data(shape, frac_nan, chunks): - vals = randn(shape, frac_nan) - coords = {"time": pd.date_range("2000-01-01", freq="D", periods=shape[0])} - da = xr.DataArray(vals, dims=("time", "x", "y"), coords=coords) - - if chunks is not None: - da = da.chunk(chunks) - - return da - - -def requires_bottleneck(): - try: - import bottleneck # noqa: F401 - except ImportError: - raise NotImplementedError() - - -class DataArrayMissingInterpolateNA: - def setup(self, shape, chunks, limit): - if chunks is not None: - requires_dask() - self.da = make_bench_data(shape, 0.1, chunks) - - @parameterized( - ["shape", "chunks", "limit"], - ( - [(365, 75, 75)], - [None, {"x": 25, "y": 25}], - [None, 3], - ), - ) - def time_interpolate_na(self, shape, chunks, limit): - actual = self.da.interpolate_na(dim="time", method="linear", limit=limit) - - if chunks is not None: - actual = actual.compute() - - -class DataArrayMissingBottleneck: - def setup(self, shape, chunks, limit): - requires_bottleneck() - if chunks is not None: - requires_dask() - self.da = make_bench_data(shape, 0.1, chunks) - - @parameterized( - ["shape", "chunks", "limit"], - ( - [(365, 75, 75)], - [None, {"x": 25, "y": 25}], - [None, 3], - ), - ) - def time_ffill(self, shape, chunks, limit): - actual = self.da.ffill(dim="time", limit=limit) - - if chunks is not None: - actual = actual.compute() - - @parameterized( - ["shape", "chunks", "limit"], - ( - [(365, 75, 75)], - [None, {"x": 25, "y": 25}], - [None, 3], - ), - ) - def time_bfill(self, shape, chunks, limit): - actual = self.da.ffill(dim="time", limit=limit) - - if chunks is not None: - actual = actual.compute() diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py deleted file mode 100644 index 6c2e15c54e9..00000000000 --- a/asv_bench/benchmarks/dataset_io.py +++ /dev/null @@ -1,478 +0,0 @@ -import os - -import numpy as np -import pandas as pd - -import xarray as xr - -from . import _skip_slow, randint, randn, requires_dask - -try: - import dask - import dask.multiprocessing -except ImportError: - pass - - -os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" - - -class IOSingleNetCDF: - """ - A few examples that benchmark reading/writing a single netCDF file with - xarray - """ - - timeout = 300.0 - repeat = 1 - number = 5 - - def make_ds(self): - # TODO: Lazily skipped in CI as it is very demanding and slow. - # Improve times and remove errors. - _skip_slow() - - # single Dataset - self.ds = xr.Dataset() - self.nt = 1000 - self.nx = 90 - self.ny = 45 - - self.block_chunks = { - "time": self.nt / 4, - "lon": self.nx / 3, - "lat": self.ny / 3, - } - - self.time_chunks = {"time": int(self.nt / 36)} - - times = pd.date_range("1970-01-01", periods=self.nt, freq="D") - lons = xr.DataArray( - np.linspace(0, 360, self.nx), - dims=("lon",), - attrs={"units": "degrees east", "long_name": "longitude"}, - ) - lats = xr.DataArray( - np.linspace(-90, 90, self.ny), - dims=("lat",), - attrs={"units": "degrees north", "long_name": "latitude"}, - ) - self.ds["foo"] = xr.DataArray( - randn((self.nt, self.nx, self.ny), frac_nan=0.2), - coords={"lon": lons, "lat": lats, "time": times}, - dims=("time", "lon", "lat"), - name="foo", - attrs={"units": "foo units", "description": "a description"}, - ) - self.ds["bar"] = xr.DataArray( - randn((self.nt, self.nx, self.ny), frac_nan=0.2), - coords={"lon": lons, "lat": lats, "time": times}, - dims=("time", "lon", "lat"), - name="bar", - attrs={"units": "bar units", "description": "a description"}, - ) - self.ds["baz"] = xr.DataArray( - randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32), - coords={"lon": lons, "lat": lats}, - dims=("lon", "lat"), - name="baz", - attrs={"units": "baz units", "description": "a description"}, - ) - - self.ds.attrs = {"history": "created for xarray benchmarking"} - - self.oinds = { - "time": randint(0, self.nt, 120), - "lon": randint(0, self.nx, 20), - "lat": randint(0, self.ny, 10), - } - self.vinds = { - "time": xr.DataArray(randint(0, self.nt, 120), dims="x"), - "lon": xr.DataArray(randint(0, self.nx, 120), dims="x"), - "lat": slice(3, 20), - } - - -class IOWriteSingleNetCDF3(IOSingleNetCDF): - def setup(self): - self.format = "NETCDF3_64BIT" - self.make_ds() - - def time_write_dataset_netcdf4(self): - self.ds.to_netcdf("test_netcdf4_write.nc", engine="netcdf4", format=self.format) - - def time_write_dataset_scipy(self): - self.ds.to_netcdf("test_scipy_write.nc", engine="scipy", format=self.format) - - -class IOReadSingleNetCDF4(IOSingleNetCDF): - def setup(self): - - self.make_ds() - - self.filepath = "test_single_file.nc4.nc" - self.format = "NETCDF4" - self.ds.to_netcdf(self.filepath, format=self.format) - - def time_load_dataset_netcdf4(self): - xr.open_dataset(self.filepath, engine="netcdf4").load() - - def time_orthogonal_indexing(self): - ds = xr.open_dataset(self.filepath, engine="netcdf4") - ds = ds.isel(**self.oinds).load() - - def time_vectorized_indexing(self): - ds = xr.open_dataset(self.filepath, engine="netcdf4") - ds = ds.isel(**self.vinds).load() - - -class IOReadSingleNetCDF3(IOReadSingleNetCDF4): - def setup(self): - - self.make_ds() - - self.filepath = "test_single_file.nc3.nc" - self.format = "NETCDF3_64BIT" - self.ds.to_netcdf(self.filepath, format=self.format) - - def time_load_dataset_scipy(self): - xr.open_dataset(self.filepath, engine="scipy").load() - - def time_orthogonal_indexing(self): - ds = xr.open_dataset(self.filepath, engine="scipy") - ds = ds.isel(**self.oinds).load() - - def time_vectorized_indexing(self): - ds = xr.open_dataset(self.filepath, engine="scipy") - ds = ds.isel(**self.vinds).load() - - -class IOReadSingleNetCDF4Dask(IOSingleNetCDF): - def setup(self): - - requires_dask() - - self.make_ds() - - self.filepath = "test_single_file.nc4.nc" - self.format = "NETCDF4" - self.ds.to_netcdf(self.filepath, format=self.format) - - def time_load_dataset_netcdf4_with_block_chunks(self): - xr.open_dataset( - self.filepath, engine="netcdf4", chunks=self.block_chunks - ).load() - - def time_load_dataset_netcdf4_with_block_chunks_oindexing(self): - ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks) - ds = ds.isel(**self.oinds).load() - - def time_load_dataset_netcdf4_with_block_chunks_vindexing(self): - ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks) - ds = ds.isel(**self.vinds).load() - - def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_dataset( - self.filepath, engine="netcdf4", chunks=self.block_chunks - ).load() - - def time_load_dataset_netcdf4_with_time_chunks(self): - xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.time_chunks).load() - - def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_dataset( - self.filepath, engine="netcdf4", chunks=self.time_chunks - ).load() - - -class IOReadSingleNetCDF3Dask(IOReadSingleNetCDF4Dask): - def setup(self): - - requires_dask() - - self.make_ds() - - self.filepath = "test_single_file.nc3.nc" - self.format = "NETCDF3_64BIT" - self.ds.to_netcdf(self.filepath, format=self.format) - - def time_load_dataset_scipy_with_block_chunks(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_dataset( - self.filepath, engine="scipy", chunks=self.block_chunks - ).load() - - def time_load_dataset_scipy_with_block_chunks_oindexing(self): - ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks) - ds = ds.isel(**self.oinds).load() - - def time_load_dataset_scipy_with_block_chunks_vindexing(self): - ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks) - ds = ds.isel(**self.vinds).load() - - def time_load_dataset_scipy_with_time_chunks(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_dataset( - self.filepath, engine="scipy", chunks=self.time_chunks - ).load() - - -class IOMultipleNetCDF: - """ - A few examples that benchmark reading/writing multiple netCDF files with - xarray - """ - - timeout = 300.0 - repeat = 1 - number = 5 - - def make_ds(self, nfiles=10): - # TODO: Lazily skipped in CI as it is very demanding and slow. - # Improve times and remove errors. - _skip_slow() - - # multiple Dataset - self.ds = xr.Dataset() - self.nt = 1000 - self.nx = 90 - self.ny = 45 - self.nfiles = nfiles - - self.block_chunks = { - "time": self.nt / 4, - "lon": self.nx / 3, - "lat": self.ny / 3, - } - - self.time_chunks = {"time": int(self.nt / 36)} - - self.time_vars = np.split( - pd.date_range("1970-01-01", periods=self.nt, freq="D"), self.nfiles - ) - - self.ds_list = [] - self.filenames_list = [] - for i, times in enumerate(self.time_vars): - ds = xr.Dataset() - nt = len(times) - lons = xr.DataArray( - np.linspace(0, 360, self.nx), - dims=("lon",), - attrs={"units": "degrees east", "long_name": "longitude"}, - ) - lats = xr.DataArray( - np.linspace(-90, 90, self.ny), - dims=("lat",), - attrs={"units": "degrees north", "long_name": "latitude"}, - ) - ds["foo"] = xr.DataArray( - randn((nt, self.nx, self.ny), frac_nan=0.2), - coords={"lon": lons, "lat": lats, "time": times}, - dims=("time", "lon", "lat"), - name="foo", - attrs={"units": "foo units", "description": "a description"}, - ) - ds["bar"] = xr.DataArray( - randn((nt, self.nx, self.ny), frac_nan=0.2), - coords={"lon": lons, "lat": lats, "time": times}, - dims=("time", "lon", "lat"), - name="bar", - attrs={"units": "bar units", "description": "a description"}, - ) - ds["baz"] = xr.DataArray( - randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32), - coords={"lon": lons, "lat": lats}, - dims=("lon", "lat"), - name="baz", - attrs={"units": "baz units", "description": "a description"}, - ) - - ds.attrs = {"history": "created for xarray benchmarking"} - - self.ds_list.append(ds) - self.filenames_list.append("test_netcdf_%i.nc" % i) - - -class IOWriteMultipleNetCDF3(IOMultipleNetCDF): - def setup(self): - self.make_ds() - self.format = "NETCDF3_64BIT" - - def time_write_dataset_netcdf4(self): - xr.save_mfdataset( - self.ds_list, self.filenames_list, engine="netcdf4", format=self.format - ) - - def time_write_dataset_scipy(self): - xr.save_mfdataset( - self.ds_list, self.filenames_list, engine="scipy", format=self.format - ) - - -class IOReadMultipleNetCDF4(IOMultipleNetCDF): - def setup(self): - - requires_dask() - - self.make_ds() - self.format = "NETCDF4" - xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) - - def time_load_dataset_netcdf4(self): - xr.open_mfdataset(self.filenames_list, engine="netcdf4").load() - - def time_open_dataset_netcdf4(self): - xr.open_mfdataset(self.filenames_list, engine="netcdf4") - - -class IOReadMultipleNetCDF3(IOReadMultipleNetCDF4): - def setup(self): - - requires_dask() - - self.make_ds() - self.format = "NETCDF3_64BIT" - xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) - - def time_load_dataset_scipy(self): - xr.open_mfdataset(self.filenames_list, engine="scipy").load() - - def time_open_dataset_scipy(self): - xr.open_mfdataset(self.filenames_list, engine="scipy") - - -class IOReadMultipleNetCDF4Dask(IOMultipleNetCDF): - def setup(self): - - requires_dask() - - self.make_ds() - self.format = "NETCDF4" - xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) - - def time_load_dataset_netcdf4_with_block_chunks(self): - xr.open_mfdataset( - self.filenames_list, engine="netcdf4", chunks=self.block_chunks - ).load() - - def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_mfdataset( - self.filenames_list, engine="netcdf4", chunks=self.block_chunks - ).load() - - def time_load_dataset_netcdf4_with_time_chunks(self): - xr.open_mfdataset( - self.filenames_list, engine="netcdf4", chunks=self.time_chunks - ).load() - - def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_mfdataset( - self.filenames_list, engine="netcdf4", chunks=self.time_chunks - ).load() - - def time_open_dataset_netcdf4_with_block_chunks(self): - xr.open_mfdataset( - self.filenames_list, engine="netcdf4", chunks=self.block_chunks - ) - - def time_open_dataset_netcdf4_with_block_chunks_multiprocessing(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_mfdataset( - self.filenames_list, engine="netcdf4", chunks=self.block_chunks - ) - - def time_open_dataset_netcdf4_with_time_chunks(self): - xr.open_mfdataset( - self.filenames_list, engine="netcdf4", chunks=self.time_chunks - ) - - def time_open_dataset_netcdf4_with_time_chunks_multiprocessing(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_mfdataset( - self.filenames_list, engine="netcdf4", chunks=self.time_chunks - ) - - -class IOReadMultipleNetCDF3Dask(IOReadMultipleNetCDF4Dask): - def setup(self): - - requires_dask() - - self.make_ds() - self.format = "NETCDF3_64BIT" - xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) - - def time_load_dataset_scipy_with_block_chunks(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_mfdataset( - self.filenames_list, engine="scipy", chunks=self.block_chunks - ).load() - - def time_load_dataset_scipy_with_time_chunks(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_mfdataset( - self.filenames_list, engine="scipy", chunks=self.time_chunks - ).load() - - def time_open_dataset_scipy_with_block_chunks(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_mfdataset( - self.filenames_list, engine="scipy", chunks=self.block_chunks - ) - - def time_open_dataset_scipy_with_time_chunks(self): - with dask.config.set(scheduler="multiprocessing"): - xr.open_mfdataset( - self.filenames_list, engine="scipy", chunks=self.time_chunks - ) - - -def create_delayed_write(): - import dask.array as da - - # TODO: Lazily skipped in CI as it is very demanding and slow. - # Improve times and remove errors. - _skip_slow() - - vals = da.random.random(300, chunks=(1,)) - ds = xr.Dataset({"vals": (["a"], vals)}) - return ds.to_netcdf("file.nc", engine="netcdf4", compute=False) - - -class IOWriteNetCDFDask: - timeout = 60 - repeat = 1 - number = 5 - - def setup(self): - requires_dask() - self.write = create_delayed_write() - - def time_write(self): - self.write.compute() - - -class IOWriteNetCDFDaskDistributed: - def setup(self): - try: - import distributed - except ImportError: - raise NotImplementedError() - - # TODO: Lazily skipped in CI as it is very demanding and slow. - # Improve times and remove errors. - _skip_slow() - - self.client = distributed.Client() - self.write = create_delayed_write() - - def cleanup(self): - self.client.shutdown() - - def time_write(self): - self.write.compute() diff --git a/asv_bench/benchmarks/import_xarray.py b/asv_bench/benchmarks/import_xarray.py deleted file mode 100644 index 94652e3b82a..00000000000 --- a/asv_bench/benchmarks/import_xarray.py +++ /dev/null @@ -1,9 +0,0 @@ -class ImportXarray: - def setup(self, *args, **kwargs): - def import_xr(): - import xarray # noqa: F401 - - self._import_xr = import_xr - - def time_import_xarray(self): - self._import_xr() diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py deleted file mode 100644 index 15212ec0c61..00000000000 --- a/asv_bench/benchmarks/indexing.py +++ /dev/null @@ -1,149 +0,0 @@ -import os - -import numpy as np -import pandas as pd - -import xarray as xr - -from . import parameterized, randint, randn, requires_dask - -nx = 2000 -ny = 1000 -nt = 500 - -basic_indexes = { - "1slice": {"x": slice(0, 3)}, - "1slice-1scalar": {"x": 0, "y": slice(None, None, 3)}, - "2slicess-1scalar": {"x": slice(3, -3, 3), "y": 1, "t": slice(None, -3, 3)}, -} - -basic_assignment_values = { - "1slice": xr.DataArray(randn((3, ny), frac_nan=0.1), dims=["x", "y"]), - "1slice-1scalar": xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1), dims=["y"]), - "2slicess-1scalar": xr.DataArray( - randn(np.empty(nx)[slice(3, -3, 3)].size, frac_nan=0.1), dims=["x"] - ), -} - -outer_indexes = { - "1d": {"x": randint(0, nx, 400)}, - "2d": {"x": randint(0, nx, 500), "y": randint(0, ny, 400)}, - "2d-1scalar": {"x": randint(0, nx, 100), "y": 1, "t": randint(0, nt, 400)}, -} - -outer_assignment_values = { - "1d": xr.DataArray(randn((400, ny), frac_nan=0.1), dims=["x", "y"]), - "2d": xr.DataArray(randn((500, 400), frac_nan=0.1), dims=["x", "y"]), - "2d-1scalar": xr.DataArray(randn(100, frac_nan=0.1), dims=["x"]), -} - -vectorized_indexes = { - "1-1d": {"x": xr.DataArray(randint(0, nx, 400), dims="a")}, - "2-1d": { - "x": xr.DataArray(randint(0, nx, 400), dims="a"), - "y": xr.DataArray(randint(0, ny, 400), dims="a"), - }, - "3-2d": { - "x": xr.DataArray(randint(0, nx, 400).reshape(4, 100), dims=["a", "b"]), - "y": xr.DataArray(randint(0, ny, 400).reshape(4, 100), dims=["a", "b"]), - "t": xr.DataArray(randint(0, nt, 400).reshape(4, 100), dims=["a", "b"]), - }, -} - -vectorized_assignment_values = { - "1-1d": xr.DataArray(randn((400, ny)), dims=["a", "y"], coords={"a": randn(400)}), - "2-1d": xr.DataArray(randn(400), dims=["a"], coords={"a": randn(400)}), - "3-2d": xr.DataArray( - randn((4, 100)), dims=["a", "b"], coords={"a": randn(4), "b": randn(100)} - ), -} - - -class Base: - def setup(self, key): - self.ds = xr.Dataset( - { - "var1": (("x", "y"), randn((nx, ny), frac_nan=0.1)), - "var2": (("x", "t"), randn((nx, nt))), - "var3": (("t",), randn(nt)), - }, - coords={ - "x": np.arange(nx), - "y": np.linspace(0, 1, ny), - "t": pd.date_range("1970-01-01", periods=nt, freq="D"), - "x_coords": ("x", np.linspace(1.1, 2.1, nx)), - }, - ) - - -class Indexing(Base): - @parameterized(["key"], [list(basic_indexes.keys())]) - def time_indexing_basic(self, key): - self.ds.isel(**basic_indexes[key]).load() - - @parameterized(["key"], [list(outer_indexes.keys())]) - def time_indexing_outer(self, key): - self.ds.isel(**outer_indexes[key]).load() - - @parameterized(["key"], [list(vectorized_indexes.keys())]) - def time_indexing_vectorized(self, key): - self.ds.isel(**vectorized_indexes[key]).load() - - -class Assignment(Base): - @parameterized(["key"], [list(basic_indexes.keys())]) - def time_assignment_basic(self, key): - ind = basic_indexes[key] - val = basic_assignment_values[key] - self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val - - @parameterized(["key"], [list(outer_indexes.keys())]) - def time_assignment_outer(self, key): - ind = outer_indexes[key] - val = outer_assignment_values[key] - self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val - - @parameterized(["key"], [list(vectorized_indexes.keys())]) - def time_assignment_vectorized(self, key): - ind = vectorized_indexes[key] - val = vectorized_assignment_values[key] - self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val - - -class IndexingDask(Indexing): - def setup(self, key): - requires_dask() - super().setup(key) - self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) - - -class BooleanIndexing: - # https://github.com/pydata/xarray/issues/2227 - def setup(self): - self.ds = xr.Dataset( - {"a": ("time", np.arange(10_000_000))}, - coords={"time": np.arange(10_000_000)}, - ) - self.time_filter = self.ds.time > 50_000 - - def time_indexing(self): - self.ds.isel(time=self.time_filter) - - -class HugeAxisSmallSliceIndexing: - # https://github.com/pydata/xarray/pull/4560 - def setup(self): - self.filepath = "test_indexing_huge_axis_small_slice.nc" - if not os.path.isfile(self.filepath): - xr.Dataset( - {"a": ("x", np.arange(10_000_000))}, - coords={"x": np.arange(10_000_000)}, - ).to_netcdf(self.filepath, format="NETCDF4") - - self.ds = xr.open_dataset(self.filepath) - - def time_indexing(self): - self.ds.isel(x=slice(100)) - - def cleanup(self): - self.ds.close() diff --git a/asv_bench/benchmarks/interp.py b/asv_bench/benchmarks/interp.py deleted file mode 100644 index 4b6691bcc0a..00000000000 --- a/asv_bench/benchmarks/interp.py +++ /dev/null @@ -1,51 +0,0 @@ -import numpy as np -import pandas as pd - -import xarray as xr - -from . import parameterized, randn, requires_dask - -nx = 1500 -ny = 1000 -nt = 500 - -randn_xy = randn((nx, ny), frac_nan=0.1) -randn_xt = randn((nx, nt)) -randn_t = randn((nt,)) - -new_x_short = np.linspace(0.3 * nx, 0.7 * nx, 100) -new_x_long = np.linspace(0.3 * nx, 0.7 * nx, 500) -new_y_long = np.linspace(0.1, 0.9, 500) - - -class Interpolation: - def setup(self, *args, **kwargs): - self.ds = xr.Dataset( - { - "var1": (("x", "y"), randn_xy), - "var2": (("x", "t"), randn_xt), - "var3": (("t",), randn_t), - }, - coords={ - "x": np.arange(nx), - "y": np.linspace(0, 1, ny), - "t": pd.date_range("1970-01-01", periods=nt, freq="D"), - "x_coords": ("x", np.linspace(1.1, 2.1, nx)), - }, - ) - - @parameterized(["method", "is_short"], (["linear", "cubic"], [True, False])) - def time_interpolation(self, method, is_short): - new_x = new_x_short if is_short else new_x_long - self.ds.interp(x=new_x, method=method).load() - - @parameterized(["method"], (["linear", "nearest"])) - def time_interpolation_2d(self, method): - self.ds.interp(x=new_x_long, y=new_y_long, method=method).load() - - -class InterpolationDask(Interpolation): - def setup(self, *args, **kwargs): - requires_dask() - super().setup(**kwargs) - self.ds = self.ds.chunk({"t": 50}) diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py deleted file mode 100644 index 8aaa515d417..00000000000 --- a/asv_bench/benchmarks/pandas.py +++ /dev/null @@ -1,26 +0,0 @@ -import numpy as np -import pandas as pd - -import xarray as xr - -from . import parameterized - - -class MultiIndexSeries: - def setup(self, dtype, subset): - data = np.random.rand(100000).astype(dtype) - index = pd.MultiIndex.from_product( - [ - list("abcdefhijk"), - list("abcdefhijk"), - pd.date_range(start="2000-01-01", periods=1000, freq="B"), - ] - ) - series = pd.Series(data, index) - if subset: - series = series[::3] - self.series = series - - @parameterized(["dtype", "subset"], ([int, float], [True, False])) - def time_from_series(self, dtype, subset): - xr.DataArray.from_series(self.series) diff --git a/asv_bench/benchmarks/reindexing.py b/asv_bench/benchmarks/reindexing.py deleted file mode 100644 index 9d0767fc3b3..00000000000 --- a/asv_bench/benchmarks/reindexing.py +++ /dev/null @@ -1,52 +0,0 @@ -import numpy as np - -import xarray as xr - -from . import requires_dask - -ntime = 500 -nx = 50 -ny = 50 - - -class Reindex: - def setup(self): - data = np.random.RandomState(0).randn(ntime, nx, ny) - self.ds = xr.Dataset( - {"temperature": (("time", "x", "y"), data)}, - coords={"time": np.arange(ntime), "x": np.arange(nx), "y": np.arange(ny)}, - ) - - def time_1d_coarse(self): - self.ds.reindex(time=np.arange(0, ntime, 5)).load() - - def time_1d_fine_all_found(self): - self.ds.reindex(time=np.arange(0, ntime, 0.5), method="nearest").load() - - def time_1d_fine_some_missing(self): - self.ds.reindex( - time=np.arange(0, ntime, 0.5), method="nearest", tolerance=0.1 - ).load() - - def time_2d_coarse(self): - self.ds.reindex(x=np.arange(0, nx, 2), y=np.arange(0, ny, 2)).load() - - def time_2d_fine_all_found(self): - self.ds.reindex( - x=np.arange(0, nx, 0.5), y=np.arange(0, ny, 0.5), method="nearest" - ).load() - - def time_2d_fine_some_missing(self): - self.ds.reindex( - x=np.arange(0, nx, 0.5), - y=np.arange(0, ny, 0.5), - method="nearest", - tolerance=0.1, - ).load() - - -class ReindexDask(Reindex): - def setup(self): - requires_dask() - super().setup() - self.ds = self.ds.chunk({"time": 100}) diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py deleted file mode 100644 index 4bf2ace352d..00000000000 --- a/asv_bench/benchmarks/repr.py +++ /dev/null @@ -1,40 +0,0 @@ -import numpy as np -import pandas as pd - -import xarray as xr - - -class Repr: - def setup(self): - a = np.arange(0, 100) - data_vars = dict() - for i in a: - data_vars[f"long_variable_name_{i}"] = xr.DataArray( - name=f"long_variable_name_{i}", - data=np.arange(0, 20), - dims=[f"long_coord_name_{i}_x"], - coords={f"long_coord_name_{i}_x": np.arange(0, 20) * 2}, - ) - self.ds = xr.Dataset(data_vars) - self.ds.attrs = {f"attr_{k}": 2 for k in a} - - def time_repr(self): - repr(self.ds) - - def time_repr_html(self): - self.ds._repr_html_() - - -class ReprMultiIndex: - def setup(self): - index = pd.MultiIndex.from_product( - [range(1000), range(1000)], names=("level_0", "level_1") - ) - series = pd.Series(range(1000 * 1000), index=index) - self.da = xr.DataArray(series) - - def time_repr(self): - repr(self.da) - - def time_repr_html(self): - self.da._repr_html_() diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py deleted file mode 100644 index f0e18bf2153..00000000000 --- a/asv_bench/benchmarks/rolling.py +++ /dev/null @@ -1,110 +0,0 @@ -import numpy as np -import pandas as pd - -import xarray as xr - -from . import parameterized, randn, requires_dask - -nx = 300 -long_nx = 30000 -ny = 200 -nt = 100 -window = 20 - -randn_xy = randn((nx, ny), frac_nan=0.1) -randn_xt = randn((nx, nt)) -randn_t = randn((nt,)) -randn_long = randn((long_nx,), frac_nan=0.1) - - -class Rolling: - def setup(self, *args, **kwargs): - self.ds = xr.Dataset( - { - "var1": (("x", "y"), randn_xy), - "var2": (("x", "t"), randn_xt), - "var3": (("t",), randn_t), - }, - coords={ - "x": np.arange(nx), - "y": np.linspace(0, 1, ny), - "t": pd.date_range("1970-01-01", periods=nt, freq="D"), - "x_coords": ("x", np.linspace(1.1, 2.1, nx)), - }, - ) - self.da_long = xr.DataArray( - randn_long, dims="x", coords={"x": np.arange(long_nx) * 0.1} - ) - - @parameterized(["func", "center"], (["mean", "count"], [True, False])) - def time_rolling(self, func, center): - getattr(self.ds.rolling(x=window, center=center), func)().load() - - @parameterized(["func", "pandas"], (["mean", "count"], [True, False])) - def time_rolling_long(self, func, pandas): - if pandas: - se = self.da_long.to_series() - getattr(se.rolling(window=window, min_periods=window), func)() - else: - getattr(self.da_long.rolling(x=window, min_periods=window), func)().load() - - @parameterized(["window_", "min_periods"], ([20, 40], [5, 5])) - def time_rolling_np(self, window_, min_periods): - self.ds.rolling(x=window_, center=False, min_periods=min_periods).reduce( - getattr(np, "nansum") - ).load() - - @parameterized(["center", "stride"], ([True, False], [1, 1])) - def time_rolling_construct(self, center, stride): - self.ds.rolling(x=window, center=center).construct( - "window_dim", stride=stride - ).sum(dim="window_dim").load() - - -class RollingDask(Rolling): - def setup(self, *args, **kwargs): - requires_dask() - super().setup(**kwargs) - self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) - self.da_long = self.da_long.chunk({"x": 10000}) - - -class RollingMemory: - def setup(self, *args, **kwargs): - self.ds = xr.Dataset( - { - "var1": (("x", "y"), randn_xy), - "var2": (("x", "t"), randn_xt), - "var3": (("t",), randn_t), - }, - coords={ - "x": np.arange(nx), - "y": np.linspace(0, 1, ny), - "t": pd.date_range("1970-01-01", periods=nt, freq="D"), - "x_coords": ("x", np.linspace(1.1, 2.1, nx)), - }, - ) - - -class DataArrayRollingMemory(RollingMemory): - @parameterized("func", ["sum", "max", "mean"]) - def peakmem_ndrolling_reduce(self, func): - roll = self.ds.var1.rolling(x=10, y=4) - getattr(roll, func)() - - @parameterized("func", ["sum", "max", "mean"]) - def peakmem_1drolling_reduce(self, func): - roll = self.ds.var3.rolling(t=100) - getattr(roll, func)() - - -class DatasetRollingMemory(RollingMemory): - @parameterized("func", ["sum", "max", "mean"]) - def peakmem_ndrolling_reduce(self, func): - roll = self.ds.rolling(x=10, y=4) - getattr(roll, func)() - - @parameterized("func", ["sum", "max", "mean"]) - def peakmem_1drolling_reduce(self, func): - roll = self.ds.rolling(t=100) - getattr(roll, func)() diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py deleted file mode 100644 index 2c5b7ca7821..00000000000 --- a/asv_bench/benchmarks/unstacking.py +++ /dev/null @@ -1,29 +0,0 @@ -import numpy as np - -import xarray as xr - -from . import requires_dask - - -class Unstacking: - def setup(self): - data = np.random.RandomState(0).randn(250, 500) - self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...]) - self.da_missing = self.da_full[:-1] - self.df_missing = self.da_missing.to_pandas() - - def time_unstack_fast(self): - self.da_full.unstack("flat_dim") - - def time_unstack_slow(self): - self.da_missing.unstack("flat_dim") - - def time_unstack_pandas_slow(self): - self.df_missing.unstack() - - -class UnstackingDask(Unstacking): - def setup(self, *args, **kwargs): - requires_dask() - super().setup(**kwargs) - self.da_full = self.da_full.chunk({"flat_dim": 25}) From c4c40230b23b75507bc55ec2f8fcd3d03f8f20d2 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:41:43 +0100 Subject: [PATCH 04/11] Update groupby.py --- asv_bench/benchmarks/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index a63e8fcaf5a..510e33d5e55 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -10,8 +10,8 @@ class GroupBy: def setup(self, *args, **kwargs): self.ds1d = xr.Dataset( { - "a": xr.DataArray(np.r_[np.arange(500.0), np.arange(500.0)]), - "b": xr.DataArray(np.arange(1000.0)), + "a": xr.DataArray(np.r_[np.arange(300), np.arange(300)]), + "b": xr.DataArray(np.arange(500)), } ) self.ds2d = self.ds1d.expand_dims(z=10) @@ -74,7 +74,7 @@ def time_agg_small_num_groups(self, method, ndim): @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) def time_agg_large_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") - getattr(ds.resample(time="6H"), method)() + getattr(ds.resample(time="12H"), method)() class ResampleDask(Resample): From 445312d37599370c5d48a9583c3a05b6116d1e13 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:56:37 +0100 Subject: [PATCH 05/11] Update groupby.py --- asv_bench/benchmarks/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 510e33d5e55..8f51583fce6 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -11,7 +11,7 @@ def setup(self, *args, **kwargs): self.ds1d = xr.Dataset( { "a": xr.DataArray(np.r_[np.arange(300), np.arange(300)]), - "b": xr.DataArray(np.arange(500)), + "b": xr.DataArray(np.arange(600)), } ) self.ds2d = self.ds1d.expand_dims(z=10) @@ -35,8 +35,8 @@ class GroupByDask(GroupBy): def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) - self.ds1d = self.ds1d.sel(dim_0=slice(250)).chunk({"dim_0": 50}) - self.ds2d = self.ds2d.sel(dim_0=slice(250)).chunk({"dim_0": 50, "z": 4}) + self.ds1d = self.ds1d.sel(dim_0=slice(150)).chunk({"dim_0": 50}) + self.ds2d = self.ds2d.sel(dim_0=slice(150)).chunk({"dim_0": 50, "z": 4}) class GroupByDataFrame(GroupBy): From ff6419522bd30c71fd5778d909405186eafb8559 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 6 Nov 2021 18:29:20 +0100 Subject: [PATCH 06/11] Update groupby.py --- asv_bench/benchmarks/groupby.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 8f51583fce6..5aa57587cf2 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -8,10 +8,11 @@ class GroupBy: def setup(self, *args, **kwargs): + self.n = 100 self.ds1d = xr.Dataset( { - "a": xr.DataArray(np.r_[np.arange(300), np.arange(300)]), - "b": xr.DataArray(np.arange(600)), + "a": xr.DataArray(np.r_[np.arange(self.n), np.arange(self.n)]), + "b": xr.DataArray(np.arange(2 * self.n)), } ) self.ds2d = self.ds1d.expand_dims(z=10) @@ -35,8 +36,10 @@ class GroupByDask(GroupBy): def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) - self.ds1d = self.ds1d.sel(dim_0=slice(150)).chunk({"dim_0": 50}) - self.ds2d = self.ds2d.sel(dim_0=slice(150)).chunk({"dim_0": 50, "z": 4}) + self.ds1d = self.ds1d.sel(dim_0=slice(self.n * 0.5)).chunk({"dim_0": 50}) + self.ds2d = self.ds2d.sel(dim_0=slice(self.n * 0.5)).chunk( + {"dim_0": 50, "z": 4} + ) class GroupByDataFrame(GroupBy): @@ -74,7 +77,7 @@ def time_agg_small_num_groups(self, method, ndim): @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) def time_agg_large_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") - getattr(ds.resample(time="12H"), method)() + getattr(ds.resample(time="24H"), method)() class ResampleDask(Resample): From c56dd9411fc8777ac70090d0bd938eb0399b653d Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 6 Nov 2021 18:41:43 +0100 Subject: [PATCH 07/11] Update groupby.py --- asv_bench/benchmarks/groupby.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 5aa57587cf2..1ffb3c6579a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -36,10 +36,8 @@ class GroupByDask(GroupBy): def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) - self.ds1d = self.ds1d.sel(dim_0=slice(self.n * 0.5)).chunk({"dim_0": 50}) - self.ds2d = self.ds2d.sel(dim_0=slice(self.n * 0.5)).chunk( - {"dim_0": 50, "z": 4} - ) + self.ds1d = self.ds1d.sel(dim_0=slice(self.n // 2)).chunk({"dim_0": 50}) + self.ds2d = self.ds2d.sel(dim_0=slice(self.n // 2)).chunk({"dim_0": 50, "z": 4}) class GroupByDataFrame(GroupBy): From a89f62be5a875a27426ee274ae81f4344982ab74 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 6 Nov 2021 18:55:11 +0100 Subject: [PATCH 08/11] Update groupby.py --- asv_bench/benchmarks/groupby.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 1ffb3c6579a..eb43a95d2b9 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -3,7 +3,7 @@ import xarray as xr -from . import parameterized, requires_dask +from . import _skip_slow, parameterized, requires_dask class GroupBy: @@ -41,13 +41,23 @@ def setup(self, *args, **kwargs): class GroupByDataFrame(GroupBy): + """Run groupby tests using pandas DataFrame.""" + def setup(self, *args, **kwargs): + # Skip testing in CI as it won't ever change in a commit: + _skip_slow() + super().setup(**kwargs) self.ds1d = self.ds1d.to_dataframe() class GroupByDaskDataFrame(GroupBy): + """Run groupby tests using dask DataFrame.""" + def setup(self, *args, **kwargs): + # Skip testing in CI as it won't ever change in a commit: + _skip_slow() + requires_dask() super().setup(**kwargs) self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe() @@ -75,7 +85,7 @@ def time_agg_small_num_groups(self, method, ndim): @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) def time_agg_large_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") - getattr(ds.resample(time="24H"), method)() + getattr(ds.resample(time="48H"), method)() class ResampleDask(Resample): From 88a4a3d6f3cba0e63c93ae3984add25e697d6c58 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 6 Nov 2021 19:28:38 +0100 Subject: [PATCH 09/11] add back the other asv tests --- asv_bench/benchmarks/combine.py | 38 ++ asv_bench/benchmarks/dataarray_missing.py | 80 ++++ asv_bench/benchmarks/dataset_io.py | 478 ++++++++++++++++++++++ asv_bench/benchmarks/import_xarray.py | 9 + asv_bench/benchmarks/indexing.py | 149 +++++++ asv_bench/benchmarks/interp.py | 51 +++ asv_bench/benchmarks/pandas.py | 26 ++ asv_bench/benchmarks/reindexing.py | 52 +++ asv_bench/benchmarks/repr.py | 40 ++ asv_bench/benchmarks/rolling.py | 110 +++++ asv_bench/benchmarks/unstacking.py | 29 ++ 11 files changed, 1062 insertions(+) create mode 100644 asv_bench/benchmarks/combine.py create mode 100644 asv_bench/benchmarks/dataarray_missing.py create mode 100644 asv_bench/benchmarks/dataset_io.py create mode 100644 asv_bench/benchmarks/import_xarray.py create mode 100644 asv_bench/benchmarks/indexing.py create mode 100644 asv_bench/benchmarks/interp.py create mode 100644 asv_bench/benchmarks/pandas.py create mode 100644 asv_bench/benchmarks/reindexing.py create mode 100644 asv_bench/benchmarks/repr.py create mode 100644 asv_bench/benchmarks/rolling.py create mode 100644 asv_bench/benchmarks/unstacking.py diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py new file mode 100644 index 00000000000..a4f8db2786b --- /dev/null +++ b/asv_bench/benchmarks/combine.py @@ -0,0 +1,38 @@ +import numpy as np + +import xarray as xr + + +class Combine: + """Benchmark concatenating and merging large datasets""" + + def setup(self): + """Create 4 datasets with two different variables""" + + t_size, x_size, y_size = 50, 450, 400 + t = np.arange(t_size) + data = np.random.randn(t_size, x_size, y_size) + + self.dsA0 = xr.Dataset( + {"A": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))} + ) + self.dsA1 = xr.Dataset( + {"A": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))} + ) + self.dsB0 = xr.Dataset( + {"B": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))} + ) + self.dsB1 = xr.Dataset( + {"B": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))} + ) + + def time_combine_nested(self): + datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]] + + xr.combine_nested(datasets, concat_dim=[None, "T"]) + + def time_combine_by_coords(self): + """Also has to load and arrange t coordinate""" + datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1] + + xr.combine_by_coords(datasets) diff --git a/asv_bench/benchmarks/dataarray_missing.py b/asv_bench/benchmarks/dataarray_missing.py new file mode 100644 index 00000000000..f89fe7f8eb9 --- /dev/null +++ b/asv_bench/benchmarks/dataarray_missing.py @@ -0,0 +1,80 @@ +import pandas as pd + +import xarray as xr + +from . import parameterized, randn, requires_dask + + +def make_bench_data(shape, frac_nan, chunks): + vals = randn(shape, frac_nan) + coords = {"time": pd.date_range("2000-01-01", freq="D", periods=shape[0])} + da = xr.DataArray(vals, dims=("time", "x", "y"), coords=coords) + + if chunks is not None: + da = da.chunk(chunks) + + return da + + +def requires_bottleneck(): + try: + import bottleneck # noqa: F401 + except ImportError: + raise NotImplementedError() + + +class DataArrayMissingInterpolateNA: + def setup(self, shape, chunks, limit): + if chunks is not None: + requires_dask() + self.da = make_bench_data(shape, 0.1, chunks) + + @parameterized( + ["shape", "chunks", "limit"], + ( + [(365, 75, 75)], + [None, {"x": 25, "y": 25}], + [None, 3], + ), + ) + def time_interpolate_na(self, shape, chunks, limit): + actual = self.da.interpolate_na(dim="time", method="linear", limit=limit) + + if chunks is not None: + actual = actual.compute() + + +class DataArrayMissingBottleneck: + def setup(self, shape, chunks, limit): + requires_bottleneck() + if chunks is not None: + requires_dask() + self.da = make_bench_data(shape, 0.1, chunks) + + @parameterized( + ["shape", "chunks", "limit"], + ( + [(365, 75, 75)], + [None, {"x": 25, "y": 25}], + [None, 3], + ), + ) + def time_ffill(self, shape, chunks, limit): + actual = self.da.ffill(dim="time", limit=limit) + + if chunks is not None: + actual = actual.compute() + + @parameterized( + ["shape", "chunks", "limit"], + ( + [(365, 75, 75)], + [None, {"x": 25, "y": 25}], + [None, 3], + ), + ) + def time_bfill(self, shape, chunks, limit): + actual = self.da.ffill(dim="time", limit=limit) + + if chunks is not None: + actual = actual.compute() diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py new file mode 100644 index 00000000000..6c2e15c54e9 --- /dev/null +++ b/asv_bench/benchmarks/dataset_io.py @@ -0,0 +1,478 @@ +import os + +import numpy as np +import pandas as pd + +import xarray as xr + +from . import _skip_slow, randint, randn, requires_dask + +try: + import dask + import dask.multiprocessing +except ImportError: + pass + + +os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" + + +class IOSingleNetCDF: + """ + A few examples that benchmark reading/writing a single netCDF file with + xarray + """ + + timeout = 300.0 + repeat = 1 + number = 5 + + def make_ds(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + # single Dataset + self.ds = xr.Dataset() + self.nt = 1000 + self.nx = 90 + self.ny = 45 + + self.block_chunks = { + "time": self.nt / 4, + "lon": self.nx / 3, + "lat": self.ny / 3, + } + + self.time_chunks = {"time": int(self.nt / 36)} + + times = pd.date_range("1970-01-01", periods=self.nt, freq="D") + lons = xr.DataArray( + np.linspace(0, 360, self.nx), + dims=("lon",), + attrs={"units": "degrees east", "long_name": "longitude"}, + ) + lats = xr.DataArray( + np.linspace(-90, 90, self.ny), + dims=("lat",), + attrs={"units": "degrees north", "long_name": "latitude"}, + ) + self.ds["foo"] = xr.DataArray( + randn((self.nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="foo", + attrs={"units": "foo units", "description": "a description"}, + ) + self.ds["bar"] = xr.DataArray( + randn((self.nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="bar", + attrs={"units": "bar units", "description": "a description"}, + ) + self.ds["baz"] = xr.DataArray( + randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32), + coords={"lon": lons, "lat": lats}, + dims=("lon", "lat"), + name="baz", + attrs={"units": "baz units", "description": "a description"}, + ) + + self.ds.attrs = {"history": "created for xarray benchmarking"} + + self.oinds = { + "time": randint(0, self.nt, 120), + "lon": randint(0, self.nx, 20), + "lat": randint(0, self.ny, 10), + } + self.vinds = { + "time": xr.DataArray(randint(0, self.nt, 120), dims="x"), + "lon": xr.DataArray(randint(0, self.nx, 120), dims="x"), + "lat": slice(3, 20), + } + + +class IOWriteSingleNetCDF3(IOSingleNetCDF): + def setup(self): + self.format = "NETCDF3_64BIT" + self.make_ds() + + def time_write_dataset_netcdf4(self): + self.ds.to_netcdf("test_netcdf4_write.nc", engine="netcdf4", format=self.format) + + def time_write_dataset_scipy(self): + self.ds.to_netcdf("test_scipy_write.nc", engine="scipy", format=self.format) + + +class IOReadSingleNetCDF4(IOSingleNetCDF): + def setup(self): + + self.make_ds() + + self.filepath = "test_single_file.nc4.nc" + self.format = "NETCDF4" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_netcdf4(self): + xr.open_dataset(self.filepath, engine="netcdf4").load() + + def time_orthogonal_indexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4") + ds = ds.isel(**self.oinds).load() + + def time_vectorized_indexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4") + ds = ds.isel(**self.vinds).load() + + +class IOReadSingleNetCDF3(IOReadSingleNetCDF4): + def setup(self): + + self.make_ds() + + self.filepath = "test_single_file.nc3.nc" + self.format = "NETCDF3_64BIT" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_scipy(self): + xr.open_dataset(self.filepath, engine="scipy").load() + + def time_orthogonal_indexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy") + ds = ds.isel(**self.oinds).load() + + def time_vectorized_indexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy") + ds = ds.isel(**self.vinds).load() + + +class IOReadSingleNetCDF4Dask(IOSingleNetCDF): + def setup(self): + + requires_dask() + + self.make_ds() + + self.filepath = "test_single_file.nc4.nc" + self.format = "NETCDF4" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_netcdf4_with_block_chunks(self): + xr.open_dataset( + self.filepath, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_block_chunks_oindexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks) + ds = ds.isel(**self.oinds).load() + + def time_load_dataset_netcdf4_with_block_chunks_vindexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks) + ds = ds.isel(**self.vinds).load() + + def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_time_chunks(self): + xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.time_chunks).load() + + def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="netcdf4", chunks=self.time_chunks + ).load() + + +class IOReadSingleNetCDF3Dask(IOReadSingleNetCDF4Dask): + def setup(self): + + requires_dask() + + self.make_ds() + + self.filepath = "test_single_file.nc3.nc" + self.format = "NETCDF3_64BIT" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_scipy_with_block_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="scipy", chunks=self.block_chunks + ).load() + + def time_load_dataset_scipy_with_block_chunks_oindexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks) + ds = ds.isel(**self.oinds).load() + + def time_load_dataset_scipy_with_block_chunks_vindexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks) + ds = ds.isel(**self.vinds).load() + + def time_load_dataset_scipy_with_time_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="scipy", chunks=self.time_chunks + ).load() + + +class IOMultipleNetCDF: + """ + A few examples that benchmark reading/writing multiple netCDF files with + xarray + """ + + timeout = 300.0 + repeat = 1 + number = 5 + + def make_ds(self, nfiles=10): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + # multiple Dataset + self.ds = xr.Dataset() + self.nt = 1000 + self.nx = 90 + self.ny = 45 + self.nfiles = nfiles + + self.block_chunks = { + "time": self.nt / 4, + "lon": self.nx / 3, + "lat": self.ny / 3, + } + + self.time_chunks = {"time": int(self.nt / 36)} + + self.time_vars = np.split( + pd.date_range("1970-01-01", periods=self.nt, freq="D"), self.nfiles + ) + + self.ds_list = [] + self.filenames_list = [] + for i, times in enumerate(self.time_vars): + ds = xr.Dataset() + nt = len(times) + lons = xr.DataArray( + np.linspace(0, 360, self.nx), + dims=("lon",), + attrs={"units": "degrees east", "long_name": "longitude"}, + ) + lats = xr.DataArray( + np.linspace(-90, 90, self.ny), + dims=("lat",), + attrs={"units": "degrees north", "long_name": "latitude"}, + ) + ds["foo"] = xr.DataArray( + randn((nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="foo", + attrs={"units": "foo units", "description": "a description"}, + ) + ds["bar"] = xr.DataArray( + randn((nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="bar", + attrs={"units": "bar units", "description": "a description"}, + ) + ds["baz"] = xr.DataArray( + randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32), + coords={"lon": lons, "lat": lats}, + dims=("lon", "lat"), + name="baz", + attrs={"units": "baz units", "description": "a description"}, + ) + + ds.attrs = {"history": "created for xarray benchmarking"} + + self.ds_list.append(ds) + self.filenames_list.append("test_netcdf_%i.nc" % i) + + +class IOWriteMultipleNetCDF3(IOMultipleNetCDF): + def setup(self): + self.make_ds() + self.format = "NETCDF3_64BIT" + + def time_write_dataset_netcdf4(self): + xr.save_mfdataset( + self.ds_list, self.filenames_list, engine="netcdf4", format=self.format + ) + + def time_write_dataset_scipy(self): + xr.save_mfdataset( + self.ds_list, self.filenames_list, engine="scipy", format=self.format + ) + + +class IOReadMultipleNetCDF4(IOMultipleNetCDF): + def setup(self): + + requires_dask() + + self.make_ds() + self.format = "NETCDF4" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_netcdf4(self): + xr.open_mfdataset(self.filenames_list, engine="netcdf4").load() + + def time_open_dataset_netcdf4(self): + xr.open_mfdataset(self.filenames_list, engine="netcdf4") + + +class IOReadMultipleNetCDF3(IOReadMultipleNetCDF4): + def setup(self): + + requires_dask() + + self.make_ds() + self.format = "NETCDF3_64BIT" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_scipy(self): + xr.open_mfdataset(self.filenames_list, engine="scipy").load() + + def time_open_dataset_scipy(self): + xr.open_mfdataset(self.filenames_list, engine="scipy") + + +class IOReadMultipleNetCDF4Dask(IOMultipleNetCDF): + def setup(self): + + requires_dask() + + self.make_ds() + self.format = "NETCDF4" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_netcdf4_with_block_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_time_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ).load() + + def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ).load() + + def time_open_dataset_netcdf4_with_block_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ) + + def time_open_dataset_netcdf4_with_block_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ) + + def time_open_dataset_netcdf4_with_time_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ) + + def time_open_dataset_netcdf4_with_time_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ) + + +class IOReadMultipleNetCDF3Dask(IOReadMultipleNetCDF4Dask): + def setup(self): + + requires_dask() + + self.make_ds() + self.format = "NETCDF3_64BIT" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_scipy_with_block_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.block_chunks + ).load() + + def time_load_dataset_scipy_with_time_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.time_chunks + ).load() + + def time_open_dataset_scipy_with_block_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.block_chunks + ) + + def time_open_dataset_scipy_with_time_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.time_chunks + ) + + +def create_delayed_write(): + import dask.array as da + + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + vals = da.random.random(300, chunks=(1,)) + ds = xr.Dataset({"vals": (["a"], vals)}) + return ds.to_netcdf("file.nc", engine="netcdf4", compute=False) + + +class IOWriteNetCDFDask: + timeout = 60 + repeat = 1 + number = 5 + + def setup(self): + requires_dask() + self.write = create_delayed_write() + + def time_write(self): + self.write.compute() + + +class IOWriteNetCDFDaskDistributed: + def setup(self): + try: + import distributed + except ImportError: + raise NotImplementedError() + + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + self.client = distributed.Client() + self.write = create_delayed_write() + + def cleanup(self): + self.client.shutdown() + + def time_write(self): + self.write.compute() diff --git a/asv_bench/benchmarks/import_xarray.py b/asv_bench/benchmarks/import_xarray.py new file mode 100644 index 00000000000..94652e3b82a --- /dev/null +++ b/asv_bench/benchmarks/import_xarray.py @@ -0,0 +1,9 @@ +class ImportXarray: + def setup(self, *args, **kwargs): + def import_xr(): + import xarray # noqa: F401 + + self._import_xr = import_xr + + def time_import_xarray(self): + self._import_xr() diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py new file mode 100644 index 00000000000..15212ec0c61 --- /dev/null +++ b/asv_bench/benchmarks/indexing.py @@ -0,0 +1,149 @@ +import os + +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized, randint, randn, requires_dask + +nx = 2000 +ny = 1000 +nt = 500 + +basic_indexes = { + "1slice": {"x": slice(0, 3)}, + "1slice-1scalar": {"x": 0, "y": slice(None, None, 3)}, + "2slicess-1scalar": {"x": slice(3, -3, 3), "y": 1, "t": slice(None, -3, 3)}, +} + +basic_assignment_values = { + "1slice": xr.DataArray(randn((3, ny), frac_nan=0.1), dims=["x", "y"]), + "1slice-1scalar": xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1), dims=["y"]), + "2slicess-1scalar": xr.DataArray( + randn(np.empty(nx)[slice(3, -3, 3)].size, frac_nan=0.1), dims=["x"] + ), +} + +outer_indexes = { + "1d": {"x": randint(0, nx, 400)}, + "2d": {"x": randint(0, nx, 500), "y": randint(0, ny, 400)}, + "2d-1scalar": {"x": randint(0, nx, 100), "y": 1, "t": randint(0, nt, 400)}, +} + +outer_assignment_values = { + "1d": xr.DataArray(randn((400, ny), frac_nan=0.1), dims=["x", "y"]), + "2d": xr.DataArray(randn((500, 400), frac_nan=0.1), dims=["x", "y"]), + "2d-1scalar": xr.DataArray(randn(100, frac_nan=0.1), dims=["x"]), +} + +vectorized_indexes = { + "1-1d": {"x": xr.DataArray(randint(0, nx, 400), dims="a")}, + "2-1d": { + "x": xr.DataArray(randint(0, nx, 400), dims="a"), + "y": xr.DataArray(randint(0, ny, 400), dims="a"), + }, + "3-2d": { + "x": xr.DataArray(randint(0, nx, 400).reshape(4, 100), dims=["a", "b"]), + "y": xr.DataArray(randint(0, ny, 400).reshape(4, 100), dims=["a", "b"]), + "t": xr.DataArray(randint(0, nt, 400).reshape(4, 100), dims=["a", "b"]), + }, +} + +vectorized_assignment_values = { + "1-1d": xr.DataArray(randn((400, ny)), dims=["a", "y"], coords={"a": randn(400)}), + "2-1d": xr.DataArray(randn(400), dims=["a"], coords={"a": randn(400)}), + "3-2d": xr.DataArray( + randn((4, 100)), dims=["a", "b"], coords={"a": randn(4), "b": randn(100)} + ), +} + + +class Base: + def setup(self, key): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn((nx, ny), frac_nan=0.1)), + "var2": (("x", "t"), randn((nx, nt))), + "var3": (("t",), randn(nt)), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + + +class Indexing(Base): + @parameterized(["key"], [list(basic_indexes.keys())]) + def time_indexing_basic(self, key): + self.ds.isel(**basic_indexes[key]).load() + + @parameterized(["key"], [list(outer_indexes.keys())]) + def time_indexing_outer(self, key): + self.ds.isel(**outer_indexes[key]).load() + + @parameterized(["key"], [list(vectorized_indexes.keys())]) + def time_indexing_vectorized(self, key): + self.ds.isel(**vectorized_indexes[key]).load() + + +class Assignment(Base): + @parameterized(["key"], [list(basic_indexes.keys())]) + def time_assignment_basic(self, key): + ind = basic_indexes[key] + val = basic_assignment_values[key] + self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val + + @parameterized(["key"], [list(outer_indexes.keys())]) + def time_assignment_outer(self, key): + ind = outer_indexes[key] + val = outer_assignment_values[key] + self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val + + @parameterized(["key"], [list(vectorized_indexes.keys())]) + def time_assignment_vectorized(self, key): + ind = vectorized_indexes[key] + val = vectorized_assignment_values[key] + self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val + + +class IndexingDask(Indexing): + def setup(self, key): + requires_dask() + super().setup(key) + self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) + + +class BooleanIndexing: + # https://github.com/pydata/xarray/issues/2227 + def setup(self): + self.ds = xr.Dataset( + {"a": ("time", np.arange(10_000_000))}, + coords={"time": np.arange(10_000_000)}, + ) + self.time_filter = self.ds.time > 50_000 + + def time_indexing(self): + self.ds.isel(time=self.time_filter) + + +class HugeAxisSmallSliceIndexing: + # https://github.com/pydata/xarray/pull/4560 + def setup(self): + self.filepath = "test_indexing_huge_axis_small_slice.nc" + if not os.path.isfile(self.filepath): + xr.Dataset( + {"a": ("x", np.arange(10_000_000))}, + coords={"x": np.arange(10_000_000)}, + ).to_netcdf(self.filepath, format="NETCDF4") + + self.ds = xr.open_dataset(self.filepath) + + def time_indexing(self): + self.ds.isel(x=slice(100)) + + def cleanup(self): + self.ds.close() diff --git a/asv_bench/benchmarks/interp.py b/asv_bench/benchmarks/interp.py new file mode 100644 index 00000000000..4b6691bcc0a --- /dev/null +++ b/asv_bench/benchmarks/interp.py @@ -0,0 +1,51 @@ +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized, randn, requires_dask + +nx = 1500 +ny = 1000 +nt = 500 + +randn_xy = randn((nx, ny), frac_nan=0.1) +randn_xt = randn((nx, nt)) +randn_t = randn((nt,)) + +new_x_short = np.linspace(0.3 * nx, 0.7 * nx, 100) +new_x_long = np.linspace(0.3 * nx, 0.7 * nx, 500) +new_y_long = np.linspace(0.1, 0.9, 500) + + +class Interpolation: + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn_xy), + "var2": (("x", "t"), randn_xt), + "var3": (("t",), randn_t), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + + @parameterized(["method", "is_short"], (["linear", "cubic"], [True, False])) + def time_interpolation(self, method, is_short): + new_x = new_x_short if is_short else new_x_long + self.ds.interp(x=new_x, method=method).load() + + @parameterized(["method"], (["linear", "nearest"])) + def time_interpolation_2d(self, method): + self.ds.interp(x=new_x_long, y=new_y_long, method=method).load() + + +class InterpolationDask(Interpolation): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + self.ds = self.ds.chunk({"t": 50}) diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py new file mode 100644 index 00000000000..8aaa515d417 --- /dev/null +++ b/asv_bench/benchmarks/pandas.py @@ -0,0 +1,26 @@ +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized + + +class MultiIndexSeries: + def setup(self, dtype, subset): + data = np.random.rand(100000).astype(dtype) + index = pd.MultiIndex.from_product( + [ + list("abcdefhijk"), + list("abcdefhijk"), + pd.date_range(start="2000-01-01", periods=1000, freq="B"), + ] + ) + series = pd.Series(data, index) + if subset: + series = series[::3] + self.series = series + + @parameterized(["dtype", "subset"], ([int, float], [True, False])) + def time_from_series(self, dtype, subset): + xr.DataArray.from_series(self.series) diff --git a/asv_bench/benchmarks/reindexing.py b/asv_bench/benchmarks/reindexing.py new file mode 100644 index 00000000000..9d0767fc3b3 --- /dev/null +++ b/asv_bench/benchmarks/reindexing.py @@ -0,0 +1,52 @@ +import numpy as np + +import xarray as xr + +from . import requires_dask + +ntime = 500 +nx = 50 +ny = 50 + + +class Reindex: + def setup(self): + data = np.random.RandomState(0).randn(ntime, nx, ny) + self.ds = xr.Dataset( + {"temperature": (("time", "x", "y"), data)}, + coords={"time": np.arange(ntime), "x": np.arange(nx), "y": np.arange(ny)}, + ) + + def time_1d_coarse(self): + self.ds.reindex(time=np.arange(0, ntime, 5)).load() + + def time_1d_fine_all_found(self): + self.ds.reindex(time=np.arange(0, ntime, 0.5), method="nearest").load() + + def time_1d_fine_some_missing(self): + self.ds.reindex( + time=np.arange(0, ntime, 0.5), method="nearest", tolerance=0.1 + ).load() + + def time_2d_coarse(self): + self.ds.reindex(x=np.arange(0, nx, 2), y=np.arange(0, ny, 2)).load() + + def time_2d_fine_all_found(self): + self.ds.reindex( + x=np.arange(0, nx, 0.5), y=np.arange(0, ny, 0.5), method="nearest" + ).load() + + def time_2d_fine_some_missing(self): + self.ds.reindex( + x=np.arange(0, nx, 0.5), + y=np.arange(0, ny, 0.5), + method="nearest", + tolerance=0.1, + ).load() + + +class ReindexDask(Reindex): + def setup(self): + requires_dask() + super().setup() + self.ds = self.ds.chunk({"time": 100}) diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py new file mode 100644 index 00000000000..4bf2ace352d --- /dev/null +++ b/asv_bench/benchmarks/repr.py @@ -0,0 +1,40 @@ +import numpy as np +import pandas as pd + +import xarray as xr + + +class Repr: + def setup(self): + a = np.arange(0, 100) + data_vars = dict() + for i in a: + data_vars[f"long_variable_name_{i}"] = xr.DataArray( + name=f"long_variable_name_{i}", + data=np.arange(0, 20), + dims=[f"long_coord_name_{i}_x"], + coords={f"long_coord_name_{i}_x": np.arange(0, 20) * 2}, + ) + self.ds = xr.Dataset(data_vars) + self.ds.attrs = {f"attr_{k}": 2 for k in a} + + def time_repr(self): + repr(self.ds) + + def time_repr_html(self): + self.ds._repr_html_() + + +class ReprMultiIndex: + def setup(self): + index = pd.MultiIndex.from_product( + [range(1000), range(1000)], names=("level_0", "level_1") + ) + series = pd.Series(range(1000 * 1000), index=index) + self.da = xr.DataArray(series) + + def time_repr(self): + repr(self.da) + + def time_repr_html(self): + self.da._repr_html_() diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py new file mode 100644 index 00000000000..f0e18bf2153 --- /dev/null +++ b/asv_bench/benchmarks/rolling.py @@ -0,0 +1,110 @@ +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized, randn, requires_dask + +nx = 300 +long_nx = 30000 +ny = 200 +nt = 100 +window = 20 + +randn_xy = randn((nx, ny), frac_nan=0.1) +randn_xt = randn((nx, nt)) +randn_t = randn((nt,)) +randn_long = randn((long_nx,), frac_nan=0.1) + + +class Rolling: + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn_xy), + "var2": (("x", "t"), randn_xt), + "var3": (("t",), randn_t), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + self.da_long = xr.DataArray( + randn_long, dims="x", coords={"x": np.arange(long_nx) * 0.1} + ) + + @parameterized(["func", "center"], (["mean", "count"], [True, False])) + def time_rolling(self, func, center): + getattr(self.ds.rolling(x=window, center=center), func)().load() + + @parameterized(["func", "pandas"], (["mean", "count"], [True, False])) + def time_rolling_long(self, func, pandas): + if pandas: + se = self.da_long.to_series() + getattr(se.rolling(window=window, min_periods=window), func)() + else: + getattr(self.da_long.rolling(x=window, min_periods=window), func)().load() + + @parameterized(["window_", "min_periods"], ([20, 40], [5, 5])) + def time_rolling_np(self, window_, min_periods): + self.ds.rolling(x=window_, center=False, min_periods=min_periods).reduce( + getattr(np, "nansum") + ).load() + + @parameterized(["center", "stride"], ([True, False], [1, 1])) + def time_rolling_construct(self, center, stride): + self.ds.rolling(x=window, center=center).construct( + "window_dim", stride=stride + ).sum(dim="window_dim").load() + + +class RollingDask(Rolling): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) + self.da_long = self.da_long.chunk({"x": 10000}) + + +class RollingMemory: + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn_xy), + "var2": (("x", "t"), randn_xt), + "var3": (("t",), randn_t), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + + +class DataArrayRollingMemory(RollingMemory): + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_ndrolling_reduce(self, func): + roll = self.ds.var1.rolling(x=10, y=4) + getattr(roll, func)() + + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_1drolling_reduce(self, func): + roll = self.ds.var3.rolling(t=100) + getattr(roll, func)() + + +class DatasetRollingMemory(RollingMemory): + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_ndrolling_reduce(self, func): + roll = self.ds.rolling(x=10, y=4) + getattr(roll, func)() + + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_1drolling_reduce(self, func): + roll = self.ds.rolling(t=100) + getattr(roll, func)() diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py new file mode 100644 index 00000000000..2c5b7ca7821 --- /dev/null +++ b/asv_bench/benchmarks/unstacking.py @@ -0,0 +1,29 @@ +import numpy as np + +import xarray as xr + +from . import requires_dask + + +class Unstacking: + def setup(self): + data = np.random.RandomState(0).randn(250, 500) + self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...]) + self.da_missing = self.da_full[:-1] + self.df_missing = self.da_missing.to_pandas() + + def time_unstack_fast(self): + self.da_full.unstack("flat_dim") + + def time_unstack_slow(self): + self.da_missing.unstack("flat_dim") + + def time_unstack_pandas_slow(self): + self.df_missing.unstack() + + +class UnstackingDask(Unstacking): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + self.da_full = self.da_full.chunk({"flat_dim": 25}) From 25409811d6f9961382c3119b405c02fe0a307255 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 8 Nov 2021 12:13:41 -0700 Subject: [PATCH 10/11] Fix small_num_groups benchmarks Make sure dask uses multiple chunks. --- asv_bench/benchmarks/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index eb43a95d2b9..c2ddf155ba7 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -11,7 +11,7 @@ def setup(self, *args, **kwargs): self.n = 100 self.ds1d = xr.Dataset( { - "a": xr.DataArray(np.r_[np.arange(self.n), np.arange(self.n)]), + "a": xr.DataArray(np.r_[np.repeat(1, self.n), np.repeat(2, self.n)]), "b": xr.DataArray(np.arange(2 * self.n)), } ) @@ -36,8 +36,8 @@ class GroupByDask(GroupBy): def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) - self.ds1d = self.ds1d.sel(dim_0=slice(self.n // 2)).chunk({"dim_0": 50}) - self.ds2d = self.ds2d.sel(dim_0=slice(self.n // 2)).chunk({"dim_0": 50, "z": 4}) + self.ds1d = self.ds1d.sel(dim_0=slice(self.n)).chunk({"dim_0": 50}) + self.ds2d = self.ds2d.sel(dim_0=slice(self.n)).chunk({"dim_0": 50, "z": 4}) class GroupByDataFrame(GroupBy): From 37abc5457d657ddb416bf9de4d5cc81ca1f3d389 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 8 Nov 2021 12:19:21 -0700 Subject: [PATCH 11/11] [skip-ci] more dask improvements. --- asv_bench/benchmarks/groupby.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c2ddf155ba7..46d6293cc98 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -36,11 +36,13 @@ class GroupByDask(GroupBy): def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) - self.ds1d = self.ds1d.sel(dim_0=slice(self.n)).chunk({"dim_0": 50}) - self.ds2d = self.ds2d.sel(dim_0=slice(self.n)).chunk({"dim_0": 50, "z": 4}) + self.ds1d = self.ds1d.sel(dim_0=slice(None, None, 2)).chunk({"dim_0": 50}) + self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2)).chunk( + {"dim_0": 50, "z": 5} + ) -class GroupByDataFrame(GroupBy): +class GroupByPandasDataFrame(GroupBy): """Run groupby tests using pandas DataFrame.""" def setup(self, *args, **kwargs):