From 113427e53fce40cc076ca843aebcab2d0c7f7e83 Mon Sep 17 00:00:00 2001
From: dcherian <deepak@cherian.net>
Date: Sat, 30 Oct 2021 06:32:18 -0600
Subject: [PATCH 01/11] Add groupby & resample benchmarks

---
 asv_bench/benchmarks/groupby.py | 60 +++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 7 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index fa8deaf572f..a63e8fcaf5a 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 
 import xarray as xr
 
@@ -7,33 +8,78 @@
 
 class GroupBy:
     def setup(self, *args, **kwargs):
-        self.ds = xr.Dataset(
+        self.ds1d = xr.Dataset(
             {
                 "a": xr.DataArray(np.r_[np.arange(500.0), np.arange(500.0)]),
                 "b": xr.DataArray(np.arange(1000.0)),
             }
         )
+        self.ds2d = self.ds1d.expand_dims(z=10)
 
-    @parameterized(["method"], [("sum", "mean")])
-    def time_agg(self, method):
-        return getattr(self.ds.groupby("a"), method)()
+    @parameterized(["ndim"], [(1, 2)])
+    def time_init(self, ndim):
+        getattr(self, f"ds{ndim}d").groupby("b")
+
+    @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
+    def time_agg_small_num_groups(self, method, ndim):
+        ds = getattr(self, f"ds{ndim}d")
+        getattr(ds.groupby("a"), method)()
+
+    @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
+    def time_agg_large_num_groups(self, method, ndim):
+        ds = getattr(self, f"ds{ndim}d")
+        getattr(ds.groupby("b"), method)()
 
 
 class GroupByDask(GroupBy):
     def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
-        self.ds = self.ds.chunk({"dim_0": 50})
+        self.ds1d = self.ds1d.sel(dim_0=slice(250)).chunk({"dim_0": 50})
+        self.ds2d = self.ds2d.sel(dim_0=slice(250)).chunk({"dim_0": 50, "z": 4})
 
 
 class GroupByDataFrame(GroupBy):
     def setup(self, *args, **kwargs):
         super().setup(**kwargs)
-        self.ds = self.ds.to_dataframe()
+        self.ds1d = self.ds1d.to_dataframe()
 
 
 class GroupByDaskDataFrame(GroupBy):
     def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
-        self.ds = self.ds.chunk({"dim_0": 50}).to_dataframe()
+        self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe()
+
+
+class Resample:
+    def setup(self, *args, **kwargs):
+        self.ds1d = xr.Dataset(
+            {
+                "b": ("time", np.arange(365.0 * 24)),
+            },
+            coords={"time": pd.date_range("2001-01-01", freq="H", periods=365 * 24)},
+        )
+        self.ds2d = self.ds1d.expand_dims(z=10)
+
+    @parameterized(["ndim"], [(1, 2)])
+    def time_init(self, ndim):
+        getattr(self, f"ds{ndim}d").resample(time="D")
+
+    @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
+    def time_agg_small_num_groups(self, method, ndim):
+        ds = getattr(self, f"ds{ndim}d")
+        getattr(ds.resample(time="3M"), method)()
+
+    @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
+    def time_agg_large_num_groups(self, method, ndim):
+        ds = getattr(self, f"ds{ndim}d")
+        getattr(ds.resample(time="6H"), method)()
+
+
+class ResampleDask(Resample):
+    def setup(self, *args, **kwargs):
+        requires_dask()
+        super().setup(**kwargs)
+        self.ds1d = self.ds1d.chunk({"time": 50})
+        self.ds2d = self.ds2d.chunk({"time": 50, "z": 4})

From 3b3ca773ddf1f1adaaf915ca6d778f29167202a6 Mon Sep 17 00:00:00 2001
From: dcherian <deepak@cherian.net>
Date: Sat, 30 Oct 2021 08:12:04 -0600
Subject: [PATCH 02/11] [skip-ci]


From 3df1015b2bbb6c4cd038b5d14dde3f62a782b70c Mon Sep 17 00:00:00 2001
From: Illviljan <14371165+Illviljan@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:04:04 +0100
Subject: [PATCH 03/11] temporarily remove other benchmarks for ci testing

---
 asv_bench/benchmarks/combine.py           |  38 --
 asv_bench/benchmarks/dataarray_missing.py |  80 ----
 asv_bench/benchmarks/dataset_io.py        | 478 ----------------------
 asv_bench/benchmarks/import_xarray.py     |   9 -
 asv_bench/benchmarks/indexing.py          | 149 -------
 asv_bench/benchmarks/interp.py            |  51 ---
 asv_bench/benchmarks/pandas.py            |  26 --
 asv_bench/benchmarks/reindexing.py        |  52 ---
 asv_bench/benchmarks/repr.py              |  40 --
 asv_bench/benchmarks/rolling.py           | 110 -----
 asv_bench/benchmarks/unstacking.py        |  29 --
 11 files changed, 1062 deletions(-)
 delete mode 100644 asv_bench/benchmarks/combine.py
 delete mode 100644 asv_bench/benchmarks/dataarray_missing.py
 delete mode 100644 asv_bench/benchmarks/dataset_io.py
 delete mode 100644 asv_bench/benchmarks/import_xarray.py
 delete mode 100644 asv_bench/benchmarks/indexing.py
 delete mode 100644 asv_bench/benchmarks/interp.py
 delete mode 100644 asv_bench/benchmarks/pandas.py
 delete mode 100644 asv_bench/benchmarks/reindexing.py
 delete mode 100644 asv_bench/benchmarks/repr.py
 delete mode 100644 asv_bench/benchmarks/rolling.py
 delete mode 100644 asv_bench/benchmarks/unstacking.py

diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py
deleted file mode 100644
index a4f8db2786b..00000000000
--- a/asv_bench/benchmarks/combine.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import numpy as np
-
-import xarray as xr
-
-
-class Combine:
-    """Benchmark concatenating and merging large datasets"""
-
-    def setup(self):
-        """Create 4 datasets with two different variables"""
-
-        t_size, x_size, y_size = 50, 450, 400
-        t = np.arange(t_size)
-        data = np.random.randn(t_size, x_size, y_size)
-
-        self.dsA0 = xr.Dataset(
-            {"A": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))}
-        )
-        self.dsA1 = xr.Dataset(
-            {"A": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))}
-        )
-        self.dsB0 = xr.Dataset(
-            {"B": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))}
-        )
-        self.dsB1 = xr.Dataset(
-            {"B": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))}
-        )
-
-    def time_combine_nested(self):
-        datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]]
-
-        xr.combine_nested(datasets, concat_dim=[None, "T"])
-
-    def time_combine_by_coords(self):
-        """Also has to load and arrange t coordinate"""
-        datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1]
-
-        xr.combine_by_coords(datasets)
diff --git a/asv_bench/benchmarks/dataarray_missing.py b/asv_bench/benchmarks/dataarray_missing.py
deleted file mode 100644
index f89fe7f8eb9..00000000000
--- a/asv_bench/benchmarks/dataarray_missing.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import pandas as pd
-
-import xarray as xr
-
-from . import parameterized, randn, requires_dask
-
-
-def make_bench_data(shape, frac_nan, chunks):
-    vals = randn(shape, frac_nan)
-    coords = {"time": pd.date_range("2000-01-01", freq="D", periods=shape[0])}
-    da = xr.DataArray(vals, dims=("time", "x", "y"), coords=coords)
-
-    if chunks is not None:
-        da = da.chunk(chunks)
-
-    return da
-
-
-def requires_bottleneck():
-    try:
-        import bottleneck  # noqa: F401
-    except ImportError:
-        raise NotImplementedError()
-
-
-class DataArrayMissingInterpolateNA:
-    def setup(self, shape, chunks, limit):
-        if chunks is not None:
-            requires_dask()
-        self.da = make_bench_data(shape, 0.1, chunks)
-
-    @parameterized(
-        ["shape", "chunks", "limit"],
-        (
-            [(365, 75, 75)],
-            [None, {"x": 25, "y": 25}],
-            [None, 3],
-        ),
-    )
-    def time_interpolate_na(self, shape, chunks, limit):
-        actual = self.da.interpolate_na(dim="time", method="linear", limit=limit)
-
-        if chunks is not None:
-            actual = actual.compute()
-
-
-class DataArrayMissingBottleneck:
-    def setup(self, shape, chunks, limit):
-        requires_bottleneck()
-        if chunks is not None:
-            requires_dask()
-        self.da = make_bench_data(shape, 0.1, chunks)
-
-    @parameterized(
-        ["shape", "chunks", "limit"],
-        (
-            [(365, 75, 75)],
-            [None, {"x": 25, "y": 25}],
-            [None, 3],
-        ),
-    )
-    def time_ffill(self, shape, chunks, limit):
-        actual = self.da.ffill(dim="time", limit=limit)
-
-        if chunks is not None:
-            actual = actual.compute()
-
-    @parameterized(
-        ["shape", "chunks", "limit"],
-        (
-            [(365, 75, 75)],
-            [None, {"x": 25, "y": 25}],
-            [None, 3],
-        ),
-    )
-    def time_bfill(self, shape, chunks, limit):
-        actual = self.da.ffill(dim="time", limit=limit)
-
-        if chunks is not None:
-            actual = actual.compute()
diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py
deleted file mode 100644
index 6c2e15c54e9..00000000000
--- a/asv_bench/benchmarks/dataset_io.py
+++ /dev/null
@@ -1,478 +0,0 @@
-import os
-
-import numpy as np
-import pandas as pd
-
-import xarray as xr
-
-from . import _skip_slow, randint, randn, requires_dask
-
-try:
-    import dask
-    import dask.multiprocessing
-except ImportError:
-    pass
-
-
-os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
-
-
-class IOSingleNetCDF:
-    """
-    A few examples that benchmark reading/writing a single netCDF file with
-    xarray
-    """
-
-    timeout = 300.0
-    repeat = 1
-    number = 5
-
-    def make_ds(self):
-        # TODO: Lazily skipped in CI as it is very demanding and slow.
-        # Improve times and remove errors.
-        _skip_slow()
-
-        # single Dataset
-        self.ds = xr.Dataset()
-        self.nt = 1000
-        self.nx = 90
-        self.ny = 45
-
-        self.block_chunks = {
-            "time": self.nt / 4,
-            "lon": self.nx / 3,
-            "lat": self.ny / 3,
-        }
-
-        self.time_chunks = {"time": int(self.nt / 36)}
-
-        times = pd.date_range("1970-01-01", periods=self.nt, freq="D")
-        lons = xr.DataArray(
-            np.linspace(0, 360, self.nx),
-            dims=("lon",),
-            attrs={"units": "degrees east", "long_name": "longitude"},
-        )
-        lats = xr.DataArray(
-            np.linspace(-90, 90, self.ny),
-            dims=("lat",),
-            attrs={"units": "degrees north", "long_name": "latitude"},
-        )
-        self.ds["foo"] = xr.DataArray(
-            randn((self.nt, self.nx, self.ny), frac_nan=0.2),
-            coords={"lon": lons, "lat": lats, "time": times},
-            dims=("time", "lon", "lat"),
-            name="foo",
-            attrs={"units": "foo units", "description": "a description"},
-        )
-        self.ds["bar"] = xr.DataArray(
-            randn((self.nt, self.nx, self.ny), frac_nan=0.2),
-            coords={"lon": lons, "lat": lats, "time": times},
-            dims=("time", "lon", "lat"),
-            name="bar",
-            attrs={"units": "bar units", "description": "a description"},
-        )
-        self.ds["baz"] = xr.DataArray(
-            randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32),
-            coords={"lon": lons, "lat": lats},
-            dims=("lon", "lat"),
-            name="baz",
-            attrs={"units": "baz units", "description": "a description"},
-        )
-
-        self.ds.attrs = {"history": "created for xarray benchmarking"}
-
-        self.oinds = {
-            "time": randint(0, self.nt, 120),
-            "lon": randint(0, self.nx, 20),
-            "lat": randint(0, self.ny, 10),
-        }
-        self.vinds = {
-            "time": xr.DataArray(randint(0, self.nt, 120), dims="x"),
-            "lon": xr.DataArray(randint(0, self.nx, 120), dims="x"),
-            "lat": slice(3, 20),
-        }
-
-
-class IOWriteSingleNetCDF3(IOSingleNetCDF):
-    def setup(self):
-        self.format = "NETCDF3_64BIT"
-        self.make_ds()
-
-    def time_write_dataset_netcdf4(self):
-        self.ds.to_netcdf("test_netcdf4_write.nc", engine="netcdf4", format=self.format)
-
-    def time_write_dataset_scipy(self):
-        self.ds.to_netcdf("test_scipy_write.nc", engine="scipy", format=self.format)
-
-
-class IOReadSingleNetCDF4(IOSingleNetCDF):
-    def setup(self):
-
-        self.make_ds()
-
-        self.filepath = "test_single_file.nc4.nc"
-        self.format = "NETCDF4"
-        self.ds.to_netcdf(self.filepath, format=self.format)
-
-    def time_load_dataset_netcdf4(self):
-        xr.open_dataset(self.filepath, engine="netcdf4").load()
-
-    def time_orthogonal_indexing(self):
-        ds = xr.open_dataset(self.filepath, engine="netcdf4")
-        ds = ds.isel(**self.oinds).load()
-
-    def time_vectorized_indexing(self):
-        ds = xr.open_dataset(self.filepath, engine="netcdf4")
-        ds = ds.isel(**self.vinds).load()
-
-
-class IOReadSingleNetCDF3(IOReadSingleNetCDF4):
-    def setup(self):
-
-        self.make_ds()
-
-        self.filepath = "test_single_file.nc3.nc"
-        self.format = "NETCDF3_64BIT"
-        self.ds.to_netcdf(self.filepath, format=self.format)
-
-    def time_load_dataset_scipy(self):
-        xr.open_dataset(self.filepath, engine="scipy").load()
-
-    def time_orthogonal_indexing(self):
-        ds = xr.open_dataset(self.filepath, engine="scipy")
-        ds = ds.isel(**self.oinds).load()
-
-    def time_vectorized_indexing(self):
-        ds = xr.open_dataset(self.filepath, engine="scipy")
-        ds = ds.isel(**self.vinds).load()
-
-
-class IOReadSingleNetCDF4Dask(IOSingleNetCDF):
-    def setup(self):
-
-        requires_dask()
-
-        self.make_ds()
-
-        self.filepath = "test_single_file.nc4.nc"
-        self.format = "NETCDF4"
-        self.ds.to_netcdf(self.filepath, format=self.format)
-
-    def time_load_dataset_netcdf4_with_block_chunks(self):
-        xr.open_dataset(
-            self.filepath, engine="netcdf4", chunks=self.block_chunks
-        ).load()
-
-    def time_load_dataset_netcdf4_with_block_chunks_oindexing(self):
-        ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks)
-        ds = ds.isel(**self.oinds).load()
-
-    def time_load_dataset_netcdf4_with_block_chunks_vindexing(self):
-        ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks)
-        ds = ds.isel(**self.vinds).load()
-
-    def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_dataset(
-                self.filepath, engine="netcdf4", chunks=self.block_chunks
-            ).load()
-
-    def time_load_dataset_netcdf4_with_time_chunks(self):
-        xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.time_chunks).load()
-
-    def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_dataset(
-                self.filepath, engine="netcdf4", chunks=self.time_chunks
-            ).load()
-
-
-class IOReadSingleNetCDF3Dask(IOReadSingleNetCDF4Dask):
-    def setup(self):
-
-        requires_dask()
-
-        self.make_ds()
-
-        self.filepath = "test_single_file.nc3.nc"
-        self.format = "NETCDF3_64BIT"
-        self.ds.to_netcdf(self.filepath, format=self.format)
-
-    def time_load_dataset_scipy_with_block_chunks(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_dataset(
-                self.filepath, engine="scipy", chunks=self.block_chunks
-            ).load()
-
-    def time_load_dataset_scipy_with_block_chunks_oindexing(self):
-        ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks)
-        ds = ds.isel(**self.oinds).load()
-
-    def time_load_dataset_scipy_with_block_chunks_vindexing(self):
-        ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks)
-        ds = ds.isel(**self.vinds).load()
-
-    def time_load_dataset_scipy_with_time_chunks(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_dataset(
-                self.filepath, engine="scipy", chunks=self.time_chunks
-            ).load()
-
-
-class IOMultipleNetCDF:
-    """
-    A few examples that benchmark reading/writing multiple netCDF files with
-    xarray
-    """
-
-    timeout = 300.0
-    repeat = 1
-    number = 5
-
-    def make_ds(self, nfiles=10):
-        # TODO: Lazily skipped in CI as it is very demanding and slow.
-        # Improve times and remove errors.
-        _skip_slow()
-
-        # multiple Dataset
-        self.ds = xr.Dataset()
-        self.nt = 1000
-        self.nx = 90
-        self.ny = 45
-        self.nfiles = nfiles
-
-        self.block_chunks = {
-            "time": self.nt / 4,
-            "lon": self.nx / 3,
-            "lat": self.ny / 3,
-        }
-
-        self.time_chunks = {"time": int(self.nt / 36)}
-
-        self.time_vars = np.split(
-            pd.date_range("1970-01-01", periods=self.nt, freq="D"), self.nfiles
-        )
-
-        self.ds_list = []
-        self.filenames_list = []
-        for i, times in enumerate(self.time_vars):
-            ds = xr.Dataset()
-            nt = len(times)
-            lons = xr.DataArray(
-                np.linspace(0, 360, self.nx),
-                dims=("lon",),
-                attrs={"units": "degrees east", "long_name": "longitude"},
-            )
-            lats = xr.DataArray(
-                np.linspace(-90, 90, self.ny),
-                dims=("lat",),
-                attrs={"units": "degrees north", "long_name": "latitude"},
-            )
-            ds["foo"] = xr.DataArray(
-                randn((nt, self.nx, self.ny), frac_nan=0.2),
-                coords={"lon": lons, "lat": lats, "time": times},
-                dims=("time", "lon", "lat"),
-                name="foo",
-                attrs={"units": "foo units", "description": "a description"},
-            )
-            ds["bar"] = xr.DataArray(
-                randn((nt, self.nx, self.ny), frac_nan=0.2),
-                coords={"lon": lons, "lat": lats, "time": times},
-                dims=("time", "lon", "lat"),
-                name="bar",
-                attrs={"units": "bar units", "description": "a description"},
-            )
-            ds["baz"] = xr.DataArray(
-                randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32),
-                coords={"lon": lons, "lat": lats},
-                dims=("lon", "lat"),
-                name="baz",
-                attrs={"units": "baz units", "description": "a description"},
-            )
-
-            ds.attrs = {"history": "created for xarray benchmarking"}
-
-            self.ds_list.append(ds)
-            self.filenames_list.append("test_netcdf_%i.nc" % i)
-
-
-class IOWriteMultipleNetCDF3(IOMultipleNetCDF):
-    def setup(self):
-        self.make_ds()
-        self.format = "NETCDF3_64BIT"
-
-    def time_write_dataset_netcdf4(self):
-        xr.save_mfdataset(
-            self.ds_list, self.filenames_list, engine="netcdf4", format=self.format
-        )
-
-    def time_write_dataset_scipy(self):
-        xr.save_mfdataset(
-            self.ds_list, self.filenames_list, engine="scipy", format=self.format
-        )
-
-
-class IOReadMultipleNetCDF4(IOMultipleNetCDF):
-    def setup(self):
-
-        requires_dask()
-
-        self.make_ds()
-        self.format = "NETCDF4"
-        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
-
-    def time_load_dataset_netcdf4(self):
-        xr.open_mfdataset(self.filenames_list, engine="netcdf4").load()
-
-    def time_open_dataset_netcdf4(self):
-        xr.open_mfdataset(self.filenames_list, engine="netcdf4")
-
-
-class IOReadMultipleNetCDF3(IOReadMultipleNetCDF4):
-    def setup(self):
-
-        requires_dask()
-
-        self.make_ds()
-        self.format = "NETCDF3_64BIT"
-        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
-
-    def time_load_dataset_scipy(self):
-        xr.open_mfdataset(self.filenames_list, engine="scipy").load()
-
-    def time_open_dataset_scipy(self):
-        xr.open_mfdataset(self.filenames_list, engine="scipy")
-
-
-class IOReadMultipleNetCDF4Dask(IOMultipleNetCDF):
-    def setup(self):
-
-        requires_dask()
-
-        self.make_ds()
-        self.format = "NETCDF4"
-        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
-
-    def time_load_dataset_netcdf4_with_block_chunks(self):
-        xr.open_mfdataset(
-            self.filenames_list, engine="netcdf4", chunks=self.block_chunks
-        ).load()
-
-    def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_mfdataset(
-                self.filenames_list, engine="netcdf4", chunks=self.block_chunks
-            ).load()
-
-    def time_load_dataset_netcdf4_with_time_chunks(self):
-        xr.open_mfdataset(
-            self.filenames_list, engine="netcdf4", chunks=self.time_chunks
-        ).load()
-
-    def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_mfdataset(
-                self.filenames_list, engine="netcdf4", chunks=self.time_chunks
-            ).load()
-
-    def time_open_dataset_netcdf4_with_block_chunks(self):
-        xr.open_mfdataset(
-            self.filenames_list, engine="netcdf4", chunks=self.block_chunks
-        )
-
-    def time_open_dataset_netcdf4_with_block_chunks_multiprocessing(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_mfdataset(
-                self.filenames_list, engine="netcdf4", chunks=self.block_chunks
-            )
-
-    def time_open_dataset_netcdf4_with_time_chunks(self):
-        xr.open_mfdataset(
-            self.filenames_list, engine="netcdf4", chunks=self.time_chunks
-        )
-
-    def time_open_dataset_netcdf4_with_time_chunks_multiprocessing(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_mfdataset(
-                self.filenames_list, engine="netcdf4", chunks=self.time_chunks
-            )
-
-
-class IOReadMultipleNetCDF3Dask(IOReadMultipleNetCDF4Dask):
-    def setup(self):
-
-        requires_dask()
-
-        self.make_ds()
-        self.format = "NETCDF3_64BIT"
-        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
-
-    def time_load_dataset_scipy_with_block_chunks(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_mfdataset(
-                self.filenames_list, engine="scipy", chunks=self.block_chunks
-            ).load()
-
-    def time_load_dataset_scipy_with_time_chunks(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_mfdataset(
-                self.filenames_list, engine="scipy", chunks=self.time_chunks
-            ).load()
-
-    def time_open_dataset_scipy_with_block_chunks(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_mfdataset(
-                self.filenames_list, engine="scipy", chunks=self.block_chunks
-            )
-
-    def time_open_dataset_scipy_with_time_chunks(self):
-        with dask.config.set(scheduler="multiprocessing"):
-            xr.open_mfdataset(
-                self.filenames_list, engine="scipy", chunks=self.time_chunks
-            )
-
-
-def create_delayed_write():
-    import dask.array as da
-
-    # TODO: Lazily skipped in CI as it is very demanding and slow.
-    # Improve times and remove errors.
-    _skip_slow()
-
-    vals = da.random.random(300, chunks=(1,))
-    ds = xr.Dataset({"vals": (["a"], vals)})
-    return ds.to_netcdf("file.nc", engine="netcdf4", compute=False)
-
-
-class IOWriteNetCDFDask:
-    timeout = 60
-    repeat = 1
-    number = 5
-
-    def setup(self):
-        requires_dask()
-        self.write = create_delayed_write()
-
-    def time_write(self):
-        self.write.compute()
-
-
-class IOWriteNetCDFDaskDistributed:
-    def setup(self):
-        try:
-            import distributed
-        except ImportError:
-            raise NotImplementedError()
-
-        # TODO: Lazily skipped in CI as it is very demanding and slow.
-        # Improve times and remove errors.
-        _skip_slow()
-
-        self.client = distributed.Client()
-        self.write = create_delayed_write()
-
-    def cleanup(self):
-        self.client.shutdown()
-
-    def time_write(self):
-        self.write.compute()
diff --git a/asv_bench/benchmarks/import_xarray.py b/asv_bench/benchmarks/import_xarray.py
deleted file mode 100644
index 94652e3b82a..00000000000
--- a/asv_bench/benchmarks/import_xarray.py
+++ /dev/null
@@ -1,9 +0,0 @@
-class ImportXarray:
-    def setup(self, *args, **kwargs):
-        def import_xr():
-            import xarray  # noqa: F401
-
-        self._import_xr = import_xr
-
-    def time_import_xarray(self):
-        self._import_xr()
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
deleted file mode 100644
index 15212ec0c61..00000000000
--- a/asv_bench/benchmarks/indexing.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import os
-
-import numpy as np
-import pandas as pd
-
-import xarray as xr
-
-from . import parameterized, randint, randn, requires_dask
-
-nx = 2000
-ny = 1000
-nt = 500
-
-basic_indexes = {
-    "1slice": {"x": slice(0, 3)},
-    "1slice-1scalar": {"x": 0, "y": slice(None, None, 3)},
-    "2slicess-1scalar": {"x": slice(3, -3, 3), "y": 1, "t": slice(None, -3, 3)},
-}
-
-basic_assignment_values = {
-    "1slice": xr.DataArray(randn((3, ny), frac_nan=0.1), dims=["x", "y"]),
-    "1slice-1scalar": xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1), dims=["y"]),
-    "2slicess-1scalar": xr.DataArray(
-        randn(np.empty(nx)[slice(3, -3, 3)].size, frac_nan=0.1), dims=["x"]
-    ),
-}
-
-outer_indexes = {
-    "1d": {"x": randint(0, nx, 400)},
-    "2d": {"x": randint(0, nx, 500), "y": randint(0, ny, 400)},
-    "2d-1scalar": {"x": randint(0, nx, 100), "y": 1, "t": randint(0, nt, 400)},
-}
-
-outer_assignment_values = {
-    "1d": xr.DataArray(randn((400, ny), frac_nan=0.1), dims=["x", "y"]),
-    "2d": xr.DataArray(randn((500, 400), frac_nan=0.1), dims=["x", "y"]),
-    "2d-1scalar": xr.DataArray(randn(100, frac_nan=0.1), dims=["x"]),
-}
-
-vectorized_indexes = {
-    "1-1d": {"x": xr.DataArray(randint(0, nx, 400), dims="a")},
-    "2-1d": {
-        "x": xr.DataArray(randint(0, nx, 400), dims="a"),
-        "y": xr.DataArray(randint(0, ny, 400), dims="a"),
-    },
-    "3-2d": {
-        "x": xr.DataArray(randint(0, nx, 400).reshape(4, 100), dims=["a", "b"]),
-        "y": xr.DataArray(randint(0, ny, 400).reshape(4, 100), dims=["a", "b"]),
-        "t": xr.DataArray(randint(0, nt, 400).reshape(4, 100), dims=["a", "b"]),
-    },
-}
-
-vectorized_assignment_values = {
-    "1-1d": xr.DataArray(randn((400, ny)), dims=["a", "y"], coords={"a": randn(400)}),
-    "2-1d": xr.DataArray(randn(400), dims=["a"], coords={"a": randn(400)}),
-    "3-2d": xr.DataArray(
-        randn((4, 100)), dims=["a", "b"], coords={"a": randn(4), "b": randn(100)}
-    ),
-}
-
-
-class Base:
-    def setup(self, key):
-        self.ds = xr.Dataset(
-            {
-                "var1": (("x", "y"), randn((nx, ny), frac_nan=0.1)),
-                "var2": (("x", "t"), randn((nx, nt))),
-                "var3": (("t",), randn(nt)),
-            },
-            coords={
-                "x": np.arange(nx),
-                "y": np.linspace(0, 1, ny),
-                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
-                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
-            },
-        )
-
-
-class Indexing(Base):
-    @parameterized(["key"], [list(basic_indexes.keys())])
-    def time_indexing_basic(self, key):
-        self.ds.isel(**basic_indexes[key]).load()
-
-    @parameterized(["key"], [list(outer_indexes.keys())])
-    def time_indexing_outer(self, key):
-        self.ds.isel(**outer_indexes[key]).load()
-
-    @parameterized(["key"], [list(vectorized_indexes.keys())])
-    def time_indexing_vectorized(self, key):
-        self.ds.isel(**vectorized_indexes[key]).load()
-
-
-class Assignment(Base):
-    @parameterized(["key"], [list(basic_indexes.keys())])
-    def time_assignment_basic(self, key):
-        ind = basic_indexes[key]
-        val = basic_assignment_values[key]
-        self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val
-
-    @parameterized(["key"], [list(outer_indexes.keys())])
-    def time_assignment_outer(self, key):
-        ind = outer_indexes[key]
-        val = outer_assignment_values[key]
-        self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val
-
-    @parameterized(["key"], [list(vectorized_indexes.keys())])
-    def time_assignment_vectorized(self, key):
-        ind = vectorized_indexes[key]
-        val = vectorized_assignment_values[key]
-        self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val
-
-
-class IndexingDask(Indexing):
-    def setup(self, key):
-        requires_dask()
-        super().setup(key)
-        self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50})
-
-
-class BooleanIndexing:
-    # https://github.com/pydata/xarray/issues/2227
-    def setup(self):
-        self.ds = xr.Dataset(
-            {"a": ("time", np.arange(10_000_000))},
-            coords={"time": np.arange(10_000_000)},
-        )
-        self.time_filter = self.ds.time > 50_000
-
-    def time_indexing(self):
-        self.ds.isel(time=self.time_filter)
-
-
-class HugeAxisSmallSliceIndexing:
-    # https://github.com/pydata/xarray/pull/4560
-    def setup(self):
-        self.filepath = "test_indexing_huge_axis_small_slice.nc"
-        if not os.path.isfile(self.filepath):
-            xr.Dataset(
-                {"a": ("x", np.arange(10_000_000))},
-                coords={"x": np.arange(10_000_000)},
-            ).to_netcdf(self.filepath, format="NETCDF4")
-
-        self.ds = xr.open_dataset(self.filepath)
-
-    def time_indexing(self):
-        self.ds.isel(x=slice(100))
-
-    def cleanup(self):
-        self.ds.close()
diff --git a/asv_bench/benchmarks/interp.py b/asv_bench/benchmarks/interp.py
deleted file mode 100644
index 4b6691bcc0a..00000000000
--- a/asv_bench/benchmarks/interp.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import numpy as np
-import pandas as pd
-
-import xarray as xr
-
-from . import parameterized, randn, requires_dask
-
-nx = 1500
-ny = 1000
-nt = 500
-
-randn_xy = randn((nx, ny), frac_nan=0.1)
-randn_xt = randn((nx, nt))
-randn_t = randn((nt,))
-
-new_x_short = np.linspace(0.3 * nx, 0.7 * nx, 100)
-new_x_long = np.linspace(0.3 * nx, 0.7 * nx, 500)
-new_y_long = np.linspace(0.1, 0.9, 500)
-
-
-class Interpolation:
-    def setup(self, *args, **kwargs):
-        self.ds = xr.Dataset(
-            {
-                "var1": (("x", "y"), randn_xy),
-                "var2": (("x", "t"), randn_xt),
-                "var3": (("t",), randn_t),
-            },
-            coords={
-                "x": np.arange(nx),
-                "y": np.linspace(0, 1, ny),
-                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
-                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
-            },
-        )
-
-    @parameterized(["method", "is_short"], (["linear", "cubic"], [True, False]))
-    def time_interpolation(self, method, is_short):
-        new_x = new_x_short if is_short else new_x_long
-        self.ds.interp(x=new_x, method=method).load()
-
-    @parameterized(["method"], (["linear", "nearest"]))
-    def time_interpolation_2d(self, method):
-        self.ds.interp(x=new_x_long, y=new_y_long, method=method).load()
-
-
-class InterpolationDask(Interpolation):
-    def setup(self, *args, **kwargs):
-        requires_dask()
-        super().setup(**kwargs)
-        self.ds = self.ds.chunk({"t": 50})
diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py
deleted file mode 100644
index 8aaa515d417..00000000000
--- a/asv_bench/benchmarks/pandas.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import numpy as np
-import pandas as pd
-
-import xarray as xr
-
-from . import parameterized
-
-
-class MultiIndexSeries:
-    def setup(self, dtype, subset):
-        data = np.random.rand(100000).astype(dtype)
-        index = pd.MultiIndex.from_product(
-            [
-                list("abcdefhijk"),
-                list("abcdefhijk"),
-                pd.date_range(start="2000-01-01", periods=1000, freq="B"),
-            ]
-        )
-        series = pd.Series(data, index)
-        if subset:
-            series = series[::3]
-        self.series = series
-
-    @parameterized(["dtype", "subset"], ([int, float], [True, False]))
-    def time_from_series(self, dtype, subset):
-        xr.DataArray.from_series(self.series)
diff --git a/asv_bench/benchmarks/reindexing.py b/asv_bench/benchmarks/reindexing.py
deleted file mode 100644
index 9d0767fc3b3..00000000000
--- a/asv_bench/benchmarks/reindexing.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import numpy as np
-
-import xarray as xr
-
-from . import requires_dask
-
-ntime = 500
-nx = 50
-ny = 50
-
-
-class Reindex:
-    def setup(self):
-        data = np.random.RandomState(0).randn(ntime, nx, ny)
-        self.ds = xr.Dataset(
-            {"temperature": (("time", "x", "y"), data)},
-            coords={"time": np.arange(ntime), "x": np.arange(nx), "y": np.arange(ny)},
-        )
-
-    def time_1d_coarse(self):
-        self.ds.reindex(time=np.arange(0, ntime, 5)).load()
-
-    def time_1d_fine_all_found(self):
-        self.ds.reindex(time=np.arange(0, ntime, 0.5), method="nearest").load()
-
-    def time_1d_fine_some_missing(self):
-        self.ds.reindex(
-            time=np.arange(0, ntime, 0.5), method="nearest", tolerance=0.1
-        ).load()
-
-    def time_2d_coarse(self):
-        self.ds.reindex(x=np.arange(0, nx, 2), y=np.arange(0, ny, 2)).load()
-
-    def time_2d_fine_all_found(self):
-        self.ds.reindex(
-            x=np.arange(0, nx, 0.5), y=np.arange(0, ny, 0.5), method="nearest"
-        ).load()
-
-    def time_2d_fine_some_missing(self):
-        self.ds.reindex(
-            x=np.arange(0, nx, 0.5),
-            y=np.arange(0, ny, 0.5),
-            method="nearest",
-            tolerance=0.1,
-        ).load()
-
-
-class ReindexDask(Reindex):
-    def setup(self):
-        requires_dask()
-        super().setup()
-        self.ds = self.ds.chunk({"time": 100})
diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py
deleted file mode 100644
index 4bf2ace352d..00000000000
--- a/asv_bench/benchmarks/repr.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import numpy as np
-import pandas as pd
-
-import xarray as xr
-
-
-class Repr:
-    def setup(self):
-        a = np.arange(0, 100)
-        data_vars = dict()
-        for i in a:
-            data_vars[f"long_variable_name_{i}"] = xr.DataArray(
-                name=f"long_variable_name_{i}",
-                data=np.arange(0, 20),
-                dims=[f"long_coord_name_{i}_x"],
-                coords={f"long_coord_name_{i}_x": np.arange(0, 20) * 2},
-            )
-        self.ds = xr.Dataset(data_vars)
-        self.ds.attrs = {f"attr_{k}": 2 for k in a}
-
-    def time_repr(self):
-        repr(self.ds)
-
-    def time_repr_html(self):
-        self.ds._repr_html_()
-
-
-class ReprMultiIndex:
-    def setup(self):
-        index = pd.MultiIndex.from_product(
-            [range(1000), range(1000)], names=("level_0", "level_1")
-        )
-        series = pd.Series(range(1000 * 1000), index=index)
-        self.da = xr.DataArray(series)
-
-    def time_repr(self):
-        repr(self.da)
-
-    def time_repr_html(self):
-        self.da._repr_html_()
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
deleted file mode 100644
index f0e18bf2153..00000000000
--- a/asv_bench/benchmarks/rolling.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import numpy as np
-import pandas as pd
-
-import xarray as xr
-
-from . import parameterized, randn, requires_dask
-
-nx = 300
-long_nx = 30000
-ny = 200
-nt = 100
-window = 20
-
-randn_xy = randn((nx, ny), frac_nan=0.1)
-randn_xt = randn((nx, nt))
-randn_t = randn((nt,))
-randn_long = randn((long_nx,), frac_nan=0.1)
-
-
-class Rolling:
-    def setup(self, *args, **kwargs):
-        self.ds = xr.Dataset(
-            {
-                "var1": (("x", "y"), randn_xy),
-                "var2": (("x", "t"), randn_xt),
-                "var3": (("t",), randn_t),
-            },
-            coords={
-                "x": np.arange(nx),
-                "y": np.linspace(0, 1, ny),
-                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
-                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
-            },
-        )
-        self.da_long = xr.DataArray(
-            randn_long, dims="x", coords={"x": np.arange(long_nx) * 0.1}
-        )
-
-    @parameterized(["func", "center"], (["mean", "count"], [True, False]))
-    def time_rolling(self, func, center):
-        getattr(self.ds.rolling(x=window, center=center), func)().load()
-
-    @parameterized(["func", "pandas"], (["mean", "count"], [True, False]))
-    def time_rolling_long(self, func, pandas):
-        if pandas:
-            se = self.da_long.to_series()
-            getattr(se.rolling(window=window, min_periods=window), func)()
-        else:
-            getattr(self.da_long.rolling(x=window, min_periods=window), func)().load()
-
-    @parameterized(["window_", "min_periods"], ([20, 40], [5, 5]))
-    def time_rolling_np(self, window_, min_periods):
-        self.ds.rolling(x=window_, center=False, min_periods=min_periods).reduce(
-            getattr(np, "nansum")
-        ).load()
-
-    @parameterized(["center", "stride"], ([True, False], [1, 1]))
-    def time_rolling_construct(self, center, stride):
-        self.ds.rolling(x=window, center=center).construct(
-            "window_dim", stride=stride
-        ).sum(dim="window_dim").load()
-
-
-class RollingDask(Rolling):
-    def setup(self, *args, **kwargs):
-        requires_dask()
-        super().setup(**kwargs)
-        self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50})
-        self.da_long = self.da_long.chunk({"x": 10000})
-
-
-class RollingMemory:
-    def setup(self, *args, **kwargs):
-        self.ds = xr.Dataset(
-            {
-                "var1": (("x", "y"), randn_xy),
-                "var2": (("x", "t"), randn_xt),
-                "var3": (("t",), randn_t),
-            },
-            coords={
-                "x": np.arange(nx),
-                "y": np.linspace(0, 1, ny),
-                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
-                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
-            },
-        )
-
-
-class DataArrayRollingMemory(RollingMemory):
-    @parameterized("func", ["sum", "max", "mean"])
-    def peakmem_ndrolling_reduce(self, func):
-        roll = self.ds.var1.rolling(x=10, y=4)
-        getattr(roll, func)()
-
-    @parameterized("func", ["sum", "max", "mean"])
-    def peakmem_1drolling_reduce(self, func):
-        roll = self.ds.var3.rolling(t=100)
-        getattr(roll, func)()
-
-
-class DatasetRollingMemory(RollingMemory):
-    @parameterized("func", ["sum", "max", "mean"])
-    def peakmem_ndrolling_reduce(self, func):
-        roll = self.ds.rolling(x=10, y=4)
-        getattr(roll, func)()
-
-    @parameterized("func", ["sum", "max", "mean"])
-    def peakmem_1drolling_reduce(self, func):
-        roll = self.ds.rolling(t=100)
-        getattr(roll, func)()
diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py
deleted file mode 100644
index 2c5b7ca7821..00000000000
--- a/asv_bench/benchmarks/unstacking.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import numpy as np
-
-import xarray as xr
-
-from . import requires_dask
-
-
-class Unstacking:
-    def setup(self):
-        data = np.random.RandomState(0).randn(250, 500)
-        self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
-        self.da_missing = self.da_full[:-1]
-        self.df_missing = self.da_missing.to_pandas()
-
-    def time_unstack_fast(self):
-        self.da_full.unstack("flat_dim")
-
-    def time_unstack_slow(self):
-        self.da_missing.unstack("flat_dim")
-
-    def time_unstack_pandas_slow(self):
-        self.df_missing.unstack()
-
-
-class UnstackingDask(Unstacking):
-    def setup(self, *args, **kwargs):
-        requires_dask()
-        super().setup(**kwargs)
-        self.da_full = self.da_full.chunk({"flat_dim": 25})

From c4c40230b23b75507bc55ec2f8fcd3d03f8f20d2 Mon Sep 17 00:00:00 2001
From: Illviljan <14371165+Illviljan@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:41:43 +0100
Subject: [PATCH 04/11] Update groupby.py

---
 asv_bench/benchmarks/groupby.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index a63e8fcaf5a..510e33d5e55 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -10,8 +10,8 @@ class GroupBy:
     def setup(self, *args, **kwargs):
         self.ds1d = xr.Dataset(
             {
-                "a": xr.DataArray(np.r_[np.arange(500.0), np.arange(500.0)]),
-                "b": xr.DataArray(np.arange(1000.0)),
+                "a": xr.DataArray(np.r_[np.arange(300), np.arange(300)]),
+                "b": xr.DataArray(np.arange(500)),
             }
         )
         self.ds2d = self.ds1d.expand_dims(z=10)
@@ -74,7 +74,7 @@ def time_agg_small_num_groups(self, method, ndim):
     @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
     def time_agg_large_num_groups(self, method, ndim):
         ds = getattr(self, f"ds{ndim}d")
-        getattr(ds.resample(time="6H"), method)()
+        getattr(ds.resample(time="12H"), method)()
 
 
 class ResampleDask(Resample):

From 445312d37599370c5d48a9583c3a05b6116d1e13 Mon Sep 17 00:00:00 2001
From: Illviljan <14371165+Illviljan@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:56:37 +0100
Subject: [PATCH 05/11] Update groupby.py

---
 asv_bench/benchmarks/groupby.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 510e33d5e55..8f51583fce6 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -11,7 +11,7 @@ def setup(self, *args, **kwargs):
         self.ds1d = xr.Dataset(
             {
                 "a": xr.DataArray(np.r_[np.arange(300), np.arange(300)]),
-                "b": xr.DataArray(np.arange(500)),
+                "b": xr.DataArray(np.arange(600)),
             }
         )
         self.ds2d = self.ds1d.expand_dims(z=10)
@@ -35,8 +35,8 @@ class GroupByDask(GroupBy):
     def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
-        self.ds1d = self.ds1d.sel(dim_0=slice(250)).chunk({"dim_0": 50})
-        self.ds2d = self.ds2d.sel(dim_0=slice(250)).chunk({"dim_0": 50, "z": 4})
+        self.ds1d = self.ds1d.sel(dim_0=slice(150)).chunk({"dim_0": 50})
+        self.ds2d = self.ds2d.sel(dim_0=slice(150)).chunk({"dim_0": 50, "z": 4})
 
 
 class GroupByDataFrame(GroupBy):

From ff6419522bd30c71fd5778d909405186eafb8559 Mon Sep 17 00:00:00 2001
From: Illviljan <14371165+Illviljan@users.noreply.github.com>
Date: Sat, 6 Nov 2021 18:29:20 +0100
Subject: [PATCH 06/11] Update groupby.py

---
 asv_bench/benchmarks/groupby.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 8f51583fce6..5aa57587cf2 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -8,10 +8,11 @@
 
 class GroupBy:
     def setup(self, *args, **kwargs):
+        self.n = 100
         self.ds1d = xr.Dataset(
             {
-                "a": xr.DataArray(np.r_[np.arange(300), np.arange(300)]),
-                "b": xr.DataArray(np.arange(600)),
+                "a": xr.DataArray(np.r_[np.arange(self.n), np.arange(self.n)]),
+                "b": xr.DataArray(np.arange(2 * self.n)),
             }
         )
         self.ds2d = self.ds1d.expand_dims(z=10)
@@ -35,8 +36,10 @@ class GroupByDask(GroupBy):
     def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
-        self.ds1d = self.ds1d.sel(dim_0=slice(150)).chunk({"dim_0": 50})
-        self.ds2d = self.ds2d.sel(dim_0=slice(150)).chunk({"dim_0": 50, "z": 4})
+        self.ds1d = self.ds1d.sel(dim_0=slice(self.n * 0.5)).chunk({"dim_0": 50})
+        self.ds2d = self.ds2d.sel(dim_0=slice(self.n * 0.5)).chunk(
+            {"dim_0": 50, "z": 4}
+        )
 
 
 class GroupByDataFrame(GroupBy):
@@ -74,7 +77,7 @@ def time_agg_small_num_groups(self, method, ndim):
     @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
     def time_agg_large_num_groups(self, method, ndim):
         ds = getattr(self, f"ds{ndim}d")
-        getattr(ds.resample(time="12H"), method)()
+        getattr(ds.resample(time="24H"), method)()
 
 
 class ResampleDask(Resample):

From c56dd9411fc8777ac70090d0bd938eb0399b653d Mon Sep 17 00:00:00 2001
From: Illviljan <14371165+Illviljan@users.noreply.github.com>
Date: Sat, 6 Nov 2021 18:41:43 +0100
Subject: [PATCH 07/11] Update groupby.py

---
 asv_bench/benchmarks/groupby.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 5aa57587cf2..1ffb3c6579a 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -36,10 +36,8 @@ class GroupByDask(GroupBy):
     def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
-        self.ds1d = self.ds1d.sel(dim_0=slice(self.n * 0.5)).chunk({"dim_0": 50})
-        self.ds2d = self.ds2d.sel(dim_0=slice(self.n * 0.5)).chunk(
-            {"dim_0": 50, "z": 4}
-        )
+        self.ds1d = self.ds1d.sel(dim_0=slice(self.n // 2)).chunk({"dim_0": 50})
+        self.ds2d = self.ds2d.sel(dim_0=slice(self.n // 2)).chunk({"dim_0": 50, "z": 4})
 
 
 class GroupByDataFrame(GroupBy):

From a89f62be5a875a27426ee274ae81f4344982ab74 Mon Sep 17 00:00:00 2001
From: Illviljan <14371165+Illviljan@users.noreply.github.com>
Date: Sat, 6 Nov 2021 18:55:11 +0100
Subject: [PATCH 08/11] Update groupby.py

---
 asv_bench/benchmarks/groupby.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 1ffb3c6579a..eb43a95d2b9 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -3,7 +3,7 @@
 
 import xarray as xr
 
-from . import parameterized, requires_dask
+from . import _skip_slow, parameterized, requires_dask
 
 
 class GroupBy:
@@ -41,13 +41,23 @@ def setup(self, *args, **kwargs):
 
 
 class GroupByDataFrame(GroupBy):
+    """Run groupby tests using pandas DataFrame."""
+
     def setup(self, *args, **kwargs):
+        # Skip testing in CI as it won't ever change in a commit:
+        _skip_slow()
+
         super().setup(**kwargs)
         self.ds1d = self.ds1d.to_dataframe()
 
 
 class GroupByDaskDataFrame(GroupBy):
+    """Run groupby tests using dask DataFrame."""
+
     def setup(self, *args, **kwargs):
+        # Skip testing in CI as it won't ever change in a commit:
+        _skip_slow()
+
         requires_dask()
         super().setup(**kwargs)
         self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe()
@@ -75,7 +85,7 @@ def time_agg_small_num_groups(self, method, ndim):
     @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
     def time_agg_large_num_groups(self, method, ndim):
         ds = getattr(self, f"ds{ndim}d")
-        getattr(ds.resample(time="24H"), method)()
+        getattr(ds.resample(time="48H"), method)()
 
 
 class ResampleDask(Resample):

From 88a4a3d6f3cba0e63c93ae3984add25e697d6c58 Mon Sep 17 00:00:00 2001
From: Illviljan <14371165+Illviljan@users.noreply.github.com>
Date: Sat, 6 Nov 2021 19:28:38 +0100
Subject: [PATCH 09/11] add back the other asv tests

---
 asv_bench/benchmarks/combine.py           |  38 ++
 asv_bench/benchmarks/dataarray_missing.py |  80 ++++
 asv_bench/benchmarks/dataset_io.py        | 478 ++++++++++++++++++++++
 asv_bench/benchmarks/import_xarray.py     |   9 +
 asv_bench/benchmarks/indexing.py          | 149 +++++++
 asv_bench/benchmarks/interp.py            |  51 +++
 asv_bench/benchmarks/pandas.py            |  26 ++
 asv_bench/benchmarks/reindexing.py        |  52 +++
 asv_bench/benchmarks/repr.py              |  40 ++
 asv_bench/benchmarks/rolling.py           | 110 +++++
 asv_bench/benchmarks/unstacking.py        |  29 ++
 11 files changed, 1062 insertions(+)
 create mode 100644 asv_bench/benchmarks/combine.py
 create mode 100644 asv_bench/benchmarks/dataarray_missing.py
 create mode 100644 asv_bench/benchmarks/dataset_io.py
 create mode 100644 asv_bench/benchmarks/import_xarray.py
 create mode 100644 asv_bench/benchmarks/indexing.py
 create mode 100644 asv_bench/benchmarks/interp.py
 create mode 100644 asv_bench/benchmarks/pandas.py
 create mode 100644 asv_bench/benchmarks/reindexing.py
 create mode 100644 asv_bench/benchmarks/repr.py
 create mode 100644 asv_bench/benchmarks/rolling.py
 create mode 100644 asv_bench/benchmarks/unstacking.py

diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py
new file mode 100644
index 00000000000..a4f8db2786b
--- /dev/null
+++ b/asv_bench/benchmarks/combine.py
@@ -0,0 +1,38 @@
+import numpy as np
+
+import xarray as xr
+
+
+class Combine:
+    """Benchmark concatenating and merging large datasets"""
+
+    def setup(self):
+        """Create 4 datasets with two different variables"""
+
+        t_size, x_size, y_size = 50, 450, 400
+        t = np.arange(t_size)
+        data = np.random.randn(t_size, x_size, y_size)
+
+        self.dsA0 = xr.Dataset(
+            {"A": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))}
+        )
+        self.dsA1 = xr.Dataset(
+            {"A": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))}
+        )
+        self.dsB0 = xr.Dataset(
+            {"B": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))}
+        )
+        self.dsB1 = xr.Dataset(
+            {"B": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))}
+        )
+
+    def time_combine_nested(self):
+        datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]]
+
+        xr.combine_nested(datasets, concat_dim=[None, "T"])
+
+    def time_combine_by_coords(self):
+        """Also has to load and arrange t coordinate"""
+        datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1]
+
+        xr.combine_by_coords(datasets)
diff --git a/asv_bench/benchmarks/dataarray_missing.py b/asv_bench/benchmarks/dataarray_missing.py
new file mode 100644
index 00000000000..f89fe7f8eb9
--- /dev/null
+++ b/asv_bench/benchmarks/dataarray_missing.py
@@ -0,0 +1,80 @@
+import pandas as pd
+
+import xarray as xr
+
+from . import parameterized, randn, requires_dask
+
+
+def make_bench_data(shape, frac_nan, chunks):
+    vals = randn(shape, frac_nan)
+    coords = {"time": pd.date_range("2000-01-01", freq="D", periods=shape[0])}
+    da = xr.DataArray(vals, dims=("time", "x", "y"), coords=coords)
+
+    if chunks is not None:
+        da = da.chunk(chunks)
+
+    return da
+
+
+def requires_bottleneck():
+    try:
+        import bottleneck  # noqa: F401
+    except ImportError:
+        raise NotImplementedError()
+
+
+class DataArrayMissingInterpolateNA:
+    def setup(self, shape, chunks, limit):
+        if chunks is not None:
+            requires_dask()
+        self.da = make_bench_data(shape, 0.1, chunks)
+
+    @parameterized(
+        ["shape", "chunks", "limit"],
+        (
+            [(365, 75, 75)],
+            [None, {"x": 25, "y": 25}],
+            [None, 3],
+        ),
+    )
+    def time_interpolate_na(self, shape, chunks, limit):
+        actual = self.da.interpolate_na(dim="time", method="linear", limit=limit)
+
+        if chunks is not None:
+            actual = actual.compute()
+
+
+class DataArrayMissingBottleneck:
+    def setup(self, shape, chunks, limit):
+        requires_bottleneck()
+        if chunks is not None:
+            requires_dask()
+        self.da = make_bench_data(shape, 0.1, chunks)
+
+    @parameterized(
+        ["shape", "chunks", "limit"],
+        (
+            [(365, 75, 75)],
+            [None, {"x": 25, "y": 25}],
+            [None, 3],
+        ),
+    )
+    def time_ffill(self, shape, chunks, limit):
+        actual = self.da.ffill(dim="time", limit=limit)
+
+        if chunks is not None:
+            actual = actual.compute()
+
+    @parameterized(
+        ["shape", "chunks", "limit"],
+        (
+            [(365, 75, 75)],
+            [None, {"x": 25, "y": 25}],
+            [None, 3],
+        ),
+    )
+    def time_bfill(self, shape, chunks, limit):
+        actual = self.da.ffill(dim="time", limit=limit)
+
+        if chunks is not None:
+            actual = actual.compute()
diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py
new file mode 100644
index 00000000000..6c2e15c54e9
--- /dev/null
+++ b/asv_bench/benchmarks/dataset_io.py
@@ -0,0 +1,478 @@
+import os
+
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+from . import _skip_slow, randint, randn, requires_dask
+
+try:
+    import dask
+    import dask.multiprocessing
+except ImportError:
+    pass
+
+
+os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
+
+
+class IOSingleNetCDF:
+    """
+    A few examples that benchmark reading/writing a single netCDF file with
+    xarray
+    """
+
+    timeout = 300.0
+    repeat = 1
+    number = 5
+
+    def make_ds(self):
+        # TODO: Lazily skipped in CI as it is very demanding and slow.
+        # Improve times and remove errors.
+        _skip_slow()
+
+        # single Dataset
+        self.ds = xr.Dataset()
+        self.nt = 1000
+        self.nx = 90
+        self.ny = 45
+
+        self.block_chunks = {
+            "time": self.nt / 4,
+            "lon": self.nx / 3,
+            "lat": self.ny / 3,
+        }
+
+        self.time_chunks = {"time": int(self.nt / 36)}
+
+        times = pd.date_range("1970-01-01", periods=self.nt, freq="D")
+        lons = xr.DataArray(
+            np.linspace(0, 360, self.nx),
+            dims=("lon",),
+            attrs={"units": "degrees east", "long_name": "longitude"},
+        )
+        lats = xr.DataArray(
+            np.linspace(-90, 90, self.ny),
+            dims=("lat",),
+            attrs={"units": "degrees north", "long_name": "latitude"},
+        )
+        self.ds["foo"] = xr.DataArray(
+            randn((self.nt, self.nx, self.ny), frac_nan=0.2),
+            coords={"lon": lons, "lat": lats, "time": times},
+            dims=("time", "lon", "lat"),
+            name="foo",
+            attrs={"units": "foo units", "description": "a description"},
+        )
+        self.ds["bar"] = xr.DataArray(
+            randn((self.nt, self.nx, self.ny), frac_nan=0.2),
+            coords={"lon": lons, "lat": lats, "time": times},
+            dims=("time", "lon", "lat"),
+            name="bar",
+            attrs={"units": "bar units", "description": "a description"},
+        )
+        self.ds["baz"] = xr.DataArray(
+            randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32),
+            coords={"lon": lons, "lat": lats},
+            dims=("lon", "lat"),
+            name="baz",
+            attrs={"units": "baz units", "description": "a description"},
+        )
+
+        self.ds.attrs = {"history": "created for xarray benchmarking"}
+
+        self.oinds = {
+            "time": randint(0, self.nt, 120),
+            "lon": randint(0, self.nx, 20),
+            "lat": randint(0, self.ny, 10),
+        }
+        self.vinds = {
+            "time": xr.DataArray(randint(0, self.nt, 120), dims="x"),
+            "lon": xr.DataArray(randint(0, self.nx, 120), dims="x"),
+            "lat": slice(3, 20),
+        }
+
+
+class IOWriteSingleNetCDF3(IOSingleNetCDF):
+    def setup(self):
+        self.format = "NETCDF3_64BIT"
+        self.make_ds()
+
+    def time_write_dataset_netcdf4(self):
+        self.ds.to_netcdf("test_netcdf4_write.nc", engine="netcdf4", format=self.format)
+
+    def time_write_dataset_scipy(self):
+        self.ds.to_netcdf("test_scipy_write.nc", engine="scipy", format=self.format)
+
+
+class IOReadSingleNetCDF4(IOSingleNetCDF):
+    def setup(self):
+
+        self.make_ds()
+
+        self.filepath = "test_single_file.nc4.nc"
+        self.format = "NETCDF4"
+        self.ds.to_netcdf(self.filepath, format=self.format)
+
+    def time_load_dataset_netcdf4(self):
+        xr.open_dataset(self.filepath, engine="netcdf4").load()
+
+    def time_orthogonal_indexing(self):
+        ds = xr.open_dataset(self.filepath, engine="netcdf4")
+        ds = ds.isel(**self.oinds).load()
+
+    def time_vectorized_indexing(self):
+        ds = xr.open_dataset(self.filepath, engine="netcdf4")
+        ds = ds.isel(**self.vinds).load()
+
+
+class IOReadSingleNetCDF3(IOReadSingleNetCDF4):
+    def setup(self):
+
+        self.make_ds()
+
+        self.filepath = "test_single_file.nc3.nc"
+        self.format = "NETCDF3_64BIT"
+        self.ds.to_netcdf(self.filepath, format=self.format)
+
+    def time_load_dataset_scipy(self):
+        xr.open_dataset(self.filepath, engine="scipy").load()
+
+    def time_orthogonal_indexing(self):
+        ds = xr.open_dataset(self.filepath, engine="scipy")
+        ds = ds.isel(**self.oinds).load()
+
+    def time_vectorized_indexing(self):
+        ds = xr.open_dataset(self.filepath, engine="scipy")
+        ds = ds.isel(**self.vinds).load()
+
+
+class IOReadSingleNetCDF4Dask(IOSingleNetCDF):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+
+        self.filepath = "test_single_file.nc4.nc"
+        self.format = "NETCDF4"
+        self.ds.to_netcdf(self.filepath, format=self.format)
+
+    def time_load_dataset_netcdf4_with_block_chunks(self):
+        xr.open_dataset(
+            self.filepath, engine="netcdf4", chunks=self.block_chunks
+        ).load()
+
+    def time_load_dataset_netcdf4_with_block_chunks_oindexing(self):
+        ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks)
+        ds = ds.isel(**self.oinds).load()
+
+    def time_load_dataset_netcdf4_with_block_chunks_vindexing(self):
+        ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks)
+        ds = ds.isel(**self.vinds).load()
+
+    def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_dataset(
+                self.filepath, engine="netcdf4", chunks=self.block_chunks
+            ).load()
+
+    def time_load_dataset_netcdf4_with_time_chunks(self):
+        xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.time_chunks).load()
+
+    def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_dataset(
+                self.filepath, engine="netcdf4", chunks=self.time_chunks
+            ).load()
+
+
+class IOReadSingleNetCDF3Dask(IOReadSingleNetCDF4Dask):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+
+        self.filepath = "test_single_file.nc3.nc"
+        self.format = "NETCDF3_64BIT"
+        self.ds.to_netcdf(self.filepath, format=self.format)
+
+    def time_load_dataset_scipy_with_block_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_dataset(
+                self.filepath, engine="scipy", chunks=self.block_chunks
+            ).load()
+
+    def time_load_dataset_scipy_with_block_chunks_oindexing(self):
+        ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks)
+        ds = ds.isel(**self.oinds).load()
+
+    def time_load_dataset_scipy_with_block_chunks_vindexing(self):
+        ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks)
+        ds = ds.isel(**self.vinds).load()
+
+    def time_load_dataset_scipy_with_time_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_dataset(
+                self.filepath, engine="scipy", chunks=self.time_chunks
+            ).load()
+
+
+class IOMultipleNetCDF:
+    """
+    A few examples that benchmark reading/writing multiple netCDF files with
+    xarray
+    """
+
+    timeout = 300.0
+    repeat = 1
+    number = 5
+
+    def make_ds(self, nfiles=10):
+        # TODO: Lazily skipped in CI as it is very demanding and slow.
+        # Improve times and remove errors.
+        _skip_slow()
+
+        # multiple Dataset
+        self.ds = xr.Dataset()
+        self.nt = 1000
+        self.nx = 90
+        self.ny = 45
+        self.nfiles = nfiles
+
+        self.block_chunks = {
+            "time": self.nt / 4,
+            "lon": self.nx / 3,
+            "lat": self.ny / 3,
+        }
+
+        self.time_chunks = {"time": int(self.nt / 36)}
+
+        self.time_vars = np.split(
+            pd.date_range("1970-01-01", periods=self.nt, freq="D"), self.nfiles
+        )
+
+        self.ds_list = []
+        self.filenames_list = []
+        for i, times in enumerate(self.time_vars):
+            ds = xr.Dataset()
+            nt = len(times)
+            lons = xr.DataArray(
+                np.linspace(0, 360, self.nx),
+                dims=("lon",),
+                attrs={"units": "degrees east", "long_name": "longitude"},
+            )
+            lats = xr.DataArray(
+                np.linspace(-90, 90, self.ny),
+                dims=("lat",),
+                attrs={"units": "degrees north", "long_name": "latitude"},
+            )
+            ds["foo"] = xr.DataArray(
+                randn((nt, self.nx, self.ny), frac_nan=0.2),
+                coords={"lon": lons, "lat": lats, "time": times},
+                dims=("time", "lon", "lat"),
+                name="foo",
+                attrs={"units": "foo units", "description": "a description"},
+            )
+            ds["bar"] = xr.DataArray(
+                randn((nt, self.nx, self.ny), frac_nan=0.2),
+                coords={"lon": lons, "lat": lats, "time": times},
+                dims=("time", "lon", "lat"),
+                name="bar",
+                attrs={"units": "bar units", "description": "a description"},
+            )
+            ds["baz"] = xr.DataArray(
+                randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32),
+                coords={"lon": lons, "lat": lats},
+                dims=("lon", "lat"),
+                name="baz",
+                attrs={"units": "baz units", "description": "a description"},
+            )
+
+            ds.attrs = {"history": "created for xarray benchmarking"}
+
+            self.ds_list.append(ds)
+            self.filenames_list.append("test_netcdf_%i.nc" % i)
+
+
+class IOWriteMultipleNetCDF3(IOMultipleNetCDF):
+    def setup(self):
+        self.make_ds()
+        self.format = "NETCDF3_64BIT"
+
+    def time_write_dataset_netcdf4(self):
+        xr.save_mfdataset(
+            self.ds_list, self.filenames_list, engine="netcdf4", format=self.format
+        )
+
+    def time_write_dataset_scipy(self):
+        xr.save_mfdataset(
+            self.ds_list, self.filenames_list, engine="scipy", format=self.format
+        )
+
+
+class IOReadMultipleNetCDF4(IOMultipleNetCDF):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+        self.format = "NETCDF4"
+        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
+
+    def time_load_dataset_netcdf4(self):
+        xr.open_mfdataset(self.filenames_list, engine="netcdf4").load()
+
+    def time_open_dataset_netcdf4(self):
+        xr.open_mfdataset(self.filenames_list, engine="netcdf4")
+
+
+class IOReadMultipleNetCDF3(IOReadMultipleNetCDF4):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+        self.format = "NETCDF3_64BIT"
+        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
+
+    def time_load_dataset_scipy(self):
+        xr.open_mfdataset(self.filenames_list, engine="scipy").load()
+
+    def time_open_dataset_scipy(self):
+        xr.open_mfdataset(self.filenames_list, engine="scipy")
+
+
+class IOReadMultipleNetCDF4Dask(IOMultipleNetCDF):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+        self.format = "NETCDF4"
+        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
+
+    def time_load_dataset_netcdf4_with_block_chunks(self):
+        xr.open_mfdataset(
+            self.filenames_list, engine="netcdf4", chunks=self.block_chunks
+        ).load()
+
+    def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="netcdf4", chunks=self.block_chunks
+            ).load()
+
+    def time_load_dataset_netcdf4_with_time_chunks(self):
+        xr.open_mfdataset(
+            self.filenames_list, engine="netcdf4", chunks=self.time_chunks
+        ).load()
+
+    def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="netcdf4", chunks=self.time_chunks
+            ).load()
+
+    def time_open_dataset_netcdf4_with_block_chunks(self):
+        xr.open_mfdataset(
+            self.filenames_list, engine="netcdf4", chunks=self.block_chunks
+        )
+
+    def time_open_dataset_netcdf4_with_block_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="netcdf4", chunks=self.block_chunks
+            )
+
+    def time_open_dataset_netcdf4_with_time_chunks(self):
+        xr.open_mfdataset(
+            self.filenames_list, engine="netcdf4", chunks=self.time_chunks
+        )
+
+    def time_open_dataset_netcdf4_with_time_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="netcdf4", chunks=self.time_chunks
+            )
+
+
+class IOReadMultipleNetCDF3Dask(IOReadMultipleNetCDF4Dask):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+        self.format = "NETCDF3_64BIT"
+        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
+
+    def time_load_dataset_scipy_with_block_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="scipy", chunks=self.block_chunks
+            ).load()
+
+    def time_load_dataset_scipy_with_time_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="scipy", chunks=self.time_chunks
+            ).load()
+
+    def time_open_dataset_scipy_with_block_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="scipy", chunks=self.block_chunks
+            )
+
+    def time_open_dataset_scipy_with_time_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="scipy", chunks=self.time_chunks
+            )
+
+
+def create_delayed_write():
+    import dask.array as da
+
+    # TODO: Lazily skipped in CI as it is very demanding and slow.
+    # Improve times and remove errors.
+    _skip_slow()
+
+    vals = da.random.random(300, chunks=(1,))
+    ds = xr.Dataset({"vals": (["a"], vals)})
+    return ds.to_netcdf("file.nc", engine="netcdf4", compute=False)
+
+
+class IOWriteNetCDFDask:
+    timeout = 60
+    repeat = 1
+    number = 5
+
+    def setup(self):
+        requires_dask()
+        self.write = create_delayed_write()
+
+    def time_write(self):
+        self.write.compute()
+
+
+class IOWriteNetCDFDaskDistributed:
+    def setup(self):
+        try:
+            import distributed
+        except ImportError:
+            raise NotImplementedError()
+
+        # TODO: Lazily skipped in CI as it is very demanding and slow.
+        # Improve times and remove errors.
+        _skip_slow()
+
+        self.client = distributed.Client()
+        self.write = create_delayed_write()
+
+    def cleanup(self):
+        self.client.shutdown()
+
+    def time_write(self):
+        self.write.compute()
diff --git a/asv_bench/benchmarks/import_xarray.py b/asv_bench/benchmarks/import_xarray.py
new file mode 100644
index 00000000000..94652e3b82a
--- /dev/null
+++ b/asv_bench/benchmarks/import_xarray.py
@@ -0,0 +1,9 @@
+class ImportXarray:
+    def setup(self, *args, **kwargs):
+        def import_xr():
+            import xarray  # noqa: F401
+
+        self._import_xr = import_xr
+
+    def time_import_xarray(self):
+        self._import_xr()
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
new file mode 100644
index 00000000000..15212ec0c61
--- /dev/null
+++ b/asv_bench/benchmarks/indexing.py
@@ -0,0 +1,149 @@
+import os
+
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+from . import parameterized, randint, randn, requires_dask
+
+nx = 2000
+ny = 1000
+nt = 500
+
+basic_indexes = {
+    "1slice": {"x": slice(0, 3)},
+    "1slice-1scalar": {"x": 0, "y": slice(None, None, 3)},
+    "2slicess-1scalar": {"x": slice(3, -3, 3), "y": 1, "t": slice(None, -3, 3)},
+}
+
+basic_assignment_values = {
+    "1slice": xr.DataArray(randn((3, ny), frac_nan=0.1), dims=["x", "y"]),
+    "1slice-1scalar": xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1), dims=["y"]),
+    "2slicess-1scalar": xr.DataArray(
+        randn(np.empty(nx)[slice(3, -3, 3)].size, frac_nan=0.1), dims=["x"]
+    ),
+}
+
+outer_indexes = {
+    "1d": {"x": randint(0, nx, 400)},
+    "2d": {"x": randint(0, nx, 500), "y": randint(0, ny, 400)},
+    "2d-1scalar": {"x": randint(0, nx, 100), "y": 1, "t": randint(0, nt, 400)},
+}
+
+outer_assignment_values = {
+    "1d": xr.DataArray(randn((400, ny), frac_nan=0.1), dims=["x", "y"]),
+    "2d": xr.DataArray(randn((500, 400), frac_nan=0.1), dims=["x", "y"]),
+    "2d-1scalar": xr.DataArray(randn(100, frac_nan=0.1), dims=["x"]),
+}
+
+vectorized_indexes = {
+    "1-1d": {"x": xr.DataArray(randint(0, nx, 400), dims="a")},
+    "2-1d": {
+        "x": xr.DataArray(randint(0, nx, 400), dims="a"),
+        "y": xr.DataArray(randint(0, ny, 400), dims="a"),
+    },
+    "3-2d": {
+        "x": xr.DataArray(randint(0, nx, 400).reshape(4, 100), dims=["a", "b"]),
+        "y": xr.DataArray(randint(0, ny, 400).reshape(4, 100), dims=["a", "b"]),
+        "t": xr.DataArray(randint(0, nt, 400).reshape(4, 100), dims=["a", "b"]),
+    },
+}
+
+vectorized_assignment_values = {
+    "1-1d": xr.DataArray(randn((400, ny)), dims=["a", "y"], coords={"a": randn(400)}),
+    "2-1d": xr.DataArray(randn(400), dims=["a"], coords={"a": randn(400)}),
+    "3-2d": xr.DataArray(
+        randn((4, 100)), dims=["a", "b"], coords={"a": randn(4), "b": randn(100)}
+    ),
+}
+
+
+class Base:
+    def setup(self, key):
+        self.ds = xr.Dataset(
+            {
+                "var1": (("x", "y"), randn((nx, ny), frac_nan=0.1)),
+                "var2": (("x", "t"), randn((nx, nt))),
+                "var3": (("t",), randn(nt)),
+            },
+            coords={
+                "x": np.arange(nx),
+                "y": np.linspace(0, 1, ny),
+                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
+                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
+            },
+        )
+
+
+class Indexing(Base):
+    @parameterized(["key"], [list(basic_indexes.keys())])
+    def time_indexing_basic(self, key):
+        self.ds.isel(**basic_indexes[key]).load()
+
+    @parameterized(["key"], [list(outer_indexes.keys())])
+    def time_indexing_outer(self, key):
+        self.ds.isel(**outer_indexes[key]).load()
+
+    @parameterized(["key"], [list(vectorized_indexes.keys())])
+    def time_indexing_vectorized(self, key):
+        self.ds.isel(**vectorized_indexes[key]).load()
+
+
+class Assignment(Base):
+    @parameterized(["key"], [list(basic_indexes.keys())])
+    def time_assignment_basic(self, key):
+        ind = basic_indexes[key]
+        val = basic_assignment_values[key]
+        self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val
+
+    @parameterized(["key"], [list(outer_indexes.keys())])
+    def time_assignment_outer(self, key):
+        ind = outer_indexes[key]
+        val = outer_assignment_values[key]
+        self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val
+
+    @parameterized(["key"], [list(vectorized_indexes.keys())])
+    def time_assignment_vectorized(self, key):
+        ind = vectorized_indexes[key]
+        val = vectorized_assignment_values[key]
+        self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val
+
+
+class IndexingDask(Indexing):
+    def setup(self, key):
+        requires_dask()
+        super().setup(key)
+        self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50})
+
+
+class BooleanIndexing:
+    # https://github.com/pydata/xarray/issues/2227
+    def setup(self):
+        self.ds = xr.Dataset(
+            {"a": ("time", np.arange(10_000_000))},
+            coords={"time": np.arange(10_000_000)},
+        )
+        self.time_filter = self.ds.time > 50_000
+
+    def time_indexing(self):
+        self.ds.isel(time=self.time_filter)
+
+
+class HugeAxisSmallSliceIndexing:
+    # https://github.com/pydata/xarray/pull/4560
+    def setup(self):
+        self.filepath = "test_indexing_huge_axis_small_slice.nc"
+        if not os.path.isfile(self.filepath):
+            xr.Dataset(
+                {"a": ("x", np.arange(10_000_000))},
+                coords={"x": np.arange(10_000_000)},
+            ).to_netcdf(self.filepath, format="NETCDF4")
+
+        self.ds = xr.open_dataset(self.filepath)
+
+    def time_indexing(self):
+        self.ds.isel(x=slice(100))
+
+    def cleanup(self):
+        self.ds.close()
diff --git a/asv_bench/benchmarks/interp.py b/asv_bench/benchmarks/interp.py
new file mode 100644
index 00000000000..4b6691bcc0a
--- /dev/null
+++ b/asv_bench/benchmarks/interp.py
@@ -0,0 +1,51 @@
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+from . import parameterized, randn, requires_dask
+
+nx = 1500
+ny = 1000
+nt = 500
+
+randn_xy = randn((nx, ny), frac_nan=0.1)
+randn_xt = randn((nx, nt))
+randn_t = randn((nt,))
+
+new_x_short = np.linspace(0.3 * nx, 0.7 * nx, 100)
+new_x_long = np.linspace(0.3 * nx, 0.7 * nx, 500)
+new_y_long = np.linspace(0.1, 0.9, 500)
+
+
+class Interpolation:
+    def setup(self, *args, **kwargs):
+        self.ds = xr.Dataset(
+            {
+                "var1": (("x", "y"), randn_xy),
+                "var2": (("x", "t"), randn_xt),
+                "var3": (("t",), randn_t),
+            },
+            coords={
+                "x": np.arange(nx),
+                "y": np.linspace(0, 1, ny),
+                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
+                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
+            },
+        )
+
+    @parameterized(["method", "is_short"], (["linear", "cubic"], [True, False]))
+    def time_interpolation(self, method, is_short):
+        new_x = new_x_short if is_short else new_x_long
+        self.ds.interp(x=new_x, method=method).load()
+
+    @parameterized(["method"], (["linear", "nearest"]))
+    def time_interpolation_2d(self, method):
+        self.ds.interp(x=new_x_long, y=new_y_long, method=method).load()
+
+
+class InterpolationDask(Interpolation):
+    def setup(self, *args, **kwargs):
+        requires_dask()
+        super().setup(**kwargs)
+        self.ds = self.ds.chunk({"t": 50})
diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py
new file mode 100644
index 00000000000..8aaa515d417
--- /dev/null
+++ b/asv_bench/benchmarks/pandas.py
@@ -0,0 +1,26 @@
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+from . import parameterized
+
+
+class MultiIndexSeries:
+    def setup(self, dtype, subset):
+        data = np.random.rand(100000).astype(dtype)
+        index = pd.MultiIndex.from_product(
+            [
+                list("abcdefhijk"),
+                list("abcdefhijk"),
+                pd.date_range(start="2000-01-01", periods=1000, freq="B"),
+            ]
+        )
+        series = pd.Series(data, index)
+        if subset:
+            series = series[::3]
+        self.series = series
+
+    @parameterized(["dtype", "subset"], ([int, float], [True, False]))
+    def time_from_series(self, dtype, subset):
+        xr.DataArray.from_series(self.series)
diff --git a/asv_bench/benchmarks/reindexing.py b/asv_bench/benchmarks/reindexing.py
new file mode 100644
index 00000000000..9d0767fc3b3
--- /dev/null
+++ b/asv_bench/benchmarks/reindexing.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+import xarray as xr
+
+from . import requires_dask
+
+ntime = 500
+nx = 50
+ny = 50
+
+
+class Reindex:
+    def setup(self):
+        data = np.random.RandomState(0).randn(ntime, nx, ny)
+        self.ds = xr.Dataset(
+            {"temperature": (("time", "x", "y"), data)},
+            coords={"time": np.arange(ntime), "x": np.arange(nx), "y": np.arange(ny)},
+        )
+
+    def time_1d_coarse(self):
+        self.ds.reindex(time=np.arange(0, ntime, 5)).load()
+
+    def time_1d_fine_all_found(self):
+        self.ds.reindex(time=np.arange(0, ntime, 0.5), method="nearest").load()
+
+    def time_1d_fine_some_missing(self):
+        self.ds.reindex(
+            time=np.arange(0, ntime, 0.5), method="nearest", tolerance=0.1
+        ).load()
+
+    def time_2d_coarse(self):
+        self.ds.reindex(x=np.arange(0, nx, 2), y=np.arange(0, ny, 2)).load()
+
+    def time_2d_fine_all_found(self):
+        self.ds.reindex(
+            x=np.arange(0, nx, 0.5), y=np.arange(0, ny, 0.5), method="nearest"
+        ).load()
+
+    def time_2d_fine_some_missing(self):
+        self.ds.reindex(
+            x=np.arange(0, nx, 0.5),
+            y=np.arange(0, ny, 0.5),
+            method="nearest",
+            tolerance=0.1,
+        ).load()
+
+
+class ReindexDask(Reindex):
+    def setup(self):
+        requires_dask()
+        super().setup()
+        self.ds = self.ds.chunk({"time": 100})
diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py
new file mode 100644
index 00000000000..4bf2ace352d
--- /dev/null
+++ b/asv_bench/benchmarks/repr.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+
+class Repr:
+    def setup(self):
+        a = np.arange(0, 100)
+        data_vars = dict()
+        for i in a:
+            data_vars[f"long_variable_name_{i}"] = xr.DataArray(
+                name=f"long_variable_name_{i}",
+                data=np.arange(0, 20),
+                dims=[f"long_coord_name_{i}_x"],
+                coords={f"long_coord_name_{i}_x": np.arange(0, 20) * 2},
+            )
+        self.ds = xr.Dataset(data_vars)
+        self.ds.attrs = {f"attr_{k}": 2 for k in a}
+
+    def time_repr(self):
+        repr(self.ds)
+
+    def time_repr_html(self):
+        self.ds._repr_html_()
+
+
+class ReprMultiIndex:
+    def setup(self):
+        index = pd.MultiIndex.from_product(
+            [range(1000), range(1000)], names=("level_0", "level_1")
+        )
+        series = pd.Series(range(1000 * 1000), index=index)
+        self.da = xr.DataArray(series)
+
+    def time_repr(self):
+        repr(self.da)
+
+    def time_repr_html(self):
+        self.da._repr_html_()
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
new file mode 100644
index 00000000000..f0e18bf2153
--- /dev/null
+++ b/asv_bench/benchmarks/rolling.py
@@ -0,0 +1,110 @@
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+from . import parameterized, randn, requires_dask
+
+nx = 300
+long_nx = 30000
+ny = 200
+nt = 100
+window = 20
+
+randn_xy = randn((nx, ny), frac_nan=0.1)
+randn_xt = randn((nx, nt))
+randn_t = randn((nt,))
+randn_long = randn((long_nx,), frac_nan=0.1)
+
+
+class Rolling:
+    def setup(self, *args, **kwargs):
+        self.ds = xr.Dataset(
+            {
+                "var1": (("x", "y"), randn_xy),
+                "var2": (("x", "t"), randn_xt),
+                "var3": (("t",), randn_t),
+            },
+            coords={
+                "x": np.arange(nx),
+                "y": np.linspace(0, 1, ny),
+                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
+                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
+            },
+        )
+        self.da_long = xr.DataArray(
+            randn_long, dims="x", coords={"x": np.arange(long_nx) * 0.1}
+        )
+
+    @parameterized(["func", "center"], (["mean", "count"], [True, False]))
+    def time_rolling(self, func, center):
+        getattr(self.ds.rolling(x=window, center=center), func)().load()
+
+    @parameterized(["func", "pandas"], (["mean", "count"], [True, False]))
+    def time_rolling_long(self, func, pandas):
+        if pandas:
+            se = self.da_long.to_series()
+            getattr(se.rolling(window=window, min_periods=window), func)()
+        else:
+            getattr(self.da_long.rolling(x=window, min_periods=window), func)().load()
+
+    @parameterized(["window_", "min_periods"], ([20, 40], [5, 5]))
+    def time_rolling_np(self, window_, min_periods):
+        self.ds.rolling(x=window_, center=False, min_periods=min_periods).reduce(
+            getattr(np, "nansum")
+        ).load()
+
+    @parameterized(["center", "stride"], ([True, False], [1, 1]))
+    def time_rolling_construct(self, center, stride):
+        self.ds.rolling(x=window, center=center).construct(
+            "window_dim", stride=stride
+        ).sum(dim="window_dim").load()
+
+
+class RollingDask(Rolling):
+    def setup(self, *args, **kwargs):
+        requires_dask()
+        super().setup(**kwargs)
+        self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50})
+        self.da_long = self.da_long.chunk({"x": 10000})
+
+
+class RollingMemory:
+    def setup(self, *args, **kwargs):
+        self.ds = xr.Dataset(
+            {
+                "var1": (("x", "y"), randn_xy),
+                "var2": (("x", "t"), randn_xt),
+                "var3": (("t",), randn_t),
+            },
+            coords={
+                "x": np.arange(nx),
+                "y": np.linspace(0, 1, ny),
+                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
+                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
+            },
+        )
+
+
+class DataArrayRollingMemory(RollingMemory):
+    @parameterized("func", ["sum", "max", "mean"])
+    def peakmem_ndrolling_reduce(self, func):
+        roll = self.ds.var1.rolling(x=10, y=4)
+        getattr(roll, func)()
+
+    @parameterized("func", ["sum", "max", "mean"])
+    def peakmem_1drolling_reduce(self, func):
+        roll = self.ds.var3.rolling(t=100)
+        getattr(roll, func)()
+
+
+class DatasetRollingMemory(RollingMemory):
+    @parameterized("func", ["sum", "max", "mean"])
+    def peakmem_ndrolling_reduce(self, func):
+        roll = self.ds.rolling(x=10, y=4)
+        getattr(roll, func)()
+
+    @parameterized("func", ["sum", "max", "mean"])
+    def peakmem_1drolling_reduce(self, func):
+        roll = self.ds.rolling(t=100)
+        getattr(roll, func)()
diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py
new file mode 100644
index 00000000000..2c5b7ca7821
--- /dev/null
+++ b/asv_bench/benchmarks/unstacking.py
@@ -0,0 +1,29 @@
+import numpy as np
+
+import xarray as xr
+
+from . import requires_dask
+
+
+class Unstacking:
+    def setup(self):
+        data = np.random.RandomState(0).randn(250, 500)
+        self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
+        self.da_missing = self.da_full[:-1]
+        self.df_missing = self.da_missing.to_pandas()
+
+    def time_unstack_fast(self):
+        self.da_full.unstack("flat_dim")
+
+    def time_unstack_slow(self):
+        self.da_missing.unstack("flat_dim")
+
+    def time_unstack_pandas_slow(self):
+        self.df_missing.unstack()
+
+
+class UnstackingDask(Unstacking):
+    def setup(self, *args, **kwargs):
+        requires_dask()
+        super().setup(**kwargs)
+        self.da_full = self.da_full.chunk({"flat_dim": 25})

From 25409811d6f9961382c3119b405c02fe0a307255 Mon Sep 17 00:00:00 2001
From: dcherian <deepak@cherian.net>
Date: Mon, 8 Nov 2021 12:13:41 -0700
Subject: [PATCH 10/11] Fix small_num_groups benchmarks

Make sure dask uses multiple chunks.
---
 asv_bench/benchmarks/groupby.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index eb43a95d2b9..c2ddf155ba7 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -11,7 +11,7 @@ def setup(self, *args, **kwargs):
         self.n = 100
         self.ds1d = xr.Dataset(
             {
-                "a": xr.DataArray(np.r_[np.arange(self.n), np.arange(self.n)]),
+                "a": xr.DataArray(np.r_[np.repeat(1, self.n), np.repeat(2, self.n)]),
                 "b": xr.DataArray(np.arange(2 * self.n)),
             }
         )
@@ -36,8 +36,8 @@ class GroupByDask(GroupBy):
     def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
-        self.ds1d = self.ds1d.sel(dim_0=slice(self.n // 2)).chunk({"dim_0": 50})
-        self.ds2d = self.ds2d.sel(dim_0=slice(self.n // 2)).chunk({"dim_0": 50, "z": 4})
+        self.ds1d = self.ds1d.sel(dim_0=slice(self.n)).chunk({"dim_0": 50})
+        self.ds2d = self.ds2d.sel(dim_0=slice(self.n)).chunk({"dim_0": 50, "z": 4})
 
 
 class GroupByDataFrame(GroupBy):

From 37abc5457d657ddb416bf9de4d5cc81ca1f3d389 Mon Sep 17 00:00:00 2001
From: dcherian <deepak@cherian.net>
Date: Mon, 8 Nov 2021 12:19:21 -0700
Subject: [PATCH 11/11] [skip-ci] more dask improvements.

---
 asv_bench/benchmarks/groupby.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index c2ddf155ba7..46d6293cc98 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -36,11 +36,13 @@ class GroupByDask(GroupBy):
     def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
-        self.ds1d = self.ds1d.sel(dim_0=slice(self.n)).chunk({"dim_0": 50})
-        self.ds2d = self.ds2d.sel(dim_0=slice(self.n)).chunk({"dim_0": 50, "z": 4})
+        self.ds1d = self.ds1d.sel(dim_0=slice(None, None, 2)).chunk({"dim_0": 50})
+        self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2)).chunk(
+            {"dim_0": 50, "z": 5}
+        )
 
 
-class GroupByDataFrame(GroupBy):
+class GroupByPandasDataFrame(GroupBy):
     """Run groupby tests using pandas DataFrame."""
 
     def setup(self, *args, **kwargs):