From fad7b3ac025de25c78038ccdfa8ab0aab9012998 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 Oct 2021 11:05:07 -0500
Subject: [PATCH 01/10] wip - compat

---
 dask_ml/_partial.py                |  31 +++--
 dask_ml/base.py                    |  98 ++++++++++++++
 dask_ml/preprocessing/_encoders.py |  16 +--
 dask_ml/preprocessing/data.py      |  50 ++++++-
 dask_ml/utils.py                   | 205 +++++++++++++++++++++++++++++
 5 files changed, 375 insertions(+), 25 deletions(-)

diff --git a/dask_ml/_partial.py b/dask_ml/_partial.py
index cfd0b7869..102b9f757 100644
--- a/dask_ml/_partial.py
+++ b/dask_ml/_partial.py
@@ -160,7 +160,7 @@ def _blocks_and_name(obj):
 
 
 def _predict(model, x):
-    return model.predict(x)[:, None]
+    return model.predict(x)
 
 
 def predict(model, x):
@@ -173,15 +173,28 @@ def predict(model, x):
 
     See docstring for ``da.learn.fit``
     """
-    if not hasattr(x, "chunks") and hasattr(x, "to_dask_array"):
-        x = x.to_dask_array()
-    assert x.ndim == 2
-    if len(x.chunks[1]) > 1:
-        x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1])))
     func = partial(_predict, model)
-    xx = np.zeros((1, x.shape[1]), dtype=x.dtype)
-    dt = model.predict(xx).dtype
-    return x.map_blocks(func, chunks=(x.chunks[0], (1,)), dtype=dt).squeeze()
+
+    if getattr(model, "feature_names_in_", None) is not None:
+        import pandas as pd
+        import dask.dataframe as dd
+
+        meta = model.predict(x._meta_nonempty)
+        return x.map_partitions(func, meta=meta)
+    else:
+        if len(x.chunks[1]) > 1:
+            x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1])))
+
+        xx = np.zeros((1, x.shape[1]), dtype=x.dtype)
+        meta = model.predict(xx)
+
+        if meta.ndim > 1:
+            chunks = (x.chunks[0], (1,))
+            drop_axis = None
+        else:
+            chunks = (x.chunks[0],)
+            drop_axis = 1
+        return x.map_blocks(func, chunks=chunks, meta=meta, drop_axis=drop_axis)
 
 
 def _copy_partial_doc(cls):
diff --git a/dask_ml/base.py b/dask_ml/base.py
index 9043e061e..60bbe5b1e 100644
--- a/dask_ml/base.py
+++ b/dask_ml/base.py
@@ -1,5 +1,103 @@
 import sklearn.base
 
+from .utils import check_array
+
+
+class DaskMLBaseMixin:
+    def _validate_data(
+        self,
+        X="no_validation",
+        y="no_validation",
+        reset=True,
+        validate_separately=False,
+        **check_params,
+    ):
+        """Validate input data and set or check the `n_features_in_` attribute.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix, dataframe} of shape \
+                (n_samples, n_features), default='no validation'
+            The input samples.
+            If `'no_validation'`, no validation is performed on `X`. This is
+            useful for meta-estimator which can delegate input validation to
+            their underlying estimator(s). In that case `y` must be passed and
+            the only accepted `check_params` are `multi_output` and
+            `y_numeric`.
+
+        y : array-like of shape (n_samples,), default='no_validation'
+            The targets.
+
+            - If `None`, `check_array` is called on `X`. If the estimator's
+              requires_y tag is True, then an error will be raised.
+            - If `'no_validation'`, `check_array` is called on `X` and the
+              estimator's requires_y tag is ignored. This is a default
+              placeholder and is never meant to be explicitly set. In that case
+              `X` must be passed.
+            - Otherwise, only `y` with `_check_y` or both `X` and `y` are
+              checked with either `check_array` or `check_X_y` depending on
+              `validate_separately`.
+
+        reset : bool, default=True
+            Whether to reset the `n_features_in_` attribute.
+            If False, the input will be checked for consistency with data
+            provided when reset was last True.
+            .. note::
+               It is recommended to call reset=True in `fit` and in the first
+               call to `partial_fit`. All other methods that validate `X`
+               should set `reset=False`.
+        validate_separately : False or tuple of dicts, default=False
+            Only used if y is not None.
+            If False, call validate_X_y(). Else, it must be a tuple of kwargs
+            to be used for calling check_array() on X and y respectively.
+        **check_params : kwargs
+            Parameters passed to :func:`sklearn.utils.check_array` or
+            :func:`sklearn.utils.check_X_y`. Ignored if validate_separately
+            is not False.
+
+        Returns
+        -------
+        out : {ndarray, sparse matrix} or tuple of these
+            The validated input. A tuple is returned if both `X` and `y` are
+            validated.
+        """
+        self._check_feature_names(X, reset=reset)
+
+        if y is None and self._get_tags()["requires_y"]:
+            raise ValueError(
+                f"This {self.__class__.__name__} estimator "
+                "requires y to be passed, but the target y is None."
+            )
+
+        no_val_X = isinstance(X, str) and X == "no_validation"
+        no_val_y = y is None or isinstance(y, str) and y == "no_validation"
+
+        if no_val_X and no_val_y:
+            raise ValueError("Validation should be done on X, y or both.")
+        elif not no_val_X and no_val_y:
+            X = check_array(X, **check_params)
+            out = X
+        elif no_val_X and not no_val_y:
+            y = _check_y(y, **check_params)
+            out = y
+        else:
+            if validate_separately:
+                # We need this because some estimators validate X and y
+                # separately, and in general, separately calling check_array()
+                # on X and y isn't equivalent to just calling check_X_y()
+                # :(
+                check_X_params, check_y_params = validate_separately
+                X = check_array(X, **check_X_params)
+                y = check_array(y, **check_y_params)
+            else:
+                X, y = check_X_y(X, y, **check_params)
+            out = X, y
+
+        if not no_val_X and check_params.get("ensure_2d", True):
+            self._check_n_features(X, reset=reset)
+
+        return out
+
 
 class ClassifierMixin(sklearn.base.ClassifierMixin):
     """Mixin class for all classifiers in scikit-learn."""
diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py
index 78c0e371b..35e59fcd9 100644
--- a/dask_ml/preprocessing/_encoders.py
+++ b/dask_ml/preprocessing/_encoders.py
@@ -8,11 +8,12 @@
 
 from .._compat import SK_024
 from .._typing import ArrayLike, DataFrameType, DTypeLike, SeriesType
+from ..base import DaskMLBaseMixin
 from ..utils import check_array
 from .label import _encode, _encode_dask_array
 
 
-class OneHotEncoder(sklearn.preprocessing.OneHotEncoder):
+class OneHotEncoder(DaskMLBaseMixin, sklearn.preprocessing.OneHotEncoder):
     """Encode categorical integer features as a one-hot numeric array.
 
     .. versionadded:: 0.8.0
@@ -161,16 +162,13 @@ def _fit(
         handle_unknown: str = "error",
         force_all_finite: bool = True,
     ):
-        X = check_array(
-            X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True
-        )
-        if SK_024:
-            kwargs = dict(force_all_finite=force_all_finite)
-        else:
-            kwargs = {}
+        X = self._validate_data(X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True)
+        self._check_n_features(X, reset=True)
+        self._check_feature_names(X, reset=True)
+
         if isinstance(X, np.ndarray):
             return super(OneHotEncoder, self)._fit(
-                X, handle_unknown=handle_unknown, **kwargs
+                X, handle_unknown=handle_unknown, force_all_finite=force_all_finite
             )
 
         is_array = isinstance(X, da.Array)
diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py
index c26c6387d..b2edbfedd 100644
--- a/dask_ml/preprocessing/data.py
+++ b/dask_ml/preprocessing/data.py
@@ -1,5 +1,6 @@
 from __future__ import division
 
+import numbers
 import multiprocessing
 from collections import OrderedDict
 from distutils.version import LooseVersion
@@ -21,6 +22,7 @@
 from dask_ml._utils import copy_learned_attributes
 from dask_ml.utils import check_array, handle_zeros_in_scale
 
+from ..base import DaskMLBaseMixin
 from .._typing import ArrayLike, DataFrameType, NDArrayOrScalar, SeriesType
 
 _PANDAS_VERSION = LooseVersion(pd.__version__)
@@ -46,7 +48,7 @@ def _handle_zeros_in_scale(scale: NDArrayOrScalar, copy=True):
         return scale
 
 
-class StandardScaler(sklearn.preprocessing.StandardScaler):
+class StandardScaler(DaskMLBaseMixin, sklearn.preprocessing.StandardScaler):
 
     __doc__ = sklearn.preprocessing.StandardScaler.__doc__
 
@@ -56,6 +58,8 @@ def fit(
         y: Optional[Union[ArrayLike, SeriesType]] = None,
     ) -> "StandardScaler":
         self._reset()
+        X = self._validate_data(X, estimator=self, accept_dask_array=True, accept_dask_dataframe=True, accept_unknown_chunks=True, preserve_pandas_dataframe=True)
+
         attributes = OrderedDict()
         if isinstance(X, (pd.DataFrame, dd.DataFrame)):
             X = X.values
@@ -71,7 +75,7 @@ def fit(
             attributes["scale_"] = scale_
             attributes["var_"] = var_
 
-        attributes["n_samples_seen_"] = np.nan
+        attributes["n_samples_seen_"] = X.shape[0]
         values = compute(*attributes.values())
         for k, v in zip(attributes, values):
             setattr(self, k, v)
@@ -137,7 +141,7 @@ def fit(
         attributes["data_range_"] = data_range
         attributes["scale_"] = scale
         attributes["min_"] = feature_range[0] - data_min * scale
-        attributes["n_samples_seen_"] = np.nan
+        attributes["n_samples_seen_"] = X.shape[0]
 
         values = compute(*attributes.values())
         for k, v in zip(attributes, values):
@@ -1036,7 +1040,7 @@ def inverse_transform(
         return X
 
 
-class PolynomialFeatures(sklearn.preprocessing.PolynomialFeatures):
+class PolynomialFeatures(DaskMLBaseMixin, sklearn.preprocessing.PolynomialFeatures):
     """preserve_dataframe : boolean
         If True, preserve pandas and dask dataframes after transforming.
         Using False (default) returns numpy or dask arrays and mimics
@@ -1072,6 +1076,38 @@ def fit(
             interaction_only=self.interaction_only,
             include_bias=self.include_bias,
         )
+        X = self._validate_data(X, estimator=self, accept_dask_array=True, accept_dask_dataframe=True, accept_unknown_chunks=True, preserve_pandas_dataframe=True)
+
+        if isinstance(self.degree, numbers.Integral):
+            if self.degree < 0:
+                raise ValueError(
+                    f"degree must be a non-negative integer, got {self.degree}."
+                )
+            self._min_degree = 0
+            self._max_degree = self.degree
+        elif (
+            isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2
+        ):
+            self._min_degree, self._max_degree = self.degree
+            if not (
+                isinstance(self._min_degree, numbers.Integral)
+                and isinstance(self._max_degree, numbers.Integral)
+                and self._min_degree >= 0
+                and self._min_degree <= self._max_degree
+            ):
+                raise ValueError(
+                    "degree=(min_degree, max_degree) must "
+                    "be non-negative integers that fulfil "
+                    "min_degree <= max_degree, got "
+                    f"{self.degree}."
+                )
+        else:
+            raise ValueError(
+                "degree must be a non-negative int or tuple "
+                "(min_degree, max_degree), got "
+                f"{self.degree}."
+            )
+
         X_sample = X
         if isinstance(X, dd.DataFrame):
             X_sample = X._meta_nonempty
@@ -1089,19 +1125,19 @@ def transform(
         y: Optional[Union[ArrayLike, SeriesType]] = None,
     ) -> Union[ArrayLike, DataFrameType]:
         if isinstance(X, da.Array):
-            n_cols = len(self._transformer.get_feature_names())
+            n_cols = len(self._transformer.get_feature_names_out())
             X = check_array(X, accept_multiple_blocks=False, accept_unknown_chunks=True)
             chunks = (X.chunks[0], n_cols)
             XP = X.map_blocks(self._transformer.transform, dtype=X.dtype, chunks=chunks)
         elif isinstance(X, pd.DataFrame):
             XP = X.pipe(self._transformer.transform)
             if self.preserve_dataframe:
-                columns = self._transformer.get_feature_names(X.columns)
+                columns = self._transformer.get_feature_names_out(X.columns)
                 XP = pd.DataFrame(data=XP, columns=columns, index=X.index)
         elif isinstance(X, dd.DataFrame):
             XP = X.map_partitions(self._transformer.transform)
             if self.preserve_dataframe:
-                columns = self._transformer.get_feature_names(X.columns)
+                columns = self._transformer.get_feature_names_out(X.columns)
                 XP = dd.from_dask_array(XP, columns, X.index)
         else:
             # typically X is instance of np.ndarray
diff --git a/dask_ml/utils.py b/dask_ml/utils.py
index fb6bec4ba..d07e7fd45 100644
--- a/dask_ml/utils.py
+++ b/dask_ml/utils.py
@@ -221,6 +221,8 @@ def _assert_eq(l, r, name=None, **kwargs):
     ):
         for a, b in zip(l, r):
             _assert_eq(a, b, **kwargs)
+    elif np.isscalar(r) and np.isnan(r):
+        assert np.isnan(l), (name, l, r)
     else:
         assert l == r, (name, l, r)
 
@@ -272,6 +274,164 @@ def check_matching_blocks(*arrays):
         raise ValueError("Unexpected types {}.".format({type(x) for x in arrays}))
 
 
+
+def check_X_y(
+    X,
+    y,
+    accept_sparse=False,
+    *,
+    accept_large_sparse=True,
+    dtype="numeric",
+    order=None,
+    copy=False,
+    force_all_finite=True,
+    ensure_2d=True,
+    allow_nd=False,
+    multi_output=False,
+    ensure_min_samples=1,
+    ensure_min_features=1,
+    y_numeric=False,
+    estimator=None,
+):
+    """Input validation for standard estimators.
+
+    Checks X and y for consistent length, enforces X to be 2D and y 1D. By
+    default, X is checked to be non-empty and containing only finite values.
+    Standard input checks are also applied to y, such as checking that y
+    does not have np.nan or np.inf targets. For multi-label y, set
+    multi_output=True to allow 2D and sparse y. If the dtype of X is
+    object, attempt converting to float, raising on failure.
+
+    Parameters
+    ----------
+    X : {ndarray, list, sparse matrix}
+        Input data.
+
+    y : {ndarray, list, sparse matrix}
+        Labels.
+
+    accept_sparse : str, bool or list of str, default=False
+        String[s] representing allowed sparse matrix formats, such as 'csc',
+        'csr', etc. If the input is sparse but not in the allowed format,
+        it will be converted to the first listed format. True allows the input
+        to be any format. False means that a sparse matrix input will
+        raise an error.
+
+    accept_large_sparse : bool, default=True
+        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
+        accept_sparse, accept_large_sparse will cause it to be accepted only
+        if its indices are stored with a 32-bit dtype.
+
+        .. versionadded:: 0.20
+
+    dtype : 'numeric', type, list of type or None, default='numeric'
+        Data type of result. If None, the dtype of the input is preserved.
+        If "numeric", dtype is preserved unless array.dtype is object.
+        If dtype is a list of types, conversion on the first type is only
+        performed if the dtype of the input is not in the list.
+
+    order : {'F', 'C'}, default=None
+        Whether an array will be forced to be fortran or c-style.
+
+    copy : bool, default=False
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
+
+    force_all_finite : bool or 'allow-nan', default=True
+        Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter
+        does not influence whether y can have np.inf, np.nan, pd.NA values.
+        The possibilities are:
+
+        - True: Force all values of X to be finite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot
+          be infinite.
+
+        .. versionadded:: 0.20
+           ``force_all_finite`` accepts the string ``'allow-nan'``.
+
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
+    ensure_2d : bool, default=True
+        Whether to raise a value error if X is not 2D.
+
+    allow_nd : bool, default=False
+        Whether to allow X.ndim > 2.
+
+    multi_output : bool, default=False
+        Whether to allow 2D y (array or sparse matrix). If false, y will be
+        validated as a vector. y cannot have np.nan or np.inf values if
+        multi_output=True.
+
+    ensure_min_samples : int, default=1
+        Make sure that X has a minimum number of samples in its first
+        axis (rows for a 2D array).
+
+    ensure_min_features : int, default=1
+        Make sure that the 2D array has some minimum number of features
+        (columns). The default value of 1 rejects empty datasets.
+        This check is only enforced when X has effectively 2 dimensions or
+        is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
+        this check.
+
+    y_numeric : bool, default=False
+        Whether to ensure that y has a numeric type. If dtype of y is object,
+        it is converted to float64. Should only be used for regression
+        algorithms.
+
+    estimator : str or estimator instance, default=None
+        If passed, include the name of the estimator in warning messages.
+
+    Returns
+    -------
+    X_converted : object
+        The converted and validated X.
+
+    y_converted : object
+        The converted and validated y.
+    """
+    if y is None:
+        raise ValueError("y cannot be None")
+
+    X = check_array(
+        X,
+        accept_sparse=accept_sparse,
+        accept_large_sparse=accept_large_sparse,
+        dtype=dtype,
+        order=order,
+        copy=copy,
+        force_all_finite=force_all_finite,
+        ensure_2d=ensure_2d,
+        allow_nd=allow_nd,
+        ensure_min_samples=ensure_min_samples,
+        ensure_min_features=ensure_min_features,
+        estimator=estimator,
+    )
+
+    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
+
+    check_consistent_length(X, y)
+
+    return X, y
+
+
+def _check_y(y, multi_output=False, y_numeric=False):
+    """Isolated part of check_X_y dedicated to y validation"""
+    if multi_output:
+        y = check_array(
+            y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None
+        )
+    else:
+        y = column_or_1d(y, warn=True)
+        _assert_all_finite(y)
+        _ensure_no_complex_data(y)
+    if y_numeric and y.dtype.kind == "O":
+        y = y.astype(np.float64)
+
+    return y
+
+
 def check_chunks(n_samples, n_features, chunks=None):
     """Validate and normalize the chunks argument for a dask.array
 
@@ -398,6 +558,51 @@ def _num_samples(X):
     return result
 
 
+
+def _get_feature_names(X):
+    """Get feature names from X.
+    Support for other array containers should place its implementation here.
+    Parameters
+    ----------
+    X : {ndarray, dataframe} of shape (n_samples, n_features)
+        Array container to extract feature names.
+        - pandas dataframe : The columns will be considered to be feature
+          names. If the dataframe contains non-string feature names, `None` is
+          returned.
+        - All other array containers will return `None`.
+    Returns
+    -------
+    names: ndarray or None
+        Feature names of `X`. Unrecognized array containers will return `None`.
+    """
+    feature_names = None
+
+    # extract feature names for support array containers
+    if hasattr(X, "columns"):
+        feature_names = np.asarray(X.columns, dtype=object)
+
+    if feature_names is None or len(feature_names) == 0:
+        return
+
+    types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names))
+
+    # Warn when types are mixed.
+    # ints and strings do not warn
+    if len(types) > 1 or not (types[0].startswith("int") or types[0] == "str"):
+        # TODO: Convert to an error in 1.2
+        warnings.warn(
+            "Feature names only support names that are all strings. "
+            f"Got feature names with dtypes: {types}. An error will be raised "
+            "in 1.2.",
+            FutureWarning,
+        )
+        return
+
+    # Only feature names of all strings are supported
+    if types[0] == "str":
+        return feature_names
+
+
 __all__ = [
     "assert_estimator_equal",
     "check_array",

From 23df9dd0190ae7e4dd51c8f801ab1b3bd76a6ef4 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 Oct 2021 11:10:39 -0500
Subject: [PATCH 02/10] wip - compat

---
 .github/workflows/lint.yaml                   | 10 +++++
 .github/workflows/tests.yaml                  | 44 +++++++++++++++++++
 ci/code_checks.sh                             | 18 --------
 ci/environment-3.6.yaml                       | 32 --------------
 ci/environment-3.8.yaml                       | 23 +++-------
 ci/environment-3.9.yaml                       | 24 ++++++++++
 ci/install.sh                                 |  4 ++
 .../dask_searchcv/test_model_selection.py     |  4 +-
 tests/preprocessing/test_data.py              | 28 +++++-------
 9 files changed, 102 insertions(+), 85 deletions(-)
 create mode 100644 .github/workflows/lint.yaml
 create mode 100644 .github/workflows/tests.yaml
 delete mode 100755 ci/code_checks.sh
 delete mode 100644 ci/environment-3.6.yaml
 create mode 100644 ci/environment-3.9.yaml
 create mode 100644 ci/install.sh

diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
new file mode 100644
index 000000000..b69c01737
--- /dev/null
+++ b/.github/workflows/lint.yaml
@@ -0,0 +1,10 @@
+name: Linting
+on: [pull_request, push]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+      - uses: pre-commit/action@v2.0.0
\ No newline at end of file
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
new file mode 100644
index 000000000..0302957d5
--- /dev/null
+++ b/.github/workflows/tests.yaml
@@ -0,0 +1,44 @@
+name: Tests
+
+on: [push, pull_request]
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        # os: ["windows-latest", "ubuntu-latest", "macos-latest"]
+        os: ["ubuntu-latest"]
+        python-version: ["3.8"]
+
+    env:
+      PYTHON_VERSION: ${{ matrix.python-version }}
+      PARALLEL: "true"
+      COVERAGE: "true"
+
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0  # Needed by codecov.io
+
+      - name: Setup Conda Environment
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          miniforge-variant: Mambaforge
+          miniforge-version: latest
+          use-mamba: true
+          channel-priority: strict
+          python-version: ${{ matrix.python-version }}
+          environment-file: ci/environment-${{ matrix.python-version }}.yaml
+          activate-environment: test-environment
+          auto-activate-base: false
+
+      - name: Install
+        shell: bash -l {0}
+        run: source ci/install.sh
+
+      - name: Run tests
+        shell: bash -l {0}
+        run: pytest -v
\ No newline at end of file
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
deleted file mode 100755
index dbef47de7..000000000
--- a/ci/code_checks.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-source activate dask-ml-test
-MSG='Checking flake8... ' ; echo $MSG
-flake8
-RET=$(($RET + $?)) ; echo $MSG "DONE"
-
-MSG='Checking black... ' ; echo $MSG
-black --version
-black --check .
-RET=$(($RET + $?)) ; echo $MSG "DONE"
-
-MSG='Checking isort... ' ; echo $MSG
-isort --version-number
-isort --recursive --check-only .
-RET=$(($RET + $?)) ; echo $MSG "DONE"
-
-exit $RET
diff --git a/ci/environment-3.6.yaml b/ci/environment-3.6.yaml
deleted file mode 100644
index 57d46f934..000000000
--- a/ci/environment-3.6.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: dask-ml-test
-channels:
-  - conda-forge
-  - defaults
-dependencies:
-  - black==19.10b0
-  - coverage
-  - dask ==2.4.0
-  - dask-glm >=0.2.0
-  - distributed ==2.4.0
-  - flake8
-  - isort==4.3.21
-  - msgpack-python ==0.6.2
-  - multipledispatch
-  - mypy
-  - numba
-  - numpy ==1.17.3
-  - numpydoc
-  - packaging
-  - pandas =0.24.2
-  - psutil
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - python=3.6.*
-  - scikit-learn =0.23.*
-  - scipy
-  - sparse
-  - toolz
-  - pip
-  - pip:
-    - pytest-azurepipelines
diff --git a/ci/environment-3.8.yaml b/ci/environment-3.8.yaml
index c48399c7d..09cb79296 100644
--- a/ci/environment-3.8.yaml
+++ b/ci/environment-3.8.yaml
@@ -1,23 +1,15 @@
-name: dask-ml-test
+name: dask-ml-3.8
 channels:
   - conda-forge
   - defaults
 dependencies:
-  - black==19.10b0
-  - coverage
-  - codecov
-  # dask 2021.3.0 introduced a regression which causes tests to fail.
-  # The issue has been resolved upstream in dask and will be included
-  # in the next release. We temporarily apply a dask version contraint
-  # to allow CI to pass
-  - dask !=2021.3.0
-  - dask-glm >=0.2.0
-  - flake8
+  - dask
+  - dask-glm
   - isort==4.3.21
   - multipledispatch >=0.4.9
   - mypy
   - numba
-  - numpy >=1.16.3
+  - numpy
   - numpydoc
   - packaging
   - pandas
@@ -26,10 +18,7 @@ dependencies:
   - pytest-cov
   - pytest-mock
   - python=3.8.*
-  - scikit-learn>=0.23.0
+  - scikit-learn>=1.0.0
   - scipy
   - sparse
-  - toolz
-  - pip
-  - pip:
-    - pytest-azurepipelines
+  - toolz
\ No newline at end of file
diff --git a/ci/environment-3.9.yaml b/ci/environment-3.9.yaml
new file mode 100644
index 000000000..f47074472
--- /dev/null
+++ b/ci/environment-3.9.yaml
@@ -0,0 +1,24 @@
+name: dask-ml-test
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - dask
+  - dask-glm
+  - isort==4.3.21
+  - multipledispatch >=0.4.9
+  - mypy
+  - numba
+  - numpy
+  - numpydoc
+  - packaging
+  - pandas
+  - psutil
+  - pytest
+  - pytest-cov
+  - pytest-mock
+  - python=3.8.*
+  - scikit-learn>=1.0.0
+  - scipy
+  - sparse
+  - toolz
\ No newline at end of file
diff --git a/ci/install.sh b/ci/install.sh
new file mode 100644
index 000000000..8ffd7a7ac
--- /dev/null
+++ b/ci/install.sh
@@ -0,0 +1,4 @@
+python -m pip install --quiet --no-deps -e .
+
+echo mamba list
+mamba list
\ No newline at end of file
diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py
index e0a203932..53fc49299 100644
--- a/tests/model_selection/dask_searchcv/test_model_selection.py
+++ b/tests/model_selection/dask_searchcv/test_model_selection.py
@@ -43,7 +43,7 @@
 from sklearn.svm import SVC
 
 import dask_ml.model_selection as dcv
-from dask_ml._compat import DISTRIBUTED_2_11_0, SK_0_23_2
+from dask_ml._compat import DISTRIBUTED_2_11_0
 from dask_ml.model_selection import check_cv, compute_n_splits
 from dask_ml.model_selection._search import _normalize_n_jobs
 from dask_ml.model_selection.methods import CVCache
@@ -488,7 +488,7 @@ def check_scores_all_nan(gs, bad_param, score_key="score"):
     )
 
 
-@pytest.mark.xfail(SK_0_23_2, reason="https://github.com/dask/dask-ml/issues/672")
+@pytest.mark.xfail(reason="https://github.com/dask/dask-ml/issues/672")
 @pytest.mark.parametrize(
     "weights", [None, (None, {"tr0": 2, "tr2": 3}, {"tr0": 2, "tr2": 4})]
 )
diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
index beb4767fd..f1bbb7169 100644
--- a/tests/preprocessing/test_data.py
+++ b/tests/preprocessing/test_data.py
@@ -66,7 +66,7 @@ def test_input_types(self, dask_df, pandas_df):
         b = spp.StandardScaler()
 
         assert_estimator_equal(
-            a.fit(dask_df.values), a.fit(dask_df), exclude="n_samples_seen_"
+            a.fit(dask_df.values), a.fit(dask_df),
         )
 
         assert_estimator_equal(
@@ -108,7 +108,7 @@ def test_basic(self):
 
         a.fit(X)
         b.fit(X.compute())
-        assert_estimator_equal(a, b, exclude="n_samples_seen_")
+        assert_estimator_equal(a, b)
 
     def test_inverse_transform(self):
         a = dpp.MinMaxScaler()
@@ -511,17 +511,19 @@ def test_basic(self):
 
         a.fit(X)
         b.fit(X.compute())
-        assert_estimator_equal(a._transformer, b)
+        assert_estimator_equal(a._transformer, b, exclude={"n_input_features_"})
 
     def test_input_types(self):
         a = dpp.PolynomialFeatures()
         b = spp.PolynomialFeatures()
 
-        assert_estimator_equal(a.fit(df), a.fit(df.compute()))
-        assert_estimator_equal(a.fit(df), a.fit(df.compute().values))
-        assert_estimator_equal(a.fit(df.values), a.fit(df.compute().values))
-        assert_estimator_equal(a.fit(df), b.fit(df.compute()))
-        assert_estimator_equal(a.fit(df), b.fit(df.compute().values))
+        exclude = {"n_input_features_"}
+
+        assert_estimator_equal(a.fit(df), a.fit(df.compute()), exclude=exclude)
+        assert_estimator_equal(a.fit(df), a.fit(df.compute().values), exclude=exclude)
+        assert_estimator_equal(a.fit(df.values), a.fit(df.compute().values), exclude=exclude)
+        assert_estimator_equal(a.fit(df), b.fit(df.compute()), exclude=exclude)
+        assert_estimator_equal(a.fit(df), b.fit(df.compute().values), exclude=exclude)
 
     def test_array_transform(self):
         a = dpp.PolynomialFeatures()
@@ -529,7 +531,7 @@ def test_array_transform(self):
 
         res_a = a.fit_transform(X)
         res_b = b.fit_transform(X.compute())
-        assert_estimator_equal(a, b)
+        assert_estimator_equal(a, b, exclude={"n_input_features_"})
         assert dask.is_dask_collection(res_a)
         assert_eq_ar(res_a, res_b)
 
@@ -549,21 +551,15 @@ def test_transformed_shape(self):
         # checks if the transformed objects have the correct columns
         a = dpp.PolynomialFeatures()
         a.fit(X)
-        n_cols = len(a.get_feature_names())
+        n_cols = len(a.get_feature_names_out())
         # dask array
         assert a.transform(X).shape[1] == n_cols
         # numpy array
         assert a.transform(X.compute()).shape[1] == n_cols
-        # dask dataframe
-        assert a.transform(df).shape[1] == n_cols
-        # pandas dataframe
-        assert a.transform(df.compute()).shape[1] == n_cols
         X_nan_rows = df.values
         df_none_divisions = X_nan_rows.to_dask_dataframe(columns=df.columns)
         # dask array with nan rows
         assert a.transform(X_nan_rows).shape[1] == n_cols
-        # dask data frame with nan rows
-        assert a.transform(df_none_divisions).shape[1] == n_cols
 
     @pytest.mark.parametrize("daskify", [False, True])
     def test_df_transform(self, daskify):

From f4f4d26c3fb05c4eacbf0f4c3ab820f3f6f251c6 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 Oct 2021 11:16:18 -0500
Subject: [PATCH 03/10] rm azure-pipelines

---
 azure-pipelines.yml | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100644 azure-pipelines.yml

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index d04fb7eb8..000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-trigger:
-- main
-
-jobs:
-- template: ci/posix.yaml
-  parameters:
-    name: 'linux'
-    vmImage: 'ubuntu-16.04'
-- template: ci/windows.yaml
-  parameters:
-    name: 'win64'
-    vmImage: 'vs2017-win2016'

From 440f3ad445b39507f33a6db8675d7ee07c3586bc Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 Oct 2021 11:16:31 -0500
Subject: [PATCH 04/10] remove compat

---
 dask_ml/_compat.py                 | 12 ++++--------
 dask_ml/cluster/_compat.py         | 12 ++----------
 dask_ml/cluster/k_means.py         | 11 +++--------
 dask_ml/model_selection/_search.py | 11 +++--------
 dask_ml/preprocessing/_encoders.py |  1 -
 tests/metrics/test_regression.py   |  4 +---
 tests/test_kmeans.py               | 11 ++---------
 7 files changed, 15 insertions(+), 47 deletions(-)

diff --git a/dask_ml/_compat.py b/dask_ml/_compat.py
index c99d625be..4539a2051 100644
--- a/dask_ml/_compat.py
+++ b/dask_ml/_compat.py
@@ -16,8 +16,6 @@
 PANDAS_VERSION = packaging.version.parse(pandas.__version__)
 DISTRIBUTED_VERSION = packaging.version.parse(distributed.__version__)
 
-SK_0_23_2 = SK_VERSION >= packaging.version.parse("0.23.2")
-SK_024 = SK_VERSION >= packaging.version.parse("0.24.0.dev0")
 DASK_240 = DASK_VERSION >= packaging.version.parse("2.4.0")
 DASK_2130 = DASK_VERSION >= packaging.version.parse("2.13.0")
 DASK_2_20_0 = DASK_VERSION >= packaging.version.parse("2.20.0")
@@ -49,9 +47,7 @@ def _check_multimetric_scoring(estimator, scoring=None):
     from sklearn.metrics._scorer import _check_multimetric_scoring
     from sklearn.metrics import check_scoring
 
-    if SK_024:
-        if callable(scoring) or isinstance(scoring, (type(None), str)):
-            scorers = {"score": check_scoring(estimator, scoring=scoring)}
-            return scorers, False
-        return _check_multimetric_scoring(estimator, scoring), True
-    return _check_multimetric_scoring(estimator, scoring)
+    if callable(scoring) or isinstance(scoring, (type(None), str)):
+        scorers = {"score": check_scoring(estimator, scoring=scoring)}
+        return scorers, False
+    return _check_multimetric_scoring(estimator, scoring), True
\ No newline at end of file
diff --git a/dask_ml/cluster/_compat.py b/dask_ml/cluster/_compat.py
index 4bbf684d7..7fee9f96a 100644
--- a/dask_ml/cluster/_compat.py
+++ b/dask_ml/cluster/_compat.py
@@ -1,11 +1,3 @@
-from .._compat import SK_024
+from sklearn.cluster._kmeans import _kmeans_plusplus  # noqa
 
-if SK_024:
-    from sklearn.cluster._kmeans import _kmeans_plusplus  # noqa
-
-    __all__ = ["_kmeans_plusplus"]
-
-else:
-    from sklearn.cluster._kmeans import _k_init
-
-    __all__ = ["_k_init"]
+__all__ = ["_kmeans_plusplus"]
\ No newline at end of file
diff --git a/dask_ml/cluster/k_means.py b/dask_ml/cluster/k_means.py
index 255516437..99399df31 100644
--- a/dask_ml/cluster/k_means.py
+++ b/dask_ml/cluster/k_means.py
@@ -12,7 +12,7 @@
 from sklearn.utils.extmath import squared_norm
 from sklearn.utils.validation import check_is_fitted
 
-from .._compat import SK_024, blockwise
+from .._compat import blockwise
 from .._utils import draw_seed
 from ..metrics import (
     euclidean_distances,
@@ -21,11 +21,7 @@
 )
 from ..utils import _timed, _timer, check_array, row_norms
 
-if SK_024:
-    from ._compat import _kmeans_plusplus
-else:
-    from ._compat import _k_init as _kmeans_plusplus
-
+from ._compat import _kmeans_plusplus
 import numba  # isort:skip (see https://github.com/dask/dask-ml/pull/577)
 
 
@@ -394,8 +390,7 @@ def init_pp(X, n_clusters, random_state):
             random_state=random_state,
             x_squared_norms=x_squared_norms,
         )
-        if SK_024:
-            centers, _ = centers
+        centers, _ = centers
 
     return centers
 
diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py
index afa0261d0..d6325afac 100644
--- a/dask_ml/model_selection/_search.py
+++ b/dask_ml/model_selection/_search.py
@@ -35,7 +35,7 @@
 from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import _num_samples, check_is_fitted
 
-from .._compat import SK_024, SK_VERSION
+from .._compat import SK_VERSION
 from ._normalize import normalize_estimator
 from .methods import (
     MISSING,
@@ -54,9 +54,7 @@
     score,
 )
 from .utils import DeprecationDict, is_dask_collection, to_indexable, to_keys, unzip
-
-if SK_024:
-    from sklearn.base import _is_pairwise
+from sklearn.base import _is_pairwise
 
 logger = logging.getLogger(__name__)
 
@@ -200,10 +198,7 @@ def build_cv_graph(
     X, y, groups = to_indexable(X, y, groups)
     cv = check_cv(cv, y, is_classifier(estimator))
     # "pairwise" estimators require a different graph for CV splitting
-    if SK_024:
-        is_pairwise = _is_pairwise(estimator)
-    else:
-        is_pairwise = getattr(estimator, "_pairwise", False)
+    is_pairwise = _is_pairwise(estimator)
 
     dsk = {}
     X_name, y_name, groups_name = to_keys(dsk, X, y, groups)
diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py
index 35e59fcd9..29602f283 100644
--- a/dask_ml/preprocessing/_encoders.py
+++ b/dask_ml/preprocessing/_encoders.py
@@ -6,7 +6,6 @@
 import pandas as pd
 import sklearn.preprocessing
 
-from .._compat import SK_024
 from .._typing import ArrayLike, DataFrameType, DTypeLike, SeriesType
 from ..base import DaskMLBaseMixin
 from ..utils import check_array
diff --git a/tests/metrics/test_regression.py b/tests/metrics/test_regression.py
index 17f21ee8d..98dca26a8 100644
--- a/tests/metrics/test_regression.py
+++ b/tests/metrics/test_regression.py
@@ -7,7 +7,6 @@
 from dask.array.utils import assert_eq
 
 import dask_ml.metrics
-from dask_ml._compat import SK_024
 
 _METRICS_TO_TEST = [
     "mean_squared_error",
@@ -17,8 +16,7 @@
 ]
 
 # mean_absolute_percentage_error() was added in scikit-learn 0.24.0
-if SK_024:
-    _METRICS_TO_TEST.append("mean_absolute_percentage_error")
+_METRICS_TO_TEST.append("mean_absolute_percentage_error")
 
 
 @pytest.fixture(params=_METRICS_TO_TEST)
diff --git a/tests/test_kmeans.py b/tests/test_kmeans.py
index 78fec594d..706b50ea0 100644
--- a/tests/test_kmeans.py
+++ b/tests/test_kmeans.py
@@ -16,14 +16,10 @@
 from sklearn.utils.estimator_checks import check_estimator
 
 import dask_ml.cluster
-from dask_ml._compat import SK_024
 from dask_ml.cluster import KMeans as DKKMeans, k_means
 from dask_ml.utils import assert_estimator_equal, row_norms
 
-if SK_024:
-    from dask_ml.cluster._compat import _kmeans_plusplus
-else:
-    from dask_ml.cluster._compat import _k_init as _kmeans_plusplus
+from dask_ml.cluster._compat import _kmeans_plusplus
 
 
 def test_check_estimator():
@@ -98,10 +94,7 @@ def test_fit_given_init(self):
         X_ = X.compute()
         x_squared_norms = sklearn.utils.extmath.row_norms(X_, squared=True)
         rs = np.random.RandomState(0)
-        if SK_024:
-            init, _ = _kmeans_plusplus(X_, 3, x_squared_norms, rs)
-        else:
-            init = _kmeans_plusplus(X_, 3, x_squared_norms, rs)
+        init, _ = _kmeans_plusplus(X_, 3, x_squared_norms, rs)
         dkkm = DKKMeans(3, init=init, random_state=0)
         skkm = SKKMeans(3, init=init, random_state=0, n_init=1)
         dkkm.fit(X)

From 637779ccb5e863afe075f6d8550febd6c04412fc Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 Oct 2021 11:20:16 -0500
Subject: [PATCH 05/10] lint

---
 dask_ml/_compat.py                 |  2 +-
 dask_ml/_partial.py                |  3 ---
 dask_ml/cluster/_compat.py         |  2 +-
 dask_ml/cluster/k_means.py         |  2 +-
 dask_ml/model_selection/_search.py |  9 +++++++--
 dask_ml/preprocessing/_encoders.py |  4 +++-
 dask_ml/preprocessing/data.py      | 23 +++++++++++++++++----
 dask_ml/utils.py                   | 32 +++++++++++++++++-------------
 tests/preprocessing/test_data.py   |  5 +++--
 tests/test_kmeans.py               |  3 +--
 10 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/dask_ml/_compat.py b/dask_ml/_compat.py
index 4539a2051..8e65d9d91 100644
--- a/dask_ml/_compat.py
+++ b/dask_ml/_compat.py
@@ -50,4 +50,4 @@ def _check_multimetric_scoring(estimator, scoring=None):
     if callable(scoring) or isinstance(scoring, (type(None), str)):
         scorers = {"score": check_scoring(estimator, scoring=scoring)}
         return scorers, False
-    return _check_multimetric_scoring(estimator, scoring), True
\ No newline at end of file
+    return _check_multimetric_scoring(estimator, scoring), True
diff --git a/dask_ml/_partial.py b/dask_ml/_partial.py
index 102b9f757..0bff82b53 100644
--- a/dask_ml/_partial.py
+++ b/dask_ml/_partial.py
@@ -176,9 +176,6 @@ def predict(model, x):
     func = partial(_predict, model)
 
     if getattr(model, "feature_names_in_", None) is not None:
-        import pandas as pd
-        import dask.dataframe as dd
-
         meta = model.predict(x._meta_nonempty)
         return x.map_partitions(func, meta=meta)
     else:
diff --git a/dask_ml/cluster/_compat.py b/dask_ml/cluster/_compat.py
index 7fee9f96a..bce639c62 100644
--- a/dask_ml/cluster/_compat.py
+++ b/dask_ml/cluster/_compat.py
@@ -1,3 +1,3 @@
 from sklearn.cluster._kmeans import _kmeans_plusplus  # noqa
 
-__all__ = ["_kmeans_plusplus"]
\ No newline at end of file
+__all__ = ["_kmeans_plusplus"]
diff --git a/dask_ml/cluster/k_means.py b/dask_ml/cluster/k_means.py
index 99399df31..4b02c2fa4 100644
--- a/dask_ml/cluster/k_means.py
+++ b/dask_ml/cluster/k_means.py
@@ -20,8 +20,8 @@
     pairwise_distances_argmin_min,
 )
 from ..utils import _timed, _timer, check_array, row_norms
-
 from ._compat import _kmeans_plusplus
+
 import numba  # isort:skip (see https://github.com/dask/dask-ml/pull/577)
 
 
diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py
index d6325afac..1113defa1 100644
--- a/dask_ml/model_selection/_search.py
+++ b/dask_ml/model_selection/_search.py
@@ -15,7 +15,13 @@
 from dask.distributed import as_completed
 from dask.utils import derived_from
 from sklearn import model_selection
-from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier
+from sklearn.base import (
+    BaseEstimator,
+    MetaEstimatorMixin,
+    _is_pairwise,
+    clone,
+    is_classifier,
+)
 from sklearn.exceptions import NotFittedError
 from sklearn.model_selection._search import BaseSearchCV, _check_param_grid
 from sklearn.model_selection._split import (
@@ -54,7 +60,6 @@
     score,
 )
 from .utils import DeprecationDict, is_dask_collection, to_indexable, to_keys, unzip
-from sklearn.base import _is_pairwise
 
 logger = logging.getLogger(__name__)
 
diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py
index 29602f283..17e5c0954 100644
--- a/dask_ml/preprocessing/_encoders.py
+++ b/dask_ml/preprocessing/_encoders.py
@@ -161,7 +161,9 @@ def _fit(
         handle_unknown: str = "error",
         force_all_finite: bool = True,
     ):
-        X = self._validate_data(X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True)
+        X = self._validate_data(
+            X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True
+        )
         self._check_n_features(X, reset=True)
         self._check_feature_names(X, reset=True)
 
diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py
index b2edbfedd..bf609a0fe 100644
--- a/dask_ml/preprocessing/data.py
+++ b/dask_ml/preprocessing/data.py
@@ -1,7 +1,8 @@
 from __future__ import division
 
-import numbers
+import collections
 import multiprocessing
+import numbers
 from collections import OrderedDict
 from distutils.version import LooseVersion
 from typing import Any, List, Optional, Sequence, Union
@@ -22,8 +23,8 @@
 from dask_ml._utils import copy_learned_attributes
 from dask_ml.utils import check_array, handle_zeros_in_scale
 
-from ..base import DaskMLBaseMixin
 from .._typing import ArrayLike, DataFrameType, NDArrayOrScalar, SeriesType
+from ..base import DaskMLBaseMixin
 
 _PANDAS_VERSION = LooseVersion(pd.__version__)
 _HAS_CTD = _PANDAS_VERSION >= "0.21.0"
@@ -58,7 +59,14 @@ def fit(
         y: Optional[Union[ArrayLike, SeriesType]] = None,
     ) -> "StandardScaler":
         self._reset()
-        X = self._validate_data(X, estimator=self, accept_dask_array=True, accept_dask_dataframe=True, accept_unknown_chunks=True, preserve_pandas_dataframe=True)
+        X = self._validate_data(
+            X,
+            estimator=self,
+            accept_dask_array=True,
+            accept_dask_dataframe=True,
+            accept_unknown_chunks=True,
+            preserve_pandas_dataframe=True,
+        )
 
         attributes = OrderedDict()
         if isinstance(X, (pd.DataFrame, dd.DataFrame)):
@@ -1076,7 +1084,14 @@ def fit(
             interaction_only=self.interaction_only,
             include_bias=self.include_bias,
         )
-        X = self._validate_data(X, estimator=self, accept_dask_array=True, accept_dask_dataframe=True, accept_unknown_chunks=True, preserve_pandas_dataframe=True)
+        X = self._validate_data(
+            X,
+            estimator=self,
+            accept_dask_array=True,
+            accept_dask_dataframe=True,
+            accept_unknown_chunks=True,
+            preserve_pandas_dataframe=True,
+        )
 
         if isinstance(self.degree, numbers.Integral):
             if self.degree < 0:
diff --git a/dask_ml/utils.py b/dask_ml/utils.py
index d07e7fd45..abeaa58a0 100644
--- a/dask_ml/utils.py
+++ b/dask_ml/utils.py
@@ -2,6 +2,7 @@
 import datetime
 import functools
 import logging
+import warnings
 from collections.abc import Sequence
 from multiprocessing import cpu_count
 from numbers import Integral
@@ -124,7 +125,7 @@ def check_array(
     accept_multiple_blocks=False,
     preserve_pandas_dataframe=False,
     remove_zero_chunks=True,
-    **kwargs
+    **kwargs,
 ):
     """Validate inputs
 
@@ -274,7 +275,6 @@ def check_matching_blocks(*arrays):
         raise ValueError("Unexpected types {}.".format({type(x) for x in arrays}))
 
 
-
 def check_X_y(
     X,
     y,
@@ -418,20 +418,25 @@ def check_X_y(
 
 def _check_y(y, multi_output=False, y_numeric=False):
     """Isolated part of check_X_y dedicated to y validation"""
-    if multi_output:
-        y = check_array(
-            y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None
-        )
-    else:
-        y = column_or_1d(y, warn=True)
-        _assert_all_finite(y)
-        _ensure_no_complex_data(y)
-    if y_numeric and y.dtype.kind == "O":
-        y = y.astype(np.float64)
-
+    # TODO: implement
+    # if multi_output:
+    #     y = check_array(
+    #         y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None
+    #     )
+    # else:
+    #     y = column_or_1d(y, warn=True)
+    #     _assert_all_finite(y)
+    #     _ensure_no_complex_data(y)
+    # if y_numeric and y.dtype.kind == "O":
+    #     y = y.astype(np.float64)
     return y
 
 
+def check_consistent_length(*arrays):
+    # TODO: check divisions, chunks, etc.
+    pass
+
+
 def check_chunks(n_samples, n_features, chunks=None):
     """Validate and normalize the chunks argument for a dask.array
 
@@ -558,7 +563,6 @@ def _num_samples(X):
     return result
 
 
-
 def _get_feature_names(X):
     """Get feature names from X.
     Support for other array containers should place its implementation here.
diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
index f1bbb7169..7851f920c 100644
--- a/tests/preprocessing/test_data.py
+++ b/tests/preprocessing/test_data.py
@@ -521,7 +521,9 @@ def test_input_types(self):
 
         assert_estimator_equal(a.fit(df), a.fit(df.compute()), exclude=exclude)
         assert_estimator_equal(a.fit(df), a.fit(df.compute().values), exclude=exclude)
-        assert_estimator_equal(a.fit(df.values), a.fit(df.compute().values), exclude=exclude)
+        assert_estimator_equal(
+            a.fit(df.values), a.fit(df.compute().values), exclude=exclude
+        )
         assert_estimator_equal(a.fit(df), b.fit(df.compute()), exclude=exclude)
         assert_estimator_equal(a.fit(df), b.fit(df.compute().values), exclude=exclude)
 
@@ -557,7 +559,6 @@ def test_transformed_shape(self):
         # numpy array
         assert a.transform(X.compute()).shape[1] == n_cols
         X_nan_rows = df.values
-        df_none_divisions = X_nan_rows.to_dask_dataframe(columns=df.columns)
         # dask array with nan rows
         assert a.transform(X_nan_rows).shape[1] == n_cols
 
diff --git a/tests/test_kmeans.py b/tests/test_kmeans.py
index 706b50ea0..94a147150 100644
--- a/tests/test_kmeans.py
+++ b/tests/test_kmeans.py
@@ -17,9 +17,8 @@
 
 import dask_ml.cluster
 from dask_ml.cluster import KMeans as DKKMeans, k_means
-from dask_ml.utils import assert_estimator_equal, row_norms
-
 from dask_ml.cluster._compat import _kmeans_plusplus
+from dask_ml.utils import assert_estimator_equal, row_norms
 
 
 def test_check_estimator():

From f9c224b63d11612a36ed6ea16b1e397419dad8ff Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 Oct 2021 11:23:20 -0500
Subject: [PATCH 06/10] lint

---
 dask_ml/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_ml/base.py b/dask_ml/base.py
index 60bbe5b1e..2997c39d1 100644
--- a/dask_ml/base.py
+++ b/dask_ml/base.py
@@ -1,6 +1,6 @@
 import sklearn.base
 
-from .utils import check_array
+from .utils import _check_y, check_array, check_X_y
 
 
 class DaskMLBaseMixin:

From 792311b9f1667b962dfc47d1416184c05dd36649 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 Oct 2021 11:27:24 -0500
Subject: [PATCH 07/10] matrix

---
 .github/workflows/tests.yaml |  2 +-
 ci/environment-3.7.yaml      | 23 ++++++-----------------
 ci/environment-3.9.yaml      |  4 ++--
 setup.py                     |  5 +++--
 4 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 0302957d5..8b12ca131 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -10,7 +10,7 @@ jobs:
       matrix:
         # os: ["windows-latest", "ubuntu-latest", "macos-latest"]
         os: ["ubuntu-latest"]
-        python-version: ["3.8"]
+        python-version: ["3.7", "3.8", "3.9"]
 
     env:
       PYTHON_VERSION: ${{ matrix.python-version }}
diff --git a/ci/environment-3.7.yaml b/ci/environment-3.7.yaml
index 2ab6922ac..89d615345 100644
--- a/ci/environment-3.7.yaml
+++ b/ci/environment-3.7.yaml
@@ -1,23 +1,15 @@
-name: dask-ml-test
+name: dask-ml-3.7
 channels:
   - conda-forge
   - defaults
 dependencies:
-  - black==19.10b0
-  - coverage
-  - codecov
-  # dask 2021.3.0 introduced a regression which causes tests to fail.
-  # The issue has been resolved upstream in dask and will be included
-  # in the next release. We temporarily apply a dask version contraint
-  # to allow CI to pass
-  - dask !=2021.3.0
-  - dask-glm >=0.2.0
-  - flake8
+  - dask
+  - dask-glm
   - isort==4.3.21
   - multipledispatch >=0.4.9
   - mypy
   - numba
-  - numpy >=1.16.3
+  - numpy
   - numpydoc
   - packaging
   - pandas
@@ -26,10 +18,7 @@ dependencies:
   - pytest-cov
   - pytest-mock
   - python=3.7.*
-  - scikit-learn>=0.23.0
+  - scikit-learn>=1.0.0
   - scipy
   - sparse
-  - toolz
-  - pip
-  - pip:
-    - pytest-azurepipelines
+  - toolz
\ No newline at end of file
diff --git a/ci/environment-3.9.yaml b/ci/environment-3.9.yaml
index f47074472..1be11d42e 100644
--- a/ci/environment-3.9.yaml
+++ b/ci/environment-3.9.yaml
@@ -1,4 +1,4 @@
-name: dask-ml-test
+name: dask-ml-3.9
 channels:
   - conda-forge
   - defaults
@@ -17,7 +17,7 @@ dependencies:
   - pytest
   - pytest-cov
   - pytest-mock
-  - python=3.8.*
+  - python=3.9.*
   - scikit-learn>=1.0.0
   - scipy
   - sparse
diff --git a/setup.py b/setup.py
index 8f1163561..036f76698 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     "numba>=0.51.0",
     "numpy>=1.17.3",
     "pandas>=0.24.2",
-    "scikit-learn>=0.23",
+    "scikit-learn>=1.0.0",
     "scipy",
     "dask-glm>=0.2.0",
     "multipledispatch>=0.4.9",
@@ -60,9 +60,10 @@
         "Topic :: Database",
         "Topic :: Scientific/Engineering",
         "License :: OSI Approved :: BSD License",
-        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
     ],
     packages=find_packages(exclude=["docs", "tests", "tests.*", "docs.*"]),
     use_scm_version=True,

From 1589d0efd09f0b01715f4accdf60fc7815c46803 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 Oct 2021 11:35:56 -0500
Subject: [PATCH 08/10] Update doc env

---
 ci/environment-docs.yaml | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/ci/environment-docs.yaml b/ci/environment-docs.yaml
index 4efabafa6..cc912ba30 100644
--- a/ci/environment-docs.yaml
+++ b/ci/environment-docs.yaml
@@ -5,12 +5,10 @@ channels:
 dependencies:
   - black
   - coverage
-  - flake8
   - graphviz
   - heapdict
   - ipykernel
   - ipython
-  - isort==4.3.21
   - multipledispatch
   - mypy
   - nbsphinx
@@ -21,18 +19,14 @@ dependencies:
   - numpydoc
   - pandas
   - psutil
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - python=3.7
+  - python=3.8
   - sortedcontainers
-  - scikit-learn>=0.23.1
+  - scikit-learn>=1.0.0
   - scipy
   - sparse
-  - sphinx==1.8.5
+  - sphinx
   - sphinx_rtd_theme
   - sphinx-gallery
-  - testpath<0.4
   - tornado
   - toolz
   - xgboost

From 2d9a4ac844694a87f3cfff954bca6eeb775b6716 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 Oct 2021 11:39:14 -0500
Subject: [PATCH 09/10] Fixed doc build

---
 docs/source/conf.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index e0bc15acf..736933cb8 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -19,7 +19,6 @@
 #
 import packaging.version
 
-import dask_sphinx_theme
 from dask_ml import __version__ as version
 
 # import sys
@@ -106,8 +105,8 @@
 
 
 html_theme = "dask_sphinx_theme"
-
-html_theme_path = [dask_sphinx_theme.get_html_theme_path()]
+html_theme_options = {"logo_only": True}
+# html_theme_path = [dask_sphinx_theme.get_html_theme_path()]
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the

From 11353ab272ccaece3be46092275c755796905825 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 16 Oct 2021 11:42:22 -0500
Subject: [PATCH 10/10] removed isort from env files

---
 ci/environment-3.7.yaml | 1 -
 ci/environment-3.8.yaml | 1 -
 ci/environment-3.9.yaml | 1 -
 3 files changed, 3 deletions(-)

diff --git a/ci/environment-3.7.yaml b/ci/environment-3.7.yaml
index 89d615345..1575b115a 100644
--- a/ci/environment-3.7.yaml
+++ b/ci/environment-3.7.yaml
@@ -5,7 +5,6 @@ channels:
 dependencies:
   - dask
   - dask-glm
-  - isort==4.3.21
   - multipledispatch >=0.4.9
   - mypy
   - numba
diff --git a/ci/environment-3.8.yaml b/ci/environment-3.8.yaml
index 09cb79296..334c5e227 100644
--- a/ci/environment-3.8.yaml
+++ b/ci/environment-3.8.yaml
@@ -5,7 +5,6 @@ channels:
 dependencies:
   - dask
   - dask-glm
-  - isort==4.3.21
   - multipledispatch >=0.4.9
   - mypy
   - numba
diff --git a/ci/environment-3.9.yaml b/ci/environment-3.9.yaml
index 1be11d42e..799bb3348 100644
--- a/ci/environment-3.9.yaml
+++ b/ci/environment-3.9.yaml
@@ -5,7 +5,6 @@ channels:
 dependencies:
   - dask
   - dask-glm
-  - isort==4.3.21
   - multipledispatch >=0.4.9
   - mypy
   - numba