From fad7b3ac025de25c78038ccdfa8ab0aab9012998 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Oct 2021 11:05:07 -0500 Subject: [PATCH 01/10] wip - compat --- dask_ml/_partial.py | 31 +++-- dask_ml/base.py | 98 ++++++++++++++ dask_ml/preprocessing/_encoders.py | 16 +-- dask_ml/preprocessing/data.py | 50 ++++++- dask_ml/utils.py | 205 +++++++++++++++++++++++++++++ 5 files changed, 375 insertions(+), 25 deletions(-) diff --git a/dask_ml/_partial.py b/dask_ml/_partial.py index cfd0b7869..102b9f757 100644 --- a/dask_ml/_partial.py +++ b/dask_ml/_partial.py @@ -160,7 +160,7 @@ def _blocks_and_name(obj): def _predict(model, x): - return model.predict(x)[:, None] + return model.predict(x) def predict(model, x): @@ -173,15 +173,28 @@ def predict(model, x): See docstring for ``da.learn.fit`` """ - if not hasattr(x, "chunks") and hasattr(x, "to_dask_array"): - x = x.to_dask_array() - assert x.ndim == 2 - if len(x.chunks[1]) > 1: - x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1]))) func = partial(_predict, model) - xx = np.zeros((1, x.shape[1]), dtype=x.dtype) - dt = model.predict(xx).dtype - return x.map_blocks(func, chunks=(x.chunks[0], (1,)), dtype=dt).squeeze() + + if getattr(model, "feature_names_in_", None) is not None: + import pandas as pd + import dask.dataframe as dd + + meta = model.predict(x._meta_nonempty) + return x.map_partitions(func, meta=meta) + else: + if len(x.chunks[1]) > 1: + x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1]))) + + xx = np.zeros((1, x.shape[1]), dtype=x.dtype) + meta = model.predict(xx) + + if meta.ndim > 1: + chunks = (x.chunks[0], (1,)) + drop_axis = None + else: + chunks = (x.chunks[0],) + drop_axis = 1 + return x.map_blocks(func, chunks=chunks, meta=meta, drop_axis=drop_axis) def _copy_partial_doc(cls): diff --git a/dask_ml/base.py b/dask_ml/base.py index 9043e061e..60bbe5b1e 100644 --- a/dask_ml/base.py +++ b/dask_ml/base.py @@ -1,5 +1,103 @@ import sklearn.base +from .utils import check_array + + +class DaskMLBaseMixin: + def _validate_data( + self, + X="no_validation", + y="no_validation", + reset=True, + validate_separately=False, + **check_params, + ): + """Validate input data and set or check the `n_features_in_` attribute. + + Parameters + ---------- + X : {array-like, sparse matrix, dataframe} of shape \ + (n_samples, n_features), default='no validation' + The input samples. + If `'no_validation'`, no validation is performed on `X`. This is + useful for meta-estimator which can delegate input validation to + their underlying estimator(s). In that case `y` must be passed and + the only accepted `check_params` are `multi_output` and + `y_numeric`. + + y : array-like of shape (n_samples,), default='no_validation' + The targets. + + - If `None`, `check_array` is called on `X`. If the estimator's + requires_y tag is True, then an error will be raised. + - If `'no_validation'`, `check_array` is called on `X` and the + estimator's requires_y tag is ignored. This is a default + placeholder and is never meant to be explicitly set. In that case + `X` must be passed. + - Otherwise, only `y` with `_check_y` or both `X` and `y` are + checked with either `check_array` or `check_X_y` depending on + `validate_separately`. + + reset : bool, default=True + Whether to reset the `n_features_in_` attribute. + If False, the input will be checked for consistency with data + provided when reset was last True. + .. note:: + It is recommended to call reset=True in `fit` and in the first + call to `partial_fit`. All other methods that validate `X` + should set `reset=False`. + validate_separately : False or tuple of dicts, default=False + Only used if y is not None. + If False, call validate_X_y(). Else, it must be a tuple of kwargs + to be used for calling check_array() on X and y respectively. + **check_params : kwargs + Parameters passed to :func:`sklearn.utils.check_array` or + :func:`sklearn.utils.check_X_y`. Ignored if validate_separately + is not False. + + Returns + ------- + out : {ndarray, sparse matrix} or tuple of these + The validated input. A tuple is returned if both `X` and `y` are + validated. + """ + self._check_feature_names(X, reset=reset) + + if y is None and self._get_tags()["requires_y"]: + raise ValueError( + f"This {self.__class__.__name__} estimator " + "requires y to be passed, but the target y is None." + ) + + no_val_X = isinstance(X, str) and X == "no_validation" + no_val_y = y is None or isinstance(y, str) and y == "no_validation" + + if no_val_X and no_val_y: + raise ValueError("Validation should be done on X, y or both.") + elif not no_val_X and no_val_y: + X = check_array(X, **check_params) + out = X + elif no_val_X and not no_val_y: + y = _check_y(y, **check_params) + out = y + else: + if validate_separately: + # We need this because some estimators validate X and y + # separately, and in general, separately calling check_array() + # on X and y isn't equivalent to just calling check_X_y() + # :( + check_X_params, check_y_params = validate_separately + X = check_array(X, **check_X_params) + y = check_array(y, **check_y_params) + else: + X, y = check_X_y(X, y, **check_params) + out = X, y + + if not no_val_X and check_params.get("ensure_2d", True): + self._check_n_features(X, reset=reset) + + return out + class ClassifierMixin(sklearn.base.ClassifierMixin): """Mixin class for all classifiers in scikit-learn.""" diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py index 78c0e371b..35e59fcd9 100644 --- a/dask_ml/preprocessing/_encoders.py +++ b/dask_ml/preprocessing/_encoders.py @@ -8,11 +8,12 @@ from .._compat import SK_024 from .._typing import ArrayLike, DataFrameType, DTypeLike, SeriesType +from ..base import DaskMLBaseMixin from ..utils import check_array from .label import _encode, _encode_dask_array -class OneHotEncoder(sklearn.preprocessing.OneHotEncoder): +class OneHotEncoder(DaskMLBaseMixin, sklearn.preprocessing.OneHotEncoder): """Encode categorical integer features as a one-hot numeric array. .. versionadded:: 0.8.0 @@ -161,16 +162,13 @@ def _fit( handle_unknown: str = "error", force_all_finite: bool = True, ): - X = check_array( - X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True - ) - if SK_024: - kwargs = dict(force_all_finite=force_all_finite) - else: - kwargs = {} + X = self._validate_data(X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True) + self._check_n_features(X, reset=True) + self._check_feature_names(X, reset=True) + if isinstance(X, np.ndarray): return super(OneHotEncoder, self)._fit( - X, handle_unknown=handle_unknown, **kwargs + X, handle_unknown=handle_unknown, force_all_finite=force_all_finite ) is_array = isinstance(X, da.Array) diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py index c26c6387d..b2edbfedd 100644 --- a/dask_ml/preprocessing/data.py +++ b/dask_ml/preprocessing/data.py @@ -1,5 +1,6 @@ from __future__ import division +import numbers import multiprocessing from collections import OrderedDict from distutils.version import LooseVersion @@ -21,6 +22,7 @@ from dask_ml._utils import copy_learned_attributes from dask_ml.utils import check_array, handle_zeros_in_scale +from ..base import DaskMLBaseMixin from .._typing import ArrayLike, DataFrameType, NDArrayOrScalar, SeriesType _PANDAS_VERSION = LooseVersion(pd.__version__) @@ -46,7 +48,7 @@ def _handle_zeros_in_scale(scale: NDArrayOrScalar, copy=True): return scale -class StandardScaler(sklearn.preprocessing.StandardScaler): +class StandardScaler(DaskMLBaseMixin, sklearn.preprocessing.StandardScaler): __doc__ = sklearn.preprocessing.StandardScaler.__doc__ @@ -56,6 +58,8 @@ def fit( y: Optional[Union[ArrayLike, SeriesType]] = None, ) -> "StandardScaler": self._reset() + X = self._validate_data(X, estimator=self, accept_dask_array=True, accept_dask_dataframe=True, accept_unknown_chunks=True, preserve_pandas_dataframe=True) + attributes = OrderedDict() if isinstance(X, (pd.DataFrame, dd.DataFrame)): X = X.values @@ -71,7 +75,7 @@ def fit( attributes["scale_"] = scale_ attributes["var_"] = var_ - attributes["n_samples_seen_"] = np.nan + attributes["n_samples_seen_"] = X.shape[0] values = compute(*attributes.values()) for k, v in zip(attributes, values): setattr(self, k, v) @@ -137,7 +141,7 @@ def fit( attributes["data_range_"] = data_range attributes["scale_"] = scale attributes["min_"] = feature_range[0] - data_min * scale - attributes["n_samples_seen_"] = np.nan + attributes["n_samples_seen_"] = X.shape[0] values = compute(*attributes.values()) for k, v in zip(attributes, values): @@ -1036,7 +1040,7 @@ def inverse_transform( return X -class PolynomialFeatures(sklearn.preprocessing.PolynomialFeatures): +class PolynomialFeatures(DaskMLBaseMixin, sklearn.preprocessing.PolynomialFeatures): """preserve_dataframe : boolean If True, preserve pandas and dask dataframes after transforming. Using False (default) returns numpy or dask arrays and mimics @@ -1072,6 +1076,38 @@ def fit( interaction_only=self.interaction_only, include_bias=self.include_bias, ) + X = self._validate_data(X, estimator=self, accept_dask_array=True, accept_dask_dataframe=True, accept_unknown_chunks=True, preserve_pandas_dataframe=True) + + if isinstance(self.degree, numbers.Integral): + if self.degree < 0: + raise ValueError( + f"degree must be a non-negative integer, got {self.degree}." + ) + self._min_degree = 0 + self._max_degree = self.degree + elif ( + isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2 + ): + self._min_degree, self._max_degree = self.degree + if not ( + isinstance(self._min_degree, numbers.Integral) + and isinstance(self._max_degree, numbers.Integral) + and self._min_degree >= 0 + and self._min_degree <= self._max_degree + ): + raise ValueError( + "degree=(min_degree, max_degree) must " + "be non-negative integers that fulfil " + "min_degree <= max_degree, got " + f"{self.degree}." + ) + else: + raise ValueError( + "degree must be a non-negative int or tuple " + "(min_degree, max_degree), got " + f"{self.degree}." + ) + X_sample = X if isinstance(X, dd.DataFrame): X_sample = X._meta_nonempty @@ -1089,19 +1125,19 @@ def transform( y: Optional[Union[ArrayLike, SeriesType]] = None, ) -> Union[ArrayLike, DataFrameType]: if isinstance(X, da.Array): - n_cols = len(self._transformer.get_feature_names()) + n_cols = len(self._transformer.get_feature_names_out()) X = check_array(X, accept_multiple_blocks=False, accept_unknown_chunks=True) chunks = (X.chunks[0], n_cols) XP = X.map_blocks(self._transformer.transform, dtype=X.dtype, chunks=chunks) elif isinstance(X, pd.DataFrame): XP = X.pipe(self._transformer.transform) if self.preserve_dataframe: - columns = self._transformer.get_feature_names(X.columns) + columns = self._transformer.get_feature_names_out(X.columns) XP = pd.DataFrame(data=XP, columns=columns, index=X.index) elif isinstance(X, dd.DataFrame): XP = X.map_partitions(self._transformer.transform) if self.preserve_dataframe: - columns = self._transformer.get_feature_names(X.columns) + columns = self._transformer.get_feature_names_out(X.columns) XP = dd.from_dask_array(XP, columns, X.index) else: # typically X is instance of np.ndarray diff --git a/dask_ml/utils.py b/dask_ml/utils.py index fb6bec4ba..d07e7fd45 100644 --- a/dask_ml/utils.py +++ b/dask_ml/utils.py @@ -221,6 +221,8 @@ def _assert_eq(l, r, name=None, **kwargs): ): for a, b in zip(l, r): _assert_eq(a, b, **kwargs) + elif np.isscalar(r) and np.isnan(r): + assert np.isnan(l), (name, l, r) else: assert l == r, (name, l, r) @@ -272,6 +274,164 @@ def check_matching_blocks(*arrays): raise ValueError("Unexpected types {}.".format({type(x) for x in arrays})) + +def check_X_y( + X, + y, + accept_sparse=False, + *, + accept_large_sparse=True, + dtype="numeric", + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + allow_nd=False, + multi_output=False, + ensure_min_samples=1, + ensure_min_features=1, + y_numeric=False, + estimator=None, +): + """Input validation for standard estimators. + + Checks X and y for consistent length, enforces X to be 2D and y 1D. By + default, X is checked to be non-empty and containing only finite values. + Standard input checks are also applied to y, such as checking that y + does not have np.nan or np.inf targets. For multi-label y, set + multi_output=True to allow 2D and sparse y. If the dtype of X is + object, attempt converting to float, raising on failure. + + Parameters + ---------- + X : {ndarray, list, sparse matrix} + Input data. + + y : {ndarray, list, sparse matrix} + Labels. + + accept_sparse : str, bool or list of str, default=False + String[s] representing allowed sparse matrix formats, such as 'csc', + 'csr', etc. If the input is sparse but not in the allowed format, + it will be converted to the first listed format. True allows the input + to be any format. False means that a sparse matrix input will + raise an error. + + accept_large_sparse : bool, default=True + If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by + accept_sparse, accept_large_sparse will cause it to be accepted only + if its indices are stored with a 32-bit dtype. + + .. versionadded:: 0.20 + + dtype : 'numeric', type, list of type or None, default='numeric' + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. + + order : {'F', 'C'}, default=None + Whether an array will be forced to be fortran or c-style. + + copy : bool, default=False + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter + does not influence whether y can have np.inf, np.nan, pd.NA values. + The possibilities are: + + - True: Force all values of X to be finite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot + be infinite. + + .. versionadded:: 0.20 + ``force_all_finite`` accepts the string ``'allow-nan'``. + + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + + ensure_2d : bool, default=True + Whether to raise a value error if X is not 2D. + + allow_nd : bool, default=False + Whether to allow X.ndim > 2. + + multi_output : bool, default=False + Whether to allow 2D y (array or sparse matrix). If false, y will be + validated as a vector. y cannot have np.nan or np.inf values if + multi_output=True. + + ensure_min_samples : int, default=1 + Make sure that X has a minimum number of samples in its first + axis (rows for a 2D array). + + ensure_min_features : int, default=1 + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when X has effectively 2 dimensions or + is originally 1D and ``ensure_2d`` is True. Setting to 0 disables + this check. + + y_numeric : bool, default=False + Whether to ensure that y has a numeric type. If dtype of y is object, + it is converted to float64. Should only be used for regression + algorithms. + + estimator : str or estimator instance, default=None + If passed, include the name of the estimator in warning messages. + + Returns + ------- + X_converted : object + The converted and validated X. + + y_converted : object + The converted and validated y. + """ + if y is None: + raise ValueError("y cannot be None") + + X = check_array( + X, + accept_sparse=accept_sparse, + accept_large_sparse=accept_large_sparse, + dtype=dtype, + order=order, + copy=copy, + force_all_finite=force_all_finite, + ensure_2d=ensure_2d, + allow_nd=allow_nd, + ensure_min_samples=ensure_min_samples, + ensure_min_features=ensure_min_features, + estimator=estimator, + ) + + y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric) + + check_consistent_length(X, y) + + return X, y + + +def _check_y(y, multi_output=False, y_numeric=False): + """Isolated part of check_X_y dedicated to y validation""" + if multi_output: + y = check_array( + y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None + ) + else: + y = column_or_1d(y, warn=True) + _assert_all_finite(y) + _ensure_no_complex_data(y) + if y_numeric and y.dtype.kind == "O": + y = y.astype(np.float64) + + return y + + def check_chunks(n_samples, n_features, chunks=None): """Validate and normalize the chunks argument for a dask.array @@ -398,6 +558,51 @@ def _num_samples(X): return result + +def _get_feature_names(X): + """Get feature names from X. + Support for other array containers should place its implementation here. + Parameters + ---------- + X : {ndarray, dataframe} of shape (n_samples, n_features) + Array container to extract feature names. + - pandas dataframe : The columns will be considered to be feature + names. If the dataframe contains non-string feature names, `None` is + returned. + - All other array containers will return `None`. + Returns + ------- + names: ndarray or None + Feature names of `X`. Unrecognized array containers will return `None`. + """ + feature_names = None + + # extract feature names for support array containers + if hasattr(X, "columns"): + feature_names = np.asarray(X.columns, dtype=object) + + if feature_names is None or len(feature_names) == 0: + return + + types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names)) + + # Warn when types are mixed. + # ints and strings do not warn + if len(types) > 1 or not (types[0].startswith("int") or types[0] == "str"): + # TODO: Convert to an error in 1.2 + warnings.warn( + "Feature names only support names that are all strings. " + f"Got feature names with dtypes: {types}. An error will be raised " + "in 1.2.", + FutureWarning, + ) + return + + # Only feature names of all strings are supported + if types[0] == "str": + return feature_names + + __all__ = [ "assert_estimator_equal", "check_array", From 23df9dd0190ae7e4dd51c8f801ab1b3bd76a6ef4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Oct 2021 11:10:39 -0500 Subject: [PATCH 02/10] wip - compat --- .github/workflows/lint.yaml | 10 +++++ .github/workflows/tests.yaml | 44 +++++++++++++++++++ ci/code_checks.sh | 18 -------- ci/environment-3.6.yaml | 32 -------------- ci/environment-3.8.yaml | 23 +++------- ci/environment-3.9.yaml | 24 ++++++++++ ci/install.sh | 4 ++ .../dask_searchcv/test_model_selection.py | 4 +- tests/preprocessing/test_data.py | 28 +++++------- 9 files changed, 102 insertions(+), 85 deletions(-) create mode 100644 .github/workflows/lint.yaml create mode 100644 .github/workflows/tests.yaml delete mode 100755 ci/code_checks.sh delete mode 100644 ci/environment-3.6.yaml create mode 100644 ci/environment-3.9.yaml create mode 100644 ci/install.sh diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 000000000..b69c01737 --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,10 @@ +name: Linting +on: [pull_request, push] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: pre-commit/action@v2.0.0 \ No newline at end of file diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml new file mode 100644 index 000000000..0302957d5 --- /dev/null +++ b/.github/workflows/tests.yaml @@ -0,0 +1,44 @@ +name: Tests + +on: [push, pull_request] + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + # os: ["windows-latest", "ubuntu-latest", "macos-latest"] + os: ["ubuntu-latest"] + python-version: ["3.8"] + + env: + PYTHON_VERSION: ${{ matrix.python-version }} + PARALLEL: "true" + COVERAGE: "true" + + steps: + - name: Checkout source + uses: actions/checkout@v2 + with: + fetch-depth: 0 # Needed by codecov.io + + - name: Setup Conda Environment + uses: conda-incubator/setup-miniconda@v2 + with: + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true + channel-priority: strict + python-version: ${{ matrix.python-version }} + environment-file: ci/environment-${{ matrix.python-version }}.yaml + activate-environment: test-environment + auto-activate-base: false + + - name: Install + shell: bash -l {0} + run: source ci/install.sh + + - name: Run tests + shell: bash -l {0} + run: pytest -v \ No newline at end of file diff --git a/ci/code_checks.sh b/ci/code_checks.sh deleted file mode 100755 index dbef47de7..000000000 --- a/ci/code_checks.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -source activate dask-ml-test -MSG='Checking flake8... ' ; echo $MSG -flake8 -RET=$(($RET + $?)) ; echo $MSG "DONE" - -MSG='Checking black... ' ; echo $MSG -black --version -black --check . -RET=$(($RET + $?)) ; echo $MSG "DONE" - -MSG='Checking isort... ' ; echo $MSG -isort --version-number -isort --recursive --check-only . -RET=$(($RET + $?)) ; echo $MSG "DONE" - -exit $RET diff --git a/ci/environment-3.6.yaml b/ci/environment-3.6.yaml deleted file mode 100644 index 57d46f934..000000000 --- a/ci/environment-3.6.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: dask-ml-test -channels: - - conda-forge - - defaults -dependencies: - - black==19.10b0 - - coverage - - dask ==2.4.0 - - dask-glm >=0.2.0 - - distributed ==2.4.0 - - flake8 - - isort==4.3.21 - - msgpack-python ==0.6.2 - - multipledispatch - - mypy - - numba - - numpy ==1.17.3 - - numpydoc - - packaging - - pandas =0.24.2 - - psutil - - pytest - - pytest-cov - - pytest-mock - - python=3.6.* - - scikit-learn =0.23.* - - scipy - - sparse - - toolz - - pip - - pip: - - pytest-azurepipelines diff --git a/ci/environment-3.8.yaml b/ci/environment-3.8.yaml index c48399c7d..09cb79296 100644 --- a/ci/environment-3.8.yaml +++ b/ci/environment-3.8.yaml @@ -1,23 +1,15 @@ -name: dask-ml-test +name: dask-ml-3.8 channels: - conda-forge - defaults dependencies: - - black==19.10b0 - - coverage - - codecov - # dask 2021.3.0 introduced a regression which causes tests to fail. - # The issue has been resolved upstream in dask and will be included - # in the next release. We temporarily apply a dask version contraint - # to allow CI to pass - - dask !=2021.3.0 - - dask-glm >=0.2.0 - - flake8 + - dask + - dask-glm - isort==4.3.21 - multipledispatch >=0.4.9 - mypy - numba - - numpy >=1.16.3 + - numpy - numpydoc - packaging - pandas @@ -26,10 +18,7 @@ dependencies: - pytest-cov - pytest-mock - python=3.8.* - - scikit-learn>=0.23.0 + - scikit-learn>=1.0.0 - scipy - sparse - - toolz - - pip - - pip: - - pytest-azurepipelines + - toolz \ No newline at end of file diff --git a/ci/environment-3.9.yaml b/ci/environment-3.9.yaml new file mode 100644 index 000000000..f47074472 --- /dev/null +++ b/ci/environment-3.9.yaml @@ -0,0 +1,24 @@ +name: dask-ml-test +channels: + - conda-forge + - defaults +dependencies: + - dask + - dask-glm + - isort==4.3.21 + - multipledispatch >=0.4.9 + - mypy + - numba + - numpy + - numpydoc + - packaging + - pandas + - psutil + - pytest + - pytest-cov + - pytest-mock + - python=3.8.* + - scikit-learn>=1.0.0 + - scipy + - sparse + - toolz \ No newline at end of file diff --git a/ci/install.sh b/ci/install.sh new file mode 100644 index 000000000..8ffd7a7ac --- /dev/null +++ b/ci/install.sh @@ -0,0 +1,4 @@ +python -m pip install --quiet --no-deps -e . + +echo mamba list +mamba list \ No newline at end of file diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index e0a203932..53fc49299 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -43,7 +43,7 @@ from sklearn.svm import SVC import dask_ml.model_selection as dcv -from dask_ml._compat import DISTRIBUTED_2_11_0, SK_0_23_2 +from dask_ml._compat import DISTRIBUTED_2_11_0 from dask_ml.model_selection import check_cv, compute_n_splits from dask_ml.model_selection._search import _normalize_n_jobs from dask_ml.model_selection.methods import CVCache @@ -488,7 +488,7 @@ def check_scores_all_nan(gs, bad_param, score_key="score"): ) -@pytest.mark.xfail(SK_0_23_2, reason="https://github.com/dask/dask-ml/issues/672") +@pytest.mark.xfail(reason="https://github.com/dask/dask-ml/issues/672") @pytest.mark.parametrize( "weights", [None, (None, {"tr0": 2, "tr2": 3}, {"tr0": 2, "tr2": 4})] ) diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py index beb4767fd..f1bbb7169 100644 --- a/tests/preprocessing/test_data.py +++ b/tests/preprocessing/test_data.py @@ -66,7 +66,7 @@ def test_input_types(self, dask_df, pandas_df): b = spp.StandardScaler() assert_estimator_equal( - a.fit(dask_df.values), a.fit(dask_df), exclude="n_samples_seen_" + a.fit(dask_df.values), a.fit(dask_df), ) assert_estimator_equal( @@ -108,7 +108,7 @@ def test_basic(self): a.fit(X) b.fit(X.compute()) - assert_estimator_equal(a, b, exclude="n_samples_seen_") + assert_estimator_equal(a, b) def test_inverse_transform(self): a = dpp.MinMaxScaler() @@ -511,17 +511,19 @@ def test_basic(self): a.fit(X) b.fit(X.compute()) - assert_estimator_equal(a._transformer, b) + assert_estimator_equal(a._transformer, b, exclude={"n_input_features_"}) def test_input_types(self): a = dpp.PolynomialFeatures() b = spp.PolynomialFeatures() - assert_estimator_equal(a.fit(df), a.fit(df.compute())) - assert_estimator_equal(a.fit(df), a.fit(df.compute().values)) - assert_estimator_equal(a.fit(df.values), a.fit(df.compute().values)) - assert_estimator_equal(a.fit(df), b.fit(df.compute())) - assert_estimator_equal(a.fit(df), b.fit(df.compute().values)) + exclude = {"n_input_features_"} + + assert_estimator_equal(a.fit(df), a.fit(df.compute()), exclude=exclude) + assert_estimator_equal(a.fit(df), a.fit(df.compute().values), exclude=exclude) + assert_estimator_equal(a.fit(df.values), a.fit(df.compute().values), exclude=exclude) + assert_estimator_equal(a.fit(df), b.fit(df.compute()), exclude=exclude) + assert_estimator_equal(a.fit(df), b.fit(df.compute().values), exclude=exclude) def test_array_transform(self): a = dpp.PolynomialFeatures() @@ -529,7 +531,7 @@ def test_array_transform(self): res_a = a.fit_transform(X) res_b = b.fit_transform(X.compute()) - assert_estimator_equal(a, b) + assert_estimator_equal(a, b, exclude={"n_input_features_"}) assert dask.is_dask_collection(res_a) assert_eq_ar(res_a, res_b) @@ -549,21 +551,15 @@ def test_transformed_shape(self): # checks if the transformed objects have the correct columns a = dpp.PolynomialFeatures() a.fit(X) - n_cols = len(a.get_feature_names()) + n_cols = len(a.get_feature_names_out()) # dask array assert a.transform(X).shape[1] == n_cols # numpy array assert a.transform(X.compute()).shape[1] == n_cols - # dask dataframe - assert a.transform(df).shape[1] == n_cols - # pandas dataframe - assert a.transform(df.compute()).shape[1] == n_cols X_nan_rows = df.values df_none_divisions = X_nan_rows.to_dask_dataframe(columns=df.columns) # dask array with nan rows assert a.transform(X_nan_rows).shape[1] == n_cols - # dask data frame with nan rows - assert a.transform(df_none_divisions).shape[1] == n_cols @pytest.mark.parametrize("daskify", [False, True]) def test_df_transform(self, daskify): From f4f4d26c3fb05c4eacbf0f4c3ab820f3f6f251c6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Oct 2021 11:16:18 -0500 Subject: [PATCH 03/10] rm azure-pipelines --- azure-pipelines.yml | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 azure-pipelines.yml diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index d04fb7eb8..000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,12 +0,0 @@ -trigger: -- main - -jobs: -- template: ci/posix.yaml - parameters: - name: 'linux' - vmImage: 'ubuntu-16.04' -- template: ci/windows.yaml - parameters: - name: 'win64' - vmImage: 'vs2017-win2016' From 440f3ad445b39507f33a6db8675d7ee07c3586bc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Oct 2021 11:16:31 -0500 Subject: [PATCH 04/10] remove compat --- dask_ml/_compat.py | 12 ++++-------- dask_ml/cluster/_compat.py | 12 ++---------- dask_ml/cluster/k_means.py | 11 +++-------- dask_ml/model_selection/_search.py | 11 +++-------- dask_ml/preprocessing/_encoders.py | 1 - tests/metrics/test_regression.py | 4 +--- tests/test_kmeans.py | 11 ++--------- 7 files changed, 15 insertions(+), 47 deletions(-) diff --git a/dask_ml/_compat.py b/dask_ml/_compat.py index c99d625be..4539a2051 100644 --- a/dask_ml/_compat.py +++ b/dask_ml/_compat.py @@ -16,8 +16,6 @@ PANDAS_VERSION = packaging.version.parse(pandas.__version__) DISTRIBUTED_VERSION = packaging.version.parse(distributed.__version__) -SK_0_23_2 = SK_VERSION >= packaging.version.parse("0.23.2") -SK_024 = SK_VERSION >= packaging.version.parse("0.24.0.dev0") DASK_240 = DASK_VERSION >= packaging.version.parse("2.4.0") DASK_2130 = DASK_VERSION >= packaging.version.parse("2.13.0") DASK_2_20_0 = DASK_VERSION >= packaging.version.parse("2.20.0") @@ -49,9 +47,7 @@ def _check_multimetric_scoring(estimator, scoring=None): from sklearn.metrics._scorer import _check_multimetric_scoring from sklearn.metrics import check_scoring - if SK_024: - if callable(scoring) or isinstance(scoring, (type(None), str)): - scorers = {"score": check_scoring(estimator, scoring=scoring)} - return scorers, False - return _check_multimetric_scoring(estimator, scoring), True - return _check_multimetric_scoring(estimator, scoring) + if callable(scoring) or isinstance(scoring, (type(None), str)): + scorers = {"score": check_scoring(estimator, scoring=scoring)} + return scorers, False + return _check_multimetric_scoring(estimator, scoring), True \ No newline at end of file diff --git a/dask_ml/cluster/_compat.py b/dask_ml/cluster/_compat.py index 4bbf684d7..7fee9f96a 100644 --- a/dask_ml/cluster/_compat.py +++ b/dask_ml/cluster/_compat.py @@ -1,11 +1,3 @@ -from .._compat import SK_024 +from sklearn.cluster._kmeans import _kmeans_plusplus # noqa -if SK_024: - from sklearn.cluster._kmeans import _kmeans_plusplus # noqa - - __all__ = ["_kmeans_plusplus"] - -else: - from sklearn.cluster._kmeans import _k_init - - __all__ = ["_k_init"] +__all__ = ["_kmeans_plusplus"] \ No newline at end of file diff --git a/dask_ml/cluster/k_means.py b/dask_ml/cluster/k_means.py index 255516437..99399df31 100644 --- a/dask_ml/cluster/k_means.py +++ b/dask_ml/cluster/k_means.py @@ -12,7 +12,7 @@ from sklearn.utils.extmath import squared_norm from sklearn.utils.validation import check_is_fitted -from .._compat import SK_024, blockwise +from .._compat import blockwise from .._utils import draw_seed from ..metrics import ( euclidean_distances, @@ -21,11 +21,7 @@ ) from ..utils import _timed, _timer, check_array, row_norms -if SK_024: - from ._compat import _kmeans_plusplus -else: - from ._compat import _k_init as _kmeans_plusplus - +from ._compat import _kmeans_plusplus import numba # isort:skip (see https://github.com/dask/dask-ml/pull/577) @@ -394,8 +390,7 @@ def init_pp(X, n_clusters, random_state): random_state=random_state, x_squared_norms=x_squared_norms, ) - if SK_024: - centers, _ = centers + centers, _ = centers return centers diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index afa0261d0..d6325afac 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -35,7 +35,7 @@ from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _num_samples, check_is_fitted -from .._compat import SK_024, SK_VERSION +from .._compat import SK_VERSION from ._normalize import normalize_estimator from .methods import ( MISSING, @@ -54,9 +54,7 @@ score, ) from .utils import DeprecationDict, is_dask_collection, to_indexable, to_keys, unzip - -if SK_024: - from sklearn.base import _is_pairwise +from sklearn.base import _is_pairwise logger = logging.getLogger(__name__) @@ -200,10 +198,7 @@ def build_cv_graph( X, y, groups = to_indexable(X, y, groups) cv = check_cv(cv, y, is_classifier(estimator)) # "pairwise" estimators require a different graph for CV splitting - if SK_024: - is_pairwise = _is_pairwise(estimator) - else: - is_pairwise = getattr(estimator, "_pairwise", False) + is_pairwise = _is_pairwise(estimator) dsk = {} X_name, y_name, groups_name = to_keys(dsk, X, y, groups) diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py index 35e59fcd9..29602f283 100644 --- a/dask_ml/preprocessing/_encoders.py +++ b/dask_ml/preprocessing/_encoders.py @@ -6,7 +6,6 @@ import pandas as pd import sklearn.preprocessing -from .._compat import SK_024 from .._typing import ArrayLike, DataFrameType, DTypeLike, SeriesType from ..base import DaskMLBaseMixin from ..utils import check_array diff --git a/tests/metrics/test_regression.py b/tests/metrics/test_regression.py index 17f21ee8d..98dca26a8 100644 --- a/tests/metrics/test_regression.py +++ b/tests/metrics/test_regression.py @@ -7,7 +7,6 @@ from dask.array.utils import assert_eq import dask_ml.metrics -from dask_ml._compat import SK_024 _METRICS_TO_TEST = [ "mean_squared_error", @@ -17,8 +16,7 @@ ] # mean_absolute_percentage_error() was added in scikit-learn 0.24.0 -if SK_024: - _METRICS_TO_TEST.append("mean_absolute_percentage_error") +_METRICS_TO_TEST.append("mean_absolute_percentage_error") @pytest.fixture(params=_METRICS_TO_TEST) diff --git a/tests/test_kmeans.py b/tests/test_kmeans.py index 78fec594d..706b50ea0 100644 --- a/tests/test_kmeans.py +++ b/tests/test_kmeans.py @@ -16,14 +16,10 @@ from sklearn.utils.estimator_checks import check_estimator import dask_ml.cluster -from dask_ml._compat import SK_024 from dask_ml.cluster import KMeans as DKKMeans, k_means from dask_ml.utils import assert_estimator_equal, row_norms -if SK_024: - from dask_ml.cluster._compat import _kmeans_plusplus -else: - from dask_ml.cluster._compat import _k_init as _kmeans_plusplus +from dask_ml.cluster._compat import _kmeans_plusplus def test_check_estimator(): @@ -98,10 +94,7 @@ def test_fit_given_init(self): X_ = X.compute() x_squared_norms = sklearn.utils.extmath.row_norms(X_, squared=True) rs = np.random.RandomState(0) - if SK_024: - init, _ = _kmeans_plusplus(X_, 3, x_squared_norms, rs) - else: - init = _kmeans_plusplus(X_, 3, x_squared_norms, rs) + init, _ = _kmeans_plusplus(X_, 3, x_squared_norms, rs) dkkm = DKKMeans(3, init=init, random_state=0) skkm = SKKMeans(3, init=init, random_state=0, n_init=1) dkkm.fit(X) From 637779ccb5e863afe075f6d8550febd6c04412fc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Oct 2021 11:20:16 -0500 Subject: [PATCH 05/10] lint --- dask_ml/_compat.py | 2 +- dask_ml/_partial.py | 3 --- dask_ml/cluster/_compat.py | 2 +- dask_ml/cluster/k_means.py | 2 +- dask_ml/model_selection/_search.py | 9 +++++++-- dask_ml/preprocessing/_encoders.py | 4 +++- dask_ml/preprocessing/data.py | 23 +++++++++++++++++---- dask_ml/utils.py | 32 +++++++++++++++++------------- tests/preprocessing/test_data.py | 5 +++-- tests/test_kmeans.py | 3 +-- 10 files changed, 54 insertions(+), 31 deletions(-) diff --git a/dask_ml/_compat.py b/dask_ml/_compat.py index 4539a2051..8e65d9d91 100644 --- a/dask_ml/_compat.py +++ b/dask_ml/_compat.py @@ -50,4 +50,4 @@ def _check_multimetric_scoring(estimator, scoring=None): if callable(scoring) or isinstance(scoring, (type(None), str)): scorers = {"score": check_scoring(estimator, scoring=scoring)} return scorers, False - return _check_multimetric_scoring(estimator, scoring), True \ No newline at end of file + return _check_multimetric_scoring(estimator, scoring), True diff --git a/dask_ml/_partial.py b/dask_ml/_partial.py index 102b9f757..0bff82b53 100644 --- a/dask_ml/_partial.py +++ b/dask_ml/_partial.py @@ -176,9 +176,6 @@ def predict(model, x): func = partial(_predict, model) if getattr(model, "feature_names_in_", None) is not None: - import pandas as pd - import dask.dataframe as dd - meta = model.predict(x._meta_nonempty) return x.map_partitions(func, meta=meta) else: diff --git a/dask_ml/cluster/_compat.py b/dask_ml/cluster/_compat.py index 7fee9f96a..bce639c62 100644 --- a/dask_ml/cluster/_compat.py +++ b/dask_ml/cluster/_compat.py @@ -1,3 +1,3 @@ from sklearn.cluster._kmeans import _kmeans_plusplus # noqa -__all__ = ["_kmeans_plusplus"] \ No newline at end of file +__all__ = ["_kmeans_plusplus"] diff --git a/dask_ml/cluster/k_means.py b/dask_ml/cluster/k_means.py index 99399df31..4b02c2fa4 100644 --- a/dask_ml/cluster/k_means.py +++ b/dask_ml/cluster/k_means.py @@ -20,8 +20,8 @@ pairwise_distances_argmin_min, ) from ..utils import _timed, _timer, check_array, row_norms - from ._compat import _kmeans_plusplus + import numba # isort:skip (see https://github.com/dask/dask-ml/pull/577) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index d6325afac..1113defa1 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -15,7 +15,13 @@ from dask.distributed import as_completed from dask.utils import derived_from from sklearn import model_selection -from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier +from sklearn.base import ( + BaseEstimator, + MetaEstimatorMixin, + _is_pairwise, + clone, + is_classifier, +) from sklearn.exceptions import NotFittedError from sklearn.model_selection._search import BaseSearchCV, _check_param_grid from sklearn.model_selection._split import ( @@ -54,7 +60,6 @@ score, ) from .utils import DeprecationDict, is_dask_collection, to_indexable, to_keys, unzip -from sklearn.base import _is_pairwise logger = logging.getLogger(__name__) diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py index 29602f283..17e5c0954 100644 --- a/dask_ml/preprocessing/_encoders.py +++ b/dask_ml/preprocessing/_encoders.py @@ -161,7 +161,9 @@ def _fit( handle_unknown: str = "error", force_all_finite: bool = True, ): - X = self._validate_data(X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True) + X = self._validate_data( + X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True + ) self._check_n_features(X, reset=True) self._check_feature_names(X, reset=True) diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py index b2edbfedd..bf609a0fe 100644 --- a/dask_ml/preprocessing/data.py +++ b/dask_ml/preprocessing/data.py @@ -1,7 +1,8 @@ from __future__ import division -import numbers +import collections import multiprocessing +import numbers from collections import OrderedDict from distutils.version import LooseVersion from typing import Any, List, Optional, Sequence, Union @@ -22,8 +23,8 @@ from dask_ml._utils import copy_learned_attributes from dask_ml.utils import check_array, handle_zeros_in_scale -from ..base import DaskMLBaseMixin from .._typing import ArrayLike, DataFrameType, NDArrayOrScalar, SeriesType +from ..base import DaskMLBaseMixin _PANDAS_VERSION = LooseVersion(pd.__version__) _HAS_CTD = _PANDAS_VERSION >= "0.21.0" @@ -58,7 +59,14 @@ def fit( y: Optional[Union[ArrayLike, SeriesType]] = None, ) -> "StandardScaler": self._reset() - X = self._validate_data(X, estimator=self, accept_dask_array=True, accept_dask_dataframe=True, accept_unknown_chunks=True, preserve_pandas_dataframe=True) + X = self._validate_data( + X, + estimator=self, + accept_dask_array=True, + accept_dask_dataframe=True, + accept_unknown_chunks=True, + preserve_pandas_dataframe=True, + ) attributes = OrderedDict() if isinstance(X, (pd.DataFrame, dd.DataFrame)): @@ -1076,7 +1084,14 @@ def fit( interaction_only=self.interaction_only, include_bias=self.include_bias, ) - X = self._validate_data(X, estimator=self, accept_dask_array=True, accept_dask_dataframe=True, accept_unknown_chunks=True, preserve_pandas_dataframe=True) + X = self._validate_data( + X, + estimator=self, + accept_dask_array=True, + accept_dask_dataframe=True, + accept_unknown_chunks=True, + preserve_pandas_dataframe=True, + ) if isinstance(self.degree, numbers.Integral): if self.degree < 0: diff --git a/dask_ml/utils.py b/dask_ml/utils.py index d07e7fd45..abeaa58a0 100644 --- a/dask_ml/utils.py +++ b/dask_ml/utils.py @@ -2,6 +2,7 @@ import datetime import functools import logging +import warnings from collections.abc import Sequence from multiprocessing import cpu_count from numbers import Integral @@ -124,7 +125,7 @@ def check_array( accept_multiple_blocks=False, preserve_pandas_dataframe=False, remove_zero_chunks=True, - **kwargs + **kwargs, ): """Validate inputs @@ -274,7 +275,6 @@ def check_matching_blocks(*arrays): raise ValueError("Unexpected types {}.".format({type(x) for x in arrays})) - def check_X_y( X, y, @@ -418,20 +418,25 @@ def check_X_y( def _check_y(y, multi_output=False, y_numeric=False): """Isolated part of check_X_y dedicated to y validation""" - if multi_output: - y = check_array( - y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None - ) - else: - y = column_or_1d(y, warn=True) - _assert_all_finite(y) - _ensure_no_complex_data(y) - if y_numeric and y.dtype.kind == "O": - y = y.astype(np.float64) - + # TODO: implement + # if multi_output: + # y = check_array( + # y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None + # ) + # else: + # y = column_or_1d(y, warn=True) + # _assert_all_finite(y) + # _ensure_no_complex_data(y) + # if y_numeric and y.dtype.kind == "O": + # y = y.astype(np.float64) return y +def check_consistent_length(*arrays): + # TODO: check divisions, chunks, etc. + pass + + def check_chunks(n_samples, n_features, chunks=None): """Validate and normalize the chunks argument for a dask.array @@ -558,7 +563,6 @@ def _num_samples(X): return result - def _get_feature_names(X): """Get feature names from X. Support for other array containers should place its implementation here. diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py index f1bbb7169..7851f920c 100644 --- a/tests/preprocessing/test_data.py +++ b/tests/preprocessing/test_data.py @@ -521,7 +521,9 @@ def test_input_types(self): assert_estimator_equal(a.fit(df), a.fit(df.compute()), exclude=exclude) assert_estimator_equal(a.fit(df), a.fit(df.compute().values), exclude=exclude) - assert_estimator_equal(a.fit(df.values), a.fit(df.compute().values), exclude=exclude) + assert_estimator_equal( + a.fit(df.values), a.fit(df.compute().values), exclude=exclude + ) assert_estimator_equal(a.fit(df), b.fit(df.compute()), exclude=exclude) assert_estimator_equal(a.fit(df), b.fit(df.compute().values), exclude=exclude) @@ -557,7 +559,6 @@ def test_transformed_shape(self): # numpy array assert a.transform(X.compute()).shape[1] == n_cols X_nan_rows = df.values - df_none_divisions = X_nan_rows.to_dask_dataframe(columns=df.columns) # dask array with nan rows assert a.transform(X_nan_rows).shape[1] == n_cols diff --git a/tests/test_kmeans.py b/tests/test_kmeans.py index 706b50ea0..94a147150 100644 --- a/tests/test_kmeans.py +++ b/tests/test_kmeans.py @@ -17,9 +17,8 @@ import dask_ml.cluster from dask_ml.cluster import KMeans as DKKMeans, k_means -from dask_ml.utils import assert_estimator_equal, row_norms - from dask_ml.cluster._compat import _kmeans_plusplus +from dask_ml.utils import assert_estimator_equal, row_norms def test_check_estimator(): From f9c224b63d11612a36ed6ea16b1e397419dad8ff Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Oct 2021 11:23:20 -0500 Subject: [PATCH 06/10] lint --- dask_ml/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/base.py b/dask_ml/base.py index 60bbe5b1e..2997c39d1 100644 --- a/dask_ml/base.py +++ b/dask_ml/base.py @@ -1,6 +1,6 @@ import sklearn.base -from .utils import check_array +from .utils import _check_y, check_array, check_X_y class DaskMLBaseMixin: From 792311b9f1667b962dfc47d1416184c05dd36649 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Oct 2021 11:27:24 -0500 Subject: [PATCH 07/10] matrix --- .github/workflows/tests.yaml | 2 +- ci/environment-3.7.yaml | 23 ++++++----------------- ci/environment-3.9.yaml | 4 ++-- setup.py | 5 +++-- 4 files changed, 12 insertions(+), 22 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 0302957d5..8b12ca131 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -10,7 +10,7 @@ jobs: matrix: # os: ["windows-latest", "ubuntu-latest", "macos-latest"] os: ["ubuntu-latest"] - python-version: ["3.8"] + python-version: ["3.7", "3.8", "3.9"] env: PYTHON_VERSION: ${{ matrix.python-version }} diff --git a/ci/environment-3.7.yaml b/ci/environment-3.7.yaml index 2ab6922ac..89d615345 100644 --- a/ci/environment-3.7.yaml +++ b/ci/environment-3.7.yaml @@ -1,23 +1,15 @@ -name: dask-ml-test +name: dask-ml-3.7 channels: - conda-forge - defaults dependencies: - - black==19.10b0 - - coverage - - codecov - # dask 2021.3.0 introduced a regression which causes tests to fail. - # The issue has been resolved upstream in dask and will be included - # in the next release. We temporarily apply a dask version contraint - # to allow CI to pass - - dask !=2021.3.0 - - dask-glm >=0.2.0 - - flake8 + - dask + - dask-glm - isort==4.3.21 - multipledispatch >=0.4.9 - mypy - numba - - numpy >=1.16.3 + - numpy - numpydoc - packaging - pandas @@ -26,10 +18,7 @@ dependencies: - pytest-cov - pytest-mock - python=3.7.* - - scikit-learn>=0.23.0 + - scikit-learn>=1.0.0 - scipy - sparse - - toolz - - pip - - pip: - - pytest-azurepipelines + - toolz \ No newline at end of file diff --git a/ci/environment-3.9.yaml b/ci/environment-3.9.yaml index f47074472..1be11d42e 100644 --- a/ci/environment-3.9.yaml +++ b/ci/environment-3.9.yaml @@ -1,4 +1,4 @@ -name: dask-ml-test +name: dask-ml-3.9 channels: - conda-forge - defaults @@ -17,7 +17,7 @@ dependencies: - pytest - pytest-cov - pytest-mock - - python=3.8.* + - python=3.9.* - scikit-learn>=1.0.0 - scipy - sparse diff --git a/setup.py b/setup.py index 8f1163561..036f76698 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ "numba>=0.51.0", "numpy>=1.17.3", "pandas>=0.24.2", - "scikit-learn>=0.23", + "scikit-learn>=1.0.0", "scipy", "dask-glm>=0.2.0", "multipledispatch>=0.4.9", @@ -60,9 +60,10 @@ "Topic :: Database", "Topic :: Scientific/Engineering", "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", ], packages=find_packages(exclude=["docs", "tests", "tests.*", "docs.*"]), use_scm_version=True, From 1589d0efd09f0b01715f4accdf60fc7815c46803 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Oct 2021 11:35:56 -0500 Subject: [PATCH 08/10] Update doc env --- ci/environment-docs.yaml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/ci/environment-docs.yaml b/ci/environment-docs.yaml index 4efabafa6..cc912ba30 100644 --- a/ci/environment-docs.yaml +++ b/ci/environment-docs.yaml @@ -5,12 +5,10 @@ channels: dependencies: - black - coverage - - flake8 - graphviz - heapdict - ipykernel - ipython - - isort==4.3.21 - multipledispatch - mypy - nbsphinx @@ -21,18 +19,14 @@ dependencies: - numpydoc - pandas - psutil - - pytest - - pytest-cov - - pytest-mock - - python=3.7 + - python=3.8 - sortedcontainers - - scikit-learn>=0.23.1 + - scikit-learn>=1.0.0 - scipy - sparse - - sphinx==1.8.5 + - sphinx - sphinx_rtd_theme - sphinx-gallery - - testpath<0.4 - tornado - toolz - xgboost From 2d9a4ac844694a87f3cfff954bca6eeb775b6716 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Oct 2021 11:39:14 -0500 Subject: [PATCH 09/10] Fixed doc build --- docs/source/conf.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index e0bc15acf..736933cb8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -19,7 +19,6 @@ # import packaging.version -import dask_sphinx_theme from dask_ml import __version__ as version # import sys @@ -106,8 +105,8 @@ html_theme = "dask_sphinx_theme" - -html_theme_path = [dask_sphinx_theme.get_html_theme_path()] +html_theme_options = {"logo_only": True} +# html_theme_path = [dask_sphinx_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the From 11353ab272ccaece3be46092275c755796905825 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Oct 2021 11:42:22 -0500 Subject: [PATCH 10/10] removed isort from env files --- ci/environment-3.7.yaml | 1 - ci/environment-3.8.yaml | 1 - ci/environment-3.9.yaml | 1 - 3 files changed, 3 deletions(-) diff --git a/ci/environment-3.7.yaml b/ci/environment-3.7.yaml index 89d615345..1575b115a 100644 --- a/ci/environment-3.7.yaml +++ b/ci/environment-3.7.yaml @@ -5,7 +5,6 @@ channels: dependencies: - dask - dask-glm - - isort==4.3.21 - multipledispatch >=0.4.9 - mypy - numba diff --git a/ci/environment-3.8.yaml b/ci/environment-3.8.yaml index 09cb79296..334c5e227 100644 --- a/ci/environment-3.8.yaml +++ b/ci/environment-3.8.yaml @@ -5,7 +5,6 @@ channels: dependencies: - dask - dask-glm - - isort==4.3.21 - multipledispatch >=0.4.9 - mypy - numba diff --git a/ci/environment-3.9.yaml b/ci/environment-3.9.yaml index 1be11d42e..799bb3348 100644 --- a/ci/environment-3.9.yaml +++ b/ci/environment-3.9.yaml @@ -5,7 +5,6 @@ channels: dependencies: - dask - dask-glm - - isort==4.3.21 - multipledispatch >=0.4.9 - mypy - numba