From efaa4b09575bcc70ff27c38a792dbc9cc440880e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 13 Feb 2021 21:55:47 +0100 Subject: [PATCH 01/22] FEA add ValueDifferenceMetric to compute distance between nominal samples --- imblearn/metrics/pairwise.py | 138 +++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 imblearn/metrics/pairwise.py diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py new file mode 100644 index 000000000..88b9c38a9 --- /dev/null +++ b/imblearn/metrics/pairwise.py @@ -0,0 +1,138 @@ +import numpy as np +from scipy.spatial import distance_matrix + + +class ValueDifferenceMetric: + """Class implementing the Value Difference Metric. + + This metric computes the distance between samples containing only nominal + categorical features. The distance between feature values of two samples + is defined as: + + .. math:: + \delta(x, y) = \sum_{c=1}^{C} |p(c|x_{f}) - p(c|y_{f})|^{k} \ , + + where :math:`x` and :math:`y` are two samples and :math:`f` a given + feature, :math:`C` is the number of classes, :math:`p(c|x_{f})` is the + conditional probability that the output class is :math:`c` given that + the feature value :math:`f` has the value :math:`x` and :math:`k` an + exponent usually defined to 1 or 2. + + The distance for the feature vectors :math:`X` and :math:`Y` is + subsequently defined as: + + .. math:: + \Delta(X, Y) = \sum_{f=1}^{F} \delta(X_{f}, Y_{f})^{r} \ , + + where :math:`F` is the number of feature and :math:`r` an exponent usually + defined equal to 1 or 2. + + Parameters + ---------- + classes : ndarray of shape (n_classes,) + The unique labels in `y`. + + categories : list of arrays + List of arrays containing the categories for each feature. You can pass + the fitted attribute `categories_` of the + :class:`~sklearn.preprocesssing.OrdinalEncoder` used to encode the + data. + + k : int, default=1 + Exponent used to compute the distance between feature value. + + r : int, default=2 + Exponent used to compute the distance between the feature vector. + + Attributes + ---------- + proba_per_class_ : list of ndarray of shape (n_categories, n_classes) + List of length `n_features` containing the conditional probabilities + for each category given a class. + """ + + def __init__(self, classes, categories, k=1, r=2): + self.classes = classes + self.categories = categories + self.k = k + self.r = r + + def fit(self, X, y): + """Compute the necessary statistics from the training set. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The input data. The data are expected to be encoded with an + :class:`~sklearn.preprocessing.OrdinalEncoder`. + + y : ndarray of shape (n_features,) + The target. + + Returns + ------- + self + """ + n_features = X.shape[1] + + # list of length n_features of ndarray (n_categories, n_classes) + counts_per_class = [ + np.transpose( + [ + np.bincount( + X[y == klass, feature_idx], + minlength=len(self.categories[feature_idx]), + ) + for klass in self.classes + ] + ) + for feature_idx in range(n_features) + ] + + # list of length n_features of ndarray (n_categories,) + proba_per_class = [ + ( + counts_per_class[feature_idx] + / counts_per_class[feature_idx].sum(axis=1)[:, np.newaxis] + ) + for feature_idx in range(n_features) + ] + + self.proba_per_class_ = proba_per_class + + return self + + def pairwise(self, X1, X2=None): + """Compute the VDM distance pairwise. + + Parameters + ---------- + X1 : ndarray of shape (n_samples, n_features) + The input data. The data are expected to be encoded with an + :class:`~sklearn.preprocessing.OrdinalEncoder`. + + X2 : ndarray of shape (n_samples, n_features) + The input data. The data are expected to be encoded with an + :class:`~sklearn.preprocessing.OrdinalEncoder`. + + Returns + ------- + distance_matrix : ndarray of shape (n_samples, n_samples) + The VDM pairwise distance. + """ + n_samples_X1, n_features = X1.shape + n_samples_X2 = X2.shape[0] if X2 is not None else n_samples_X1 + + distance = np.zeros(shape=(n_samples_X1, n_samples_X2), dtype=np.float64) + for feature_idx in range(n_features): + proba_feature_X1 = self.proba_per_class_[feature_idx][X1[:, feature_idx]] + if X2 is not None: + proba_feature_X2 = self.proba_per_class_[feature_idx][ + X2[:, feature_idx] + ] + else: + proba_feature_X2 = proba_feature_X1 + distance += ( + distance_matrix(proba_feature_X1, proba_feature_X2, p=self.k) ** self.r + ) + return distance From d0e5d2f955ef8410fef365e42fc026b5022663ff Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 13 Feb 2021 22:16:39 +0100 Subject: [PATCH 02/22] linting --- imblearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 88b9c38a9..86137e34b 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -3,7 +3,7 @@ class ValueDifferenceMetric: - """Class implementing the Value Difference Metric. + r"""Class implementing the Value Difference Metric. This metric computes the distance between samples containing only nominal categorical features. The distance between feature values of two samples From e59585a9556bb77a0fcc3f9517680916691c757b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 13 Feb 2021 23:04:00 +0100 Subject: [PATCH 03/22] style --- imblearn/metrics/pairwise.py | 40 ++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 86137e34b..3fb934181 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -62,7 +62,7 @@ def fit(self, X, y): Parameters ---------- - X : ndarray of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features), dtype={np.int32, np.int64} The input data. The data are expected to be encoded with an :class:`~sklearn.preprocessing.OrdinalEncoder`. @@ -76,29 +76,25 @@ def fit(self, X, y): n_features = X.shape[1] # list of length n_features of ndarray (n_categories, n_classes) - counts_per_class = [ - np.transpose( + # compute the counts + self.proba_per_class_ = [ + np.array( [ np.bincount( X[y == klass, feature_idx], minlength=len(self.categories[feature_idx]), ) for klass in self.classes - ] - ) + ], + dtype=np.float64, + ).T for feature_idx in range(n_features) ] - - # list of length n_features of ndarray (n_categories,) - proba_per_class = [ - ( - counts_per_class[feature_idx] - / counts_per_class[feature_idx].sum(axis=1)[:, np.newaxis] + # normalize by the summing over the classes + for feature_idx in range(n_features): + self.proba_per_class_[feature_idx] /= ( + self.proba_per_class_[feature_idx].sum(axis=1).reshape(-1, 1) ) - for feature_idx in range(n_features) - ] - - self.proba_per_class_ = proba_per_class return self @@ -107,11 +103,11 @@ def pairwise(self, X1, X2=None): Parameters ---------- - X1 : ndarray of shape (n_samples, n_features) + X1 : ndarray of shape (n_samples, n_features), dtype={np.int32, np.int64} The input data. The data are expected to be encoded with an :class:`~sklearn.preprocessing.OrdinalEncoder`. - X2 : ndarray of shape (n_samples, n_features) + X2 : ndarray of shape (n_samples, n_features), dtype={np.int32, np.int64} The input data. The data are expected to be encoded with an :class:`~sklearn.preprocessing.OrdinalEncoder`. @@ -120,8 +116,16 @@ def pairwise(self, X1, X2=None): distance_matrix : ndarray of shape (n_samples, n_samples) The VDM pairwise distance. """ + if X1.dtype.kind != "i": + X1 = X1.astype(np.int64) n_samples_X1, n_features = X1.shape - n_samples_X2 = X2.shape[0] if X2 is not None else n_samples_X1 + + if X2 is not None: + if X2.dtype.kind != "i": + X2 = X2.astype(np.int64) + n_samples_X2 = X2.shape[0] + else: + n_samples_X2 = n_samples_X1 distance = np.zeros(shape=(n_samples_X1, n_samples_X2), dtype=np.float64) for feature_idx in range(n_features): From 92d89cafab0a135dd384ecfc9f76b972a62222ef Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 13 Feb 2021 23:20:35 +0100 Subject: [PATCH 04/22] TST basic tests --- imblearn/metrics/pairwise.py | 9 +++++- imblearn/metrics/tests/test_pairwise.py | 43 +++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 imblearn/metrics/tests/test_pairwise.py diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 3fb934181..5156c665c 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -1,3 +1,8 @@ +"""Metrics to perform pairwise computation.""" + +# Authors: Guillaume Lemaitre +# License: MIT + import numpy as np from scipy.spatial import distance_matrix @@ -51,7 +56,7 @@ class ValueDifferenceMetric: for each category given a class. """ - def __init__(self, classes, categories, k=1, r=2): + def __init__(self, classes, categories, *, k=1, r=2): self.classes = classes self.categories = categories self.k = k @@ -73,6 +78,8 @@ def fit(self, X, y): ------- self """ + if X.dtype.kind != "i": + X = X.astype(np.int64) n_features = X.shape[1] # list of length n_features of ndarray (n_categories, n_classes) diff --git a/imblearn/metrics/tests/test_pairwise.py b/imblearn/metrics/tests/test_pairwise.py new file mode 100644 index 000000000..1a5d316b2 --- /dev/null +++ b/imblearn/metrics/tests/test_pairwise.py @@ -0,0 +1,43 @@ +"""Test for the metrics that perform pairwise distance computation.""" + +# Authors: Guillaume Lemaitre +# License: MIT + +import numpy as np +import pytest + +from sklearn.preprocessing import OrdinalEncoder + +from imblearn.metrics.pairwise import ValueDifferenceMetric + + +@pytest.fixture +def data(): + rng = np.random.RandomState(0) + + feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30 + feature_2 = ["A"] * 40 + ["B"] * 20 + feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10 + X = np.array([feature_1, feature_2, feature_3], dtype=object).T + rng.shuffle(X) + y = rng.randint(low=0, high=3, size=X.shape[0]) + return X, y + + +def test_value_difference_metric(data): + X, y = data + + encoder = OrdinalEncoder(dtype=np.int32) + classes = np.unique(y) + + X_encoded = encoder.fit_transform(X) + + vdm = ValueDifferenceMetric(classes, encoder.categories_) + vdm.fit(X_encoded, y) + + dist_1 = vdm.pairwise(X_encoded) + dist_2 = vdm.pairwise(X_encoded, X_encoded) + + np.testing.assert_allclose(dist_1, dist_2) + assert dist_1.shape == (X.shape[0], X.shape[0]) + assert dist_2.shape == (X.shape[0], X.shape[0]) From 6cc89693895a41e51d93688cf4fde1461124979f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 13 Feb 2021 23:55:24 +0100 Subject: [PATCH 05/22] TST check that still true wiht differen r and k --- imblearn/metrics/tests/test_pairwise.py | 59 +++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/imblearn/metrics/tests/test_pairwise.py b/imblearn/metrics/tests/test_pairwise.py index 1a5d316b2..532ae5462 100644 --- a/imblearn/metrics/tests/test_pairwise.py +++ b/imblearn/metrics/tests/test_pairwise.py @@ -24,15 +24,21 @@ def data(): return X, y -def test_value_difference_metric(data): +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +@pytest.mark.parametrize("k, r", [(1, 1), (1, 2), (2, 1), (2, 2)]) +def test_value_difference_metric(data, dtype, k, r): + # Check basic feature of the metric: + # * the shape of the distance matrix is (n_samples, n_samples) + # * computing pairwise distance of X is the same than explicitely between + # X and X. X, y = data - encoder = OrdinalEncoder(dtype=np.int32) + encoder = OrdinalEncoder(dtype=dtype) classes = np.unique(y) X_encoded = encoder.fit_transform(X) - vdm = ValueDifferenceMetric(classes, encoder.categories_) + vdm = ValueDifferenceMetric(classes, encoder.categories_, k=k, r=r) vdm.fit(X_encoded, y) dist_1 = vdm.pairwise(X_encoded) @@ -41,3 +47,50 @@ def test_value_difference_metric(data): np.testing.assert_allclose(dist_1, dist_2) assert dist_1.shape == (X.shape[0], X.shape[0]) assert dist_2.shape == (X.shape[0], X.shape[0]) + + +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +@pytest.mark.parametrize("k, r", [(1, 1), (1, 2), (2, 1), (2, 2)]) +def test_value_difference_metric_property(dtype, k, r): + # Check the property of the vdm distance. Let's check the property + # described in "Improved Heterogeneous Distance Functions", D.R. Wilson and + # T.R. Martinez, Journal of Artificial Intelligence Research 6 (1997) 1-34 + # https://arxiv.org/pdf/cs/9701101.pdf + # + # "if an attribute color has three values red, green and blue, and the + # application is to identify whether or not an object is an apple, red and + # green would be considered closer thanred and blue because the former two + # both have similar correlations with the output class apple." + + # defined our feature + X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1) + # 0 - not an apple / 1 - an apple + y = np.array([1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1], dtype=np.int32) + + encoder = OrdinalEncoder(dtype=dtype) + classes = np.unique(y) + + X_encoded = encoder.fit_transform(X) + vdm = ValueDifferenceMetric(classes, encoder.categories_, k=k, r=r) + vdm.fit(X_encoded, y) + + sample_green = encoder.transform([["green"]]) + sample_red = encoder.transform([["red"]]) + sample_blue = encoder.transform([["blue"]]) + + for sample in (sample_green, sample_red, sample_blue): + # computing the distance between a sample of the same category should + # give a null distance + dist = vdm.pairwise(sample).squeeze() + assert dist == pytest.approx(0) + + # check the property explained in the introduction example + dist_1 = vdm.pairwise(sample_green, sample_red).squeeze() + dist_2 = vdm.pairwise(sample_blue, sample_red).squeeze() + dist_3 = vdm.pairwise(sample_blue, sample_green).squeeze() + + # green and red are very close + # blue is closer to red than green + assert dist_1 < dist_2 + assert dist_1 < dist_3 + assert dist_2 < dist_3 From adb2433eacb59ee2cd3c3818e5a2fdee500f5ff9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 00:09:32 +0100 Subject: [PATCH 06/22] DEBUG --- imblearn/metrics/pairwise.py | 44 +++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 5156c665c..dc7c31326 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -32,6 +32,8 @@ class ValueDifferenceMetric: where :math:`F` is the number of feature and :math:`r` an exponent usually defined equal to 1 or 2. + The definition of this distance was propoed in [1]_. + Parameters ---------- classes : ndarray of shape (n_classes,) @@ -54,6 +56,11 @@ class ValueDifferenceMetric: proba_per_class_ : list of ndarray of shape (n_categories, n_classes) List of length `n_features` containing the conditional probabilities for each category given a class. + + References + ---------- + .. [1] Stanfill, Craig, and David Waltz. "Toward memory-based reasoning." + Communications of the ACM 29.12 (1986): 1213-1228. """ def __init__(self, classes, categories, *, k=1, r=2): @@ -84,19 +91,30 @@ def fit(self, X, y): # list of length n_features of ndarray (n_categories, n_classes) # compute the counts - self.proba_per_class_ = [ - np.array( - [ - np.bincount( - X[y == klass, feature_idx], - minlength=len(self.categories[feature_idx]), - ) - for klass in self.classes - ], - dtype=np.float64, - ).T - for feature_idx in range(n_features) - ] + # self.proba_per_class_ = [ + # np.array( + # [ + # np.bincount( + # X[y == klass, feature_idx], + # minlength=len(self.categories[feature_idx]), + # ) + # for klass in self.classes + # ], + # dtype=np.float64, + # ).T + # for feature_idx in range(n_features) + # ] + self.proba_per_class_ = [] + for feature_idx in range(n_features): + arr = [] + for klass in self.classes: + tmp = np.bincount( + X[y == klass, feature_idx], + minlength=len(self.categories[feature_idx]), + ) + tmp = np.array(tmp, dtype=np.float64).T + arr.append(tmp) + self.proba_per_class_.append(arr) # normalize by the summing over the classes for feature_idx in range(n_features): self.proba_per_class_[feature_idx] /= ( From 1635acafacbbfe9f4591c8c3e240061b120b7035 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 00:19:28 +0100 Subject: [PATCH 07/22] iter --- imblearn/metrics/pairwise.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index dc7c31326..40fb6edc3 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -85,8 +85,7 @@ def fit(self, X, y): ------- self """ - if X.dtype.kind != "i": - X = X.astype(np.int64) + X = np.array(X, dtype=np.int64, copy=False) n_features = X.shape[1] # list of length n_features of ndarray (n_categories, n_classes) From 4e15dfbee8aedd1be19536fe30e3629e37b3f05e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 00:26:06 +0100 Subject: [PATCH 08/22] iter --- imblearn/metrics/pairwise.py | 37 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 40fb6edc3..1c256972c 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -90,30 +90,19 @@ def fit(self, X, y): # list of length n_features of ndarray (n_categories, n_classes) # compute the counts - # self.proba_per_class_ = [ - # np.array( - # [ - # np.bincount( - # X[y == klass, feature_idx], - # minlength=len(self.categories[feature_idx]), - # ) - # for klass in self.classes - # ], - # dtype=np.float64, - # ).T - # for feature_idx in range(n_features) - # ] - self.proba_per_class_ = [] - for feature_idx in range(n_features): - arr = [] - for klass in self.classes: - tmp = np.bincount( - X[y == klass, feature_idx], - minlength=len(self.categories[feature_idx]), - ) - tmp = np.array(tmp, dtype=np.float64).T - arr.append(tmp) - self.proba_per_class_.append(arr) + self.proba_per_class_ = [ + np.array( + [ + np.bincount( + X[y == klass, feature_idx], + minlength=len(self.categories[feature_idx]), + ) + for klass in self.classes + ], + dtype=np.float64, + ).T + for feature_idx in range(n_features) + ] # normalize by the summing over the classes for feature_idx in range(n_features): self.proba_per_class_[feature_idx] /= ( From 1d521d0231bac0c3a828907030baf3ce21a74c9a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 00:32:16 +0100 Subject: [PATCH 09/22] iter --- imblearn/metrics/pairwise.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 1c256972c..6590c2660 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -85,7 +85,7 @@ def fit(self, X, y): ------- self """ - X = np.array(X, dtype=np.int64, copy=False) + X = np.array(X, dtype=np.int32, copy=False) n_features = X.shape[1] # list of length n_features of ndarray (n_categories, n_classes) @@ -130,12 +130,12 @@ def pairwise(self, X1, X2=None): The VDM pairwise distance. """ if X1.dtype.kind != "i": - X1 = X1.astype(np.int64) + X1 = X1.astype(np.int32) n_samples_X1, n_features = X1.shape if X2 is not None: if X2.dtype.kind != "i": - X2 = X2.astype(np.int64) + X2 = X2.astype(np.int32) n_samples_X2 = X2.shape[0] else: n_samples_X2 = n_samples_X1 From dc1de98e826da5d4254bfe62d9ba2dc05d2f9013 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 12:34:21 +0100 Subject: [PATCH 10/22] iter --- .flake8 | 8 ++++++++ doc/api.rst | 20 ++++++++++++++++++++ doc/metrics.rst | 18 ++++++++++++++---- imblearn/metrics/_classification.py | 4 +++- imblearn/metrics/pairwise.py | 13 ++++++++++--- 5 files changed, 55 insertions(+), 8 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..67be91051 --- /dev/null +++ b/.flake8 @@ -0,0 +1,8 @@ +[flake8] +max-line-length = 88 +# Default flake8 3.5 ignored flags +ignore=E121,E123,E126,E226,E24,E704,W503,W504,E203 +# It's fine not to put the import at the top of the file in the examples +# folder. +per-file-ignores = + examples/*: E402 diff --git a/doc/api.rst b/doc/api.rst index 04203bc3f..927261c4b 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -205,6 +205,10 @@ Imbalance-learn provides some fast-prototyping tools. .. currentmodule:: imblearn +Classification metrics +---------------------- +See the :ref:`metrics` section of the user guide for further details. + .. autosummary:: :toctree: generated/ :template: function.rst @@ -217,6 +221,22 @@ Imbalance-learn provides some fast-prototyping tools. metrics.macro_averaged_mean_absolute_error metrics.make_index_balanced_accuracy +Pairwise metrics +---------------- +See the :ref:`pairwise_metrics` section of the user guide for further details. + +.. automodule:: imblearn.metrics.pairwise + :no-members: + :no-inherited-members: + +.. currentmodule:: imblearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + metrics.pairwise.ValueDifferenceMetric + .. _datasets_ref: :mod:`imblearn.datasets`: Datasets diff --git a/doc/metrics.rst b/doc/metrics.rst index 98368a650..78b2b98d0 100644 --- a/doc/metrics.rst +++ b/doc/metrics.rst @@ -6,6 +6,9 @@ Metrics .. currentmodule:: imblearn.metrics +Classification metrics +---------------------- + Currently, scikit-learn only offers the ``sklearn.metrics.balanced_accuracy_score`` (in 0.20) as metric to deal with imbalanced datasets. The module :mod:`imblearn.metrics` offers a couple of @@ -15,7 +18,7 @@ classifiers. .. _sensitivity_specificity: Sensitivity and specificity metrics ------------------------------------ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Sensitivity and specificity are metrics which are well known in medical imaging. Sensitivity (also called true positive rate or recall) is the @@ -34,7 +37,7 @@ use those metrics. .. _imbalanced_metrics: Additional metrics specific to imbalanced datasets --------------------------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The :func:`geometric_mean_score` :cite:`barandela2003strategies,kubat1997addressing` is the root of the product @@ -48,7 +51,7 @@ parameter ``alpha``. .. _macro_averaged_mean_absolute_error: Macro-Averaged Mean Absolute Error (MA-MAE) -------------------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Ordinal classification is used when there is a rank among classes, for example levels of functionality or movie ratings. @@ -60,9 +63,16 @@ each class and averaged over classes, giving an equal weight to each class. .. _classification_report: Summary of important metrics ----------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The :func:`classification_report_imbalanced` will compute a set of metrics per class and summarize it in a table. The parameter `output_dict` allows to get a string or a Python dictionary. This dictionary can be reused to create a Pandas dataframe for instance. + +.. _pairwise_metrics: + +Pairwise metrics +---------------- + +TODO: add documentation regarding ValueDifferenceMetric diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index 2af92c38c..b85fe9495 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -1,5 +1,7 @@ # coding: utf-8 -"""Metrics to assess performance on classification task given class prediction +"""Metrics to assess performance on classification task given class +predictions. The available metrics are complementary from the metrics available +in scikit-learn. Functions named as ``*_score`` return a scalar value to maximize: the higher the better diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 6590c2660..0051dc741 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -57,6 +57,13 @@ class ValueDifferenceMetric: List of length `n_features` containing the conditional probabilities for each category given a class. + Notes + ----- + The input data `X` are expected to be encoded by an + :class:`~sklearn.preprocessing.OrdinalEncoder` and the data type is used + should be `np.int32`. If other data types are given, `X` will be converted + to `np.int32`. + References ---------- .. [1] Stanfill, Craig, and David Waltz. "Toward memory-based reasoning." @@ -74,7 +81,7 @@ def fit(self, X, y): Parameters ---------- - X : ndarray of shape (n_samples, n_features), dtype={np.int32, np.int64} + X : ndarray of shape (n_samples, n_features), dtype=np.int32 The input data. The data are expected to be encoded with an :class:`~sklearn.preprocessing.OrdinalEncoder`. @@ -116,11 +123,11 @@ def pairwise(self, X1, X2=None): Parameters ---------- - X1 : ndarray of shape (n_samples, n_features), dtype={np.int32, np.int64} + X1 : ndarray of shape (n_samples, n_features), dtype=np.int32 The input data. The data are expected to be encoded with an :class:`~sklearn.preprocessing.OrdinalEncoder`. - X2 : ndarray of shape (n_samples, n_features), dtype={np.int32, np.int64} + X2 : ndarray of shape (n_samples, n_features), dtype=np.int32 The input data. The data are expected to be encoded with an :class:`~sklearn.preprocessing.OrdinalEncoder`. From 7eaa69656a604a19a85ee187cf852ef22b3d4ad6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 13:18:16 +0100 Subject: [PATCH 11/22] improve support for str labels --- imblearn/metrics/pairwise.py | 34 ++++++++++++++++++++----- imblearn/metrics/tests/test_pairwise.py | 34 +++++++++++++++++-------- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 0051dc741..f7490e16c 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -5,6 +5,8 @@ import numpy as np from scipy.spatial import distance_matrix +from sklearn.utils import check_consistent_length, column_or_1d +from sklearn.utils.multiclass import unique_labels class ValueDifferenceMetric: @@ -36,9 +38,6 @@ class ValueDifferenceMetric: Parameters ---------- - classes : ndarray of shape (n_classes,) - The unique labels in `y`. - categories : list of arrays List of arrays containing the categories for each feature. You can pass the fitted attribute `categories_` of the @@ -68,10 +67,29 @@ class ValueDifferenceMetric: ---------- .. [1] Stanfill, Craig, and David Waltz. "Toward memory-based reasoning." Communications of the ACM 29.12 (1986): 1213-1228. + + Examples + -------- + >>> import numpy as np + >>> X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1) + >>> y = [1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1] + >>> from sklearn.preprocessing import OrdinalEncoder + >>> encoder = OrdinalEncoder(dtype=np.int32) + >>> X_encoded = encoder.fit_transform(X) + >>> from imblearn.metrics.pairwise import ValueDifferenceMetric + >>> vdm = ValueDifferenceMetric(categories=encoder.categories_).fit(X_encoded, y) + >>> pairwise_distance = vdm.pairwise(X_encoded) + >>> pairwise_distance.shape + (30, 30) + >>> X_test = np.array(["green", "red", "blue"]).reshape(-1, 1) + >>> X_test_encoded = encoder.transform(X_test) + >>> vdm.pairwise(X_test_encoded) + array([[ 0. , 0.04, 1.96], + [ 0.04, 0. , 1.44], + [ 1.96, 1.44, 0. ]]) """ - def __init__(self, classes, categories, *, k=1, r=2): - self.classes = classes + def __init__(self, categories, *, k=1, r=2): self.categories = categories self.k = k self.r = r @@ -92,8 +110,12 @@ def fit(self, X, y): ------- self """ + check_consistent_length(X, y) X = np.array(X, dtype=np.int32, copy=False) + y = column_or_1d(y) + n_features = X.shape[1] + classes = unique_labels(y) # list of length n_features of ndarray (n_categories, n_classes) # compute the counts @@ -104,7 +126,7 @@ def fit(self, X, y): X[y == klass, feature_idx], minlength=len(self.categories[feature_idx]), ) - for klass in self.classes + for klass in classes ], dtype=np.float64, ).T diff --git a/imblearn/metrics/tests/test_pairwise.py b/imblearn/metrics/tests/test_pairwise.py index 532ae5462..f8495f6ad 100644 --- a/imblearn/metrics/tests/test_pairwise.py +++ b/imblearn/metrics/tests/test_pairwise.py @@ -6,7 +6,8 @@ import numpy as np import pytest -from sklearn.preprocessing import OrdinalEncoder +from sklearn.preprocessing import LabelEncoder, OrdinalEncoder +from sklearn.utils._testing import _convert_container from imblearn.metrics.pairwise import ValueDifferenceMetric @@ -20,25 +21,30 @@ def data(): feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10 X = np.array([feature_1, feature_2, feature_3], dtype=object).T rng.shuffle(X) - y = rng.randint(low=0, high=3, size=X.shape[0]) + y = rng.randint(low=0, high=2, size=X.shape[0]) + y_labels = np.array(["not apple", "apple"], dtype=object) + y = y_labels[y] return X, y @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) @pytest.mark.parametrize("k, r", [(1, 1), (1, 2), (2, 1), (2, 2)]) -def test_value_difference_metric(data, dtype, k, r): +@pytest.mark.parametrize("y_type", ["list", "array"]) +@pytest.mark.parametrize("encode_label", [True, False]) +def test_value_difference_metric(data, dtype, k, r, y_type, encode_label): # Check basic feature of the metric: # * the shape of the distance matrix is (n_samples, n_samples) # * computing pairwise distance of X is the same than explicitely between # X and X. X, y = data + y = _convert_container(y, y_type) + if encode_label: + y = LabelEncoder().fit_transform(y) encoder = OrdinalEncoder(dtype=dtype) - classes = np.unique(y) - X_encoded = encoder.fit_transform(X) - vdm = ValueDifferenceMetric(classes, encoder.categories_, k=k, r=r) + vdm = ValueDifferenceMetric(encoder.categories_, k=k, r=r) vdm.fit(X_encoded, y) dist_1 = vdm.pairwise(X_encoded) @@ -51,7 +57,9 @@ def test_value_difference_metric(data, dtype, k, r): @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) @pytest.mark.parametrize("k, r", [(1, 1), (1, 2), (2, 1), (2, 2)]) -def test_value_difference_metric_property(dtype, k, r): +@pytest.mark.parametrize("y_type", ["list", "array"]) +@pytest.mark.parametrize("encode_label", [True, False]) +def test_value_difference_metric_property(dtype, k, r, y_type, encode_label): # Check the property of the vdm distance. Let's check the property # described in "Improved Heterogeneous Distance Functions", D.R. Wilson and # T.R. Martinez, Journal of Artificial Intelligence Research 6 (1997) 1-34 @@ -65,13 +73,17 @@ def test_value_difference_metric_property(dtype, k, r): # defined our feature X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1) # 0 - not an apple / 1 - an apple - y = np.array([1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1], dtype=np.int32) + y = np.array([1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1]) + y_labels = np.array(["not apple", "apple"], dtype=object) + y = y_labels[y] + y = _convert_container(y, y_type) + if encode_label: + y = LabelEncoder().fit_transform(y) encoder = OrdinalEncoder(dtype=dtype) - classes = np.unique(y) - X_encoded = encoder.fit_transform(X) - vdm = ValueDifferenceMetric(classes, encoder.categories_, k=k, r=r) + + vdm = ValueDifferenceMetric(encoder.categories_, k=k, r=r) vdm.fit(X_encoded, y) sample_green = encoder.transform([["green"]]) From 24232e03b39de9fd0121cc014765342a34718ed2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 13:33:42 +0100 Subject: [PATCH 12/22] Apply suggestions from code review Co-authored-by: Christos Aridas --- imblearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index f7490e16c..62c13a6aa 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -13,7 +13,7 @@ class ValueDifferenceMetric: r"""Class implementing the Value Difference Metric. This metric computes the distance between samples containing only nominal - categorical features. The distance between feature values of two samples + features. The distance between feature values of two samples is defined as: .. math:: From 2df76aacca5dc9ba9a25b27df343ef84158f6055 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 13:34:03 +0100 Subject: [PATCH 13/22] Apply suggestions from code review Co-authored-by: Christos Aridas --- imblearn/metrics/tests/test_pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/metrics/tests/test_pairwise.py b/imblearn/metrics/tests/test_pairwise.py index f8495f6ad..e20771c23 100644 --- a/imblearn/metrics/tests/test_pairwise.py +++ b/imblearn/metrics/tests/test_pairwise.py @@ -67,7 +67,7 @@ def test_value_difference_metric_property(dtype, k, r, y_type, encode_label): # # "if an attribute color has three values red, green and blue, and the # application is to identify whether or not an object is an apple, red and - # green would be considered closer thanred and blue because the former two + # green would be considered closer than red and blue because the former two # both have similar correlations with the output class apple." # defined our feature From e449a3c7d0f9844669b0c73c6d17e2fa974194f9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 14:36:32 +0100 Subject: [PATCH 14/22] TST improve test and add auto mode for n_categories --- imblearn/metrics/pairwise.py | 88 ++++++++++++++----------- imblearn/metrics/tests/test_pairwise.py | 36 +++++++++- 2 files changed, 85 insertions(+), 39 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 62c13a6aa..3925c89f1 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -5,11 +5,12 @@ import numpy as np from scipy.spatial import distance_matrix -from sklearn.utils import check_consistent_length, column_or_1d +from sklearn.base import BaseEstimator +from sklearn.utils import check_consistent_length from sklearn.utils.multiclass import unique_labels -class ValueDifferenceMetric: +class ValueDifferenceMetric(BaseEstimator): r"""Class implementing the Value Difference Metric. This metric computes the distance between samples containing only nominal @@ -38,11 +39,12 @@ class ValueDifferenceMetric: Parameters ---------- - categories : list of arrays - List of arrays containing the categories for each feature. You can pass + n_categories : "auto" or array-like of shape (n_features,), default="auto" + The number of unique categories per features. If `"auto"`, the number + of categories will be computed from `X` at `fit`. Otherwise, you can + provide an array-like of such counts to avoid computation. You can use the fitted attribute `categories_` of the - :class:`~sklearn.preprocesssing.OrdinalEncoder` used to encode the - data. + :class:`~sklearn.preprocesssing.OrdinalEncoder`to deduce these counts. k : int, default=1 Exponent used to compute the distance between feature value. @@ -52,6 +54,9 @@ class ValueDifferenceMetric: Attributes ---------- + n_categories_ : ndarray of shape (n_features,) + The number of categories per features. + proba_per_class_ : list of ndarray of shape (n_categories, n_classes) List of length `n_features` containing the conditional probabilities for each category given a class. @@ -77,7 +82,7 @@ class ValueDifferenceMetric: >>> encoder = OrdinalEncoder(dtype=np.int32) >>> X_encoded = encoder.fit_transform(X) >>> from imblearn.metrics.pairwise import ValueDifferenceMetric - >>> vdm = ValueDifferenceMetric(categories=encoder.categories_).fit(X_encoded, y) + >>> vdm = ValueDifferenceMetric().fit(X_encoded, y) >>> pairwise_distance = vdm.pairwise(X_encoded) >>> pairwise_distance.shape (30, 30) @@ -89,8 +94,8 @@ class ValueDifferenceMetric: [ 1.96, 1.44, 0. ]]) """ - def __init__(self, categories, *, k=1, r=2): - self.categories = categories + def __init__(self, *, n_categories="auto", k=1, r=2): + self.n_categories = n_categories self.k = k self.r = r @@ -111,10 +116,23 @@ def fit(self, X, y): self """ check_consistent_length(X, y) - X = np.array(X, dtype=np.int32, copy=False) - y = column_or_1d(y) + X, y = self._validate_data(X, y, reset=True, dtype=np.int32) + + if isinstance(self.n_categories, str) and self.n_categories == "auto": + self.n_categories_ = [ + len(np.unique(X[:, feature_idx])) + for feature_idx in range(self.n_features_in_) + ] + else: + if len(self.n_categories) != self.n_features_in_: + raise ValueError( + f"The length of n_categories is not consistent with the " + f"number of feature in X. Got {len(self.n_categories)} " + f"elements in n_categories and {self.n_features_in_} in " + f"X." + ) + self.n_categories_ = np.array(self.n_categories, copy=False) - n_features = X.shape[1] classes = unique_labels(y) # list of length n_features of ndarray (n_categories, n_classes) @@ -124,32 +142,32 @@ def fit(self, X, y): [ np.bincount( X[y == klass, feature_idx], - minlength=len(self.categories[feature_idx]), + minlength=self.n_categories_[feature_idx], ) for klass in classes ], dtype=np.float64, ).T - for feature_idx in range(n_features) + for feature_idx in range(self.n_features_in_) ] # normalize by the summing over the classes - for feature_idx in range(n_features): + for feature_idx in range(self.n_features_in_): self.proba_per_class_[feature_idx] /= ( self.proba_per_class_[feature_idx].sum(axis=1).reshape(-1, 1) ) return self - def pairwise(self, X1, X2=None): + def pairwise(self, X, Y=None): """Compute the VDM distance pairwise. Parameters ---------- - X1 : ndarray of shape (n_samples, n_features), dtype=np.int32 + X : ndarray of shape (n_samples, n_features), dtype=np.int32 The input data. The data are expected to be encoded with an :class:`~sklearn.preprocessing.OrdinalEncoder`. - X2 : ndarray of shape (n_samples, n_features), dtype=np.int32 + Y : ndarray of shape (n_samples, n_features), dtype=np.int32 The input data. The data are expected to be encoded with an :class:`~sklearn.preprocessing.OrdinalEncoder`. @@ -158,27 +176,23 @@ def pairwise(self, X1, X2=None): distance_matrix : ndarray of shape (n_samples, n_samples) The VDM pairwise distance. """ - if X1.dtype.kind != "i": - X1 = X1.astype(np.int32) - n_samples_X1, n_features = X1.shape - - if X2 is not None: - if X2.dtype.kind != "i": - X2 = X2.astype(np.int32) - n_samples_X2 = X2.shape[0] + X = self._validate_data(X, reset=False, dtype=np.int32) + n_samples_X = X.shape[0] + + if Y is not None: + Y = self._validate_data(Y, reset=False, dtype=np.int32) + n_samples_Y = Y.shape[0] else: - n_samples_X2 = n_samples_X1 - - distance = np.zeros(shape=(n_samples_X1, n_samples_X2), dtype=np.float64) - for feature_idx in range(n_features): - proba_feature_X1 = self.proba_per_class_[feature_idx][X1[:, feature_idx]] - if X2 is not None: - proba_feature_X2 = self.proba_per_class_[feature_idx][ - X2[:, feature_idx] - ] + n_samples_Y = n_samples_X + + distance = np.zeros(shape=(n_samples_X, n_samples_Y), dtype=np.float64) + for feature_idx in range(self.n_features_in_): + proba_feature_X = self.proba_per_class_[feature_idx][X[:, feature_idx]] + if Y is not None: + proba_feature_Y = self.proba_per_class_[feature_idx][Y[:, feature_idx]] else: - proba_feature_X2 = proba_feature_X1 + proba_feature_Y = proba_feature_X distance += ( - distance_matrix(proba_feature_X1, proba_feature_X2, p=self.k) ** self.r + distance_matrix(proba_feature_X, proba_feature_Y, p=self.k) ** self.r ) return distance diff --git a/imblearn/metrics/tests/test_pairwise.py b/imblearn/metrics/tests/test_pairwise.py index e20771c23..67e0978d5 100644 --- a/imblearn/metrics/tests/test_pairwise.py +++ b/imblearn/metrics/tests/test_pairwise.py @@ -44,7 +44,7 @@ def test_value_difference_metric(data, dtype, k, r, y_type, encode_label): encoder = OrdinalEncoder(dtype=dtype) X_encoded = encoder.fit_transform(X) - vdm = ValueDifferenceMetric(encoder.categories_, k=k, r=r) + vdm = ValueDifferenceMetric(k=k, r=r) vdm.fit(X_encoded, y) dist_1 = vdm.pairwise(X_encoded) @@ -83,7 +83,7 @@ def test_value_difference_metric_property(dtype, k, r, y_type, encode_label): encoder = OrdinalEncoder(dtype=dtype) X_encoded = encoder.fit_transform(X) - vdm = ValueDifferenceMetric(encoder.categories_, k=k, r=r) + vdm = ValueDifferenceMetric(k=k, r=r) vdm.fit(X_encoded, y) sample_green = encoder.transform([["green"]]) @@ -106,3 +106,35 @@ def test_value_difference_metric_property(dtype, k, r, y_type, encode_label): assert dist_1 < dist_2 assert dist_1 < dist_3 assert dist_2 < dist_3 + + +def test_value_difference_metric_categories(data): + # Check that "auto" is equivalent to provide the number categories + # beforehand + X, y = data + + encoder = OrdinalEncoder(dtype=np.int32) + X_encoded = encoder.fit_transform(X) + n_categories = np.array([len(cat) for cat in encoder.categories_]) + + vdm_auto = ValueDifferenceMetric().fit(X_encoded, y) + vdm_categories = ValueDifferenceMetric(n_categories=n_categories) + vdm_categories.fit(X_encoded, y) + + np.testing.assert_array_equal(vdm_auto.n_categories_, n_categories) + np.testing.assert_array_equal(vdm_auto.n_categories_, vdm_categories.n_categories_) + + +def test_value_difference_metric_categorie_error(data): + # Check that we raise an error if n_categories is inconsistent with the + # number of features in X + X, y = data + + encoder = OrdinalEncoder(dtype=np.int32) + X_encoded = encoder.fit_transform(X) + n_categories = [1, 2] + + vdm = ValueDifferenceMetric(n_categories=n_categories) + err_msg = "The length of n_categories is not consistent with the number" + with pytest.raises(ValueError, match=err_msg): + vdm.fit(X_encoded, y) From a4a60267324381bf5fc6e4312b84c5ef79dbcaba Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 14:56:32 +0100 Subject: [PATCH 15/22] add tags to mention that we expect categorical as X input --- imblearn/metrics/pairwise.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 3925c89f1..10486f9a9 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -196,3 +196,6 @@ def pairwise(self, X, Y=None): distance_matrix(proba_feature_X, proba_feature_Y, p=self.k) ** self.r ) return distance + + def _more_tags(self): + return {"X_types": ["categorical"]} From 741b76f713cef72b41060dfdfbdb6d198d57ce5b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 15:23:57 +0100 Subject: [PATCH 16/22] speed-up --- imblearn/metrics/pairwise.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 10486f9a9..ecfe6d541 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -138,18 +138,15 @@ def fit(self, X, y): # list of length n_features of ndarray (n_categories, n_classes) # compute the counts self.proba_per_class_ = [ - np.array( - [ - np.bincount( - X[y == klass, feature_idx], - minlength=self.n_categories_[feature_idx], - ) - for klass in classes - ], - dtype=np.float64, - ).T - for feature_idx in range(self.n_features_in_) + np.empty(shape=(n_cat, len(classes)), dtype=np.float64) + for n_cat in self.n_categories_ ] + for feature_idx in range(self.n_features_in_): + for klass_idx, klass in enumerate(classes): + self.proba_per_class_[feature_idx][:, klass_idx] = np.bincount( + X[y == klass, feature_idx], + minlength=self.n_categories_[feature_idx], + ) # normalize by the summing over the classes for feature_idx in range(self.n_features_in_): self.proba_per_class_[feature_idx] /= ( @@ -197,5 +194,5 @@ def pairwise(self, X, Y=None): ) return distance - def _more_tags(self): - return {"X_types": ["categorical"]} + # def _more_tags(self): + # return {"X_types": ["categorical"]} From b52961eac255a2149b38104ae959db532285c711 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 15:36:33 +0100 Subject: [PATCH 17/22] fix when missing categories --- imblearn/metrics/pairwise.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index ecfe6d541..c99115991 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -119,10 +119,8 @@ def fit(self, X, y): X, y = self._validate_data(X, y, reset=True, dtype=np.int32) if isinstance(self.n_categories, str) and self.n_categories == "auto": - self.n_categories_ = [ - len(np.unique(X[:, feature_idx])) - for feature_idx in range(self.n_features_in_) - ] + # categories are expected to be encoded from 0 to n_categories - 1 + self.n_categories_ = X.max(axis=0) + 1 else: if len(self.n_categories) != self.n_features_in_: raise ValueError( @@ -132,7 +130,6 @@ def fit(self, X, y): f"X." ) self.n_categories_ = np.array(self.n_categories, copy=False) - classes = unique_labels(y) # list of length n_features of ndarray (n_categories, n_classes) @@ -193,6 +190,3 @@ def pairwise(self, X, Y=None): distance_matrix(proba_feature_X, proba_feature_Y, p=self.k) ** self.r ) return distance - - # def _more_tags(self): - # return {"X_types": ["categorical"]} From 80d68aaeb0c0fca78d1523a22120e04c98117ab7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 15:53:24 +0100 Subject: [PATCH 18/22] TST add test for corner case --- imblearn/metrics/pairwise.py | 18 +++++++++++------- imblearn/metrics/tests/test_pairwise.py | 22 +++++++++++++++++++++- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index c99115991..ce320c448 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -105,7 +105,7 @@ def fit(self, X, y): Parameters ---------- X : ndarray of shape (n_samples, n_features), dtype=np.int32 - The input data. The data are expected to be encoded with an + The input data. The data are expected to be encoded with a :class:`~sklearn.preprocessing.OrdinalEncoder`. y : ndarray of shape (n_features,) @@ -144,11 +144,15 @@ def fit(self, X, y): X[y == klass, feature_idx], minlength=self.n_categories_[feature_idx], ) + # normalize by the summing over the classes - for feature_idx in range(self.n_features_in_): - self.proba_per_class_[feature_idx] /= ( - self.proba_per_class_[feature_idx].sum(axis=1).reshape(-1, 1) - ) + with np.errstate(invalid="ignore"): + # silence potential warning due to in-place division by zero + for feature_idx in range(self.n_features_in_): + self.proba_per_class_[feature_idx] /= ( + self.proba_per_class_[feature_idx].sum(axis=1).reshape(-1, 1) + ) + np.nan_to_num(self.proba_per_class_[feature_idx], copy=False) return self @@ -158,11 +162,11 @@ def pairwise(self, X, Y=None): Parameters ---------- X : ndarray of shape (n_samples, n_features), dtype=np.int32 - The input data. The data are expected to be encoded with an + The input data. The data are expected to be encoded with a :class:`~sklearn.preprocessing.OrdinalEncoder`. Y : ndarray of shape (n_samples, n_features), dtype=np.int32 - The input data. The data are expected to be encoded with an + The input data. The data are expected to be encoded with a :class:`~sklearn.preprocessing.OrdinalEncoder`. Returns diff --git a/imblearn/metrics/tests/test_pairwise.py b/imblearn/metrics/tests/test_pairwise.py index 67e0978d5..bb8e7435b 100644 --- a/imblearn/metrics/tests/test_pairwise.py +++ b/imblearn/metrics/tests/test_pairwise.py @@ -125,7 +125,7 @@ def test_value_difference_metric_categories(data): np.testing.assert_array_equal(vdm_auto.n_categories_, vdm_categories.n_categories_) -def test_value_difference_metric_categorie_error(data): +def test_value_difference_metric_categories_error(data): # Check that we raise an error if n_categories is inconsistent with the # number of features in X X, y = data @@ -138,3 +138,23 @@ def test_value_difference_metric_categorie_error(data): err_msg = "The length of n_categories is not consistent with the number" with pytest.raises(ValueError, match=err_msg): vdm.fit(X_encoded, y) + + +def test_value_difference_metric_missing_categories(data): + # Check that we don't get issue when a category is missing between 0 + # n_categories - 1 + X, y = data + + encoder = OrdinalEncoder(dtype=np.int32) + X_encoded = encoder.fit_transform(X) + n_categories = np.array([len(cat) for cat in encoder.categories_]) + + # remove a categories that could be between 0 and n_categories + X_encoded[X_encoded[:, -1] == 1] = 0 + np.testing.assert_array_equal(np.unique(X_encoded[:, -1]), [0, 2, 3]) + + vdm = ValueDifferenceMetric(n_categories=n_categories) + vdm.fit(X_encoded, y) + + for n_cats, proba in zip(n_categories, vdm.proba_per_class_): + assert proba.shape == (n_cats, len(np.unique(y))) From cc56be36f0f651536ec221c3f9ff6caf94a29140 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 18:16:32 +0100 Subject: [PATCH 19/22] DOC update user guide --- doc/conf.py | 12 +++-- doc/metrics.rst | 70 ++++++++++++++++++++++++- imblearn/metrics/pairwise.py | 4 ++ imblearn/metrics/tests/test_pairwise.py | 13 +++++ references.bib | 22 +++++++- 5 files changed, 114 insertions(+), 7 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index fedbf7249..22b3fe4db 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -21,7 +21,6 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath("sphinxext")) from github_link import make_linkcode_resolve -import sphinx_gallery # -- General configuration ------------------------------------------------ @@ -44,7 +43,7 @@ ] # bibtex file -bibtex_bibfiles = ['bibtex/refs.bib'] +bibtex_bibfiles = ["bibtex/refs.bib"] # this is needed for some reason... # see https://github.com/numpy/numpydoc/issues/69 @@ -77,8 +76,8 @@ master_doc = "index" # General information about the project. -project = 'imbalanced-learn' -copyright = '2014-2020, The imbalanced-learn developers' +project = "imbalanced-learn" +copyright = "2014-2020, The imbalanced-learn developers" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -260,7 +259,10 @@ # intersphinx configuration intersphinx_mapping = { - "python": ("https://docs.python.org/{.major}".format(sys.version_info), None,), + "python": ( + "https://docs.python.org/{.major}".format(sys.version_info), + None, + ), "numpy": ("https://docs.scipy.org/doc/numpy/", None), "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), "matplotlib": ("https://matplotlib.org/", None), diff --git a/doc/metrics.rst b/doc/metrics.rst index 78b2b98d0..4d588efb4 100644 --- a/doc/metrics.rst +++ b/doc/metrics.rst @@ -75,4 +75,72 @@ a Pandas dataframe for instance. Pairwise metrics ---------------- -TODO: add documentation regarding ValueDifferenceMetric +The :mod:`imblearn.metrics.pairwise` submodule implements pairwise distances +that are available in scikit-learn while used in some of the methods in +imbalanced-learn. + +.. _vdm: + +Value Difference Metric +~~~~~~~~~~~~~~~~~~~~~~~ + +The class :class:`~imblearn.metrics.pairwise.ValueDifferenceMetric` is +implementing the Value Difference Metric proposed in +:cite:`stanfill1986toward`. This measure is used to compute the proximity +of two samples composed of only nominal categorical features. + +Given a single feature, categories with similar correlation with the target +vector will be considered closer. Let's give an example to illustrate this +behaviour as given in :cite:`wilson1997improved`. `X` will be represented by a +single feature which will be some color and the target will be if a sample is +whether or not an apple:: + + >>> import numpy as np + >>> X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1) + >>> y = ["apple"] * 8 + ["not apple"] * 5 + ["apple"] * 7 + ["not apple"] * 9 + ["apple"] + +In this dataset, the categories "red" and "green" are more correlated to the +target `y` and should have a smaller distance than with the category "blue". +We should this behaviour. Be aware that we need to encode the `X` to work with +numerical values:: + + >>> from sklearn.preprocessing import OrdinalEncoder + >>> encoder = OrdinalEncoder(dtype=np.int32) + >>> X_encoded = encoder.fit_transform(X) + +Now, we can compute the distance between three different samples representing +the different categories:: + + >>> from imblearn.metrics.pairwise import ValueDifferenceMetric + >>> vdm = ValueDifferenceMetric().fit(X_encoded, y) + >>> X_test = np.array(["green", "red", "blue"]).reshape(-1, 1) + >>> X_test_encoded = encoder.transform(X_test) + >>> vdm.pairwise(X_test_encoded) + array([[ 0. , 0.04, 1.96], + [ 0.04, 0. , 1.44], + [ 1.96, 1.44, 0. ]]) + +We see that the minimum distance happen when the categories "red" and "green" +are compared. Whenever comparing with "blue", the distance is much larger. + +**Mathematical formulation** + +The distance between feature values of two samples is defined as: + +.. math:: + \delta(x, y) = \sum_{c=1}^{C} |p(c|x_{f}) - p(c|y_{f})|^{k} \ , + +where :math:`x` and :math:`y` are two samples and :math:`f` a given +feature, :math:`C` is the number of classes, :math:`p(c|x_{f})` is the +conditional probability that the output class is :math:`c` given that +the feature value :math:`f` has the value :math:`x` and :math:`k` an +exponent usually defined to 1 or 2. + +The distance for the feature vectors :math:`X` and :math:`Y` is +subsequently defined as: + +.. math:: + \Delta(X, Y) = \sum_{f=1}^{F} \delta(X_{f}, Y_{f})^{r} \ , + +where :math:`F` is the number of feature and :math:`r` an exponent usually +defined equal to 1 or 2. diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index ce320c448..c1ba7abee 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator from sklearn.utils import check_consistent_length from sklearn.utils.multiclass import unique_labels +from sklearn.utils.validation import check_is_fitted class ValueDifferenceMetric(BaseEstimator): @@ -37,6 +38,8 @@ class ValueDifferenceMetric(BaseEstimator): The definition of this distance was propoed in [1]_. + Read more in the :ref:`User Guide `. + Parameters ---------- n_categories : "auto" or array-like of shape (n_features,), default="auto" @@ -174,6 +177,7 @@ def pairwise(self, X, Y=None): distance_matrix : ndarray of shape (n_samples, n_samples) The VDM pairwise distance. """ + check_is_fitted(self) X = self._validate_data(X, reset=False, dtype=np.int32) n_samples_X = X.shape[0] diff --git a/imblearn/metrics/tests/test_pairwise.py b/imblearn/metrics/tests/test_pairwise.py index bb8e7435b..7c8b30e9e 100644 --- a/imblearn/metrics/tests/test_pairwise.py +++ b/imblearn/metrics/tests/test_pairwise.py @@ -6,6 +6,7 @@ import numpy as np import pytest +from sklearn.exceptions import NotFittedError from sklearn.preprocessing import LabelEncoder, OrdinalEncoder from sklearn.utils._testing import _convert_container @@ -158,3 +159,15 @@ def test_value_difference_metric_missing_categories(data): for n_cats, proba in zip(n_categories, vdm.proba_per_class_): assert proba.shape == (n_cats, len(np.unique(y))) + + +def test_value_difference_value_unfitted(data): + # Check that we raise a NotFittedError when `fit` is not not called before + # pairwise. + X, y = data + + encoder = OrdinalEncoder(dtype=np.int32) + X_encoded = encoder.fit_transform(X) + + with pytest.raises(NotFittedError): + ValueDifferenceMetric().pairwise(X_encoded) diff --git a/references.bib b/references.bib index c4432827a..398f9e4c3 100644 --- a/references.bib +++ b/references.bib @@ -198,4 +198,24 @@ @article{torelli2014rose issn = {1573-756X}, url = {https://doi.org/10.1007/s10618-012-0295-5}, doi = {10.1007/s10618-012-0295-5} -} \ No newline at end of file +} + +@article{stanfill1986toward, + title={Toward memory-based reasoning}, + author={Stanfill, Craig and Waltz, David}, + journal={Communications of the ACM}, + volume={29}, + number={12}, + pages={1213--1228}, + year={1986}, + publisher={ACM New York, NY, USA} +} + +@article{wilson1997improved, + title={Improved heterogeneous distance functions}, + author={Wilson, D Randall and Martinez, Tony R}, + journal={Journal of artificial intelligence research}, + volume={6}, + pages={1--34}, + year={1997} +} From 7c897f6299330075dd4f68139e861f4962ad4ef7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 18:46:53 +0100 Subject: [PATCH 20/22] Apply suggestions from code review Co-authored-by: Christos Aridas --- doc/metrics.rst | 2 +- imblearn/metrics/_classification.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/metrics.rst b/doc/metrics.rst index 4d588efb4..2ac4e3759 100644 --- a/doc/metrics.rst +++ b/doc/metrics.rst @@ -87,7 +87,7 @@ Value Difference Metric The class :class:`~imblearn.metrics.pairwise.ValueDifferenceMetric` is implementing the Value Difference Metric proposed in :cite:`stanfill1986toward`. This measure is used to compute the proximity -of two samples composed of only nominal categorical features. +of two samples composed of only nominal values. Given a single feature, categories with similar correlation with the target vector will be considered closer. Let's give an example to illustrate this diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index b85fe9495..47418b9a0 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -1,5 +1,5 @@ # coding: utf-8 -"""Metrics to assess performance on classification task given class +"""Metrics to assess performance on a classification task given class predictions. The available metrics are complementary from the metrics available in scikit-learn. From 5526c8a4454741cf14414e4aedbd61a867f4a867 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 18:49:38 +0100 Subject: [PATCH 21/22] iter --- doc/bibtex/refs.bib | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/bibtex/refs.bib b/doc/bibtex/refs.bib index 63f8bc0d1..469d4abe8 100644 --- a/doc/bibtex/refs.bib +++ b/doc/bibtex/refs.bib @@ -223,4 +223,24 @@ @article{esuli2009ordinal publisher = {IEEE Computer Society}, address = {Los Alamitos, CA, USA}, month = {dec} -} \ No newline at end of file +} + +@article{stanfill1986toward, + title={Toward memory-based reasoning}, + author={Stanfill, Craig and Waltz, David}, + journal={Communications of the ACM}, + volume={29}, + number={12}, + pages={1213--1228}, + year={1986}, + publisher={ACM New York, NY, USA} +} + +@article{wilson1997improved, + title={Improved heterogeneous distance functions}, + author={Wilson, D Randall and Martinez, Tony R}, + journal={Journal of artificial intelligence research}, + volume={6}, + pages={1--34}, + year={1997} +} From f16bfdd92520924d4981177f8ef46278277723be Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 Feb 2021 19:15:47 +0100 Subject: [PATCH 22/22] add entry whats new --- doc/whats_new/v0.7.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst index 06d725a56..95a3294d4 100644 --- a/doc/whats_new/v0.7.rst +++ b/doc/whats_new/v0.7.rst @@ -90,6 +90,10 @@ Enhancements classification. :pr:`780` by :user:`Aurélien Massiot `. +- Add the class :class:`imblearn.metrics.pairwise.ValueDifferenceMetric` to + compute pairwise distances between samples containing only nominal values. + :pr:`796` by :user:`Guillaume Lemaitre `. + Deprecation ...........