|
8 | 8 | methods are used in conjunction with a 3NN classifier in order |
9 | 9 | to examine the improvement of the classifier's output quality |
10 | 10 | by using an over-sampler. |
11 | | -
|
12 | 11 | """ |
13 | 12 |
|
14 | 13 | # Authors: Christos Aridas |
15 | 14 | # Guillaume Lemaitre <[email protected]> |
16 | 15 | # License: MIT |
17 | 16 |
|
18 | | -import matplotlib.pyplot as plt |
19 | | -import numpy as np |
20 | | -from scipy import interp |
21 | | -from sklearn import datasets, neighbors |
22 | | -from sklearn.metrics import auc, roc_curve |
23 | | -from sklearn.model_selection import StratifiedKFold |
24 | | - |
25 | | -from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler |
26 | | -from imblearn.pipeline import make_pipeline |
27 | | - |
| 17 | +# %% |
28 | 18 | print(__doc__) |
29 | 19 |
|
30 | | -LW = 2 |
31 | | -RANDOM_STATE = 42 |
| 20 | +import seaborn as sns |
32 | 21 |
|
| 22 | +sns.set_context("poster") |
33 | 23 |
|
34 | | -class DummySampler: |
35 | | - def sample(self, X, y): |
36 | | - return X, y |
| 24 | +# %% [markdown] |
| 25 | +# Load the dataset |
| 26 | +# ---------------- |
| 27 | +# |
| 28 | +# We will use a dataset containing image from know person where we will |
| 29 | +# build a model to recognize the person on the image. We will make this problem |
| 30 | +# a binary problem by taking picture of only George W. Bush and Bill Clinton. |
37 | 31 |
|
38 | | - def fit(self, X, y): |
39 | | - return self |
| 32 | +# %% |
| 33 | +import numpy as np |
| 34 | +from sklearn.datasets import fetch_lfw_people |
| 35 | + |
| 36 | +data = fetch_lfw_people() |
| 37 | +george_bush_id = 1871 # Photos of George W. Bush |
| 38 | +bill_clinton_id = 531 # Photos of Bill Clinton |
| 39 | +classes = [george_bush_id, bill_clinton_id] |
| 40 | +classes_name = np.array(["B. Clinton", "G.W. Bush"], dtype=np.object) |
| 41 | + |
| 42 | +# %% |
| 43 | +mask_photos = np.isin(data.target, classes) |
| 44 | +X, y = data.data[mask_photos], data.target[mask_photos] |
| 45 | +y = (y == george_bush_id).astype(np.int8) |
| 46 | +y = classes_name[y] |
| 47 | + |
| 48 | +# %% [markdown] |
| 49 | +# We can check the ratio between the two classes. |
| 50 | + |
| 51 | +# %% |
| 52 | +import pandas as pd |
| 53 | + |
| 54 | +class_distribution = pd.Series(y).value_counts(normalize=True) |
| 55 | +ax = class_distribution.plot.barh() |
| 56 | +ax.set_title("Class distribution") |
| 57 | +pos_label = class_distribution.idxmin() |
| 58 | +print(f"The positive label considered as the minority class is {pos_label}") |
| 59 | + |
| 60 | +# %% [markdown] |
| 61 | +# We see that we have an imbalanced classification problem with ~95% of the |
| 62 | +# data belonging to the class G.W. Bush. |
| 63 | +# |
| 64 | +# Compare over-sampling approaches |
| 65 | +# -------------------------------- |
| 66 | +# |
| 67 | +# We will use different over-sampling approaches and use a kNN classifier |
| 68 | +# to check if we can recognize the 2 presidents. The evaluation will be |
| 69 | +# performed through cross-validation and we will plot the mean ROC curve. |
| 70 | +# |
| 71 | +# We will create different pipelines and evaluate them. |
| 72 | + |
| 73 | +# %% |
| 74 | +from imblearn import FunctionSampler |
| 75 | +from imblearn.over_sampling import ADASYN, RandomOverSampler, SMOTE |
| 76 | +from imblearn.pipeline import make_pipeline |
| 77 | +from sklearn.neighbors import KNeighborsClassifier |
40 | 78 |
|
41 | | - def fit_resample(self, X, y): |
42 | | - return self.sample(X, y) |
| 79 | +classifier = KNeighborsClassifier(n_neighbors=3) |
43 | 80 |
|
| 81 | +pipeline = [ |
| 82 | + make_pipeline(FunctionSampler(), classifier), |
| 83 | + make_pipeline(RandomOverSampler(random_state=42), classifier), |
| 84 | + make_pipeline(ADASYN(random_state=42), classifier), |
| 85 | + make_pipeline(SMOTE(random_state=42), classifier), |
| 86 | +] |
44 | 87 |
|
45 | | -cv = StratifiedKFold(n_splits=3) |
| 88 | +# %% |
| 89 | +from sklearn.model_selection import StratifiedKFold |
46 | 90 |
|
47 | | -# Load the dataset |
48 | | -data = datasets.fetch_lfw_people() |
49 | | -majority_person = 1871 # 530 photos of George W Bush |
50 | | -minority_person = 531 # 29 photos of Bill Clinton |
51 | | -majority_idxs = np.flatnonzero(data.target == majority_person) |
52 | | -minority_idxs = np.flatnonzero(data.target == minority_person) |
53 | | -idxs = np.hstack((majority_idxs, minority_idxs)) |
54 | | - |
55 | | -X = data.data[idxs] |
56 | | -y = data.target[idxs] |
57 | | -y[y == majority_person] = 0 |
58 | | -y[y == minority_person] = 1 |
59 | | - |
60 | | -classifier = ["3NN", neighbors.KNeighborsClassifier(3)] |
61 | | - |
62 | | -samplers = [ |
63 | | - ["Standard", DummySampler()], |
64 | | - ["ADASYN", ADASYN(random_state=RANDOM_STATE)], |
65 | | - ["ROS", RandomOverSampler(random_state=RANDOM_STATE)], |
66 | | - ["SMOTE", SMOTE(random_state=RANDOM_STATE)], |
67 | | -] |
| 91 | +cv = StratifiedKFold(n_splits=3) |
68 | 92 |
|
69 | | -pipelines = [ |
70 | | - [ |
71 | | - f"{sampler[0]}-{classifier[0]}", |
72 | | - make_pipeline(sampler[1], classifier[1]), |
73 | | - ] |
74 | | - for sampler in samplers |
75 | | -] |
| 93 | +# %% [markdown] |
| 94 | +# We will compute the mean ROC curve for each pipeline using a different splits |
| 95 | +# provided by the :class:`~sklearn.model_selection.StratifiedKFold` |
| 96 | +# cross-validation. |
76 | 97 |
|
77 | | -fig = plt.figure() |
78 | | -ax = fig.add_subplot(1, 1, 1) |
| 98 | +# %% |
| 99 | +import matplotlib.pyplot as plt |
| 100 | +from sklearn.metrics import RocCurveDisplay, roc_curve, auc |
79 | 101 |
|
80 | | -for name, pipeline in pipelines: |
81 | | - mean_tpr = 0.0 |
82 | | - mean_fpr = np.linspace(0, 1, 100) |
| 102 | +disp = [] |
| 103 | +for model in pipeline: |
| 104 | + # compute the mean fpr/tpr to get the mean ROC curve |
| 105 | + mean_tpr, mean_fpr = 0.0, np.linspace(0, 1, 100) |
83 | 106 | for train, test in cv.split(X, y): |
84 | | - probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test]) |
85 | | - fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) |
86 | | - mean_tpr += interp(mean_fpr, fpr, tpr) |
| 107 | + model.fit(X[train], y[train]) |
| 108 | + y_proba = model.predict_proba(X[test]) |
| 109 | + |
| 110 | + pos_label_idx = np.flatnonzero(model.classes_ == pos_label)[0] |
| 111 | + fpr, tpr, thresholds = roc_curve( |
| 112 | + y[test], y_proba[:, pos_label_idx], pos_label=pos_label |
| 113 | + ) |
| 114 | + mean_tpr += np.interp(mean_fpr, fpr, tpr) |
87 | 115 | mean_tpr[0] = 0.0 |
88 | | - roc_auc = auc(fpr, tpr) |
89 | 116 |
|
90 | 117 | mean_tpr /= cv.get_n_splits(X, y) |
91 | 118 | mean_tpr[-1] = 1.0 |
92 | 119 | mean_auc = auc(mean_fpr, mean_tpr) |
93 | | - plt.plot( |
94 | | - mean_fpr, |
95 | | - mean_tpr, |
96 | | - linestyle="--", |
97 | | - label=f"{name} (area = {mean_auc:.2f})", |
98 | | - lw=LW, |
99 | | - ) |
100 | | - |
101 | | -plt.plot([0, 1], [0, 1], linestyle="--", lw=LW, color="k", label="Luck") |
102 | | - |
103 | | -# make nice plotting |
104 | | -ax.spines["top"].set_visible(False) |
105 | | -ax.spines["right"].set_visible(False) |
106 | | -ax.get_xaxis().tick_bottom() |
107 | | -ax.get_yaxis().tick_left() |
108 | | -ax.spines["left"].set_position(("outward", 10)) |
109 | | -ax.spines["bottom"].set_position(("outward", 10)) |
110 | | -plt.xlim([0, 1]) |
111 | | -plt.ylim([0, 1]) |
112 | | -plt.xlabel("False Positive Rate") |
113 | | -plt.ylabel("True Positive Rate") |
114 | | -plt.title("Receiver operating characteristic example") |
115 | 120 |
|
116 | | -plt.legend(loc="lower right") |
| 121 | + # Create a display that we will reuse to make the aggregated plots for |
| 122 | + # all methods |
| 123 | + disp.append( |
| 124 | + RocCurveDisplay( |
| 125 | + fpr=mean_fpr, |
| 126 | + tpr=mean_tpr, |
| 127 | + roc_auc=mean_auc, |
| 128 | + estimator_name=f"{model[0].__class__.__name__}", |
| 129 | + ) |
| 130 | + ) |
117 | 131 |
|
| 132 | +# %% [markdown] |
| 133 | +# In the previous cell, we created the different mean ROC curve and we can plot |
| 134 | +# them on the same plot. |
| 135 | + |
| 136 | +# %% |
| 137 | +fig, ax = plt.subplots(figsize=(9, 9)) |
| 138 | +for d in disp: |
| 139 | + d.plot(ax=ax, linestyle="--") |
| 140 | +ax.plot([0, 1], [0, 1], linestyle="--", color="k") |
| 141 | +ax.axis("square") |
| 142 | +fig.suptitle("Comparison of over-sampling methods with a 3NN classifier") |
| 143 | +ax.set_xlim([0, 1]) |
| 144 | +ax.set_ylim([0, 1]) |
| 145 | +sns.despine(offset=10, ax=ax) |
118 | 146 | plt.show() |
| 147 | + |
| 148 | +# %% [markdown] |
| 149 | +# We see that for this task, methods that are generating new samples with some |
| 150 | +# interpolation (i.e. ADASYN and SMOTE) perform better than random |
| 151 | +# over-sampling or no resampling. |
0 commit comments