scikit-learn-contrib
diff --git a/‎build_tools/circle/build_doc.sh‎
Lines changed: 1 addition & 1 deletion b/‎build_tools/circle/build_doc.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/conf.py‎
Lines changed: 1 addition & 0 deletions b/‎doc/conf.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/api/plot_sampling_strategy_usage.py‎
Lines changed: 90 additions & 139 deletions b/‎examples/api/plot_sampling_strategy_usage.py‎
Lines changed: 90 additions & 139 deletions
@@ -101,7 +101,7 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python=3.8
 source activate $CONDA_ENV_NAME
 
 conda install --yes pip numpy scipy joblib pillow matplotlib memory_profiler \
-        sphinx pandas tensorflow=2
+        sphinx pandas tensorflow=2 seaborn
 pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn
 pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git
 pip install -U git+https://github.com/numpy/numpydoc.git
 
@@ -160,6 +160,7 @@
     "matplotlib": ("https://matplotlib.org/", None),
     "sklearn": ("http://scikit-learn.org/stable", None),
     "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None),
+    "seaborn": ("https://seaborn.pydata.org/", None),
 }
 
 # -- Options for sphinx-gallery -----------------------------------------------
 
@@ -12,217 +12,173 @@
 # Authors: Guillaume Lemaitre <[email protected]>
 # License: MIT
 
-from collections import Counter
-
-import numpy as np
-import matplotlib.pyplot as plt
-
-from sklearn.datasets import load_iris
-
-from imblearn.datasets import make_imbalance
-
-from imblearn.over_sampling import RandomOverSampler
-from imblearn.under_sampling import RandomUnderSampler
-from imblearn.under_sampling import TomekLinks
-
+# %%
 print(__doc__)
+import seaborn as sns
 
+sns.set_context("poster")
 
-def plot_pie(y):
-    target_stats = Counter(y)
-    labels = list(target_stats.keys())
-    sizes = list(target_stats.values())
-    explode = tuple([0.1] * len(target_stats))
-
-    def make_autopct(values):
-        def my_autopct(pct):
-            total = sum(values)
-            val = int(round(pct * total / 100.0))
-            return f"{pct:.2f}%  ({val:d})"
-
-        return my_autopct
-
-    fig, ax = plt.subplots()
-    ax.pie(
-        sizes,
-        explode=explode,
-        labels=labels,
-        shadow=True,
-        autopct=make_autopct(sizes),
-    )
-    ax.axis("equal")
-
-
-###############################################################################
+# %% [markdown]
+# Create an imbalanced dataset
+# ----------------------------
+#
 # First, we will create an imbalanced data set from a the iris data set.
 
-iris = load_iris()
+# %%
+from sklearn.datasets import load_iris
+from imblearn.datasets import make_imbalance
 
-print(f"Information of the original iris data set: \n {Counter(iris.target)}")
-plot_pie(iris.target)
+iris = load_iris(as_frame=True)
 
 sampling_strategy = {0: 10, 1: 20, 2: 47}
 X, y = make_imbalance(iris.data, iris.target, sampling_strategy=sampling_strategy)
 
-print(
-    f"Information of the iris data set after making it"
-    f" imbalanced using a dict: \n sampling_strategy={sampling_strategy} \n "
-    f"y: {Counter(y)}"
-)
-plot_pie(y)
+# %%
+import matplotlib.pyplot as plt
 
-###############################################################################
-# Using ``sampling_strategy`` in resampling algorithms
-###############################################################################
+fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
+autopct = "%.2f"
+iris.target.value_counts().plot.pie(autopct=autopct, ax=axs[0])
+axs[0].set_title("Original")
+y.value_counts().plot.pie(autopct=autopct, ax=axs[1])
+axs[1].set_title("Imbalanced")
+fig.tight_layout()
 
-###############################################################################
-# ``sampling_strategy`` as a ``float``
-# ....................................
+# %% [markdown]
+# Using ``sampling_strategy`` in resampling algorithms
+# ====================================================
 #
-# ``sampling_strategy`` can be given a ``float``. For **under-sampling
+# `sampling_strategy` as a `float`
+# --------------------------------
+#
+# `sampling_strategy` can be given a `float`. For **under-sampling
 # methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by
 # :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and
 # :math:`N_{m}` are the number of samples in the majority class after
 # resampling and the number of samples in the minority class, respectively.
 
+# %%
+import numpy as np
+
 # select only 2 classes since the ratio make sense in this case
 binary_mask = np.bitwise_or(y == 0, y == 2)
 binary_y = y[binary_mask]
 binary_X = X[binary_mask]
 
-sampling_strategy = 0.8
+# %%
+from imblearn.under_sampling import RandomUnderSampler
 
+sampling_strategy = 0.8
 rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
 X_res, y_res = rus.fit_resample(binary_X, binary_y)
-print(
-    f"Information of the iris data set after making it "
-    f"balanced using a float and an under-sampling method: \n "
-    f"sampling_strategy={sampling_strategy} \n y: {Counter(y_res)}"
-)
-plot_pie(y_res)
-
-###############################################################################
+ax = y_res.value_counts().plot.pie(autopct=autopct)
+_ = ax.set_title("Under-sampling")
+
+# %% [markdown]
 # For **over-sampling methods**, it correspond to the ratio
 # :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}`
 # where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the
 # minority class after resampling and the number of samples in the majority
 # class, respectively.
 
+# %%
+from imblearn.over_sampling import RandomOverSampler
+
 ros = RandomOverSampler(sampling_strategy=sampling_strategy)
 X_res, y_res = ros.fit_resample(binary_X, binary_y)
-print(
-    f"Information of the iris data set after making it "
-    f"balanced using a float and an over-sampling method: \n "
-    f"sampling_strategy={sampling_strategy} \n y: {Counter(y_res)}"
-)
-plot_pie(y_res)
-
-###############################################################################
-# ``sampling_strategy`` has a ``str``
-# ...................................
+ax = y_res.value_counts().plot.pie(autopct=autopct)
+_ = ax.set_title("Over-sampling")
+
+# %% [markdown]
+# `sampling_strategy` has a `str`
+# -------------------------------
 #
-# ``sampling_strategy`` can be given as a string which specify the class
+# `sampling_strategy` can be given as a string which specify the class
 # targeted by the resampling. With under- and over-sampling, the number of
 # samples will be equalized.
 #
 # Note that we are using multiple classes from now on.
 
+# %%
 sampling_strategy = "not minority"
 
+fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
 rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
 X_res, y_res = rus.fit_resample(X, y)
-print(
-    f"Information of the iris data set after making it "
-    f"balanced by under-sampling: \n sampling_strategy={sampling_strategy} \n"
-    f" y: {Counter(y_res)}"
-)
-plot_pie(y_res)
+y_res.value_counts().plot.pie(autopct=autopct, ax=axs[0])
+axs[0].set_title("Under-sampling")
 
 sampling_strategy = "not majority"
-
 ros = RandomOverSampler(sampling_strategy=sampling_strategy)
 X_res, y_res = ros.fit_resample(X, y)
-print(
-    f"Information of the iris data set after making it "
-    f"balanced by over-sampling: \n sampling_strategy={sampling_strategy} \n "
-    f"y: {Counter(y_res)}"
-)
-plot_pie(y_res)
-
-###############################################################################
+y_res.value_counts().plot.pie(autopct=autopct, ax=axs[1])
+axs[1].set_title("Over-sampling")
+
+# %% [markdown]
 # With **cleaning method**, the number of samples in each class will not be
 # equalized even if targeted.
 
+# %%
+from imblearn.under_sampling import TomekLinks
+
 sampling_strategy = "not minority"
 tl = TomekLinks(sampling_strategy)
 X_res, y_res = tl.fit_resample(X, y)
-print(
-    f"Information of the iris data set after making it "
-    f"balanced by cleaning sampling: \n sampling_strategy={sampling_strategy} \n "
-    f"y: {Counter(y_res)}"
-)
-plot_pie(y_res)
-
-###############################################################################
-# ``sampling_strategy`` as a ``dict``
-# ...................................
+ax = y_res.value_counts().plot.pie(autopct=autopct)
+_ = ax.set_title("Cleaning")
+
+# %% [markdown]
+# `sampling_strategy as a `dict`
+# ------------------------------
 #
-# When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted
+# When `sampling_strategy` is a `dict`, the keys correspond to the targeted
 # classes. The values correspond to the desired number of samples for each
 # targeted class. This is working for both **under- and over-sampling**
-# algorithms but not for the **cleaning algorithms**. Use a ``list`` instead.
+# algorithms but not for the **cleaning algorithms**. Use a `list` instead.
 
+# %%
+fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
 
 sampling_strategy = {0: 10, 1: 15, 2: 20}
-
 rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
 X_res, y_res = rus.fit_resample(X, y)
-print(
-    f"Information of the iris data set after making it "
-    f"balanced by under-sampling: \n sampling_strategy={sampling_strategy} \n "
-    f"y: {Counter(y_res)}"
-)
-plot_pie(y_res)
+y_res.value_counts().plot.pie(autopct=autopct, ax=axs[0])
+axs[0].set_title("Under-sampling")
 
 sampling_strategy = {0: 25, 1: 35, 2: 47}
-
 ros = RandomOverSampler(sampling_strategy=sampling_strategy)
 X_res, y_res = ros.fit_resample(X, y)
-print(
-    f"Information of the iris data set after making it "
-    f"balanced by over-sampling: \n sampling_strategy={sampling_strategy} \n "
-    f"y: {Counter(y_res)}"
-)
-plot_pie(y_res)
-
-###############################################################################
-# ``sampling_strategy`` as a ``list``
-# ...................................
+y_res.value_counts().plot.pie(autopct=autopct, ax=axs[1])
+axs[1].set_title("Under-sampling")
+
+# %% [markdown]
+# `sampling_strategy` as a `list`
+# -------------------------------
 #
-# When ``sampling_strategy`` is a ``list``, the list contains the targeted
+# When `sampling_strategy` is a `list`, the list contains the targeted
 # classes. It is used only for **cleaning methods** and raise an error
 # otherwise.
 
+# %%
 sampling_strategy = [0, 1, 2]
 tl = TomekLinks(sampling_strategy=sampling_strategy)
 X_res, y_res = tl.fit_resample(X, y)
-print(
-    f"Information of the iris data set after making it "
-    f"balanced by cleaning sampling: \n sampling_strategy={sampling_strategy} "
-    f"\n y: {Counter(y_res)}"
-)
-plot_pie(y_res)
-
-###############################################################################
-# ``sampling_strategy`` as a callable
-# ...................................
+ax = y_res.value_counts().plot.pie(autopct=autopct)
+_ = ax.set_title("Cleaning")
+
+# %% [markdown]
+# `sampling_strategy` as a callable
+# ---------------------------------
 #
-# When callable, function taking ``y`` and returns a ``dict``. The keys
+# When callable, function taking `y` and returns a `dict`. The keys
 # correspond to the targeted classes. The values correspond to the desired
 # number of samples for each class.
 
 
+# %%
 def ratio_multiplier(y):
+    from collections import Counter
+
     multiplier = {1: 0.7, 2: 0.95}
     target_stats = Counter(y)
     for key, value in target_stats.items():
@@ -232,11 +188,6 @@ def ratio_multiplier(y):
 
 
 X_res, y_res = RandomUnderSampler(sampling_strategy=ratio_multiplier).fit_resample(X, y)
-
-print(
-    f"Information of the iris data set after balancing using a callable"
-    f" mode:\n ratio={ratio_multiplier} \n y: {Counter(y_res)}"
-)
-plot_pie(y_res)
-
+ax = y_res.value_counts().plot.pie(autopct=autopct)
+ax.set_title("Under-sampling")
 plt.show()
Original file line number	Diff line number	Diff line change
`@@ -160,6 +160,7 @@`
`160`	`160`	`"matplotlib": ("https://matplotlib.org/", None),`
`161`	`161`	`"sklearn": ("http://scikit-learn.org/stable", None),`
`162`	`162`	`"pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None),`
	`163`	`+ "seaborn": ("https://seaborn.pydata.org/", None),`
`163`	`164`	`}`
`164`	`165`
`165`	`166`	`# -- Options for sphinx-gallery -----------------------------------------------`