Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
287 changes: 22 additions & 265 deletions imblearn/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# Christos Aridas
# Guillaume Lemaitre <[email protected]>
# License: BSD

from sklearn import pipeline
from sklearn.base import clone
from sklearn.utils import Bunch, _print_elapsed_time
from sklearn.utils import _print_elapsed_time
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.utils.validation import check_memory

Expand Down Expand Up @@ -145,7 +144,8 @@ def _validate_steps(self):
):
raise TypeError(
"All intermediate steps of the chain should "
"be estimators that implement fit and transform or sample."
"be estimators that implement fit and transform or "
"fit_resample."
" '%s' implements both)" % (t)
)

Expand All @@ -167,6 +167,21 @@ def _validate_steps(self):
% (estimator, type(estimator))
)

def _iter(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At first I thought that we might need tests for this but since the tests of the pipeline are passing might be just a sanity check. So it's ok.

self, with_final=True, filter_passthrough=True, filter_resample=True
):
"""Generate (idx, (name, trans)) tuples from self.steps.

When `filter_passthrough` is `True`, 'passthrough' and None
transformers are filtered out. When `filter_resample` is `True`,
estimator with a method `fit_resample` are filtered out.
"""
it = super()._iter(with_final, filter_passthrough)
if filter_resample:
return filter(lambda x: not hasattr(x[-1], "fit_resample"), it)
else:
return it

# Estimator interface

def _fit(self, X, y=None, **fit_params):
Expand All @@ -175,7 +190,7 @@ def _fit(self, X, y=None, **fit_params):
# Setup the memory
memory = check_memory(self.memory)

fit_transform_one_cached = memory.cache(_fit_transform_one)
fit_transform_one_cached = memory.cache(pipeline._fit_transform_one)
fit_resample_one_cached = memory.cache(_fit_resample_one)

fit_params_steps = {
Expand All @@ -194,7 +209,8 @@ def _fit(self, X, y=None, **fit_params):
for (step_idx,
name,
transformer) in self._iter(with_final=False,
filter_passthrough=False):
filter_passthrough=False,
filter_resample=False):
if (transformer is None or transformer == 'passthrough'):
with _print_elapsed_time('Pipeline',
self._log_message(step_idx)):
Expand All @@ -208,7 +224,7 @@ def _fit(self, X, y=None, **fit_params):
else:
cloned_transformer = clone(transformer)
elif hasattr(memory, "cachedir"):
# joblib < 0.11
# joblib <= 0.11
if memory.cachedir is None:
# we do not clone when caching is disabled to
# preserve backward compatibility
Expand Down Expand Up @@ -354,38 +370,6 @@ def fit_resample(self, X, y=None, **fit_params):
elif hasattr(last_step, "fit_resample"):
return last_step.fit_resample(Xt, yt, **fit_params)

@if_delegate_has_method(delegate="_final_estimator")
def predict(self, X, **predict_params):
"""Apply transformers/samplers to the data, and predict with the final
estimator

Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.

**predict_params : dict of string -> object
Parameters to the ``predict`` called at the end of all
transformations in the pipeline. Note that while this may be
used to return uncertainties from some models with return_std
or return_cov, uncertainties that are generated by the
transformations in the pipeline are not propagated to the
final estimator.

Returns
-------
y_pred : array-like

"""
Xt = X
for _, _, transform in self._iter(with_final=False):
if hasattr(transform, "fit_resample"):
pass
else:
Xt = transform.transform(Xt)
return self.steps[-1][-1].predict(Xt, **predict_params)

@if_delegate_has_method(delegate="_final_estimator")
def fit_predict(self, X, y=None, **fit_params):
"""Applies fit_predict of last step in pipeline after transforms.
Expand Down Expand Up @@ -419,233 +403,6 @@ def fit_predict(self, X, y=None, **fit_params):
y_pred = self.steps[-1][-1].fit_predict(Xt, yt, **fit_params)
return y_pred

@if_delegate_has_method(delegate="_final_estimator")
def predict_proba(self, X):
"""Apply transformers/samplers, and predict_proba of the final
estimator

Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.

Returns
-------
y_proba : array-like, shape = [n_samples, n_classes]

"""
Xt = X
for _, _, transform in self._iter(with_final=False):
if hasattr(transform, "fit_resample"):
pass
else:
Xt = transform.transform(Xt)
return self.steps[-1][-1].predict_proba(Xt)

@if_delegate_has_method(delegate="_final_estimator")
def score_samples(self, X):
"""Apply transforms, and score_samples of the final estimator.
Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.
Returns
-------
y_score : ndarray, shape (n_samples,)
"""
Xt = X
for _, _, transformer in self._iter(with_final=False):
if hasattr(transformer, "fit_resample"):
pass
else:
Xt = transformer.transform(Xt)
return self.steps[-1][-1].score_samples(Xt)

@if_delegate_has_method(delegate="_final_estimator")
def decision_function(self, X):
"""Apply transformers/samplers, and decision_function of the final
estimator

Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.

Returns
-------
y_score : array-like, shape = [n_samples, n_classes]

"""
Xt = X
for _, _, transform in self._iter(with_final=False):
if hasattr(transform, "fit_resample"):
pass
else:
Xt = transform.transform(Xt)
return self.steps[-1][-1].decision_function(Xt)

@if_delegate_has_method(delegate="_final_estimator")
def predict_log_proba(self, X):
"""Apply transformers/samplers, and predict_log_proba of the final
estimator

Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.

Returns
-------
y_score : array-like, shape = [n_samples, n_classes]

"""
Xt = X
for _, _, transform in self._iter(with_final=False):
if hasattr(transform, "fit_resample"):
pass
else:
Xt = transform.transform(Xt)
return self.steps[-1][-1].predict_log_proba(Xt)

@property
def transform(self):
"""Apply transformers/samplers, and transform with the final estimator

This also works where final estimator is ``None``: all prior
transformations are applied.

Parameters
----------
X : iterable
Data to transform. Must fulfill input requirements of first step
of the pipeline.

Returns
-------
Xt : array-like, shape = [n_samples, n_transformed_features]
"""
# _final_estimator is None or has transform, otherwise attribute error
if self._final_estimator != "passthrough":
self._final_estimator.transform
return self._transform

def _transform(self, X):
Xt = X
for _, _, transform in self._iter():
if hasattr(transform, "fit_resample"):
pass
else:
Xt = transform.transform(Xt)
return Xt

@property
def inverse_transform(self):
"""Apply inverse transformations in reverse order

All estimators in the pipeline must support ``inverse_transform``.

Parameters
----------
Xt : array-like, shape = [n_samples, n_transformed_features]
Data samples, where ``n_samples`` is the number of samples and
``n_features`` is the number of features. Must fulfill
input requirements of last step of pipeline's
``inverse_transform`` method.

Returns
-------
Xt : array-like, shape = [n_samples, n_features]
"""
# raise AttributeError if necessary for hasattr behaviour
for _, _, transform in self._iter():
transform.inverse_transform
return self._inverse_transform

def _inverse_transform(self, X):
Xt = X
reverse_iter = reversed(list(self._iter()))
for _, _, transform in reverse_iter:
if hasattr(transform, "fit_resample"):
pass
else:
Xt = transform.inverse_transform(Xt)
return Xt

@if_delegate_has_method(delegate="_final_estimator")
def score(self, X, y=None, sample_weight=None):
"""Apply transformers/samplers, and score with the final estimator

Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.

y : iterable, default=None
Targets used for scoring. Must fulfill label requirements for all
steps of the pipeline.

sample_weight : array-like, default=None
If not None, this argument is passed as ``sample_weight`` keyword
argument to the ``score`` method of the final estimator.

Returns
-------
score : float
"""
Xt = X
for _, _, transform in self._iter(with_final=False):
if hasattr(transform, "fit_resample"):
pass
else:
Xt = transform.transform(Xt)
score_params = {}
if sample_weight is not None:
score_params["sample_weight"] = sample_weight
return self.steps[-1][-1].score(Xt, y, **score_params)

@if_delegate_has_method(delegate='_final_estimator')
def score_samples(self, X):
"""Apply transforms, and score_samples of the final estimator.
Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.
Returns
-------
y_score : ndarray, shape (n_samples,)
"""
Xt = X
for _, _, transformer in self._iter(with_final=False):
if hasattr(transformer, "fit_resample"):
pass
else:
Xt = transformer.transform(Xt)
return self.steps[-1][-1].score_samples(Xt)


def _fit_transform_one(transformer,
X,
y,
weight,
message_clsname='',
message=None,
**fit_params):
with _print_elapsed_time(message_clsname, message):
if hasattr(transformer, "fit_transform"):
res = transformer.fit_transform(X, y, **fit_params)
else:
res = transformer.fit(X, y, **fit_params).transform(X)
# if we have a weight for this transformer, multiply output
if weight is None:
return res, transformer
return res * weight, transformer


def _fit_resample_one(sampler,
X,
Expand Down