-
Notifications
You must be signed in to change notification settings - Fork 1.3k
[MRG+1] Pipeline Refactor - Reduce Code Footprint #654
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
30dde87
refactored _iter to allow use inheritance to remove derived methods
MattEding d4e7aea
change _iter semantics; remove unused import
MattEding ee7773b
Update pipeline.py
glemaitre 9fa8b57
Update pipeline.py
glemaitre a9d7909
PEP8
glemaitre File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,10 +12,9 @@ | |
| # Christos Aridas | ||
| # Guillaume Lemaitre <[email protected]> | ||
| # License: BSD | ||
|
|
||
| from sklearn import pipeline | ||
| from sklearn.base import clone | ||
| from sklearn.utils import Bunch, _print_elapsed_time | ||
| from sklearn.utils import _print_elapsed_time | ||
| from sklearn.utils.metaestimators import if_delegate_has_method | ||
| from sklearn.utils.validation import check_memory | ||
|
|
||
|
|
@@ -145,7 +144,8 @@ def _validate_steps(self): | |
| ): | ||
| raise TypeError( | ||
| "All intermediate steps of the chain should " | ||
| "be estimators that implement fit and transform or sample." | ||
| "be estimators that implement fit and transform or " | ||
| "fit_resample." | ||
| " '%s' implements both)" % (t) | ||
| ) | ||
|
|
||
|
|
@@ -167,6 +167,21 @@ def _validate_steps(self): | |
| % (estimator, type(estimator)) | ||
| ) | ||
|
|
||
| def _iter( | ||
| self, with_final=True, filter_passthrough=True, filter_resample=True | ||
| ): | ||
| """Generate (idx, (name, trans)) tuples from self.steps. | ||
|
|
||
| When `filter_passthrough` is `True`, 'passthrough' and None | ||
| transformers are filtered out. When `filter_resample` is `True`, | ||
| estimator with a method `fit_resample` are filtered out. | ||
| """ | ||
| it = super()._iter(with_final, filter_passthrough) | ||
| if filter_resample: | ||
| return filter(lambda x: not hasattr(x[-1], "fit_resample"), it) | ||
| else: | ||
| return it | ||
|
|
||
| # Estimator interface | ||
|
|
||
| def _fit(self, X, y=None, **fit_params): | ||
|
|
@@ -175,7 +190,7 @@ def _fit(self, X, y=None, **fit_params): | |
| # Setup the memory | ||
| memory = check_memory(self.memory) | ||
|
|
||
| fit_transform_one_cached = memory.cache(_fit_transform_one) | ||
| fit_transform_one_cached = memory.cache(pipeline._fit_transform_one) | ||
| fit_resample_one_cached = memory.cache(_fit_resample_one) | ||
|
|
||
| fit_params_steps = { | ||
|
|
@@ -194,7 +209,8 @@ def _fit(self, X, y=None, **fit_params): | |
| for (step_idx, | ||
| name, | ||
| transformer) in self._iter(with_final=False, | ||
| filter_passthrough=False): | ||
| filter_passthrough=False, | ||
| filter_resample=False): | ||
| if (transformer is None or transformer == 'passthrough'): | ||
| with _print_elapsed_time('Pipeline', | ||
| self._log_message(step_idx)): | ||
|
|
@@ -208,7 +224,7 @@ def _fit(self, X, y=None, **fit_params): | |
| else: | ||
| cloned_transformer = clone(transformer) | ||
| elif hasattr(memory, "cachedir"): | ||
| # joblib < 0.11 | ||
| # joblib <= 0.11 | ||
| if memory.cachedir is None: | ||
| # we do not clone when caching is disabled to | ||
| # preserve backward compatibility | ||
|
|
@@ -354,38 +370,6 @@ def fit_resample(self, X, y=None, **fit_params): | |
| elif hasattr(last_step, "fit_resample"): | ||
| return last_step.fit_resample(Xt, yt, **fit_params) | ||
|
|
||
| @if_delegate_has_method(delegate="_final_estimator") | ||
| def predict(self, X, **predict_params): | ||
| """Apply transformers/samplers to the data, and predict with the final | ||
| estimator | ||
|
|
||
| Parameters | ||
| ---------- | ||
| X : iterable | ||
| Data to predict on. Must fulfill input requirements of first step | ||
| of the pipeline. | ||
|
|
||
| **predict_params : dict of string -> object | ||
| Parameters to the ``predict`` called at the end of all | ||
| transformations in the pipeline. Note that while this may be | ||
| used to return uncertainties from some models with return_std | ||
| or return_cov, uncertainties that are generated by the | ||
| transformations in the pipeline are not propagated to the | ||
| final estimator. | ||
|
|
||
| Returns | ||
| ------- | ||
| y_pred : array-like | ||
|
|
||
| """ | ||
| Xt = X | ||
| for _, _, transform in self._iter(with_final=False): | ||
| if hasattr(transform, "fit_resample"): | ||
| pass | ||
| else: | ||
| Xt = transform.transform(Xt) | ||
| return self.steps[-1][-1].predict(Xt, **predict_params) | ||
|
|
||
| @if_delegate_has_method(delegate="_final_estimator") | ||
| def fit_predict(self, X, y=None, **fit_params): | ||
| """Applies fit_predict of last step in pipeline after transforms. | ||
|
|
@@ -419,233 +403,6 @@ def fit_predict(self, X, y=None, **fit_params): | |
| y_pred = self.steps[-1][-1].fit_predict(Xt, yt, **fit_params) | ||
| return y_pred | ||
|
|
||
| @if_delegate_has_method(delegate="_final_estimator") | ||
| def predict_proba(self, X): | ||
| """Apply transformers/samplers, and predict_proba of the final | ||
| estimator | ||
|
|
||
| Parameters | ||
| ---------- | ||
| X : iterable | ||
| Data to predict on. Must fulfill input requirements of first step | ||
| of the pipeline. | ||
|
|
||
| Returns | ||
| ------- | ||
| y_proba : array-like, shape = [n_samples, n_classes] | ||
|
|
||
| """ | ||
| Xt = X | ||
| for _, _, transform in self._iter(with_final=False): | ||
| if hasattr(transform, "fit_resample"): | ||
| pass | ||
| else: | ||
| Xt = transform.transform(Xt) | ||
| return self.steps[-1][-1].predict_proba(Xt) | ||
|
|
||
| @if_delegate_has_method(delegate="_final_estimator") | ||
| def score_samples(self, X): | ||
| """Apply transforms, and score_samples of the final estimator. | ||
| Parameters | ||
| ---------- | ||
| X : iterable | ||
| Data to predict on. Must fulfill input requirements of first step | ||
| of the pipeline. | ||
| Returns | ||
| ------- | ||
| y_score : ndarray, shape (n_samples,) | ||
| """ | ||
| Xt = X | ||
| for _, _, transformer in self._iter(with_final=False): | ||
| if hasattr(transformer, "fit_resample"): | ||
| pass | ||
| else: | ||
| Xt = transformer.transform(Xt) | ||
| return self.steps[-1][-1].score_samples(Xt) | ||
|
|
||
| @if_delegate_has_method(delegate="_final_estimator") | ||
| def decision_function(self, X): | ||
| """Apply transformers/samplers, and decision_function of the final | ||
| estimator | ||
|
|
||
| Parameters | ||
| ---------- | ||
| X : iterable | ||
| Data to predict on. Must fulfill input requirements of first step | ||
| of the pipeline. | ||
|
|
||
| Returns | ||
| ------- | ||
| y_score : array-like, shape = [n_samples, n_classes] | ||
|
|
||
| """ | ||
| Xt = X | ||
| for _, _, transform in self._iter(with_final=False): | ||
| if hasattr(transform, "fit_resample"): | ||
| pass | ||
| else: | ||
| Xt = transform.transform(Xt) | ||
| return self.steps[-1][-1].decision_function(Xt) | ||
|
|
||
| @if_delegate_has_method(delegate="_final_estimator") | ||
| def predict_log_proba(self, X): | ||
| """Apply transformers/samplers, and predict_log_proba of the final | ||
| estimator | ||
|
|
||
| Parameters | ||
| ---------- | ||
| X : iterable | ||
| Data to predict on. Must fulfill input requirements of first step | ||
| of the pipeline. | ||
|
|
||
| Returns | ||
| ------- | ||
| y_score : array-like, shape = [n_samples, n_classes] | ||
|
|
||
| """ | ||
| Xt = X | ||
| for _, _, transform in self._iter(with_final=False): | ||
| if hasattr(transform, "fit_resample"): | ||
| pass | ||
| else: | ||
| Xt = transform.transform(Xt) | ||
| return self.steps[-1][-1].predict_log_proba(Xt) | ||
|
|
||
| @property | ||
| def transform(self): | ||
| """Apply transformers/samplers, and transform with the final estimator | ||
|
|
||
| This also works where final estimator is ``None``: all prior | ||
| transformations are applied. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| X : iterable | ||
| Data to transform. Must fulfill input requirements of first step | ||
| of the pipeline. | ||
|
|
||
| Returns | ||
| ------- | ||
| Xt : array-like, shape = [n_samples, n_transformed_features] | ||
| """ | ||
| # _final_estimator is None or has transform, otherwise attribute error | ||
| if self._final_estimator != "passthrough": | ||
| self._final_estimator.transform | ||
| return self._transform | ||
|
|
||
| def _transform(self, X): | ||
| Xt = X | ||
| for _, _, transform in self._iter(): | ||
| if hasattr(transform, "fit_resample"): | ||
| pass | ||
| else: | ||
| Xt = transform.transform(Xt) | ||
| return Xt | ||
|
|
||
| @property | ||
| def inverse_transform(self): | ||
| """Apply inverse transformations in reverse order | ||
|
|
||
| All estimators in the pipeline must support ``inverse_transform``. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| Xt : array-like, shape = [n_samples, n_transformed_features] | ||
| Data samples, where ``n_samples`` is the number of samples and | ||
| ``n_features`` is the number of features. Must fulfill | ||
| input requirements of last step of pipeline's | ||
| ``inverse_transform`` method. | ||
|
|
||
| Returns | ||
| ------- | ||
| Xt : array-like, shape = [n_samples, n_features] | ||
| """ | ||
| # raise AttributeError if necessary for hasattr behaviour | ||
| for _, _, transform in self._iter(): | ||
| transform.inverse_transform | ||
| return self._inverse_transform | ||
|
|
||
| def _inverse_transform(self, X): | ||
| Xt = X | ||
| reverse_iter = reversed(list(self._iter())) | ||
| for _, _, transform in reverse_iter: | ||
| if hasattr(transform, "fit_resample"): | ||
| pass | ||
| else: | ||
| Xt = transform.inverse_transform(Xt) | ||
| return Xt | ||
|
|
||
| @if_delegate_has_method(delegate="_final_estimator") | ||
| def score(self, X, y=None, sample_weight=None): | ||
| """Apply transformers/samplers, and score with the final estimator | ||
|
|
||
| Parameters | ||
| ---------- | ||
| X : iterable | ||
| Data to predict on. Must fulfill input requirements of first step | ||
| of the pipeline. | ||
|
|
||
| y : iterable, default=None | ||
| Targets used for scoring. Must fulfill label requirements for all | ||
| steps of the pipeline. | ||
|
|
||
| sample_weight : array-like, default=None | ||
| If not None, this argument is passed as ``sample_weight`` keyword | ||
| argument to the ``score`` method of the final estimator. | ||
|
|
||
| Returns | ||
| ------- | ||
| score : float | ||
| """ | ||
| Xt = X | ||
| for _, _, transform in self._iter(with_final=False): | ||
| if hasattr(transform, "fit_resample"): | ||
| pass | ||
| else: | ||
| Xt = transform.transform(Xt) | ||
| score_params = {} | ||
| if sample_weight is not None: | ||
| score_params["sample_weight"] = sample_weight | ||
| return self.steps[-1][-1].score(Xt, y, **score_params) | ||
|
|
||
| @if_delegate_has_method(delegate='_final_estimator') | ||
| def score_samples(self, X): | ||
| """Apply transforms, and score_samples of the final estimator. | ||
| Parameters | ||
| ---------- | ||
| X : iterable | ||
| Data to predict on. Must fulfill input requirements of first step | ||
| of the pipeline. | ||
| Returns | ||
| ------- | ||
| y_score : ndarray, shape (n_samples,) | ||
| """ | ||
| Xt = X | ||
| for _, _, transformer in self._iter(with_final=False): | ||
| if hasattr(transformer, "fit_resample"): | ||
| pass | ||
| else: | ||
| Xt = transformer.transform(Xt) | ||
| return self.steps[-1][-1].score_samples(Xt) | ||
|
|
||
|
|
||
| def _fit_transform_one(transformer, | ||
| X, | ||
| y, | ||
| weight, | ||
| message_clsname='', | ||
| message=None, | ||
| **fit_params): | ||
| with _print_elapsed_time(message_clsname, message): | ||
| if hasattr(transformer, "fit_transform"): | ||
| res = transformer.fit_transform(X, y, **fit_params) | ||
| else: | ||
| res = transformer.fit(X, y, **fit_params).transform(X) | ||
| # if we have a weight for this transformer, multiply output | ||
| if weight is None: | ||
| return res, transformer | ||
| return res * weight, transformer | ||
|
|
||
|
|
||
| def _fit_resample_one(sampler, | ||
| X, | ||
|
|
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At first I thought that we might need tests for this but since the tests of the pipeline are passing might be just a sanity check. So it's ok.