Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 0 additions & 11 deletions src/python/nimbusml/base_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import os

from sklearn.base import BaseEstimator
from sklearn.utils.multiclass import unique_labels

from . import Pipeline
from .internal.core.base_pipeline_item import BasePipelineItem
Expand Down Expand Up @@ -39,16 +38,6 @@ def fit(self, X, y=None, **params):
:param y: array-like with shape=[n_samples]
:return: self
"""
if y is not None and not isinstance(y, (
str, tuple)) and self.type in set(
['classifier', 'anomaly']):
unique_classes = unique_labels(y)
if len(unique_classes) < 2:
raise ValueError(
"Classifier can't train when only one class is "
"present.")
self.classes_ = unique_classes

# Clear cached summary since it should not
# retain its value after a new call to fit
if hasattr(self, 'model_summary_'):
Expand Down
29 changes: 23 additions & 6 deletions src/python/nimbusml/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pandas import DataFrame, Series
from scipy.sparse import csr_matrix
from sklearn.utils.validation import check_X_y, check_array
from sklearn.utils.multiclass import unique_labels

from .internal.core.base_pipeline_item import BasePipelineItem
from .internal.entrypoints.data_customtextloader import \
Expand Down Expand Up @@ -1111,6 +1112,8 @@ def fit(self, X, y=None, verbose=1, **params):
i, n.__class__.__name__), TrainedWarning)
break

self._extract_classes(y)

graph, X, y, weights, start_time, schema, telemetry_info, \
learner_features, _, max_slots = self._fit_graph(
X, y, verbose, **params)
Expand Down Expand Up @@ -1923,6 +1926,24 @@ def _predict(self, X, y=None,
self._write_csv_time = graph._write_csv_time
return out_data, out_metrics

def _extract_classes(self, y):
if ((len(self.steps) > 0) and
(self.last_node.type in ['classifier', 'anomaly']) and
(y is not None) and
(not isinstance(y, (str, tuple)))):

unique_classes = unique_labels(y)
if len(unique_classes) < 2:
raise ValueError(
"Classifier can't train when only one class is "
"present.")
self._add_classes(unique_classes)

def _extract_classes_from_headers(self, headers):
classes = [x.replace('Score.', '') for x in headers]
classes = np.array(classes).astype(self.last_node.classes_.dtype)
self._add_classes(classes)

def _add_classes(self, classes):
# Create classes_ attribute similar to scikit
# Add both to pipeline and ending classifier
Expand Down Expand Up @@ -1974,11 +1995,7 @@ def predict_proba(self, X, verbose=0, **params):
# for multiclass, scores are probabilities
pcols = [i for i in scores.columns if i.startswith('Score.')]
if len(pcols) > 0:
# [todo]: this is a bug, predict_proba should not change
# internal state of pipeline.
# test check_dict_unchanged() detects that, commenting line
# for now
# self._add_classes([x.replace('Score.', '') for x in pcols])
self._extract_classes_from_headers(pcols)
return scores.loc[:, pcols].values

raise ValueError(
Expand Down Expand Up @@ -2019,7 +2036,7 @@ def decision_function(self, X, verbose=0, **params):

# for multiclass with n_classes > 2
if len(scols) > 2:
self._add_classes([x.replace('Score.', '') for x in scols])
self._extract_classes_from_headers(scols)
return scores.loc[:, scols].values

raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@
X_train, X_test, y_train, y_test = \
train_test_split(features, labels)

# 3 class dataset with integer labels
np.random.seed(0)
df = get_dataset("iris").as_df()
df.drop(['Species'], inplace=True, axis=1)
features_3class_int, labels_3class_int = split_features_and_label(df, 'Label')
X_train_3class_int, X_test_3class_int, y_train_3class_int, y_test_3class_int = \
train_test_split(features_3class_int, labels_3class_int)

# 3 class dataset with string labels
np.random.seed(0)
df = get_dataset("iris").as_df()
Expand Down Expand Up @@ -115,6 +123,36 @@ def test_pass_predict_proba_multiclass_3class(self):
err_msg=invalid_decision_function_output)
assert_equal(set(clf.classes_), {'Blue', 'Green', 'Red'})

def test_pass_predict_proba_multiclass_with_pipeline_adds_classes(self):
clf = FastLinearClassifier(number_of_threads=1)
pipeline = Pipeline([clf])
pipeline.fit(X_train_3class, y_train_3class)

expected_classes = {'Blue', 'Green', 'Red'}
assert_equal(set(clf.classes_), expected_classes)
assert_equal(set(pipeline.classes_), expected_classes)

s = pipeline.predict_proba(X_test_3class).sum()
assert_almost_equal(
s,
38.0,
decimal=4,
err_msg=invalid_decision_function_output)

assert_equal(set(clf.classes_), expected_classes)
assert_equal(set(pipeline.classes_), expected_classes)

def test_pass_predict_proba_multiclass_3class_retains_classes_type(self):
clf = FastLinearClassifier(number_of_threads=1)
clf.fit(X_train_3class_int, y_train_3class_int)
s = clf.predict_proba(X_test_3class_int).sum()
assert_almost_equal(
s,
38.0,
decimal=4,
err_msg=invalid_decision_function_output)
assert_equal(set(clf.classes_), {0, 1, 2})

def test_fail_predict_proba_multiclass_with_pipeline(self):
check_unsupported_predict_proba(self, Pipeline(
[NaiveBayesClassifier()]), X_train, y_train, X_test)
Expand Down Expand Up @@ -174,6 +212,36 @@ def test_pass_decision_function_multiclass_3class(self):
err_msg=invalid_decision_function_output)
assert_equal(set(clf.classes_), {'Blue', 'Green', 'Red'})

def test_pass_decision_function_multiclass_with_pipeline_adds_classes(self):
clf = FastLinearClassifier(number_of_threads=1)
pipeline = Pipeline([clf])
pipeline.fit(X_train_3class, y_train_3class)

expected_classes = {'Blue', 'Green', 'Red'}
assert_equal(set(clf.classes_), expected_classes)
assert_equal(set(pipeline.classes_), expected_classes)

s = pipeline.decision_function(X_test_3class).sum()
assert_almost_equal(
s,
38.0,
decimal=4,
err_msg=invalid_decision_function_output)

assert_equal(set(clf.classes_), expected_classes)
assert_equal(set(pipeline.classes_), expected_classes)

def test_pass_decision_function_multiclass_3class_retains_classes_type(self):
clf = FastLinearClassifier(number_of_threads=1)
clf.fit(X_train_3class_int, y_train_3class_int)
s = clf.decision_function(X_test_3class_int).sum()
assert_almost_equal(
s,
38.0,
decimal=4,
err_msg=invalid_decision_function_output)
assert_equal(set(clf.classes_), {0, 1, 2})

def test_fail_decision_function_multiclass(self):
check_unsupported_decision_function(
self, LogisticRegressionClassifier(), X_train, y_train, X_test)
Expand Down
8 changes: 8 additions & 0 deletions src/python/tests/test_estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,14 @@ def load_json(file_path):
estimator = estimator << 'F0'

for check in _yield_all_checks(class_name, estimator):
# Skip check_dict_unchanged for estimators which
# update the classes_ attribute. For more details
# see https://github.com/microsoft/NimbusML/pull/200
if (check.__name__ == 'check_dict_unchanged') and \
(hasattr(estimator, 'predict_proba') or
hasattr(estimator, 'decision_function')):
continue

if check.__name__ in OMITTED_CHECKS_ALWAYS:
continue
if 'Binary' in class_name and check.__name__ in NOBINARY_CHECKS:
Expand Down