From 683a881f74dbd1e171f89be99976546c3bb6b5ac Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Fri, 4 Oct 2019 22:54:21 -0700 Subject: [PATCH 01/15] draft --- src/DotNetBridge/Entrypoints.cs | 55 +++++++++++++ .../entrypoints/transforms_datasetscorer.py | 2 +- src/python/tests/test_copyright.py | 81 +++++++++---------- src/python/tools/entrypoint_compiler.py | 5 ++ 4 files changed, 99 insertions(+), 44 deletions(-) diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index 039247fe..d2ca45b8 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -87,5 +87,60 @@ public static CommonOutputs.TransformOutput CreateVariableColumn(IHostEnvironmen var xf = VariableColumnTransform.Create(env, inputOptions, inputOptions.Data); return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf }; } + + public sealed class Input + { + [Argument(ArgumentType.Required, HelpText = "The dataset to be scored", SortOrder = 1)] + public IDataView Data; + + [Argument(ArgumentType.Required, HelpText = "The predictor model to apply to data", SortOrder = 2)] + public PredictorModel PredictorModel; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Suffix to append to the score columns", SortOrder = 3)] + public string Suffix; + } + + public sealed class Output + { + [TlcModule.Output(Desc = "The scored dataset", SortOrder = 1)] + public IDataView ScoredData; + + [TlcModule.Output(Desc = "The scoring transform", SortOrder = 2)] + public TransformModel ScoringTransform; + } + + [TlcModule.EntryPoint(Name = "Transforms.DatasetScorer1", Desc = "Score a dataset with a predictor model")] + public static Output Score(IHostEnvironment env, Input input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("ScoreModel"); + host.CheckValue(input, nameof(input)); + EntryPointUtils.CheckInputArgs(host, input); + + var inputData = input.Data; + // input.PredictorModel.PrepareData(host, inputData, out RoleMappedData data, out IPredictor predictor); + IPredictor predictor = input.PredictorModel.Predictor; + var data = new RoleMappedData(input.Data, null, input.Data.Schema[0].Name); + + IDataView scoredPipe; + using (var ch = host.Start("Creating scoring pipeline")) + { + ch.Trace("Creating pipeline"); + var bindable = ScoreUtils.GetSchemaBindableMapper(host, predictor); + ch.AssertValue(bindable); + + var mapper = bindable.Bind(host, data.Schema); + var scorer = ScoreUtils.GetScorerComponent(host, mapper, input.Suffix); + scoredPipe = scorer.CreateComponent(host, data.Data, mapper, input.PredictorModel.GetTrainingSchema(host)); + } + + return + new Output + { + ScoredData = scoredPipe, + ScoringTransform = new TransformModelImpl(host, scoredPipe, inputData) + }; + + } } } diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorer.py b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorer.py index 2aa14c74..983b3abd 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorer.py @@ -27,7 +27,7 @@ def transforms_datasetscorer( :param scoring_transform: The scoring transform (outputs). """ - entrypoint_name = 'Transforms.DatasetScorer' + entrypoint_name = 'Transforms.DatasetScorer1' inputs = {} outputs = {} diff --git a/src/python/tests/test_copyright.py b/src/python/tests/test_copyright.py index 87dc0830..03984b61 100644 --- a/src/python/tests/test_copyright.py +++ b/src/python/tests/test_copyright.py @@ -1,48 +1,43 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# ------------------------------------------------------------------------- -import os -import unittest +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LogisticRegressionBinaryClassifier +from nimbusml.preprocessing import DatasetTransformer +from nimbusml.preprocessing.schema import PrefixColumnConcatenator +from nimbusml.preprocessing.schema import ColumnDropper +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path) + +# train featurizer +featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})]) +featurization_pipeline.fit(data) +print(featurization_pipeline.get_schema()) -class TestCopyright(unittest.TestCase): - """ - Tests that the copyright is present in everyfile. - """ +# need to remove extra columns from csr matrix +# and get csr_matrix featurized data +csr_featurization_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), ColumnDropper() << ['case', 'row_num']]) +sparse_featurized_data = csr_featurization_pipeline.fit_transform(data, as_csr=True) +# Note: the relative order of all columns is still the same. +print(csr_featurization_pipeline.get_schema()) - def test_copyrigth_present(self): - root = os.path.join(os.path.abspath(os.path.dirname(__file__)), - '..') - root = os.path.normpath(root) - allfiles = [] - for r, dirs, files in os.walk(root): - for name in files: - if name.endswith('.py'): - allfiles.append(os.path.join(r, name)) +concat_pipeline = Pipeline([DatasetTransformer(csr_featurization_pipeline.model), PrefixColumnConcatenator({'education': 'education.'})]) +concat_pipeline.fit(data) +print(concat_pipeline.get_schema()) - nothere = [] - nb = 0 - for name in allfiles: - if 'examples' in name or 'docs' in name or \ - '__init__.py' in name or 'entrypoints' in name: - continue - with open(name, "r") as f: - content = f.read() - if 'Copyright (c) Microsoft Corporation. All rights ' \ - 'reserved.' not in content: - nothere.append(name) - else: - nb += 1 +# train a featurizer + learner pipeline +# Note! order of feature columns on input to learner should be the same as in csr_matrix above +#feature_cols = ['parity', 'education', 'age', 'induced', 'spontaneous', 'stratum', 'pooled.stratum'] # 9 features +feature_cols = csr_featurization_pipeline.get_schema() +training_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), LogisticRegressionBinaryClassifier(feature=feature_cols, label='case')]) +training_pipeline.fit(data, output_predictor_model=True) + +# load just a learner model +predictor_pipeline = Pipeline() +predictor_pipeline.load_model(training_pipeline.predictor_model) +print(predictor_pipeline.get_schema()) - if len(nothere) > 0: - nothere = [' File "{0}", line 1'.format(_) for _ in nothere] - raise Exception( - "Copyright not found in\n{0}".format( - "\n".join(nothere))) - if nb == 0: - raise Exception("No file found in '{0}'".format(root)) - - -if __name__ == '__main__': - unittest.main() +# use just a learner model on csr_matrix featurized data +predictor_pipeline.predict_proba(sparse_featurized_data.astype(np.float32)) \ No newline at end of file diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py index b2765691..014264f6 100644 --- a/src/python/tools/entrypoint_compiler.py +++ b/src/python/tools/entrypoint_compiler.py @@ -2041,6 +2041,11 @@ def generate_code(pkg_path, generate_entrypoints, generate_api): script_args = arg_parser.parse_args() pkg_path = os.path.join(my_dir, r'..\nimbusml') + + script_args.generate_api = False + script_args.generate_entrypoints = False + script_args.check_manual_changes = False + if script_args.check_manual_changes: verbose = False if script_args.folder == 'temp': From e1e304b34761614a8ef4704a28df1bc604fabfc5 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Fri, 4 Oct 2019 23:36:06 -0700 Subject: [PATCH 02/15] draft --- src/python/nimbusml.pyproj | 1 + .../nimbusml/tests/pipeline/test_csr_input.py | 59 +++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 src/python/nimbusml/tests/pipeline/test_csr_input.py diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index 345e7ccf..7f4e4d6d 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -682,6 +682,7 @@ + diff --git a/src/python/nimbusml/tests/pipeline/test_csr_input.py b/src/python/nimbusml/tests/pipeline/test_csr_input.py new file mode 100644 index 00000000..f54f1362 --- /dev/null +++ b/src/python/nimbusml/tests/pipeline/test_csr_input.py @@ -0,0 +1,59 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import os +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LogisticRegressionBinaryClassifier +from nimbusml.preprocessing import DatasetTransformer +from nimbusml.preprocessing.schema import PrefixColumnConcatenator +from nimbusml.preprocessing.schema import ColumnDropper + +class TestCsrInput(unittest.TestCase): + + def test_predict_proba_on_csr(self): + path = get_dataset('infert').as_filepath() + data = FileDataStream.read_csv(path) + cols = list(data.head(1).columns.values) # ordered data column names. + + # train featurizer + featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})]) + featurization_pipeline.fit(data) + # Note: the relative order of all columns is still the same as in raw data. + #print(featurization_pipeline.get_schema()) + + # need to remove extra columns before getting csr_matrix featurized data as it wont have column name information. + csr_featurization_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), ColumnDropper() << ['case', 'row_num']]) + sparse_featurized_data = csr_featurization_pipeline.fit_transform(data, as_csr=True) + # Note: the relative order of all columns is still the same. + #print(csr_featurization_pipeline.get_schema()) + + # train learner + # Note: order & number of feature columns for learner (parameter 'feature') should be the same as in csr_matrix above + cols.remove('row_num') + cols.remove('case') + feature_cols = cols + print(feature_cols) + #['education', 'age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooled.stratum'] + training_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), LogisticRegressionBinaryClassifier(feature=feature_cols, label='case')]) + training_pipeline.fit(data, output_predictor_model=True) + + # load just a learner model + predictor_pipeline = Pipeline() + predictor_pipeline.load_model(training_pipeline.predictor_model) + # see the order of Feature.* columns that get passed to learner algo + #print(predictor_pipeline.get_schema()) + + # use just a learner model on csr_matrix featurized data + print(predictor_pipeline.predict_proba(sparse_featurized_data.astype(np.float32))) + # TBD assert + +if __name__ == '__main__': + unittest.main() + From 08d304972e19bdcfb7894330035e7670105e4582 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Fri, 4 Oct 2019 23:42:55 -0700 Subject: [PATCH 03/15] rollback --- src/python/tests/test_copyright.py | 81 ++++++++++++++++-------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/src/python/tests/test_copyright.py b/src/python/tests/test_copyright.py index 03984b61..87dc0830 100644 --- a/src/python/tests/test_copyright.py +++ b/src/python/tests/test_copyright.py @@ -1,43 +1,48 @@ -import numpy as np -import pandas as pd -from nimbusml import Pipeline, FileDataStream -from nimbusml.datasets import get_dataset -from nimbusml.feature_extraction.categorical import OneHotVectorizer -from nimbusml.linear_model import LogisticRegressionBinaryClassifier -from nimbusml.preprocessing import DatasetTransformer -from nimbusml.preprocessing.schema import PrefixColumnConcatenator -from nimbusml.preprocessing.schema import ColumnDropper +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------- +import os +import unittest -path = get_dataset('infert').as_filepath() -data = FileDataStream.read_csv(path) - -# train featurizer -featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})]) -featurization_pipeline.fit(data) -print(featurization_pipeline.get_schema()) -# need to remove extra columns from csr matrix -# and get csr_matrix featurized data -csr_featurization_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), ColumnDropper() << ['case', 'row_num']]) -sparse_featurized_data = csr_featurization_pipeline.fit_transform(data, as_csr=True) -# Note: the relative order of all columns is still the same. -print(csr_featurization_pipeline.get_schema()) +class TestCopyright(unittest.TestCase): + """ + Tests that the copyright is present in everyfile. + """ -concat_pipeline = Pipeline([DatasetTransformer(csr_featurization_pipeline.model), PrefixColumnConcatenator({'education': 'education.'})]) -concat_pipeline.fit(data) -print(concat_pipeline.get_schema()) + def test_copyrigth_present(self): + root = os.path.join(os.path.abspath(os.path.dirname(__file__)), + '..') + root = os.path.normpath(root) + allfiles = [] + for r, dirs, files in os.walk(root): + for name in files: + if name.endswith('.py'): + allfiles.append(os.path.join(r, name)) -# train a featurizer + learner pipeline -# Note! order of feature columns on input to learner should be the same as in csr_matrix above -#feature_cols = ['parity', 'education', 'age', 'induced', 'spontaneous', 'stratum', 'pooled.stratum'] # 9 features -feature_cols = csr_featurization_pipeline.get_schema() -training_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), LogisticRegressionBinaryClassifier(feature=feature_cols, label='case')]) -training_pipeline.fit(data, output_predictor_model=True) - -# load just a learner model -predictor_pipeline = Pipeline() -predictor_pipeline.load_model(training_pipeline.predictor_model) -print(predictor_pipeline.get_schema()) + nothere = [] + nb = 0 + for name in allfiles: + if 'examples' in name or 'docs' in name or \ + '__init__.py' in name or 'entrypoints' in name: + continue + with open(name, "r") as f: + content = f.read() + if 'Copyright (c) Microsoft Corporation. All rights ' \ + 'reserved.' not in content: + nothere.append(name) + else: + nb += 1 -# use just a learner model on csr_matrix featurized data -predictor_pipeline.predict_proba(sparse_featurized_data.astype(np.float32)) \ No newline at end of file + if len(nothere) > 0: + nothere = [' File "{0}", line 1'.format(_) for _ in nothere] + raise Exception( + "Copyright not found in\n{0}".format( + "\n".join(nothere))) + if nb == 0: + raise Exception("No file found in '{0}'".format(root)) + + +if __name__ == '__main__': + unittest.main() From b90d1145ae9e8064019fb7955b3ccbdb301b891d Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Mon, 7 Oct 2019 14:25:49 -0700 Subject: [PATCH 04/15] new entrypoint --- src/DotNetBridge/Entrypoints.cs | 3 +- src/python/nimbusml.pyproj | 1 + .../entrypoints/transforms_csrscorer.py | 68 +++++++++++++++++++ .../entrypoints/transforms_datasetscorer.py | 2 +- src/python/nimbusml/pipeline.py | 15 ++++ 5 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 src/python/nimbusml/internal/entrypoints/transforms_csrscorer.py diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index d2ca45b8..7c53569f 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -109,7 +109,7 @@ public sealed class Output public TransformModel ScoringTransform; } - [TlcModule.EntryPoint(Name = "Transforms.DatasetScorer1", Desc = "Score a dataset with a predictor model")] + [TlcModule.EntryPoint(Name = "Transforms.CsrScorer", Desc = "Score a csr_matrix based dataset with a predictor model")] public static Output Score(IHostEnvironment env, Input input) { Contracts.CheckValue(env, nameof(env)); @@ -118,7 +118,6 @@ public static Output Score(IHostEnvironment env, Input input) EntryPointUtils.CheckInputArgs(host, input); var inputData = input.Data; - // input.PredictorModel.PrepareData(host, inputData, out RoleMappedData data, out IPredictor predictor); IPredictor predictor = input.PredictorModel.Predictor; var data = new RoleMappedData(input.Data, null, input.Data.Schema[0].Name); diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index 7f4e4d6d..bafc90ef 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -394,6 +394,7 @@ + diff --git a/src/python/nimbusml/internal/entrypoints/transforms_csrscorer.py b/src/python/nimbusml/internal/entrypoints/transforms_csrscorer.py new file mode 100644 index 00000000..50f21e7b --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_csrscorer.py @@ -0,0 +1,68 @@ +""" +Transforms.CsrScorer +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_csrscorer( + data, + predictor_model, + scored_data=None, + scoring_transform=None, + suffix=None, + **params): + """ + **Description** + Score a dataset with a predictor model + + :param data: The dataset to be scored (inputs). + :param predictor_model: The predictor model to apply to data + (inputs). + :param suffix: Suffix to append to the score columns (inputs). + :param scored_data: The scored dataset (outputs). + :param scoring_transform: The scoring transform (outputs). + """ + + entrypoint_name = 'Transforms.CsrScorer' + inputs = {} + outputs = {} + + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if predictor_model is not None: + inputs['PredictorModel'] = try_set( + obj=predictor_model, + none_acceptable=False, + is_of_type=str) + if suffix is not None: + inputs['Suffix'] = try_set( + obj=suffix, + none_acceptable=True, + is_of_type=str) + if scored_data is not None: + outputs['ScoredData'] = try_set( + obj=scored_data, + none_acceptable=False, + is_of_type=str) + if scoring_transform is not None: + outputs['ScoringTransform'] = try_set( + obj=scoring_transform, none_acceptable=False, is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorer.py b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorer.py index 983b3abd..2aa14c74 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorer.py @@ -27,7 +27,7 @@ def transforms_datasetscorer( :param scoring_transform: The scoring transform (outputs). """ - entrypoint_name = 'Transforms.DatasetScorer1' + entrypoint_name = 'Transforms.DatasetScorer' inputs = {} outputs = {} diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index b3be72f8..0d873c66 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -42,6 +42,8 @@ from .internal.entrypoints.models_schema import models_schema from .internal.entrypoints.transforms_datasetscorer import \ transforms_datasetscorer +from .internal.entrypoints.transforms_csrscorer import \ + transforms_csrscorer from .internal.entrypoints.transforms_datasettransformscorer import \ transforms_datasettransformscorer from .internal.entrypoints.transforms_featurecombiner import \ @@ -1771,6 +1773,12 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, data="$data", predictor_model="$predictor_model", scored_data="$scoredvectordata") + + if isinstance(X, csr_matrix): + score_node = transforms_csrscorer( + data="$data", + predictor_model="$predictor_model", + scored_data="$scoredvectordata") fcc_node = transforms_featurecontributioncalculationtransformer( data="$scoredvectordata", @@ -1926,6 +1934,13 @@ def _predict(self, X, y=None, data="$data", predictor_model="$predictor_model", scored_data="$scoredVectorData") + + if isinstance(X, csr_matrix): + score_node = transforms_csrscorer( + data="$data", + predictor_model="$predictor_model", + scored_data="$scoredVectorData") + all_nodes.extend([score_node]) if (evaltype in ['binary', 'multiclass']) or \ From 27e066c87576ca72b3ba8e7d0eecb77b04f9ade7 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Mon, 7 Oct 2019 14:39:10 -0700 Subject: [PATCH 05/15] add assert --- src/python/nimbusml/tests/pipeline/test_csr_input.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/python/nimbusml/tests/pipeline/test_csr_input.py b/src/python/nimbusml/tests/pipeline/test_csr_input.py index f54f1362..51069087 100644 --- a/src/python/nimbusml/tests/pipeline/test_csr_input.py +++ b/src/python/nimbusml/tests/pipeline/test_csr_input.py @@ -14,6 +14,7 @@ from nimbusml.preprocessing import DatasetTransformer from nimbusml.preprocessing.schema import PrefixColumnConcatenator from nimbusml.preprocessing.schema import ColumnDropper +from numpy.testing import assert_equal class TestCsrInput(unittest.TestCase): @@ -51,8 +52,11 @@ def test_predict_proba_on_csr(self): #print(predictor_pipeline.get_schema()) # use just a learner model on csr_matrix featurized data - print(predictor_pipeline.predict_proba(sparse_featurized_data.astype(np.float32))) - # TBD assert + predictions = predictor_pipeline.predict_proba(sparse_featurized_data.astype(np.float32)) + assert_equal(len(predictions), 248) + assert_equal(len(predictions[0]), 2) + + if __name__ == '__main__': unittest.main() From 6c5d6f4559df70faef8ff69d187152db25f50dbd Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Mon, 7 Oct 2019 14:40:20 -0700 Subject: [PATCH 06/15] rollback --- src/python/tools/entrypoint_compiler.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py index 1d79151a..ed829533 100644 --- a/src/python/tools/entrypoint_compiler.py +++ b/src/python/tools/entrypoint_compiler.py @@ -2041,11 +2041,6 @@ def generate_code(pkg_path, generate_entrypoints, generate_api): script_args = arg_parser.parse_args() pkg_path = os.path.join(my_dir, r'..\nimbusml') - - script_args.generate_api = False - script_args.generate_entrypoints = False - script_args.check_manual_changes = False - if script_args.check_manual_changes: verbose = False if script_args.folder == 'temp': From f936e9f33afdbe1a3f7888ed5d650fb75ef1b7a4 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Mon, 7 Oct 2019 14:42:15 -0700 Subject: [PATCH 07/15] no print in test --- src/python/nimbusml/tests/pipeline/test_csr_input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/nimbusml/tests/pipeline/test_csr_input.py b/src/python/nimbusml/tests/pipeline/test_csr_input.py index 51069087..fd04a3c9 100644 --- a/src/python/nimbusml/tests/pipeline/test_csr_input.py +++ b/src/python/nimbusml/tests/pipeline/test_csr_input.py @@ -40,7 +40,7 @@ def test_predict_proba_on_csr(self): cols.remove('row_num') cols.remove('case') feature_cols = cols - print(feature_cols) + #print(feature_cols) #['education', 'age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooled.stratum'] training_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), LogisticRegressionBinaryClassifier(feature=feature_cols, label='case')]) training_pipeline.fit(data, output_predictor_model=True) From 5ae56a65cee655733f7b930343deb0aac70091fd Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Mon, 7 Oct 2019 14:43:11 -0700 Subject: [PATCH 08/15] up version --- src/python/nimbusml/__init__.py | 2 +- src/python/setup.py | 2 +- version.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py index 4e18a65b..0b508fcf 100644 --- a/src/python/nimbusml/__init__.py +++ b/src/python/nimbusml/__init__.py @@ -2,7 +2,7 @@ Microsoft Machine Learning for Python """ -__version__ = '1.4.2' +__version__ = '1.5.0' # CoreCLR version of MicrosoftML is built on Windows. # But file permissions are not preserved when it's copied to Linux. diff --git a/src/python/setup.py b/src/python/setup.py index 7b983db8..fc350275 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -45,7 +45,7 @@ # Versions should comply with PEP440. For a discussion on # single-sourcing the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.4.2', + version='1.5.0', description='NimbusML', long_description=long_description, diff --git a/version.txt b/version.txt index c9929e36..3e1ad720 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.4.2 \ No newline at end of file +1.5.0 \ No newline at end of file From dd72cc49922e7d9f9ee0a3aecaf00fe8334eb167 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Mon, 7 Oct 2019 15:37:56 -0700 Subject: [PATCH 09/15] only Single type is allowed for Feature vector --- src/python/nimbusml/pipeline.py | 2 ++ src/python/nimbusml/tests/pipeline/test_csr_input.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 0d873c66..468b8c14 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -1775,6 +1775,7 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, scored_data="$scoredvectordata") if isinstance(X, csr_matrix): + X = X.astype(np.float32) score_node = transforms_csrscorer( data="$data", predictor_model="$predictor_model", @@ -1936,6 +1937,7 @@ def _predict(self, X, y=None, scored_data="$scoredVectorData") if isinstance(X, csr_matrix): + X = X.astype(np.float32) score_node = transforms_csrscorer( data="$data", predictor_model="$predictor_model", diff --git a/src/python/nimbusml/tests/pipeline/test_csr_input.py b/src/python/nimbusml/tests/pipeline/test_csr_input.py index fd04a3c9..2a2f3bd2 100644 --- a/src/python/nimbusml/tests/pipeline/test_csr_input.py +++ b/src/python/nimbusml/tests/pipeline/test_csr_input.py @@ -52,7 +52,7 @@ def test_predict_proba_on_csr(self): #print(predictor_pipeline.get_schema()) # use just a learner model on csr_matrix featurized data - predictions = predictor_pipeline.predict_proba(sparse_featurized_data.astype(np.float32)) + predictions = predictor_pipeline.predict_proba(sparse_featurized_data) assert_equal(len(predictions), 248) assert_equal(len(predictions[0]), 2) From c3ffe3c7a78a73a3da936501725eba4d6bf1b2e8 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 8 Oct 2019 15:12:35 -0700 Subject: [PATCH 10/15] fix comments, rename entrypoint --- src/DotNetBridge/Entrypoints.cs | 25 +++++++++++++------ src/python/nimbusml.pyproj | 2 +- ...corer.py => transforms_datasetscorerex.py} | 6 ++--- src/python/nimbusml/pipeline.py | 25 +++---------------- 4 files changed, 25 insertions(+), 33 deletions(-) rename src/python/nimbusml/internal/entrypoints/{transforms_csrscorer.py => transforms_datasetscorerex.py} (94%) diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index 7c53569f..c57d1950 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -65,7 +65,7 @@ public sealed class ModelSchemaOutput public IDataView Schema; } - [TlcModule.EntryPoint(Name = "Models.Schema", Desc = "Retrieve input and output model schemas")] + [TlcModule.EntryPoint(Name = "Models.Schema", Desc = "Retrieve output model schema")] public static ModelSchemaOutput GetSchema(IHostEnvironment env, TransformModelInput input) { Contracts.CheckValue(env, nameof(env)); @@ -88,7 +88,7 @@ public static CommonOutputs.TransformOutput CreateVariableColumn(IHostEnvironmen return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf }; } - public sealed class Input + public sealed class ScoringTransformInput { [Argument(ArgumentType.Required, HelpText = "The dataset to be scored", SortOrder = 1)] public IDataView Data; @@ -100,7 +100,7 @@ public sealed class Input public string Suffix; } - public sealed class Output + public sealed class ScoringTransformOutput { [TlcModule.Output(Desc = "The scored dataset", SortOrder = 1)] public IDataView ScoredData; @@ -109,8 +109,8 @@ public sealed class Output public TransformModel ScoringTransform; } - [TlcModule.EntryPoint(Name = "Transforms.CsrScorer", Desc = "Score a csr_matrix based dataset with a predictor model")] - public static Output Score(IHostEnvironment env, Input input) + [TlcModule.EntryPoint(Name = "Transforms.DatasetScorerEx", Desc = "Score a dataset with a predictor model")] + public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransformInput input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("ScoreModel"); @@ -118,8 +118,17 @@ public static Output Score(IHostEnvironment env, Input input) EntryPointUtils.CheckInputArgs(host, input); var inputData = input.Data; - IPredictor predictor = input.PredictorModel.Predictor; - var data = new RoleMappedData(input.Data, null, input.Data.Schema[0].Name); + RoleMappedData data; + IPredictor predictor; + try + { + input.PredictorModel.PrepareData(host, inputData, out data, out predictor); + } + catch (Exception) + { + predictor = input.PredictorModel.Predictor; + data = new RoleMappedData(input.Data, null, input.Data.Schema[0].Name); + } IDataView scoredPipe; using (var ch = host.Start("Creating scoring pipeline")) @@ -134,7 +143,7 @@ public static Output Score(IHostEnvironment env, Input input) } return - new Output + new ScoringTransformOutput { ScoredData = scoredPipe, ScoringTransform = new TransformModelImpl(host, scoredPipe, inputData) diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index bafc90ef..7da1cc62 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -394,7 +394,7 @@ - + diff --git a/src/python/nimbusml/internal/entrypoints/transforms_csrscorer.py b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py similarity index 94% rename from src/python/nimbusml/internal/entrypoints/transforms_csrscorer.py rename to src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py index 50f21e7b..7a5d8c71 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_csrscorer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py @@ -1,5 +1,5 @@ """ -Transforms.CsrScorer +Transforms.DatasetScorerEx """ @@ -7,7 +7,7 @@ from ..utils.utils import try_set, unlist -def transforms_csrscorer( +def transforms_datasetscorerex( data, predictor_model, scored_data=None, @@ -26,7 +26,7 @@ def transforms_csrscorer( :param scoring_transform: The scoring transform (outputs). """ - entrypoint_name = 'Transforms.CsrScorer' + entrypoint_name = 'Transforms.DatasetScorerEx' inputs = {} outputs = {} diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 468b8c14..da57265a 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -40,10 +40,8 @@ models_regressionevaluator from .internal.entrypoints.models_summarizer import models_summarizer from .internal.entrypoints.models_schema import models_schema -from .internal.entrypoints.transforms_datasetscorer import \ - transforms_datasetscorer -from .internal.entrypoints.transforms_csrscorer import \ - transforms_csrscorer +from .internal.entrypoints.transforms_datasetscorerex import \ + transforms_datasetscorerex from .internal.entrypoints.transforms_datasettransformscorer import \ transforms_datasettransformscorer from .internal.entrypoints.transforms_featurecombiner import \ @@ -1769,17 +1767,10 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, all_nodes = [importtext_node] inputs = dict([('file', ''), ('predictor_model', self.model)]) - score_node = transforms_datasetscorer( + score_node = transforms_datasetscorerex( data="$data", predictor_model="$predictor_model", scored_data="$scoredvectordata") - - if isinstance(X, csr_matrix): - X = X.astype(np.float32) - score_node = transforms_csrscorer( - data="$data", - predictor_model="$predictor_model", - scored_data="$scoredvectordata") fcc_node = transforms_featurecontributioncalculationtransformer( data="$scoredvectordata", @@ -1931,18 +1922,10 @@ def _predict(self, X, y=None, all_nodes = [importtext_node] inputs = dict([('file', ''), ('predictor_model', self.model)]) - score_node = transforms_datasetscorer( + score_node = transforms_datasetscorerex( data="$data", predictor_model="$predictor_model", scored_data="$scoredVectorData") - - if isinstance(X, csr_matrix): - X = X.astype(np.float32) - score_node = transforms_csrscorer( - data="$data", - predictor_model="$predictor_model", - scored_data="$scoredVectorData") - all_nodes.extend([score_node]) if (evaltype in ['binary', 'multiclass']) or \ From 840d97628965be6ebeaa773eebb700997d61c480 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 8 Oct 2019 15:33:39 -0700 Subject: [PATCH 11/15] convert to single --- src/DotNetBridge/Entrypoints.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index c57d1950..70a1ceef 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -126,8 +126,11 @@ public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransfor } catch (Exception) { + var inputColumnName = inputData.Schema[0].Name; predictor = input.PredictorModel.Predictor; - data = new RoleMappedData(input.Data, null, input.Data.Schema[0].Name); + var xf = new TypeConvertingTransformer(host, + new TypeConvertingEstimator.ColumnOptions(inputColumnName, DataKind.Single, inputColumnName)).Transform(inputData); + data = new RoleMappedData(xf, null, inputColumnName); } IDataView scoredPipe; From c34b653f148ce1abe4a2482d7685075a6c9a0d75 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 8 Oct 2019 17:09:36 -0700 Subject: [PATCH 12/15] fix type --- src/DotNetBridge/Entrypoints.cs | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index 70a1ceef..51795579 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -109,6 +109,24 @@ public sealed class ScoringTransformOutput public TransformModel ScoringTransform; } + private static bool AreSchemasCompatible(DataViewSchema schema1, DataViewSchema schema2) + { + if (schema1 == null) + return schema2 == null; + if (schema2 == null) + return schema1 == null; + if (schema1.Count != schema2.Count) + return false; + + for (int i = 0; i < schema1.Count; i++) + { + if(schema1[i].Type != schema2[i].Type) + return false; + } + + return true; + } + [TlcModule.EntryPoint(Name = "Transforms.DatasetScorerEx", Desc = "Score a dataset with a predictor model")] public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransformInput input) { @@ -117,19 +135,24 @@ public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransfor host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); - var inputData = input.Data; RoleMappedData data; IPredictor predictor; - try + var inputData = input.Data; + if (AreSchemasCompatible(inputData.Schema, input.PredictorModel.TransformModel.InputSchema)) { input.PredictorModel.PrepareData(host, inputData, out data, out predictor); } - catch (Exception) + else // use only trainer model. { + // feature vector provided only. + host.Assert(inputData.Schema.Count == 1); var inputColumnName = inputData.Schema[0].Name; + var trainingSchema = input.PredictorModel.GetTrainingSchema(host); + // get feature vector item type. + var requiredType = ((DataViewSchema.Column)trainingSchema.Feature).Type.GetItemType().RawType; predictor = input.PredictorModel.Predictor; - var xf = new TypeConvertingTransformer(host, - new TypeConvertingEstimator.ColumnOptions(inputColumnName, DataKind.Single, inputColumnName)).Transform(inputData); + var xf = new TypeConvertingTransformer(host, + new TypeConvertingEstimator.ColumnOptions(inputColumnName, requiredType, inputColumnName)).Transform(inputData); data = new RoleMappedData(xf, null, inputColumnName); } From 38e8f7a98118a7d3a3f48970d6325d94b9f975fd Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 8 Oct 2019 23:15:11 -0700 Subject: [PATCH 13/15] add feature contribution test --- src/DotNetBridge/Entrypoints.cs | 8 +++++--- src/python/nimbusml/tests/pipeline/test_csr_input.py | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index 51795579..c08cf096 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -149,11 +149,13 @@ public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransfor var inputColumnName = inputData.Schema[0].Name; var trainingSchema = input.PredictorModel.GetTrainingSchema(host); // get feature vector item type. - var requiredType = ((DataViewSchema.Column)trainingSchema.Feature).Type.GetItemType().RawType; + var trainingFeatureColumn = (DataViewSchema.Column)trainingSchema.Feature; + var requiredType = trainingFeatureColumn.Type.GetItemType().RawType; + var featuresColumnName = trainingFeatureColumn.Name; predictor = input.PredictorModel.Predictor; var xf = new TypeConvertingTransformer(host, - new TypeConvertingEstimator.ColumnOptions(inputColumnName, requiredType, inputColumnName)).Transform(inputData); - data = new RoleMappedData(xf, null, inputColumnName); + new TypeConvertingEstimator.ColumnOptions(featuresColumnName, requiredType, inputColumnName)).Transform(inputData); + data = new RoleMappedData(xf, null, featuresColumnName); } IDataView scoredPipe; diff --git a/src/python/nimbusml/tests/pipeline/test_csr_input.py b/src/python/nimbusml/tests/pipeline/test_csr_input.py index 2a2f3bd2..821107e0 100644 --- a/src/python/nimbusml/tests/pipeline/test_csr_input.py +++ b/src/python/nimbusml/tests/pipeline/test_csr_input.py @@ -56,7 +56,9 @@ def test_predict_proba_on_csr(self): assert_equal(len(predictions), 248) assert_equal(len(predictions[0]), 2) - + # get feature contributions + fcc = predictor_pipeline.get_feature_contributions(sparse_featurized_data) + assert_equal(fcc.shape, (248,30)) if __name__ == '__main__': unittest.main() From 29a954498ce53a4ecc79501677e04fde32b6594b Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 8 Oct 2019 23:20:26 -0700 Subject: [PATCH 14/15] rename pipeline.get_schema() to pipeline.gat_output_columns() --- src/python/nimbusml/examples/Schema.py | 2 +- src/python/nimbusml/pipeline.py | 2 +- src/python/nimbusml/tests/pipeline/test_csr_input.py | 6 +++--- .../nimbusml/tests/pipeline/test_pipeline_get_schema.py | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/python/nimbusml/examples/Schema.py b/src/python/nimbusml/examples/Schema.py index ddec3c0d..c0b8d493 100644 --- a/src/python/nimbusml/examples/Schema.py +++ b/src/python/nimbusml/examples/Schema.py @@ -27,7 +27,7 @@ ]) pipe.fit(data) -schema = pipe.get_schema() +schema = pipe.get_output_columns() print(schema[0:5]) # ['Sentiment', 'SentimentText', 'features.Char.|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u'] diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 65240272..b60bc9fb 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -1812,7 +1812,7 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, return out_data - def get_schema(self, verbose=0, **params): + def get_output_columns(self, verbose=0, **params): """ Returns the output list of columns for the fitted model. :return: list . diff --git a/src/python/nimbusml/tests/pipeline/test_csr_input.py b/src/python/nimbusml/tests/pipeline/test_csr_input.py index 821107e0..176a7651 100644 --- a/src/python/nimbusml/tests/pipeline/test_csr_input.py +++ b/src/python/nimbusml/tests/pipeline/test_csr_input.py @@ -27,13 +27,13 @@ def test_predict_proba_on_csr(self): featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})]) featurization_pipeline.fit(data) # Note: the relative order of all columns is still the same as in raw data. - #print(featurization_pipeline.get_schema()) + #print(featurization_pipeline.get_output_columns()) # need to remove extra columns before getting csr_matrix featurized data as it wont have column name information. csr_featurization_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), ColumnDropper() << ['case', 'row_num']]) sparse_featurized_data = csr_featurization_pipeline.fit_transform(data, as_csr=True) # Note: the relative order of all columns is still the same. - #print(csr_featurization_pipeline.get_schema()) + #print(csr_featurization_pipeline.get_output_columns()) # train learner # Note: order & number of feature columns for learner (parameter 'feature') should be the same as in csr_matrix above @@ -49,7 +49,7 @@ def test_predict_proba_on_csr(self): predictor_pipeline = Pipeline() predictor_pipeline.load_model(training_pipeline.predictor_model) # see the order of Feature.* columns that get passed to learner algo - #print(predictor_pipeline.get_schema()) + #print(predictor_pipeline.get_output_columns()) # use just a learner model on csr_matrix featurized data predictions = predictor_pipeline.predict_proba(sparse_featurized_data) diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py b/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py index 5eccf4d7..63bb5310 100644 --- a/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py +++ b/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py @@ -28,7 +28,7 @@ def test_get_schema_returns_correct_value_for_single_valued_columns(self): pipeline.fit(df) df = pipeline.transform(df) - schema = pipeline.get_schema() + schema = pipeline.get_output_columns() self.assertTrue('c1' in schema) self.assertTrue('c2' in schema) @@ -39,7 +39,7 @@ def test_get_schema_returns_correct_value_for_vector_valued_columns(self): pipeline = Pipeline([OneHotVectorizer() << 'c0']) pipeline.fit(train_df) - schema = pipeline.get_schema() + schema = pipeline.get_output_columns() self.assertTrue('c0.a' in schema) self.assertTrue('c0.b' in schema) @@ -55,7 +55,7 @@ def test_get_schema_does_not_work_when_predictor_is_part_of_model(self): pipeline.fit(df) try: - schema = pipeline.get_schema() + schema = pipeline.get_output_columns() except Exception as e: pass else: From 8836bafe63e784291c919f59c215c6185f7446d9 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 8 Oct 2019 23:48:58 -0700 Subject: [PATCH 15/15] fix build --- src/DotNetBridge/Entrypoints.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index c08cf096..9be84e67 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -138,13 +138,13 @@ public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransfor RoleMappedData data; IPredictor predictor; var inputData = input.Data; - if (AreSchemasCompatible(inputData.Schema, input.PredictorModel.TransformModel.InputSchema)) + try { input.PredictorModel.PrepareData(host, inputData, out data, out predictor); } - else // use only trainer model. + catch (Exception) { - // feature vector provided only. + // this can happen in csr_matrix case, try to use only trainer model. host.Assert(inputData.Schema.Count == 1); var inputColumnName = inputData.Schema[0].Name; var trainingSchema = input.PredictorModel.GetTrainingSchema(host);