Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 92 additions & 1 deletion src/DotNetBridge/Entrypoints.cs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public sealed class ModelSchemaOutput
public IDataView Schema;
}

[TlcModule.EntryPoint(Name = "Models.Schema", Desc = "Retrieve input and output model schemas")]
[TlcModule.EntryPoint(Name = "Models.Schema", Desc = "Retrieve output model schema")]
public static ModelSchemaOutput GetSchema(IHostEnvironment env, TransformModelInput input)
{
Contracts.CheckValue(env, nameof(env));
Expand All @@ -87,5 +87,96 @@ public static CommonOutputs.TransformOutput CreateVariableColumn(IHostEnvironmen
var xf = VariableColumnTransform.Create(env, inputOptions, inputOptions.Data);
return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf };
}

public sealed class ScoringTransformInput
{
[Argument(ArgumentType.Required, HelpText = "The dataset to be scored", SortOrder = 1)]
public IDataView Data;

[Argument(ArgumentType.Required, HelpText = "The predictor model to apply to data", SortOrder = 2)]
public PredictorModel PredictorModel;

[Argument(ArgumentType.AtMostOnce, HelpText = "Suffix to append to the score columns", SortOrder = 3)]
public string Suffix;
}

public sealed class ScoringTransformOutput
{
[TlcModule.Output(Desc = "The scored dataset", SortOrder = 1)]
public IDataView ScoredData;

[TlcModule.Output(Desc = "The scoring transform", SortOrder = 2)]
public TransformModel ScoringTransform;
}

private static bool AreSchemasCompatible(DataViewSchema schema1, DataViewSchema schema2)
{
if (schema1 == null)
return schema2 == null;
if (schema2 == null)
return schema1 == null;
if (schema1.Count != schema2.Count)
return false;

for (int i = 0; i < schema1.Count; i++)
{
if(schema1[i].Type != schema2[i].Type)
return false;
}

return true;
}

[TlcModule.EntryPoint(Name = "Transforms.DatasetScorerEx", Desc = "Score a dataset with a predictor model")]
public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransformInput input)
{
Contracts.CheckValue(env, nameof(env));
var host = env.Register("ScoreModel");
host.CheckValue(input, nameof(input));
EntryPointUtils.CheckInputArgs(host, input);

RoleMappedData data;
IPredictor predictor;
var inputData = input.Data;
try
{
input.PredictorModel.PrepareData(host, inputData, out data, out predictor);
}
catch (Exception)
{
// this can happen in csr_matrix case, try to use only trainer model.
host.Assert(inputData.Schema.Count == 1);
var inputColumnName = inputData.Schema[0].Name;
var trainingSchema = input.PredictorModel.GetTrainingSchema(host);
// get feature vector item type.
var trainingFeatureColumn = (DataViewSchema.Column)trainingSchema.Feature;
var requiredType = trainingFeatureColumn.Type.GetItemType().RawType;
var featuresColumnName = trainingFeatureColumn.Name;
predictor = input.PredictorModel.Predictor;
var xf = new TypeConvertingTransformer(host,
new TypeConvertingEstimator.ColumnOptions(featuresColumnName, requiredType, inputColumnName)).Transform(inputData);
data = new RoleMappedData(xf, null, featuresColumnName);
}

IDataView scoredPipe;
using (var ch = host.Start("Creating scoring pipeline"))
{
ch.Trace("Creating pipeline");
var bindable = ScoreUtils.GetSchemaBindableMapper(host, predictor);
ch.AssertValue(bindable);

var mapper = bindable.Bind(host, data.Schema);
var scorer = ScoreUtils.GetScorerComponent(host, mapper, input.Suffix);
scoredPipe = scorer.CreateComponent(host, data.Data, mapper, input.PredictorModel.GetTrainingSchema(host));
}

return
new ScoringTransformOutput
{
ScoredData = scoredPipe,
ScoringTransform = new TransformModelImpl(host, scoredPipe, inputData)
};

}
}
}
2 changes: 2 additions & 0 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,7 @@
<Compile Include="nimbusml\internal\entrypoints\transforms_categoricalhashonehotvectorizer.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_categoricalonehotvectorizer.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_charactertokenizer.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_datasetscorerex.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_prefixcolumnconcatenator.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_columnconcatenator.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_columncopier.py" />
Expand Down Expand Up @@ -685,6 +686,7 @@
<Compile Include="nimbusml\tests\feature_extraction\text\test_sentiment.py" />
<Compile Include="nimbusml\tests\idv\__init__.py" />
<Compile Include="nimbusml\tests\linear_model\test_linearsvmbinaryclassifier.py" />
<Compile Include="nimbusml\tests\pipeline\test_csr_input.py" />
<Compile Include="nimbusml\tests\pipeline\test_permutation_feature_importance.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_get_schema.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_split_models.py" />
Expand Down
2 changes: 1 addition & 1 deletion src/python/nimbusml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Microsoft Machine Learning for Python
"""

__version__ = '1.4.2'
__version__ = '1.5.0'

# CoreCLR version of MicrosoftML is built on Windows.
# But file permissions are not preserved when it's copied to Linux.
Expand Down
2 changes: 1 addition & 1 deletion src/python/nimbusml/examples/Schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
])

pipe.fit(data)
schema = pipe.get_schema()
schema = pipe.get_output_columns()

print(schema[0:5])
# ['Sentiment', 'SentimentText', 'features.Char.<?>|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u']
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Transforms.DatasetScorerEx
"""


from ..utils.entrypoints import EntryPoint
from ..utils.utils import try_set, unlist


def transforms_datasetscorerex(
data,
predictor_model,
scored_data=None,
scoring_transform=None,
suffix=None,
**params):
"""
**Description**
Score a dataset with a predictor model

:param data: The dataset to be scored (inputs).
:param predictor_model: The predictor model to apply to data
(inputs).
:param suffix: Suffix to append to the score columns (inputs).
:param scored_data: The scored dataset (outputs).
:param scoring_transform: The scoring transform (outputs).
"""

entrypoint_name = 'Transforms.DatasetScorerEx'
inputs = {}
outputs = {}

if data is not None:
inputs['Data'] = try_set(
obj=data,
none_acceptable=False,
is_of_type=str)
if predictor_model is not None:
inputs['PredictorModel'] = try_set(
obj=predictor_model,
none_acceptable=False,
is_of_type=str)
if suffix is not None:
inputs['Suffix'] = try_set(
obj=suffix,
none_acceptable=True,
is_of_type=str)
if scored_data is not None:
outputs['ScoredData'] = try_set(
obj=scored_data,
none_acceptable=False,
is_of_type=str)
if scoring_transform is not None:
outputs['ScoringTransform'] = try_set(
obj=scoring_transform, none_acceptable=False, is_of_type=str)

input_variables = {
x for x in unlist(inputs.values())
if isinstance(x, str) and x.startswith("$")}
output_variables = {
x for x in unlist(outputs.values())
if isinstance(x, str) and x.startswith("$")}

entrypoint = EntryPoint(
name=entrypoint_name, inputs=inputs, outputs=outputs,
input_variables=input_variables,
output_variables=output_variables)
return entrypoint
10 changes: 5 additions & 5 deletions src/python/nimbusml/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@
models_regressionevaluator
from .internal.entrypoints.models_summarizer import models_summarizer
from .internal.entrypoints.models_schema import models_schema
from .internal.entrypoints.transforms_datasetscorer import \
transforms_datasetscorer
from .internal.entrypoints.transforms_datasetscorerex import \
transforms_datasetscorerex
from .internal.entrypoints.transforms_datasettransformscorer import \
transforms_datasettransformscorer
from .internal.entrypoints.transforms_featurecombiner import \
Expand Down Expand Up @@ -1772,7 +1772,7 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0,
all_nodes = [importtext_node]
inputs = dict([('file', ''), ('predictor_model', self.model)])

score_node = transforms_datasetscorer(
score_node = transforms_datasetscorerex(
data="$data",
predictor_model="$predictor_model",
scored_data="$scoredvectordata")
Expand Down Expand Up @@ -1815,7 +1815,7 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0,

return out_data

def get_schema(self, verbose=0, **params):
def get_output_columns(self, verbose=0, **params):
"""
Returns the output list of columns for the fitted model.
:return: list .
Expand Down Expand Up @@ -2102,7 +2102,7 @@ def _predict(self, X, y=None,
all_nodes = [importtext_node]
inputs = dict([('file', ''), ('predictor_model', self.model)])

score_node = transforms_datasetscorer(
score_node = transforms_datasetscorerex(
data="$data",
predictor_model="$predictor_model",
scored_data="$scoredVectorData")
Expand Down
65 changes: 65 additions & 0 deletions src/python/nimbusml/tests/pipeline/test_csr_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
import os
import unittest

import numpy as np
import pandas as pd
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import LogisticRegressionBinaryClassifier
from nimbusml.preprocessing import DatasetTransformer
from nimbusml.preprocessing.schema import PrefixColumnConcatenator
from nimbusml.preprocessing.schema import ColumnDropper
from numpy.testing import assert_equal

class TestCsrInput(unittest.TestCase):

def test_predict_proba_on_csr(self):
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path)
cols = list(data.head(1).columns.values) # ordered data column names.

# train featurizer
featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})])
featurization_pipeline.fit(data)
# Note: the relative order of all columns is still the same as in raw data.
#print(featurization_pipeline.get_output_columns())

# need to remove extra columns before getting csr_matrix featurized data as it wont have column name information.
csr_featurization_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), ColumnDropper() << ['case', 'row_num']])
sparse_featurized_data = csr_featurization_pipeline.fit_transform(data, as_csr=True)
# Note: the relative order of all columns is still the same.
#print(csr_featurization_pipeline.get_output_columns())

# train learner
# Note: order & number of feature columns for learner (parameter 'feature') should be the same as in csr_matrix above
cols.remove('row_num')
cols.remove('case')
feature_cols = cols
#print(feature_cols)
#['education', 'age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooled.stratum']
training_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), LogisticRegressionBinaryClassifier(feature=feature_cols, label='case')])
training_pipeline.fit(data, output_predictor_model=True)

# load just a learner model
predictor_pipeline = Pipeline()
predictor_pipeline.load_model(training_pipeline.predictor_model)
# see the order of Feature.* columns that get passed to learner algo
#print(predictor_pipeline.get_output_columns())

# use just a learner model on csr_matrix featurized data
predictions = predictor_pipeline.predict_proba(sparse_featurized_data)
assert_equal(len(predictions), 248)
assert_equal(len(predictions[0]), 2)

# get feature contributions
fcc = predictor_pipeline.get_feature_contributions(sparse_featurized_data)
assert_equal(fcc.shape, (248,30))

if __name__ == '__main__':
unittest.main()

Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_get_schema_returns_correct_value_for_single_valued_columns(self):
pipeline.fit(df)
df = pipeline.transform(df)

schema = pipeline.get_schema()
schema = pipeline.get_output_columns()

self.assertTrue('c1' in schema)
self.assertTrue('c2' in schema)
Expand All @@ -39,7 +39,7 @@ def test_get_schema_returns_correct_value_for_vector_valued_columns(self):
pipeline = Pipeline([OneHotVectorizer() << 'c0'])
pipeline.fit(train_df)

schema = pipeline.get_schema()
schema = pipeline.get_output_columns()

self.assertTrue('c0.a' in schema)
self.assertTrue('c0.b' in schema)
Expand All @@ -55,7 +55,7 @@ def test_get_schema_does_not_work_when_predictor_is_part_of_model(self):
pipeline.fit(df)

try:
schema = pipeline.get_schema()
schema = pipeline.get_output_columns()
except Exception as e:
pass
else:
Expand Down
2 changes: 1 addition & 1 deletion src/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
# Versions should comply with PEP440. For a discussion on
# single-sourcing the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version='1.4.2',
version='1.5.0',

description='NimbusML',
long_description=long_description,
Expand Down
2 changes: 1 addition & 1 deletion version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.4.2
1.5.0