Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions release-next.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,22 @@
xf.fit(train_df)
result = xf.transform(train_df, as_csr=True)
```

- **Permutation Feature Importance for model interpretibility.**

[PR#279](https://github.com/microsoft/NimbusML/pull/279)
Adds `permutation_feature_importance()` method to `Pipeline` and
predictor estimators, enabling evaluation of model-wide feature
importances on any dataset with same schema as the dataset used
to fit the `Pipeline`.

```python
pipe = Pipeline([
LogisticRegressionBinaryClassifier(label='label', feature=['feature'])
])
pipe.fit(data)
pipe.permutation_feature_importance(data)
```

- **Initial implementation of LpScaler.**

Expand Down
24 changes: 12 additions & 12 deletions src/DotNetBridge/DotNetBridge.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,19 @@
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
</PackageReference>
<PackageReference Include="Microsoft.ML" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.CpuMath" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.EntryPoints" Version="0.16.0-preview" />
<PackageReference Include="Microsoft.ML.Mkl.Components" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.ImageAnalytics" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.LightGBM" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.OnnxTransformer" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.TensorFlow" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.Dnn" Version="0.16.0-preview" />
<PackageReference Include="Microsoft.ML.Ensemble" Version="0.16.0-preview" />
<PackageReference Include="Microsoft.ML.TimeSeries" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.CpuMath" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.EntryPoints" Version="0.16.0-preview2" />
<PackageReference Include="Microsoft.ML.Mkl.Components" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.ImageAnalytics" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.LightGBM" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.OnnxTransformer" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.TensorFlow" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.Dnn" Version="0.16.0-preview2" />
<PackageReference Include="Microsoft.ML.Ensemble" Version="0.16.0-preview2" />
<PackageReference Include="Microsoft.ML.TimeSeries" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.DataPrep" Version="0.0.1.12-preview" />
<PackageReference Include="TensorFlow.NET" Version="0.10.10" />
<PackageReference Include="TensorFlow.NET" Version="0.11.3" />
<PackageReference Include="SciSharp.TensorFlow.Redist" Version="1.14.0" />
</ItemGroup>
</Project>
24 changes: 12 additions & 12 deletions src/Platforms/build.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.CpuMath" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.EntryPoints" Version="0.16.0-preview" />
<PackageReference Include="Microsoft.ML.Mkl.Components" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.ImageAnalytics" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.LightGBM" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.OnnxTransformer" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.TensorFlow" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML.Dnn" Version="0.16.0-preview" />
<PackageReference Include="Microsoft.ML.Ensemble" Version="0.16.0-preview" />
<PackageReference Include="Microsoft.ML.TimeSeries" Version="1.4.0-preview" />
<PackageReference Include="Microsoft.ML" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.CpuMath" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.EntryPoints" Version="0.16.0-preview2" />
<PackageReference Include="Microsoft.ML.Mkl.Components" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.ImageAnalytics" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.LightGBM" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.OnnxTransformer" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.TensorFlow" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.ML.Dnn" Version="0.16.0-preview2" />
<PackageReference Include="Microsoft.ML.Ensemble" Version="0.16.0-preview2" />
<PackageReference Include="Microsoft.ML.TimeSeries" Version="1.4.0-preview2" />
<PackageReference Include="Microsoft.DataPrep" Version="0.0.1.12-preview" />
<PackageReference Include="TensorFlow.NET" Version="0.10.10" />
<PackageReference Include="TensorFlow.NET" Version="0.11.3" />
<PackageReference Include="SciSharp.TensorFlow.Redist" Version="1.14.0" />
</ItemGroup>

Expand Down
3 changes: 3 additions & 0 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@
<Compile Include="nimbusml\examples\Hinge.py" />
<Compile Include="nimbusml\examples\IidChangePointDetector.py" />
<Compile Include="nimbusml\examples\LinearSvmBinaryClassifier.py" />
<Compile Include="nimbusml\examples\PermutationFeatureImportance.py" />
<Compile Include="nimbusml\examples\Schema.py" />
<Compile Include="nimbusml\examples\SsaForecaster.py" />
<Compile Include="nimbusml\examples\SsaChangePointDetector.py" />
Expand Down Expand Up @@ -436,6 +437,7 @@
<Compile Include="nimbusml\internal\entrypoints\transforms_nooperation.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_optionalcolumncreator.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_pcacalculator.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_permutationfeatureimportance.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_predictedlabelcolumnoriginalvalueconverter.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_randomnumbergenerator.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_rowrangefilter.py" />
Expand Down Expand Up @@ -682,6 +684,7 @@
<Compile Include="nimbusml\tests\feature_extraction\text\test_sentiment.py" />
<Compile Include="nimbusml\tests\idv\__init__.py" />
<Compile Include="nimbusml\tests\linear_model\test_linearsvmbinaryclassifier.py" />
<Compile Include="nimbusml\tests\pipeline\test_permutation_feature_importance.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_get_schema.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_split_models.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_combining.py" />
Expand Down
8 changes: 7 additions & 1 deletion src/python/nimbusml/base_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,13 @@ def _invoke_inference_method(self, method, X, **params):

@trace
def get_feature_contributions(self, X, **params):
return self._invoke_inference_method('get_feature_contributions', X, **params)
return self._invoke_inference_method('get_feature_contributions',
X, **params)

@trace
def permutation_feature_importance(self, X, **params):
return self._invoke_inference_method('permutation_feature_importance',
X, **params)

@trace
def predict(self, X, **params):
Expand Down
173 changes: 173 additions & 0 deletions src/python/nimbusml/examples/PermutationFeatureImportance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
###############################################################################
# Permutation Feature Importance (PFI)

# Permutation feature importance (PFI) is a technique to determine the global
# importance of features in a trained machine learning model. PFI is a simple
# yet powerful technique motivated by Breiman in section 10 of his Random
# Forests paper (Machine Learning, 2001). The advantage of the PFI method is
# that it is model agnostic - it works with any model that can be evaluated -
# and it can use any dataset, not just the training set, to compute feature
# importance metrics.

# PFI works by taking a labeled dataset, choosing a feature, and permuting the
# values for that feature across all the examples, so that each example now has
# a random value for the feature and the original values for all other
# features. The evaluation metric (e.g. NDCG) is then calculated for this
# modified dataset, and the change in the evaluation metric from the original
# dataset is computed. The larger the change in the evaluation metric, the more
# important the feature is to the model, i.e. the most important features are
# those that the model is most sensitive to. PFI works by performing this
# permutation analysis across allthe features of a model, one after another.

# PFI is supported for binary classifiers, classifiers, regressors, and
# rankers.

from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmRanker
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import LogisticRegressionBinaryClassifier, \
FastLinearClassifier, FastLinearRegressor
from nimbusml.preprocessing import ToKey
from numpy.testing import assert_almost_equal

# data input (as a FileDataStream)
adult_path = get_dataset('uciadult_train').as_filepath()
classification_data = FileDataStream.read_csv(adult_path)
print(classification_data.head())
# label workclass education ... capital-loss hours-per-week
# 0 0 Private 11th ... 0 40
# 1 0 Private HS-grad ... 0 50
# 2 1 Local-gov Assoc-acdm ... 0 40
# 3 1 Private Some-college ... 0 40
# 4 0 ? Some-college ... 0 30

######################################
# PFI for Binary Classification models
######################################
# define the training pipeline with a binary classifier
binary_pipeline = Pipeline([
OneHotVectorizer(columns=['education']),
LogisticRegressionBinaryClassifier(
feature=['age', 'education'], label='label')])

# train the model
binary_model = binary_pipeline.fit(classification_data)

# get permutation feature importance
binary_pfi = binary_model.permutation_feature_importance(classification_data)

# Print PFI for each feature, ordered by most important features w.r.t. AUC.
# Since AUC is an increasing metric, the highest negative changes indicate the
# most important features.
print("============== PFI for Binary Classification Model ==============")
print(binary_pfi.sort_values('AreaUnderRocCurve').head())
# FeatureName AreaUnderRocCurve AreaUnderRocCurve.StdErr ...
# 0 age -0.081604 0.0 ...
# 6 education.Prof-school -0.012964 0.0 ...
# 10 education.Doctorate -0.012863 0.0 ...
# 8 education.Bachelors -0.010593 0.0 ...
# 2 education.HS-grad -0.005918 0.0 ...


###############################
# PFI for Classification models
###############################
# define the training pipeline with a classifier
# use 1 thread and no shuffling to force determinism
multiclass_pipeline = Pipeline([
OneHotVectorizer(columns=['education']),
FastLinearClassifier(feature=['age', 'education'], label='label',
number_of_threads=1, shuffle=False)])

# train the model
multiclass_model = multiclass_pipeline.fit(classification_data)

# get permutation feature importance
multiclass_pfi = multiclass_model.permutation_feature_importance(classification_data)

# Print PFI for each feature, ordered by most important features w.r.t. Macro
# accuracy. Since Macro accuracy is an increasing metric, the highest negative
# changes indicate the most important features.
print("================== PFI for Classification Model ==================")
print(multiclass_pfi.sort_values('MacroAccuracy').head())
# FeatureName MacroAccuracy ... MicroAccuracy ...
# 10 education.Doctorate -0.028233 ... -0.020 ...
# 0 age -0.001750 ... 0.002 ...
# 6 education.Prof-school -0.001750 ... 0.002 ...
# 9 education.Masters -0.001299 ... -0.002 ...
# 1 education.11th 0.000000 ... 0.000 ...

###########################
# PFI for Regression models
###########################
# load input data
infert_path = get_dataset('infert').as_filepath()
regression_data = FileDataStream.read_csv(infert_path)
print(regression_data.head())
# age case education induced parity ... row_num spontaneous ...
# 0 26 1 0-5yrs 1 6 ... 1 2 ...
# 1 42 1 0-5yrs 1 1 ... 2 0 ...
# 2 39 1 0-5yrs 2 6 ... 3 0 ...
# 3 34 1 0-5yrs 2 4 ... 4 0 ...
# 4 35 1 6-11yrs 1 3 ... 5 1 ...

# define the training pipeline with a regressor
# use 1 thread and no shuffling to force determinism
regression_pipeline = Pipeline([
OneHotVectorizer(columns=['education']),
FastLinearRegressor(feature=['induced', 'education'], label='age',
number_of_threads=1, shuffle=False)])

# train the model
regression_model = regression_pipeline.fit(regression_data)

# get permutation feature importance
regression_pfi = regression_model.permutation_feature_importance(regression_data)

# print PFI for each feaure, ordered by most important features w.r.t. MAE.
# Since MAE is a decreasing metric, the highest positive changes indicate the
# most important features.
print("==================== PFI for Regression Model ====================")
print(regression_pfi.sort_values('MeanAbsoluteError', ascending=False).head())
# FeatureName MeanAbsoluteError ... RSquared RSquared.StdErr
#3 education.12+ yrs 0.393451 ... -0.146338 0.0
#0 induced 0.085804 ... -0.026168 0.0
#1 education.0-5yrs 0.064460 ... -0.027587 0.0
#2 education.6-11yrs -0.000047 ... 0.000059 0.0

########################
# PFI for Ranking models
########################
# load input data
ticket_path = get_dataset('gen_tickettrain').as_filepath()
ranking_data = FileDataStream.read_csv(ticket_path)
print(ranking_data.head())
# rank group carrier price Class dep_day nbr_stops duration
# 0 2 1 AA 240 3 1 0 12.0
# 1 1 1 AA 300 3 0 1 15.0
# 2 1 1 AA 360 3 0 2 18.0
# 3 0 1 AA 540 2 0 0 12.0
# 4 1 1 AA 600 2 0 1 15.0

# define the training pipeline with a ranker
ranking_pipeline = Pipeline([
ToKey(columns=['group']),
LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
label='rank', group_id='group')])

# train the model
ranking_model = ranking_pipeline.fit(ranking_data)

# get permutation feature importance
ranking_pfi = ranking_model.permutation_feature_importance(ranking_data)

# Print PFI for each feature, ordered by most important features w.r.t. DCG@1.
# Since DCG is an increasing metric, the highest negative changes indicate the
# most important features.
print("===================== PFI for Ranking Model =====================")
print(ranking_pfi.sort_values('DCG@1').head())
# Feature DCG@1 DCG@2 DCG@3 ... NDCG@1 NDCG@2 ...
# 0 Class -4.869096 -7.030914 -5.948893 ... -0.420238 -0.407281 ...
# 2 duration -2.344379 -3.595958 -3.956632 ... -0.232143 -0.231539 ...
# 1 dep_day 0.000000 0.000000 0.000000 ... 0.000000 0.000000 ...
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
Transforms.PermutationFeatureImportance
"""

import numbers

from ..utils.entrypoints import EntryPoint
from ..utils.utils import try_set, unlist


def transforms_permutationfeatureimportance(
data,
predictor_model,
metrics=None,
use_feature_weight_filter=False,
number_of_examples_to_use=None,
permutation_count=1,
**params):
"""
**Description**
Permutation Feature Importance (PFI)

:param data: Input dataset (inputs).
:param predictor_model: The path to the model file (inputs).
:param use_feature_weight_filter: Use feature weights to pre-
filter features (inputs).
:param number_of_examples_to_use: Limit the number of examples to
evaluate on (inputs).
:param permutation_count: The number of permutations to perform
(inputs).
:param metrics: The PFI metrics (outputs).
"""

entrypoint_name = 'Transforms.PermutationFeatureImportance'
inputs = {}
outputs = {}

if data is not None:
inputs['Data'] = try_set(
obj=data,
none_acceptable=False,
is_of_type=str)
if predictor_model is not None:
inputs['PredictorModel'] = try_set(
obj=predictor_model,
none_acceptable=False,
is_of_type=str)
if use_feature_weight_filter is not None:
inputs['UseFeatureWeightFilter'] = try_set(
obj=use_feature_weight_filter,
none_acceptable=True,
is_of_type=bool)
if number_of_examples_to_use is not None:
inputs['NumberOfExamplesToUse'] = try_set(
obj=number_of_examples_to_use,
none_acceptable=True,
is_of_type=numbers.Real)
if permutation_count is not None:
inputs['PermutationCount'] = try_set(
obj=permutation_count,
none_acceptable=True,
is_of_type=numbers.Real)
if metrics is not None:
outputs['Metrics'] = try_set(
obj=metrics,
none_acceptable=False,
is_of_type=str)

input_variables = {
x for x in unlist(inputs.values())
if isinstance(x, str) and x.startswith("$")}
output_variables = {
x for x in unlist(outputs.values())
if isinstance(x, str) and x.startswith("$")}

entrypoint = EntryPoint(
name=entrypoint_name, inputs=inputs, outputs=outputs,
input_variables=input_variables,
output_variables=output_variables)
return entrypoint
Loading