diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureContributionCalculationTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureContributionCalculationTransform.cs deleted file mode 100644 index d167f3f9e3..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureContributionCalculationTransform.cs +++ /dev/null @@ -1,120 +0,0 @@ -using System; -using Microsoft.ML; -using Microsoft.ML.Data; -using Microsoft.ML.SamplesUtils; - -namespace Samples.Dynamic -{ - public static class FeatureContributionCalculationTransform - { - public static void Example() - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Read the Housing regression dataset - var data = DatasetUtils.LoadHousingRegressionDataset(mlContext); - - // Create a pipeline. - // Concatenate the features to create a Feature vector. - // Then append a linear model, setting the "MedianHomeValue" column as the label of the dataset, - // the "Features" column produced by concatenation as the features column. - var transformPipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental", - "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", - "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"); - var learner = mlContext.Regression.Trainers.Ols( - labelColumnName: "MedianHomeValue", featureColumnName: "Features"); - - var transformedData = transformPipeline.Fit(data).Transform(data); - - // Now we train the model and score it on the transformed data. - var model = learner.Fit(transformedData); - var scoredData = model.Transform(transformedData); - - // Create a Feature Contribution Calculator - // Calculate the feature contributions for all features given trained model parameters - // And don't normalize the contribution scores - var featureContributionCalculator = mlContext.Transforms.CalculateFeatureContribution(model, numberOfPositiveContributions: 11, normalize: false); - var outputData = featureContributionCalculator.Fit(scoredData).Transform(scoredData); - - // FeatureContributionCalculatingEstimator can be use as an intermediary step in a pipeline. - // The features retained by FeatureContributionCalculatingEstimator will be in the FeatureContribution column. - var pipeline = mlContext.Transforms.CalculateFeatureContribution(model, numberOfPositiveContributions: 11) - .Append(mlContext.Regression.Trainers.Ols(featureColumnName: "FeatureContributions")); - var outData = featureContributionCalculator.Fit(scoredData).Transform(scoredData); - - // Let's extract the weights from the linear model to use as a comparison - var weights = model.Model.Weights; - - // Let's now walk through the first ten records and see which feature drove the values the most - // Get prediction scores and contributions - var scoringEnumerator = mlContext.Data.CreateEnumerable(outputData, true).GetEnumerator(); - int index = 0; - Console.WriteLine("Label\tScore\tBiggestFeature\tValue\tWeight\tContribution"); - while (scoringEnumerator.MoveNext() && index < 10) - { - var row = scoringEnumerator.Current; - - // Get the feature index with the biggest contribution - var featureOfInterest = GetMostContributingFeature(row.FeatureContributions); - - // And the corresponding information about the feature - var value = row.Features[featureOfInterest]; - var contribution = row.FeatureContributions[featureOfInterest]; - var name = data.Schema[featureOfInterest + 1].Name; - var weight = weights[featureOfInterest]; - - Console.WriteLine("{0:0.00}\t{1:0.00}\t{2}\t{3:0.00}\t{4:0.00}\t{5:0.00}", - row.MedianHomeValue, - row.Score, - name, - value, - weight, - contribution - ); - - index++; - } - - // The output of the above code is: - // Label Score BiggestFeature Value Weight Contribution - // 24.00 27.74 RoomsPerDwelling 6.58 98.55 39.95 - // 21.60 23.85 RoomsPerDwelling 6.42 98.55 39.01 - // 34.70 29.29 RoomsPerDwelling 7.19 98.55 43.65 - // 33.40 27.17 RoomsPerDwelling 7.00 98.55 42.52 - // 36.20 27.68 RoomsPerDwelling 7.15 98.55 43.42 - // 28.70 23.13 RoomsPerDwelling 6.43 98.55 39.07 - // 22.90 22.71 RoomsPerDwelling 6.01 98.55 36.53 - // 27.10 21.72 RoomsPerDwelling 6.17 98.55 37.50 - // 16.50 18.04 RoomsPerDwelling 5.63 98.55 34.21 - // 18.90 20.14 RoomsPerDwelling 6.00 98.55 36.48 - } - - private static int GetMostContributingFeature(float[] featureContributions) - { - int index = 0; - float currentValue = float.NegativeInfinity; - for (int i = 0; i < featureContributions.Length; i++) - if (featureContributions[i] > currentValue) - { - currentValue = featureContributions[i]; - index = i; - } - return index; - } - - private sealed class HousingRegressionScoreAndContribution - { - public float MedianHomeValue { get; set; } - - [VectorType(11)] - public float[] Features { get; set; } - - public float Score { get; set; } - - [VectorType(4)] - public float[] FeatureContributions { get; set; } - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CalculateFeatureContribution.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CalculateFeatureContribution.cs new file mode 100644 index 0000000000..3a342f8809 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CalculateFeatureContribution.cs @@ -0,0 +1,115 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; + +namespace Samples.Dynamic +{ + public static class CalculateFeatureContribution + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(seed: 1); + + // Create a small dataset. + var samples = GenerateData(); + + // Convert training data to IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Create a pipeline to concatenate the features into a feature vector and normalize it. + var transformPipeline = mlContext.Transforms.Concatenate("Features", + new string[] { nameof(Data.Feature1), nameof(Data.Feature2) }) + .Append(mlContext.Transforms.NormalizeMeanVariance("Features")); + + // Fit the pipeline. + var transformer = transformPipeline.Fit(data); + + // Transform the data. + var transformedData = transformer.Transform(data); + + // Define a linear trainer. + var linearTrainer = mlContext.Regression.Trainers.Ols(); + + // Now we train the model and score it on the transformed data. + var linearModel = linearTrainer.Fit(transformedData); + // Print the model parameters. + Console.WriteLine($"Linear Model Parameters"); + Console.WriteLine($"Bias: {linearModel.Model.Bias} Feature1: {linearModel.Model.Weights[0]} Feature2: {linearModel.Model.Weights[1]}"); + + // Define a feature contribution calculator for all the features, and don't normalize the contributions. + // These are "trivial estimators" and they don't need to fit to the data, so we can feed a subset. + var simpleScoredDataset = linearModel.Transform(mlContext.Data.TakeRows(transformedData, 1)); + var linearFeatureContributionCalculator = mlContext.Transforms.CalculateFeatureContribution(linearModel, normalize: false).Fit(simpleScoredDataset); + + // Create a transformer chain to describe the entire pipeline. + var scoringPipeline = transformer.Append(linearModel).Append(linearFeatureContributionCalculator); + + // Create the prediction engine to get the features extracted from the text. + var predictionEngine = mlContext.Model.CreatePredictionEngine(scoringPipeline); + + // Convert the text into numeric features. + var prediction = predictionEngine.Predict(samples.First()); + + // Write out the prediction, with contributions. + // Note that for the linear model, the feature contributions for a feature in an example is the feature-weight*feature-value. + // The total prediction is thus the bias plus the feature contributions. + Console.WriteLine($"Label: {prediction.Label} Prediction: {prediction.Score}"); + Console.WriteLine($"Feature1: {prediction.Features[0]} Feature2: {prediction.Features[1]}"); + Console.WriteLine($"Feature Contributions: {prediction.FeatureContributions[0]} {prediction.FeatureContributions[1]}"); + + // Expected output: + // Linear Model Parameters + // Bias: -0.007505796 Feature1: 1.536963 Feature2: 3.031206 + // Label: 1.55184 Prediction: 1.389091 + // Feature1: -0.5053467 Feature2: 0.7169741 + // Feature Contributions: -0.7766994 2.173296 + } + + private class Data + { + public float Label { get; set; } + + public float Feature1 { get; set; } + + public float Feature2 { get; set; } + } + + private class ScoredData : Data + { + public float Score { get; set; } + public float[] Features { get; set; } + public float[] FeatureContributions { get; set; } + } + + /// + /// Generate an enumerable of Data objects, creating the label as a simple + /// linear combination of the features. + /// + /// The number of examples. + /// The bias, or offset, in the calculation of the label. + /// The weight to multiply the first feature with to compute the label. + /// The weight to multiply the second feature with to compute the label. + /// The seed for generating feature values and label noise. + /// An enumerable of Data objects. + private static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < nExamples; i++) + { + var data = new Data + { + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + data.Label = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); + yield return data; + } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CalculateFeatureContributionCalibrated.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CalculateFeatureContributionCalibrated.cs new file mode 100644 index 0000000000..c05e7e5468 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CalculateFeatureContributionCalibrated.cs @@ -0,0 +1,124 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; + +namespace Samples.Dynamic +{ + public static class CalculateFeatureContributionCalibrated + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Create a small dataset. + var samples = GenerateData(); + + // Convert training data to IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Create a pipeline to concatenate the features into a feature vector and normalize it. + var transformPipeline = mlContext.Transforms.Concatenate("Features", + new string[] { nameof(Data.Feature1), nameof(Data.Feature2) }) + .Append(mlContext.Transforms.NormalizeMeanVariance("Features")); + + // Fit the pipeline. + var transformer = transformPipeline.Fit(data); + + // Transform the data. + var transformedData = transformer.Transform(data); + + // Define a linear trainer. + var linearTrainer = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(); + + // Now we train the model and score it on the transformed data. + var linearModel = linearTrainer.Fit(transformedData); + // Print the model parameters. + Console.WriteLine($"Linear Model Parameters"); + Console.WriteLine("Bias: {0} Feature1: {1} Feature2: {2}", + linearModel.Model.SubModel.Bias, + linearModel.Model.SubModel.Weights[0], + linearModel.Model.SubModel.Weights[1]); + + // Define a feature contribution calculator for all the features, and don't normalize the contributions. + // These are "trivial estimators" and they don't need to fit to the data, so we can feed a subset. + var simpleScoredDataset = linearModel.Transform(mlContext.Data.TakeRows(transformedData, 1)); + var linearFeatureContributionCalculator = mlContext.Transforms.CalculateFeatureContribution(linearModel, normalize: false).Fit(simpleScoredDataset); + + // Create a transformer chain to describe the entire pipeline. + var scoringPipeline = transformer.Append(linearModel).Append(linearFeatureContributionCalculator); + + // Create the prediction engine to get the features extracted from the text. + var predictionEngine = mlContext.Model.CreatePredictionEngine(scoringPipeline); + + // Convert the text into numeric features. + var prediction = predictionEngine.Predict(samples.First()); + + // Write out the prediction, with contributions. + // Note that for the linear model, the feature contributions for a feature in an example is the feature-weight*feature-value. + // The total prediction is thus the bias plus the feature contributions. + Console.WriteLine($"Label: {prediction.Label} Prediction-Score: {prediction.Score} Prediction-Probability: {prediction.Probability}"); + Console.WriteLine($"Feature1: {prediction.Features[0]} Feature2: {prediction.Features[1]}"); + Console.WriteLine($"Feature Contributions: {prediction.FeatureContributions[0]} {prediction.FeatureContributions[1]}"); + + // Expected output: + // Linear Model Parameters + // Bias: 0.003757346 Feature1: 9.070082 Feature2: 17.7816 + // Label: True Prediction-Score: 8.169167 Prediction-Probability: 0.9997168 + // Feature1: -0.5053467 Feature2: 0.7169741 + // Feature Contributions: -4.583536 12.74894 + } + + private class Data + { + public bool Label { get; set; } + + public float Feature1 { get; set; } + + public float Feature2 { get; set; } + } + + private class ScoredData : Data + { + public float Score { get; set; } + + public float Probability { get; set; } + + public float[] Features { get; set; } + + public float[] FeatureContributions { get; set; } + } + + /// + /// Generate an enumerable of Data objects, creating the label as a simple + /// linear combination of the features. + /// + /// The number of examples. + /// The bias, or offset, in the calculation of the label. + /// The weight to multiply the first feature with to compute the label. + /// The weight to multiply the second feature with to compute the label. + /// The seed for generating feature values and label noise. + /// An enumerable of Data objects. + private static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < nExamples; i++) + { + var data = new Data + { + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a Boolean label with noise. + var value = bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5; + data.Label = Sigmoid(value) > 0.5; + yield return data; + } + } + private static double Sigmoid(double x) => 1.0 / (1.0 + Math.Exp(-1 * x)); + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index 03b6ce19b6..a74699c5a1 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - ReplaceMissingValues.Example(); + CalculateFeatureContribution.Example(); } } } diff --git a/src/Microsoft.ML.Data/Transforms/ExplainabilityCatalog.cs b/src/Microsoft.ML.Data/Transforms/ExplainabilityCatalog.cs index 083d0f6380..cc861c0b9b 100644 --- a/src/Microsoft.ML.Data/Transforms/ExplainabilityCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ExplainabilityCatalog.cs @@ -28,7 +28,7 @@ public static class ExplainabilityCatalog /// /// /// /// /// @@ -54,7 +54,7 @@ public static FeatureContributionCalculatingEstimator CalculateFeatureContributi /// /// /// /// ///