diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs deleted file mode 100644 index de98b1ddb0..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs +++ /dev/null @@ -1,120 +0,0 @@ -using System; -using System.Collections.Generic; -using Microsoft.ML.Data; - -namespace Microsoft.ML.Samples.Dynamic -{ - public static class FeatureSelectionTransform - { - public static void Example() - { - // Downloading a classification dataset from github.com/dotnet/machinelearning. - // It will be stored in the same path as the executable - string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); - - // Data Preview - // 1. Label 0=benign, 1=malignant - // 2. Clump Thickness 1 - 10 - // 3. Uniformity of Cell Size 1 - 10 - // 4. Uniformity of Cell Shape 1 - 10 - // 5. Marginal Adhesion 1 - 10 - // 6. Single Epithelial Cell Size 1 - 10 - // 7. Bare Nuclei 1 - 10 - // 8. Bland Chromatin 1 - 10 - // 9. Normal Nucleoli 1 - 10 - // 10. Mitoses 1 - 10 - - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var ml = new MLContext(); - - // First, we define the loader: specify the data columns and where to find them in the text file. Notice that we combine entries from - // all the feature columns into entries of a vector of a single column named "Features". - var loader = ml.Data.CreateTextLoader( - columns: new[] - { - new TextLoader.Column("Label", DataKind.Boolean, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 9) }) - }, - hasHeader: true - ); - - // Then, we use the loader to load the data as an IDataView. - var data = loader.Load(dataFilePath); - - // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data - // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. - - // In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default - // values than the specified count. This transformation can be used to remove slots with too many missing values. - var countSelectEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - outputColumnName: "FeaturesCountSelect", inputColumnName: "Features", count: 695); - - // We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature - // vector based on highest mutual information between that slot and a specified label. Notice that it is possible to - // specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information - // between features and label. - var mutualInfoEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( - outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumnName: "Label", slotsInOutput: 5); - - // Now, we can put the previous two transformations together in a pipeline. - var pipeline = countSelectEst.Append(mutualInfoEst); - - // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. - var transformedData = pipeline.Fit(data).Transform(data); - - // Small helper to print the data inside a column, in the console. Only prints the first 10 rows. - Action>> printHelper = (columnName, column) => - { - Console.WriteLine($"{columnName} column obtained post-transformation."); - int count = 0; - foreach (var row in column) - { - foreach (var value in row.GetValues()) - Console.Write($"{value}\t"); - Console.WriteLine(""); - count++; - if (count >= 10) - break; - } - - Console.WriteLine("==================================================="); - }; - - // Print the data that results from the transformations. - var countSelectColumn = transformedData.GetColumn>(transformedData.Schema["FeaturesCountSelect"]); - var MISelectColumn = transformedData.GetColumn>(transformedData.Schema["FeaturesMISelect"]); - printHelper("FeaturesCountSelect", countSelectColumn); - printHelper("FeaturesMISelect", MISelectColumn); - - // Below is the output of the this code. We see that some slots habe been dropped by the first transformation. - // Among the remaining slots, the second transformation only preserves the top 5 slots based on mutualinformation - // with the label column. - - // FeaturesCountSelect column obtained post-transformation. - // 5 4 4 5 7 3 2 1 - // 3 1 1 1 2 3 1 1 - // 6 8 8 1 3 3 7 1 - // 4 1 1 3 2 3 1 1 - // 8 10 10 8 7 9 7 1 - // 1 1 1 1 2 3 1 1 - // 2 1 2 1 2 3 1 1 - // 2 1 1 1 2 1 1 5 - // 4 2 1 1 2 2 1 1 - // 1 1 1 1 1 3 1 1 - // =================================================== - // FeaturesMISelect column obtained post-transformation. - // 4 4 7 3 2 - // 1 1 2 3 1 - // 8 8 3 3 7 - // 1 1 2 3 1 - // 10 10 7 9 7 - // 1 1 2 3 1 - // 1 2 2 3 1 - // 1 1 2 1 1 - // 2 1 2 2 1 - // 1 1 1 3 1 - // =================================================== - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs new file mode 100644 index 0000000000..8b27d23e0d --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs @@ -0,0 +1,100 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; + +namespace Samples.Dynamic +{ + public static class SelectFeaturesBasedOnCount + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var rawData = GetData(); + + // Printing the columns of the input data. + Console.WriteLine($"NumericVector StringVector"); + foreach (var item in rawData) + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVector), string.Join(",", item.StringVector)); + + // NumericVector StringVector + // 4,NaN,6 A,WA,Male + // 4,5,6 A,,Female + // 4,5,6 A,NY, + // 4,NaN,NaN A,,Male + + var data = mlContext.Data.LoadFromEnumerable(rawData); + + // We will use the SelectFeaturesBasedOnCount to retain only those slots which have at least 'count' non-default values per slot. + var pipeline = + mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(outputColumnName: "NumericVector", count: 3) // Usage on numeric column. + .Append(mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(outputColumnName: "StringVector", count: 3)); // Usage on text column. + + var transformedData = pipeline.Fit(data).Transform(data); + + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); + + // Printing the columns of the transformed data. + Console.WriteLine($"NumericVector StringVector"); + foreach (var item in convertedData) + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVector), string.Join(",", item.StringVector)); + + // NumericVector StringVector + // 4,6 A,Male + // 4,6 A,Female + // 4,6 A, + // 4,NaN A,Male + } + + public class TransformedData + { + public float[] NumericVector { get; set; } + + public string[] StringVector { get; set; } + } + + public class InputData + { + [VectorType(3)] + public float[] NumericVector { get; set; } + + [VectorType(3)] + public string[] StringVector { get; set; } + } + + /// + /// Return a few rows of data. + /// + public static IEnumerable GetData() + { + var data = new List + { + new InputData + { + NumericVector = new float[] { 4, float.NaN, 6 }, + StringVector = new string[] { "A", "WA", "Male"} + }, + new InputData + { + NumericVector = new float[] { 4, 5, 6 }, + StringVector = new string[] { "A", "", "Female"} + }, + new InputData + { + NumericVector = new float[] { 4, 5, 6 }, + StringVector = new string[] { "A", "NY", null} + }, + new InputData + { + NumericVector = new float[] { 4, float.NaN, float.NaN }, + StringVector = new string[] { "A", null, "Male"} + } + }; + return data; + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs new file mode 100644 index 0000000000..ae45fa375e --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs @@ -0,0 +1,103 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; + +namespace Samples.Dynamic +{ + public static class SelectFeaturesBasedOnCountMultiColumn + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var rawData = GetData(); + + // Printing the columns of the input data. + Console.WriteLine($"NumericVector StringVector"); + foreach (var item in rawData) + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVector), string.Join(",", item.StringVector)); + + // NumericVector StringVector + // 4,NaN,6 A,WA,Male + // 4,5,6 A,,Female + // 4,5,6 A,NY, + // 4,NaN,NaN A,,Male + + var data = mlContext.Data.LoadFromEnumerable(rawData); + + // We will use the SelectFeaturesBasedOnCount transform estimator, to retain only those slots which have + // at least 'count' non-default values per slot. + + // Multi column example. This pipeline transform two columns using the provided parameters. + var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( + new InputOutputColumnPair[] { new InputOutputColumnPair("NumericVector"), new InputOutputColumnPair("StringVector") }, + count: 3); + + var transformedData = pipeline.Fit(data).Transform(data); + + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); + + // Printing the columns of the transformed data. + Console.WriteLine($"NumericVector StringVector"); + foreach (var item in convertedData) + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVector), string.Join(",", item.StringVector)); + + // NumericVector StringVector + // 4,6 A,Male + // 4,6 A,Female + // 4,6 A, + // 4,NaN A,Male + } + + private class TransformedData + { + public float[] NumericVector { get; set; } + + public string[] StringVector { get; set; } + } + + public class InputData + { + [VectorType(3)] + public float[] NumericVector { get; set; } + + [VectorType(3)] + public string[] StringVector { get; set; } + } + + /// + /// Returns a few rows of data. + /// + public static IEnumerable GetData() + { + var data = new List + { + new InputData + { + NumericVector = new float[] { 4, float.NaN, 6 }, + StringVector = new string[] { "A", "WA", "Male"} + }, + new InputData + { + NumericVector = new float[] { 4, 5, 6 }, + StringVector = new string[] { "A", "", "Female"} + }, + new InputData + { + NumericVector = new float[] { 4, 5, 6 }, + StringVector = new string[] { "A", "NY", null} + }, + new InputData + { + NumericVector = new float[] { 4, float.NaN, float.NaN }, + StringVector = new string[] { "A", null, "Male"} + } + }; + return data; + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs new file mode 100644 index 0000000000..d7aa805ceb --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs @@ -0,0 +1,99 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; + +namespace Samples.Dynamic +{ + public static class SelectFeaturesBasedOnMutualInformation + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var rawData = GetData(); + + // Printing the columns of the input data. + Console.WriteLine($"Label NumericVector"); + foreach (var item in rawData) + Console.WriteLine("{0,-25} {1,-25}", item.Label, string.Join(",", item.NumericVector)); + + // Label NumericVector + // True 4,0,6 + // False 0,5,7 + // True 4,0,6 + // False 0,5,7 + + var data = mlContext.Data.LoadFromEnumerable(rawData); + + // We define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature + // vector based on highest mutual information between that slot and a specified label. + var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( + outputColumnName: "NumericVector", labelColumnName: "Label", + slotsInOutput:2); + + // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. + var transformedData = pipeline.Fit(data).Transform(data); + + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); + + // Printing the columns of the transformed data. + Console.WriteLine($"NumericVector"); + foreach (var item in convertedData) + Console.WriteLine("{0,-25}", string.Join(",", item.NumericVector)); + + // NumericVector + // 4,0 + // 0,5 + // 4,0 + // 0,5 + } + + public class TransformedData + { + public float[] NumericVector { get; set; } + } + + public class NumericData + { + public bool Label; + + [VectorType(3)] + public float[] NumericVector { get; set; } + } + + /// + /// Returns a few rows of numeric data. + /// + public static IEnumerable GetData() + { + var data = new List + { + new NumericData + { + Label = true, + NumericVector = new float[] { 4, 0, 6 }, + }, + new NumericData + { + Label = false, + NumericVector = new float[] { 0, 5, 7 }, + }, + new NumericData + { + Label = true, + NumericVector = new float[] { 4, 0, 6 }, + }, + new NumericData + { + Label = false, + NumericVector = new float[] { 0, 5, 7 }, + } + }; + return data; + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs new file mode 100644 index 0000000000..d9543cee7e --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs @@ -0,0 +1,110 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; + +namespace Samples.Dynamic +{ + public static class SelectFeaturesBasedOnMutualInformationMultiColumn + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var rawData = GetData(); + + // Printing the columns of the input data. + Console.WriteLine($"NumericVectorA NumericVectorB"); + foreach (var item in rawData) + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVectorA), string.Join(",", item.NumericVectorB)); + + // NumericVectorA NumericVectorB + // 4,0,6 7,8,9 + // 0,5,7 7,9,0 + // 4,0,6 7,8,9 + // 0,5,7 7,8,0 + + var data = mlContext.Data.LoadFromEnumerable(rawData); + + // We define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature + // vector based on highest mutual information between that slot and a specified label. + + // Multi column example : This pipeline transform two columns using the provided parameters. + var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( + new InputOutputColumnPair[] { new InputOutputColumnPair("NumericVectorA"), new InputOutputColumnPair("NumericVectorB") }, + labelColumnName: "Label", + slotsInOutput: 4); + + var transformedData = pipeline.Fit(data).Transform(data); + + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); + + // Printing the columns of the transformed data. + Console.WriteLine($"NumericVectorA NumericVectorB"); + foreach (var item in convertedData) + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVectorA), string.Join(",", item.NumericVectorB)); + + // NumericVectorA NumericVectorB + // 4,0,6 9 + // 0,5,7 0 + // 4,0,6 9 + // 0,5,7 0 + } + + private class TransformedData + { + public float[] NumericVectorA { get; set; } + + public float[] NumericVectorB { get; set; } + } + + public class NumericData + { + public bool Label; + + [VectorType(3)] + public float[] NumericVectorA { get; set; } + + [VectorType(3)] + public float[] NumericVectorB { get; set; } + } + + /// + /// Returns a few rows of numeric data. + /// + public static IEnumerable GetData() + { + var data = new List + { + new NumericData + { + Label = true, + NumericVectorA = new float[] { 4, 0, 6 }, + NumericVectorB = new float[] { 7, 8, 9 }, + }, + new NumericData + { + Label = false, + NumericVectorA = new float[] { 0, 5, 7 }, + NumericVectorB = new float[] { 7, 9, 0 }, + }, + new NumericData + { + Label = true, + NumericVectorA = new float[] { 4, 0, 6 }, + NumericVectorB = new float[] { 7, 8, 9 }, + }, + new NumericData + { + Label = false, + NumericVectorA = new float[] { 0, 5, 7 }, + NumericVectorB = new float[] { 7, 8, 0 }, + } + }; + return data; + } + } +} diff --git a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs index 1b9d35d251..c115836f22 100644 --- a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs +++ b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs @@ -24,7 +24,7 @@ public static class FeatureSelectionCatalog /// /// /// /// /// @@ -41,6 +41,13 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu /// The name of the label column. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. + /// + /// + /// + /// + /// public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog, InputOutputColumnPair[] columns, string labelColumnName = MutualInfoSelectDefaults.LabelColumn, @@ -56,13 +63,6 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu /// /// The transform's catalog. /// Describes the parameters of the feature selection process for each column pair. - /// - /// - /// - /// - /// [BestFriend] internal static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this TransformsCatalog.FeatureSelectionTransforms catalog, params CountFeatureSelectingEstimator.ColumnOptions[] columns) @@ -76,7 +76,7 @@ internal static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this T /// /// /// /// /// @@ -90,6 +90,13 @@ public static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this Tra /// The transform's catalog. /// Specifies the names of the columns on which to apply the transformation. /// If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved. + /// + /// + /// + /// + /// public static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this TransformsCatalog.FeatureSelectionTransforms catalog, InputOutputColumnPair[] columns, long count = CountSelectDefaults.Count)