From 7545d5aa339f53485b1bcea97b805973d45c49aa Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Wed, 3 Apr 2019 00:13:00 +0000 Subject: [PATCH 1/6] samples for FeatureSelection transform estimators --- .../Dynamic/FeatureSelectionTransform.cs | 120 ---------------- .../SelectFeaturesBasedOnCount.cs | 117 ++++++++++++++++ .../SelectFeaturesBasedOnMutualInformation.cs | 129 ++++++++++++++++++ .../FeatureSelectionCatalog.cs | 25 ++-- 4 files changed, 262 insertions(+), 129 deletions(-) delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs deleted file mode 100644 index de98b1ddb0..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs +++ /dev/null @@ -1,120 +0,0 @@ -using System; -using System.Collections.Generic; -using Microsoft.ML.Data; - -namespace Microsoft.ML.Samples.Dynamic -{ - public static class FeatureSelectionTransform - { - public static void Example() - { - // Downloading a classification dataset from github.com/dotnet/machinelearning. - // It will be stored in the same path as the executable - string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); - - // Data Preview - // 1. Label 0=benign, 1=malignant - // 2. Clump Thickness 1 - 10 - // 3. Uniformity of Cell Size 1 - 10 - // 4. Uniformity of Cell Shape 1 - 10 - // 5. Marginal Adhesion 1 - 10 - // 6. Single Epithelial Cell Size 1 - 10 - // 7. Bare Nuclei 1 - 10 - // 8. Bland Chromatin 1 - 10 - // 9. Normal Nucleoli 1 - 10 - // 10. Mitoses 1 - 10 - - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var ml = new MLContext(); - - // First, we define the loader: specify the data columns and where to find them in the text file. Notice that we combine entries from - // all the feature columns into entries of a vector of a single column named "Features". - var loader = ml.Data.CreateTextLoader( - columns: new[] - { - new TextLoader.Column("Label", DataKind.Boolean, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 9) }) - }, - hasHeader: true - ); - - // Then, we use the loader to load the data as an IDataView. - var data = loader.Load(dataFilePath); - - // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data - // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. - - // In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default - // values than the specified count. This transformation can be used to remove slots with too many missing values. - var countSelectEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - outputColumnName: "FeaturesCountSelect", inputColumnName: "Features", count: 695); - - // We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature - // vector based on highest mutual information between that slot and a specified label. Notice that it is possible to - // specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information - // between features and label. - var mutualInfoEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( - outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumnName: "Label", slotsInOutput: 5); - - // Now, we can put the previous two transformations together in a pipeline. - var pipeline = countSelectEst.Append(mutualInfoEst); - - // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. - var transformedData = pipeline.Fit(data).Transform(data); - - // Small helper to print the data inside a column, in the console. Only prints the first 10 rows. - Action>> printHelper = (columnName, column) => - { - Console.WriteLine($"{columnName} column obtained post-transformation."); - int count = 0; - foreach (var row in column) - { - foreach (var value in row.GetValues()) - Console.Write($"{value}\t"); - Console.WriteLine(""); - count++; - if (count >= 10) - break; - } - - Console.WriteLine("==================================================="); - }; - - // Print the data that results from the transformations. - var countSelectColumn = transformedData.GetColumn>(transformedData.Schema["FeaturesCountSelect"]); - var MISelectColumn = transformedData.GetColumn>(transformedData.Schema["FeaturesMISelect"]); - printHelper("FeaturesCountSelect", countSelectColumn); - printHelper("FeaturesMISelect", MISelectColumn); - - // Below is the output of the this code. We see that some slots habe been dropped by the first transformation. - // Among the remaining slots, the second transformation only preserves the top 5 slots based on mutualinformation - // with the label column. - - // FeaturesCountSelect column obtained post-transformation. - // 5 4 4 5 7 3 2 1 - // 3 1 1 1 2 3 1 1 - // 6 8 8 1 3 3 7 1 - // 4 1 1 3 2 3 1 1 - // 8 10 10 8 7 9 7 1 - // 1 1 1 1 2 3 1 1 - // 2 1 2 1 2 3 1 1 - // 2 1 1 1 2 1 1 5 - // 4 2 1 1 2 2 1 1 - // 1 1 1 1 1 3 1 1 - // =================================================== - // FeaturesMISelect column obtained post-transformation. - // 4 4 7 3 2 - // 1 1 2 3 1 - // 8 8 3 3 7 - // 1 1 2 3 1 - // 10 10 7 9 7 - // 1 1 2 3 1 - // 1 2 2 3 1 - // 1 1 2 1 1 - // 2 1 2 2 1 - // 1 1 1 3 1 - // =================================================== - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs new file mode 100644 index 0000000000..3a0aff4f35 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs @@ -0,0 +1,117 @@ +using System; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class SelectFeaturesBasedOnCount + { + private static readonly int printRowCount = 4; + + public static void Example() + { + // Downloading a classification dataset from github.com/dotnet/machinelearning. + // It will be stored in the same path as the executable + string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); + + // Data Preview + // 1. Label 0=benign, 1=malignant + // 2. Clump Thickness 1 - 10 + // 3. Uniformity of Cell Size 1 - 10 + // 4. Uniformity of Cell Shape 1 - 10 + // 5. Marginal Adhesion 1 - 10 + // 6. Single Epithelial Cell Size 1 - 10 + // 7. Bare Nuclei 1 - 10 + // 8. Bland Chromatin 1 - 10 + // 9. Normal Nucleoli 1 - 10 + // 10. Mitoses 1 - 10 + + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // First, we define the loader: specify the data columns and where to find them in the text file. Notice that we combine entries from + // all the feature columns into entries of a vector of a single column named "Features". + var loader = mlContext.Data.CreateTextLoader( + columns: new[] + { + new TextLoader.Column("Label", DataKind.Boolean, 0), + new TextLoader.Column("GroupA", DataKind.Single, new [] { new TextLoader.Range(1, 3) }), + new TextLoader.Column("GroupB", DataKind.Single, new [] { new TextLoader.Range(4, 6) }), + new TextLoader.Column("GroupC", DataKind.Single, new [] { new TextLoader.Range(7, 9) }), + }, + hasHeader: true + ); + + // Then, we use the loader to load the data as an IDataView. + var data = loader.Load(dataFilePath); + + Console.WriteLine("Contents of column 'GroupB'"); + PrintDataColumn(data, "GroupB"); + // 5 7 10 + // 1 2 2 + // 1 3 4 + // 3 2 1 + + // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data + // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. + + // In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default + // values than the specified count. This transformation can be used to remove slots with too many missing values. + var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( + outputColumnName: "FeaturesSelectedGroupB", inputColumnName: "GroupB", count: 695); + + // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. + var transformedData = pipeline.Fit(data).Transform(data); + + Console.WriteLine("Contents of column 'FeaturesSelectedGroupB'"); + PrintDataColumn(transformedData, "FeaturesSelectedGroupB"); + // Note, SelectFeaturesBasedOnCount retained only 2 slots (out of 3). + // 5 7 + // 1 2 + // 1 3 + // 3 2 + + // Multi column example : This pipeline uses two columns for transformation + pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( + new InputOutputColumnPair[] { new InputOutputColumnPair("GroupB"), new InputOutputColumnPair("GroupC") }, + count: 695); + + transformedData = pipeline.Fit(data).Transform(data); + + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true).Take(printRowCount); + Console.WriteLine("Contents of two columns 'GroupB' and 'GroupC'."); + foreach (var item in convertedData) + Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.GroupB), string.Join(" ", item.GroupC)); + + // 5 7 3 2 1 + // 1 2 3 1 1 + // 1 3 3 7 1 + // 3 2 3 1 1 + } + + private static void PrintDataColumn(IDataView transformedData, string columnName) + { + var countSelectColumn = transformedData.GetColumn(transformedData.Schema[columnName]); + + int count = 0; + foreach (var row in countSelectColumn) + { + for (var i = 0; i < row.Length; i++) + Console.Write($"{row[i]} "); + Console.WriteLine(); + + count += 1; + if (count >= printRowCount) + break; + } + } + + private class TransformedData + { + public float[] GroupB { get; set; } + + public float[] GroupC { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs new file mode 100644 index 0000000000..c0599b14b7 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs @@ -0,0 +1,129 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class SelectFeaturesBasedOnMutualInformation + { + private static readonly int printRowCount = 4; + + public static void Example() + { + // Downloading a classification dataset from github.com/dotnet/machinelearning. + // It will be stored in the same path as the executable + string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); + + // Data Preview + // 1. Label 0=benign, 1=malignant + // 2. Clump Thickness 1 - 10 + // 3. Uniformity of Cell Size 1 - 10 + // 4. Uniformity of Cell Shape 1 - 10 + // 5. Marginal Adhesion 1 - 10 + // 6. Single Epithelial Cell Size 1 - 10 + // 7. Bare Nuclei 1 - 10 + // 8. Bland Chromatin 1 - 10 + // 9. Normal Nucleoli 1 - 10 + // 10. Mitoses 1 - 10 + + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // First, we define the loader: specify the data columns and where to find them in the text file. Notice that we combine entries from + // all the feature columns into entries of a vector of a single column named "Features". + var loader = mlContext.Data.CreateTextLoader( + columns: new[] + { + new TextLoader.Column("Label", DataKind.Boolean, 0), + new TextLoader.Column("GroupA", DataKind.Single, new [] { new TextLoader.Range(1, 3) }), + new TextLoader.Column("GroupB", DataKind.Single, new [] { new TextLoader.Range(4, 6) }), + new TextLoader.Column("GroupC", DataKind.Single, new [] { new TextLoader.Range(7, 9) }), + }, + hasHeader: true + ); + + // Then, we use the loader to load the data as an IDataView. + var data = loader.Load(dataFilePath); + + Console.WriteLine("Contents of column 'GroupB'"); + PrintDataColumn(data, "GroupB"); + // 5 7 10 + // 1 2 2 + // 1 3 4 + // 3 2 1 + + Console.WriteLine("Contents of column 'GroupC'"); + PrintDataColumn(data, "GroupC"); + // 3 2 1 + // 3 1 1 + // 3 7 1 + // 3 1 1 + + // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data + // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. + + // We define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature + // vector based on highest mutual information between that slot and a specified label. Notice that it is possible to + // specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information + // between features and label. + + var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( + outputColumnName: "FeaturesSelectedGroupB", inputColumnName: "GroupB", labelColumnName: "Label", slotsInOutput: 2); + + // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. + var transformedData = pipeline.Fit(data).Transform(data); + + Console.WriteLine("Contents of column 'FeaturesSelectedGroupB'"); + PrintDataColumn(transformedData, "FeaturesSelectedGroupB"); + // Note, SelectFeaturesBasedOnMutualInformation retained 2 slots (out of 3). + // 7 10 + // 2 2 + // 3 4 + // 2 1 + + // Multi column example : This pipeline uses two columns for transformation + pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( + new InputOutputColumnPair[] { new InputOutputColumnPair("GroupB"), new InputOutputColumnPair("GroupC") }, + labelColumnName: "Label", + slotsInOutput:4); + + transformedData = pipeline.Fit(data).Transform(data); + + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true).Take(printRowCount); + Console.WriteLine("Contents of two columns 'GroupB' and 'GroupC'."); + foreach (var item in convertedData) + Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.GroupB), string.Join(" ", item.GroupC)); + + // 7 10 3 2 + // 2 2 3 1 + // 3 4 3 7 + // 2 1 3 1 + } + + private static void PrintDataColumn(IDataView transformedData, string columnName) + { + var countSelectColumn = transformedData.GetColumn(transformedData.Schema[columnName]); + + int count = 0; + foreach (var row in countSelectColumn) + { + for (var i = 0; i < row.Length; i++) + Console.Write($"{row[i]} "); + Console.WriteLine(); + + count += 1; + if (count >= printRowCount) + break; + } + } + + private class TransformedData + { + public float[] GroupB { get; set; } + + public float[] GroupC { get; set; } + } + } +} diff --git a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs index 1b9d35d251..0cdea6b269 100644 --- a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs +++ b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs @@ -24,7 +24,7 @@ public static class FeatureSelectionCatalog /// /// /// /// /// @@ -41,6 +41,13 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu /// The name of the label column. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. + /// + /// + /// + /// + /// public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog, InputOutputColumnPair[] columns, string labelColumnName = MutualInfoSelectDefaults.LabelColumn, @@ -56,13 +63,6 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu /// /// The transform's catalog. /// Describes the parameters of the feature selection process for each column pair. - /// - /// - /// - /// - /// [BestFriend] internal static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this TransformsCatalog.FeatureSelectionTransforms catalog, params CountFeatureSelectingEstimator.ColumnOptions[] columns) @@ -76,7 +76,7 @@ internal static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this T /// /// /// /// /// @@ -90,6 +90,13 @@ public static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this Tra /// The transform's catalog. /// Specifies the names of the columns on which to apply the transformation. /// If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved. + /// + /// + /// + /// + /// public static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this TransformsCatalog.FeatureSelectionTransforms catalog, InputOutputColumnPair[] columns, long count = CountSelectDefaults.Count) From fdcc01254b02a8c74ff5d0b2565bca1b5e5290d6 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Wed, 3 Apr 2019 23:13:09 +0000 Subject: [PATCH 2/6] fix review comments --- .../SelectFeaturesBasedOnCount.cs | 135 ++++++++-------- .../SelectFeaturesBasedOnCountMultiColumn.cs | 128 +++++++++++++++ .../SelectFeaturesBasedOnMutualInformation.cs | 146 ++++++++---------- ...uresBasedOnMutualInformationMultiColumn.cs | 111 +++++++++++++ .../FeatureSelectionCatalog.cs | 8 +- 5 files changed, 371 insertions(+), 157 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs index 3a0aff4f35..a1cad6af94 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs @@ -1,117 +1,110 @@ using System; -using System.Linq; +using System.Collections.Generic; using Microsoft.ML.Data; namespace Microsoft.ML.Samples.Dynamic { public static class SelectFeaturesBasedOnCount { - private static readonly int printRowCount = 4; - public static void Example() { - // Downloading a classification dataset from github.com/dotnet/machinelearning. - // It will be stored in the same path as the executable - string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); - - // Data Preview - // 1. Label 0=benign, 1=malignant - // 2. Clump Thickness 1 - 10 - // 3. Uniformity of Cell Size 1 - 10 - // 4. Uniformity of Cell Shape 1 - 10 - // 5. Marginal Adhesion 1 - 10 - // 6. Single Epithelial Cell Size 1 - 10 - // 7. Bare Nuclei 1 - 10 - // 8. Bland Chromatin 1 - 10 - // 9. Normal Nucleoli 1 - 10 - // 10. Mitoses 1 - 10 - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var mlContext = new MLContext(); - // First, we define the loader: specify the data columns and where to find them in the text file. Notice that we combine entries from - // all the feature columns into entries of a vector of a single column named "Features". - var loader = mlContext.Data.CreateTextLoader( - columns: new[] - { - new TextLoader.Column("Label", DataKind.Boolean, 0), - new TextLoader.Column("GroupA", DataKind.Single, new [] { new TextLoader.Range(1, 3) }), - new TextLoader.Column("GroupB", DataKind.Single, new [] { new TextLoader.Range(4, 6) }), - new TextLoader.Column("GroupC", DataKind.Single, new [] { new TextLoader.Range(7, 9) }), - }, - hasHeader: true - ); - - // Then, we use the loader to load the data as an IDataView. - var data = loader.Load(dataFilePath); + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var rawData = GetData(); + var data = mlContext.Data.LoadFromEnumerable(rawData); Console.WriteLine("Contents of column 'GroupB'"); PrintDataColumn(data, "GroupB"); - // 5 7 10 - // 1 2 2 - // 1 3 4 - // 3 2 1 - + // 4 NaN 6 + // 4 5 6 + // 4 5 6 + // 4 NaN NaN + // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. - // In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default - // values than the specified count. This transformation can be used to remove slots with too many missing values. + // We will use the SelectFeaturesBasedOnCount transform estimator, to retain only those slots which have + // at least 'count' non-default values per slot. var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - outputColumnName: "FeaturesSelectedGroupB", inputColumnName: "GroupB", count: 695); + outputColumnName: "FeaturesSelectedGroupB", inputColumnName: "GroupB", count: 3); // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. var transformedData = pipeline.Fit(data).Transform(data); Console.WriteLine("Contents of column 'FeaturesSelectedGroupB'"); PrintDataColumn(transformedData, "FeaturesSelectedGroupB"); - // Note, SelectFeaturesBasedOnCount retained only 2 slots (out of 3). - // 5 7 - // 1 2 - // 1 3 - // 3 2 - - // Multi column example : This pipeline uses two columns for transformation - pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - new InputOutputColumnPair[] { new InputOutputColumnPair("GroupB"), new InputOutputColumnPair("GroupC") }, - count: 695); - - transformedData = pipeline.Fit(data).Transform(data); - - var convertedData = mlContext.Data.CreateEnumerable(transformedData, true).Take(printRowCount); - Console.WriteLine("Contents of two columns 'GroupB' and 'GroupC'."); - foreach (var item in convertedData) - Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.GroupB), string.Join(" ", item.GroupC)); - - // 5 7 3 2 1 - // 1 2 3 1 1 - // 1 3 3 7 1 - // 3 2 3 1 1 + // 4 6 + // 4 6 + // 4 6 + // 4 NaN } private static void PrintDataColumn(IDataView transformedData, string columnName) { var countSelectColumn = transformedData.GetColumn(transformedData.Schema[columnName]); - int count = 0; foreach (var row in countSelectColumn) { for (var i = 0; i < row.Length; i++) - Console.Write($"{row[i]} "); + Console.Write($"{row[i]}\t"); Console.WriteLine(); - - count += 1; - if (count >= printRowCount) - break; } } - private class TransformedData + public class NumericData { + public bool Label; + + [VectorType(3)] + public float[] GroupA { get; set; } + + [VectorType(3)] public float[] GroupB { get; set; } + [VectorType(3)] public float[] GroupC { get; set; } } + + /// + /// Returns a few rows of numeric data. + /// + public static IEnumerable GetData() + { + var data = new List + { + new NumericData + { + Label = true, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, float.NaN, 6 }, + GroupC = new float[] { 7, 8, 9 }, + }, + new NumericData + { + Label = false, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, 5, 6 }, + GroupC = new float[] { 7, 8, 9 }, + }, + new NumericData + { + Label = true, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, 5, 6 }, + GroupC = new float[] { 7, 8, 9 }, + }, + new NumericData + { + Label = false, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, float.NaN, float.NaN }, + GroupC = new float[] { 7, 8, 9 }, + } + }; + return data; + } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs new file mode 100644 index 0000000000..0b73a6d0fd --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs @@ -0,0 +1,128 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class SelectFeaturesBasedOnCountMultiColumn + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var rawData = GetData(); + var data = mlContext.Data.LoadFromEnumerable(rawData); + + Console.WriteLine("Contents of column 'GroupB'"); + PrintDataColumn(data, "GroupB"); + // 4 NaN 6 + // 4 5 6 + // 4 5 6 + // 4 NaN NaN + + Console.WriteLine("Contents of column 'GroupC'"); + PrintDataColumn(data, "GroupC"); + // NaN 8 9 + // NaN 8 9 + // NaN 8 9 + // 7 8 9 + + // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data + // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. + + // We will use the SelectFeaturesBasedOnCount transform estimator, to retain only those slots which have + // at least 'count' non-default values per slot. + + // Multi column example : This pipeline uses two columns for transformation + var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( + new InputOutputColumnPair[] { new InputOutputColumnPair("GroupB"), new InputOutputColumnPair("GroupC") }, + count: 3); + + var transformedData = pipeline.Fit(data).Transform(data); + + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); + Console.WriteLine("Contents of two columns 'GroupB' and 'GroupC'."); + foreach (var item in convertedData) + Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.GroupB), string.Join(" ", item.GroupC)); + // 4 6 8 9 + // 4 6 8 9 + // 4 6 8 9 + // 4 NaN 8 9 + } + + private static void PrintDataColumn(IDataView transformedData, string columnName) + { + var countSelectColumn = transformedData.GetColumn(transformedData.Schema[columnName]); + + foreach (var row in countSelectColumn) + { + for (var i = 0; i < row.Length; i++) + Console.Write($"{row[i]}\t"); + Console.WriteLine(); + } + } + + private class TransformedData + { + public float[] GroupB { get; set; } + + public float[] GroupC { get; set; } + } + + public class NumericData + { + public bool Label; + + [VectorType(3)] + public float[] GroupA { get; set; } + + [VectorType(3)] + public float[] GroupB { get; set; } + + [VectorType(3)] + public float[] GroupC { get; set; } + } + + /// + /// Returns a few rows of numeric data. + /// + public static IEnumerable GetData() + { + var data = new List + { + new NumericData + { + Label = true, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, float.NaN, 6 }, + GroupC = new float[] { float.NaN, 8, 9 }, + }, + new NumericData + { + Label = false, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, 5, 6 }, + GroupC = new float[] { float.NaN, 8, 9 }, + }, + new NumericData + { + Label = true, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, 5, 6 }, + GroupC = new float[] { float.NaN, 8, 9 }, + }, + new NumericData + { + Label = false, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, float.NaN, float.NaN }, + GroupC = new float[] { 7, 8, 9 }, + } + }; + return data; + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs index c0599b14b7..a4b1f4d6b4 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs @@ -1,129 +1,111 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.ML.Data; namespace Microsoft.ML.Samples.Dynamic { public static class SelectFeaturesBasedOnMutualInformation { - private static readonly int printRowCount = 4; - public static void Example() { - // Downloading a classification dataset from github.com/dotnet/machinelearning. - // It will be stored in the same path as the executable - string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); - - // Data Preview - // 1. Label 0=benign, 1=malignant - // 2. Clump Thickness 1 - 10 - // 3. Uniformity of Cell Size 1 - 10 - // 4. Uniformity of Cell Shape 1 - 10 - // 5. Marginal Adhesion 1 - 10 - // 6. Single Epithelial Cell Size 1 - 10 - // 7. Bare Nuclei 1 - 10 - // 8. Bland Chromatin 1 - 10 - // 9. Normal Nucleoli 1 - 10 - // 10. Mitoses 1 - 10 - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var mlContext = new MLContext(); - // First, we define the loader: specify the data columns and where to find them in the text file. Notice that we combine entries from - // all the feature columns into entries of a vector of a single column named "Features". - var loader = mlContext.Data.CreateTextLoader( - columns: new[] - { - new TextLoader.Column("Label", DataKind.Boolean, 0), - new TextLoader.Column("GroupA", DataKind.Single, new [] { new TextLoader.Range(1, 3) }), - new TextLoader.Column("GroupB", DataKind.Single, new [] { new TextLoader.Range(4, 6) }), - new TextLoader.Column("GroupC", DataKind.Single, new [] { new TextLoader.Range(7, 9) }), - }, - hasHeader: true - ); - - // Then, we use the loader to load the data as an IDataView. - var data = loader.Load(dataFilePath); + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var rawData = GetData(); - Console.WriteLine("Contents of column 'GroupB'"); - PrintDataColumn(data, "GroupB"); - // 5 7 10 - // 1 2 2 - // 1 3 4 - // 3 2 1 + Console.WriteLine("Contents of two columns 'Label' and 'GroupB'."); + foreach (var item in rawData) + Console.WriteLine("{0}\t\t{1}", item.Label, string.Join(" ", item.GroupB)); + // True 4 0 6 + // False 0 5 7 + // True 4 0 6 + // False 0 5 7 - Console.WriteLine("Contents of column 'GroupC'"); - PrintDataColumn(data, "GroupC"); - // 3 2 1 - // 3 1 1 - // 3 7 1 - // 3 1 1 - - // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data - // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. + var data = mlContext.Data.LoadFromEnumerable(rawData); // We define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature - // vector based on highest mutual information between that slot and a specified label. Notice that it is possible to - // specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information - // between features and label. + // vector based on highest mutual information between that slot and a specified label. var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( - outputColumnName: "FeaturesSelectedGroupB", inputColumnName: "GroupB", labelColumnName: "Label", slotsInOutput: 2); + outputColumnName: "FeaturesSelectedGroupB", inputColumnName: "GroupB", labelColumnName: "Label", + slotsInOutput:2); // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. var transformedData = pipeline.Fit(data).Transform(data); Console.WriteLine("Contents of column 'FeaturesSelectedGroupB'"); PrintDataColumn(transformedData, "FeaturesSelectedGroupB"); - // Note, SelectFeaturesBasedOnMutualInformation retained 2 slots (out of 3). - // 7 10 - // 2 2 - // 3 4 - // 2 1 - - // Multi column example : This pipeline uses two columns for transformation - pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( - new InputOutputColumnPair[] { new InputOutputColumnPair("GroupB"), new InputOutputColumnPair("GroupC") }, - labelColumnName: "Label", - slotsInOutput:4); - - transformedData = pipeline.Fit(data).Transform(data); - - var convertedData = mlContext.Data.CreateEnumerable(transformedData, true).Take(printRowCount); - Console.WriteLine("Contents of two columns 'GroupB' and 'GroupC'."); - foreach (var item in convertedData) - Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.GroupB), string.Join(" ", item.GroupC)); - - // 7 10 3 2 - // 2 2 3 1 - // 3 4 3 7 - // 2 1 3 1 + // 4 0 + // 0 5 + // 4 0 + // 0 5 } private static void PrintDataColumn(IDataView transformedData, string columnName) { var countSelectColumn = transformedData.GetColumn(transformedData.Schema[columnName]); - int count = 0; foreach (var row in countSelectColumn) { for (var i = 0; i < row.Length; i++) Console.Write($"{row[i]} "); Console.WriteLine(); - - count += 1; - if (count >= printRowCount) - break; } } - private class TransformedData + public class NumericData { + public bool Label; + + [VectorType(3)] + public float[] GroupA { get; set; } + + [VectorType(3)] public float[] GroupB { get; set; } + [VectorType(3)] public float[] GroupC { get; set; } } + + /// + /// Returns a few rows of numeric data. + /// + public static IEnumerable GetData() + { + var data = new List + { + new NumericData + { + Label = true, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, 0, 6 }, + GroupC = new float[] { 7, 8, 9 }, + }, + new NumericData + { + Label = false, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 0, 5, 7 }, + GroupC = new float[] { 7, 8, 9 }, + }, + new NumericData + { + Label = true, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, 0, 6 }, + GroupC = new float[] { 7, 8, 9 }, + }, + new NumericData + { + Label = false, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 0, 5, 7 }, + GroupC = new float[] { 7, 8, 9 }, + } + }; + return data; + } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs new file mode 100644 index 0000000000..c33d406d18 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs @@ -0,0 +1,111 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class SelectFeaturesBasedOnMutualInformationMultiColumn + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var rawData = GetData(); + + Console.WriteLine("Contents of columns 'Label', 'GroupB' and 'GroupC'."); + foreach (var item in rawData) + Console.WriteLine("{0}\t\t{1}\t\t{2}", item.Label, string.Join(" ", item.GroupB), string.Join(" ", item.GroupC)); + // True 4 0 6 7 8 9 + // False 0 5 7 7 9 0 + // True 4 0 6 7 8 9 + // False 0 5 7 7 8 0 + + var data = mlContext.Data.LoadFromEnumerable(rawData); + + // We define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature + // vector based on highest mutual information between that slot and a specified label. + + // Multi column example : This pipeline uses two columns for transformation. + var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( + new InputOutputColumnPair[] { new InputOutputColumnPair("GroupB"), new InputOutputColumnPair("GroupC") }, + labelColumnName: "Label", + slotsInOutput: 4); + + var transformedData = pipeline.Fit(data).Transform(data); + + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); + Console.WriteLine("Contents of two columns 'GroupB' and 'GroupC'."); + foreach (var item in convertedData) + Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.GroupB), string.Join(" ", item.GroupC)); + + // Here, we see SelectFeaturesBasedOnMutualInformation selected 4 slots. (3 slots from the 'GroupB' column and 1 slot from the 'GroupC' column.) + // 4 0 6 9 + // 0 5 7 0 + // 4 0 6 9 + // 0 5 7 0 + } + + private class TransformedData + { + public float[] GroupB { get; set; } + + public float[] GroupC { get; set; } + } + + public class NumericData + { + public bool Label; + + [VectorType(3)] + public float[] GroupA { get; set; } + + [VectorType(3)] + public float[] GroupB { get; set; } + + [VectorType(3)] + public float[] GroupC { get; set; } + } + + /// + /// Returns a few rows of numeric data. + /// + public static IEnumerable GetData() + { + var data = new List + { + new NumericData + { + Label = true, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, 0, 6 }, + GroupC = new float[] { 7, 8, 9 }, + }, + new NumericData + { + Label = false, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 0, 5, 7 }, + GroupC = new float[] { 7, 9, 0 }, + }, + new NumericData + { + Label = true, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 4, 0, 6 }, + GroupC = new float[] { 7, 8, 9 }, + }, + new NumericData + { + Label = false, + GroupA = new float[] { 1, 2, 3 }, + GroupB = new float[] { 0, 5, 7 }, + GroupC = new float[] { 7, 8, 0 }, + } + }; + return data; + } + } +} diff --git a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs index 0cdea6b269..c115836f22 100644 --- a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs +++ b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs @@ -24,7 +24,7 @@ public static class FeatureSelectionCatalog /// /// /// /// /// @@ -44,7 +44,7 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu /// /// /// /// /// @@ -76,7 +76,7 @@ internal static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this T /// /// /// /// /// @@ -93,7 +93,7 @@ public static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this Tra /// /// /// /// /// From c4533bca2f7955a90b5fe15925a1c3e37bc47844 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 4 Apr 2019 18:44:14 +0000 Subject: [PATCH 3/6] fix review comments --- .../SelectFeaturesBasedOnCount.cs | 100 +++++++++-------- .../SelectFeaturesBasedOnCountMultiColumn.cs | 106 ++++++------------ .../SelectFeaturesBasedOnMutualInformation.cs | 35 ++---- ...uresBasedOnMutualInformationMultiColumn.cs | 42 +++---- 4 files changed, 117 insertions(+), 166 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs index a1cad6af94..43db2df365 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs @@ -1,8 +1,9 @@ using System; using System.Collections.Generic; +using Microsoft.ML; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { public static class SelectFeaturesBasedOnCount { @@ -16,92 +17,93 @@ public static void Example() var rawData = GetData(); var data = mlContext.Data.LoadFromEnumerable(rawData); - Console.WriteLine("Contents of column 'GroupB'"); - PrintDataColumn(data, "GroupB"); - // 4 NaN 6 - // 4 5 6 - // 4 5 6 - // 4 NaN NaN + var convertedData = mlContext.Data.CreateEnumerable(data, true); - // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data - // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. + Console.WriteLine("Contents of two columns 'GroupA' and 'Info'."); + foreach (var item in convertedData) + Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.GroupA), string.Join("\t", item.Info)); + // 4 NaN 6 A WA Male + // 4 5 6 A Female + // 4 5 6 A NY + // 4 NaN NaN A Male - // We will use the SelectFeaturesBasedOnCount transform estimator, to retain only those slots which have - // at least 'count' non-default values per slot. + // We will use the SelectFeaturesBasedOnCount to retain only those slots which have at least 'count' non-default values per slot. + + // Usage on numeric column. var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - outputColumnName: "FeaturesSelectedGroupB", inputColumnName: "GroupB", count: 3); + outputColumnName: "FeaturesSelectedGroupA", inputColumnName: "GroupA", count: 3); // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. var transformedData = pipeline.Fit(data).Transform(data); - Console.WriteLine("Contents of column 'FeaturesSelectedGroupB'"); - PrintDataColumn(transformedData, "FeaturesSelectedGroupB"); + Console.WriteLine("Contents of column 'FeaturesSelectedGroupA'"); + var featuresSelectedGroupA = transformedData.GetColumn(transformedData.Schema["FeaturesSelectedGroupA"]); + foreach (var row in featuresSelectedGroupA) + { + for (var i = 0; i < row.Length; i++) + Console.Write($"{row[i]}\t"); + Console.WriteLine(); + } // 4 6 // 4 6 // 4 6 // 4 NaN - } - private static void PrintDataColumn(IDataView transformedData, string columnName) - { - var countSelectColumn = transformedData.GetColumn(transformedData.Schema[columnName]); + // Usage on text column. + pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( + outputColumnName: "FeaturesSelectedInfo", inputColumnName: "Info", count: 3); + + transformedData = pipeline.Fit(data).Transform(data); - foreach (var row in countSelectColumn) + Console.WriteLine("Contents of column 'FeaturesSelectedInfo'"); + var featuresSelectedInfo = transformedData.GetColumn(transformedData.Schema["FeaturesSelectedInfo"]); + foreach (var row in featuresSelectedInfo) { for (var i = 0; i < row.Length; i++) Console.Write($"{row[i]}\t"); Console.WriteLine(); } + // A Male + // A Female + // A + // A Male } - public class NumericData + public class InputData { - public bool Label; - [VectorType(3)] public float[] GroupA { get; set; } [VectorType(3)] - public float[] GroupB { get; set; } - - [VectorType(3)] - public float[] GroupC { get; set; } + public string[] Info { get; set; } } /// - /// Returns a few rows of numeric data. + /// Return a few rows of data. /// - public static IEnumerable GetData() + public static IEnumerable GetData() { - var data = new List + var data = new List { - new NumericData + new InputData { - Label = true, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, float.NaN, 6 }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 4, float.NaN, 6 }, + Info = new string[] { "A", "WA", "Male"} }, - new NumericData + new InputData { - Label = false, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, 5, 6 }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 4, 5, 6 }, + Info = new string[] { "A", "", "Female"} }, - new NumericData + new InputData { - Label = true, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, 5, 6 }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 4, 5, 6 }, + Info = new string[] { "A", "NY", null} }, - new NumericData + new InputData { - Label = false, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, float.NaN, float.NaN }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 4, float.NaN, float.NaN }, + Info = new string[] { "A", null, "Male"} } }; return data; diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs index 0b73a6d0fd..871fe9531e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs @@ -1,8 +1,9 @@ using System; using System.Collections.Generic; +using Microsoft.ML; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { public static class SelectFeaturesBasedOnCountMultiColumn { @@ -14,112 +15,79 @@ public static void Example() // Get a small dataset as an IEnumerable and convert it to an IDataView. var rawData = GetData(); - var data = mlContext.Data.LoadFromEnumerable(rawData); - - Console.WriteLine("Contents of column 'GroupB'"); - PrintDataColumn(data, "GroupB"); - // 4 NaN 6 - // 4 5 6 - // 4 5 6 - // 4 NaN NaN - Console.WriteLine("Contents of column 'GroupC'"); - PrintDataColumn(data, "GroupC"); - // NaN 8 9 - // NaN 8 9 - // NaN 8 9 - // 7 8 9 + Console.WriteLine("Contents of two columns 'GroupA' and 'Info'."); + foreach (var item in rawData) + Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.GroupA), string.Join("\t", item.Info)); + // 4 NaN 6 A WA Male + // 4 5 6 A Female + // 4 5 6 A NY + // 4 NaN NaN A Male - // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data - // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. + var data = mlContext.Data.LoadFromEnumerable(rawData); // We will use the SelectFeaturesBasedOnCount transform estimator, to retain only those slots which have // at least 'count' non-default values per slot. - // Multi column example : This pipeline uses two columns for transformation + // Multi column example. This pipeline transform two columns using the provided parameters. var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - new InputOutputColumnPair[] { new InputOutputColumnPair("GroupB"), new InputOutputColumnPair("GroupC") }, + new InputOutputColumnPair[] { new InputOutputColumnPair("GroupA"), new InputOutputColumnPair("Info") }, count: 3); var transformedData = pipeline.Fit(data).Transform(data); var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); - Console.WriteLine("Contents of two columns 'GroupB' and 'GroupC'."); + Console.WriteLine("Contents of two columns 'GroupA' and 'Info'."); foreach (var item in convertedData) - Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.GroupB), string.Join(" ", item.GroupC)); - // 4 6 8 9 - // 4 6 8 9 - // 4 6 8 9 - // 4 NaN 8 9 - } - - private static void PrintDataColumn(IDataView transformedData, string columnName) - { - var countSelectColumn = transformedData.GetColumn(transformedData.Schema[columnName]); - - foreach (var row in countSelectColumn) - { - for (var i = 0; i < row.Length; i++) - Console.Write($"{row[i]}\t"); - Console.WriteLine(); - } + Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.GroupA), string.Join("\t", item.Info)); + // 4 6 A Male + // 4 6 A Female + // 4 6 A + // 4 NaN A Male } private class TransformedData { - public float[] GroupB { get; set; } + public float[] GroupA { get; set; } - public float[] GroupC { get; set; } + public string[] Info { get; set; } } - public class NumericData + public class InputData { - public bool Label; - [VectorType(3)] public float[] GroupA { get; set; } [VectorType(3)] - public float[] GroupB { get; set; } - - [VectorType(3)] - public float[] GroupC { get; set; } + public string[] Info { get; set; } } /// - /// Returns a few rows of numeric data. + /// Returns a few rows of data. /// - public static IEnumerable GetData() + public static IEnumerable GetData() { - var data = new List + var data = new List { - new NumericData + new InputData { - Label = true, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, float.NaN, 6 }, - GroupC = new float[] { float.NaN, 8, 9 }, + GroupA = new float[] { 4, float.NaN, 6 }, + Info = new string[] { "A", "WA", "Male"} }, - new NumericData + new InputData { - Label = false, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, 5, 6 }, - GroupC = new float[] { float.NaN, 8, 9 }, + GroupA = new float[] { 4, 5, 6 }, + Info = new string[] { "A", "", "Female"} }, - new NumericData + new InputData { - Label = true, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, 5, 6 }, - GroupC = new float[] { float.NaN, 8, 9 }, + GroupA = new float[] { 4, 5, 6 }, + Info = new string[] { "A", "NY", null} }, - new NumericData + new InputData { - Label = false, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, float.NaN, float.NaN }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 4, float.NaN, float.NaN }, + Info = new string[] { "A", null, "Male"} } }; return data; diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs index a4b1f4d6b4..9967c94142 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs @@ -1,8 +1,9 @@ using System; using System.Collections.Generic; +using Microsoft.ML; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { public static class SelectFeaturesBasedOnMutualInformation { @@ -15,9 +16,9 @@ public static void Example() // Get a small dataset as an IEnumerable and convert it to an IDataView. var rawData = GetData(); - Console.WriteLine("Contents of two columns 'Label' and 'GroupB'."); + Console.WriteLine("Contents of two columns 'Label' and 'GroupA'."); foreach (var item in rawData) - Console.WriteLine("{0}\t\t{1}", item.Label, string.Join(" ", item.GroupB)); + Console.WriteLine("{0}\t\t{1}", item.Label, string.Join(" ", item.GroupA)); // True 4 0 6 // False 0 5 7 // True 4 0 6 @@ -29,14 +30,14 @@ public static void Example() // vector based on highest mutual information between that slot and a specified label. var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( - outputColumnName: "FeaturesSelectedGroupB", inputColumnName: "GroupB", labelColumnName: "Label", + outputColumnName: "FeaturesSelectedGroupA", inputColumnName: "GroupA", labelColumnName: "Label", slotsInOutput:2); // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. var transformedData = pipeline.Fit(data).Transform(data); - Console.WriteLine("Contents of column 'FeaturesSelectedGroupB'"); - PrintDataColumn(transformedData, "FeaturesSelectedGroupB"); + Console.WriteLine("Contents of column 'FeaturesSelectedGroupA'"); + PrintDataColumn(transformedData, "FeaturesSelectedGroupA"); // 4 0 // 0 5 // 4 0 @@ -61,12 +62,6 @@ public class NumericData [VectorType(3)] public float[] GroupA { get; set; } - - [VectorType(3)] - public float[] GroupB { get; set; } - - [VectorType(3)] - public float[] GroupC { get; set; } } /// @@ -79,30 +74,22 @@ public static IEnumerable GetData() new NumericData { Label = true, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, 0, 6 }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 4, 0, 6 }, }, new NumericData { Label = false, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 0, 5, 7 }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 0, 5, 7 }, }, new NumericData { Label = true, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, 0, 6 }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 4, 0, 6 }, }, new NumericData { Label = false, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 0, 5, 7 }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 0, 5, 7 }, } }; return data; diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs index c33d406d18..71ac19b207 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs @@ -1,8 +1,9 @@ using System; using System.Collections.Generic; +using Microsoft.ML; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { public static class SelectFeaturesBasedOnMutualInformationMultiColumn { @@ -15,9 +16,9 @@ public static void Example() // Get a small dataset as an IEnumerable and convert it to an IDataView. var rawData = GetData(); - Console.WriteLine("Contents of columns 'Label', 'GroupB' and 'GroupC'."); + Console.WriteLine("Contents of columns 'Label', 'GroupA' and 'GroupB'."); foreach (var item in rawData) - Console.WriteLine("{0}\t\t{1}\t\t{2}", item.Label, string.Join(" ", item.GroupB), string.Join(" ", item.GroupC)); + Console.WriteLine("{0}\t\t{1}\t\t{2}", item.Label, string.Join(" ", item.GroupA), string.Join(" ", item.GroupB)); // True 4 0 6 7 8 9 // False 0 5 7 7 9 0 // True 4 0 6 7 8 9 @@ -28,18 +29,18 @@ public static void Example() // We define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature // vector based on highest mutual information between that slot and a specified label. - // Multi column example : This pipeline uses two columns for transformation. + // Multi column example : This pipeline transform two columns using the provided parameters. var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( - new InputOutputColumnPair[] { new InputOutputColumnPair("GroupB"), new InputOutputColumnPair("GroupC") }, + new InputOutputColumnPair[] { new InputOutputColumnPair("GroupA"), new InputOutputColumnPair("GroupB") }, labelColumnName: "Label", slotsInOutput: 4); var transformedData = pipeline.Fit(data).Transform(data); var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); - Console.WriteLine("Contents of two columns 'GroupB' and 'GroupC'."); + Console.WriteLine("Contents of two columns 'GroupA' and 'GroupB'."); foreach (var item in convertedData) - Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.GroupB), string.Join(" ", item.GroupC)); + Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.GroupA), string.Join(" ", item.GroupB)); // Here, we see SelectFeaturesBasedOnMutualInformation selected 4 slots. (3 slots from the 'GroupB' column and 1 slot from the 'GroupC' column.) // 4 0 6 9 @@ -50,9 +51,9 @@ public static void Example() private class TransformedData { - public float[] GroupB { get; set; } + public float[] GroupA { get; set; } - public float[] GroupC { get; set; } + public float[] GroupB { get; set; } } public class NumericData @@ -64,9 +65,6 @@ public class NumericData [VectorType(3)] public float[] GroupB { get; set; } - - [VectorType(3)] - public float[] GroupC { get; set; } } /// @@ -79,30 +77,26 @@ public static IEnumerable GetData() new NumericData { Label = true, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, 0, 6 }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 4, 0, 6 }, + GroupB = new float[] { 7, 8, 9 }, }, new NumericData { Label = false, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 0, 5, 7 }, - GroupC = new float[] { 7, 9, 0 }, + GroupA = new float[] { 0, 5, 7 }, + GroupB = new float[] { 7, 9, 0 }, }, new NumericData { Label = true, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 4, 0, 6 }, - GroupC = new float[] { 7, 8, 9 }, + GroupA = new float[] { 4, 0, 6 }, + GroupB = new float[] { 7, 8, 9 }, }, new NumericData { Label = false, - GroupA = new float[] { 1, 2, 3 }, - GroupB = new float[] { 0, 5, 7 }, - GroupC = new float[] { 7, 8, 0 }, + GroupA = new float[] { 0, 5, 7 }, + GroupB = new float[] { 7, 8, 0 }, } }; return data; From d44c6a2833c1c030185ff2ffd4196926f8e08d2c Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 4 Apr 2019 20:51:38 +0000 Subject: [PATCH 4/6] review comments --- .../SelectFeaturesBasedOnCount.cs | 36 +++++++++---------- .../SelectFeaturesBasedOnCountMultiColumn.cs | 34 +++++++++--------- .../SelectFeaturesBasedOnMutualInformation.cs | 20 +++++------ ...uresBasedOnMutualInformationMultiColumn.cs | 36 +++++++++---------- 4 files changed, 63 insertions(+), 63 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs index 43db2df365..ea40853c54 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs @@ -19,9 +19,9 @@ public static void Example() var convertedData = mlContext.Data.CreateEnumerable(data, true); - Console.WriteLine("Contents of two columns 'GroupA' and 'Info'."); + Console.WriteLine("Contents of two columns 'NumericVector' and 'StringVector'."); foreach (var item in convertedData) - Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.GroupA), string.Join("\t", item.Info)); + Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.NumericVector), string.Join("\t", item.StringVector)); // 4 NaN 6 A WA Male // 4 5 6 A Female // 4 5 6 A NY @@ -31,13 +31,13 @@ public static void Example() // Usage on numeric column. var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - outputColumnName: "FeaturesSelectedGroupA", inputColumnName: "GroupA", count: 3); + outputColumnName: "NumericVector", count: 3); // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. var transformedData = pipeline.Fit(data).Transform(data); - Console.WriteLine("Contents of column 'FeaturesSelectedGroupA'"); - var featuresSelectedGroupA = transformedData.GetColumn(transformedData.Schema["FeaturesSelectedGroupA"]); + Console.WriteLine("Contents of column 'NumericVector'"); + var featuresSelectedGroupA = transformedData.GetColumn(transformedData.Schema["NumericVector"]); foreach (var row in featuresSelectedGroupA) { for (var i = 0; i < row.Length; i++) @@ -51,12 +51,12 @@ public static void Example() // Usage on text column. pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - outputColumnName: "FeaturesSelectedInfo", inputColumnName: "Info", count: 3); + outputColumnName: "StringVector", count: 3); transformedData = pipeline.Fit(data).Transform(data); - Console.WriteLine("Contents of column 'FeaturesSelectedInfo'"); - var featuresSelectedInfo = transformedData.GetColumn(transformedData.Schema["FeaturesSelectedInfo"]); + Console.WriteLine("Contents of column 'StringVector'"); + var featuresSelectedInfo = transformedData.GetColumn(transformedData.Schema["StringVector"]); foreach (var row in featuresSelectedInfo) { for (var i = 0; i < row.Length; i++) @@ -72,10 +72,10 @@ public static void Example() public class InputData { [VectorType(3)] - public float[] GroupA { get; set; } + public float[] NumericVector { get; set; } [VectorType(3)] - public string[] Info { get; set; } + public string[] StringVector { get; set; } } /// @@ -87,23 +87,23 @@ public static IEnumerable GetData() { new InputData { - GroupA = new float[] { 4, float.NaN, 6 }, - Info = new string[] { "A", "WA", "Male"} + NumericVector = new float[] { 4, float.NaN, 6 }, + StringVector = new string[] { "A", "WA", "Male"} }, new InputData { - GroupA = new float[] { 4, 5, 6 }, - Info = new string[] { "A", "", "Female"} + NumericVector = new float[] { 4, 5, 6 }, + StringVector = new string[] { "A", "", "Female"} }, new InputData { - GroupA = new float[] { 4, 5, 6 }, - Info = new string[] { "A", "NY", null} + NumericVector = new float[] { 4, 5, 6 }, + StringVector = new string[] { "A", "NY", null} }, new InputData { - GroupA = new float[] { 4, float.NaN, float.NaN }, - Info = new string[] { "A", null, "Male"} + NumericVector = new float[] { 4, float.NaN, float.NaN }, + StringVector = new string[] { "A", null, "Male"} } }; return data; diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs index 871fe9531e..a1e217b089 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs @@ -16,9 +16,9 @@ public static void Example() // Get a small dataset as an IEnumerable and convert it to an IDataView. var rawData = GetData(); - Console.WriteLine("Contents of two columns 'GroupA' and 'Info'."); + Console.WriteLine("Contents of two columns 'NumericVector' and 'StringVector'."); foreach (var item in rawData) - Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.GroupA), string.Join("\t", item.Info)); + Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.NumericVector), string.Join("\t", item.StringVector)); // 4 NaN 6 A WA Male // 4 5 6 A Female // 4 5 6 A NY @@ -31,15 +31,15 @@ public static void Example() // Multi column example. This pipeline transform two columns using the provided parameters. var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - new InputOutputColumnPair[] { new InputOutputColumnPair("GroupA"), new InputOutputColumnPair("Info") }, + new InputOutputColumnPair[] { new InputOutputColumnPair("NumericVector"), new InputOutputColumnPair("StringVector") }, count: 3); var transformedData = pipeline.Fit(data).Transform(data); var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); - Console.WriteLine("Contents of two columns 'GroupA' and 'Info'."); + Console.WriteLine("Contents of two columns 'NumericVector' and 'StringVector'."); foreach (var item in convertedData) - Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.GroupA), string.Join("\t", item.Info)); + Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.NumericVector), string.Join("\t", item.StringVector)); // 4 6 A Male // 4 6 A Female // 4 6 A @@ -48,18 +48,18 @@ public static void Example() private class TransformedData { - public float[] GroupA { get; set; } + public float[] NumericVector { get; set; } - public string[] Info { get; set; } + public string[] StringVector { get; set; } } public class InputData { [VectorType(3)] - public float[] GroupA { get; set; } + public float[] NumericVector { get; set; } [VectorType(3)] - public string[] Info { get; set; } + public string[] StringVector { get; set; } } /// @@ -71,23 +71,23 @@ public static IEnumerable GetData() { new InputData { - GroupA = new float[] { 4, float.NaN, 6 }, - Info = new string[] { "A", "WA", "Male"} + NumericVector = new float[] { 4, float.NaN, 6 }, + StringVector = new string[] { "A", "WA", "Male"} }, new InputData { - GroupA = new float[] { 4, 5, 6 }, - Info = new string[] { "A", "", "Female"} + NumericVector = new float[] { 4, 5, 6 }, + StringVector = new string[] { "A", "", "Female"} }, new InputData { - GroupA = new float[] { 4, 5, 6 }, - Info = new string[] { "A", "NY", null} + NumericVector = new float[] { 4, 5, 6 }, + StringVector = new string[] { "A", "NY", null} }, new InputData { - GroupA = new float[] { 4, float.NaN, float.NaN }, - Info = new string[] { "A", null, "Male"} + NumericVector = new float[] { 4, float.NaN, float.NaN }, + StringVector = new string[] { "A", null, "Male"} } }; return data; diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs index 9967c94142..7937eed0a5 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs @@ -16,9 +16,9 @@ public static void Example() // Get a small dataset as an IEnumerable and convert it to an IDataView. var rawData = GetData(); - Console.WriteLine("Contents of two columns 'Label' and 'GroupA'."); + Console.WriteLine("Contents of two columns 'Label' and 'NumericVector'."); foreach (var item in rawData) - Console.WriteLine("{0}\t\t{1}", item.Label, string.Join(" ", item.GroupA)); + Console.WriteLine("{0}\t\t{1}", item.Label, string.Join(" ", item.NumericVector)); // True 4 0 6 // False 0 5 7 // True 4 0 6 @@ -30,14 +30,14 @@ public static void Example() // vector based on highest mutual information between that slot and a specified label. var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( - outputColumnName: "FeaturesSelectedGroupA", inputColumnName: "GroupA", labelColumnName: "Label", + outputColumnName: "NumericVector", labelColumnName: "Label", slotsInOutput:2); // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. var transformedData = pipeline.Fit(data).Transform(data); - Console.WriteLine("Contents of column 'FeaturesSelectedGroupA'"); - PrintDataColumn(transformedData, "FeaturesSelectedGroupA"); + Console.WriteLine("Contents of column 'NumericVector'"); + PrintDataColumn(transformedData, "NumericVector"); // 4 0 // 0 5 // 4 0 @@ -61,7 +61,7 @@ public class NumericData public bool Label; [VectorType(3)] - public float[] GroupA { get; set; } + public float[] NumericVector { get; set; } } /// @@ -74,22 +74,22 @@ public static IEnumerable GetData() new NumericData { Label = true, - GroupA = new float[] { 4, 0, 6 }, + NumericVector = new float[] { 4, 6, 0 }, }, new NumericData { Label = false, - GroupA = new float[] { 0, 5, 7 }, + NumericVector = new float[] { 0, 7, 5 }, }, new NumericData { Label = true, - GroupA = new float[] { 4, 0, 6 }, + NumericVector = new float[] { 4, 6, 0 }, }, new NumericData { Label = false, - GroupA = new float[] { 0, 5, 7 }, + NumericVector = new float[] { 0, 7, 5 }, } }; return data; diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs index 71ac19b207..0679cec9b7 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs @@ -16,9 +16,9 @@ public static void Example() // Get a small dataset as an IEnumerable and convert it to an IDataView. var rawData = GetData(); - Console.WriteLine("Contents of columns 'Label', 'GroupA' and 'GroupB'."); + Console.WriteLine("Contents of columns 'Label', 'NumericVectorA' and 'NumericVectorB'."); foreach (var item in rawData) - Console.WriteLine("{0}\t\t{1}\t\t{2}", item.Label, string.Join(" ", item.GroupA), string.Join(" ", item.GroupB)); + Console.WriteLine("{0}\t\t{1}\t\t{2}", item.Label, string.Join(" ", item.NumericVectorA), string.Join(" ", item.NumericVectorB)); // True 4 0 6 7 8 9 // False 0 5 7 7 9 0 // True 4 0 6 7 8 9 @@ -31,18 +31,18 @@ public static void Example() // Multi column example : This pipeline transform two columns using the provided parameters. var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( - new InputOutputColumnPair[] { new InputOutputColumnPair("GroupA"), new InputOutputColumnPair("GroupB") }, + new InputOutputColumnPair[] { new InputOutputColumnPair("NumericVectorA"), new InputOutputColumnPair("NumericVectorB") }, labelColumnName: "Label", slotsInOutput: 4); var transformedData = pipeline.Fit(data).Transform(data); var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); - Console.WriteLine("Contents of two columns 'GroupA' and 'GroupB'."); + Console.WriteLine("Contents of two columns 'NumericVectorA' and 'NumericVectorB'."); foreach (var item in convertedData) - Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.GroupA), string.Join(" ", item.GroupB)); + Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.NumericVectorA), string.Join(" ", item.NumericVectorB)); - // Here, we see SelectFeaturesBasedOnMutualInformation selected 4 slots. (3 slots from the 'GroupB' column and 1 slot from the 'GroupC' column.) + // Here, we see SelectFeaturesBasedOnMutualInformation selected 4 slots. // 4 0 6 9 // 0 5 7 0 // 4 0 6 9 @@ -51,9 +51,9 @@ public static void Example() private class TransformedData { - public float[] GroupA { get; set; } + public float[] NumericVectorA { get; set; } - public float[] GroupB { get; set; } + public float[] NumericVectorB { get; set; } } public class NumericData @@ -61,10 +61,10 @@ public class NumericData public bool Label; [VectorType(3)] - public float[] GroupA { get; set; } + public float[] NumericVectorA { get; set; } [VectorType(3)] - public float[] GroupB { get; set; } + public float[] NumericVectorB { get; set; } } /// @@ -77,26 +77,26 @@ public static IEnumerable GetData() new NumericData { Label = true, - GroupA = new float[] { 4, 0, 6 }, - GroupB = new float[] { 7, 8, 9 }, + NumericVectorA = new float[] { 4, 0, 6 }, + NumericVectorB = new float[] { 7, 8, 9 }, }, new NumericData { Label = false, - GroupA = new float[] { 0, 5, 7 }, - GroupB = new float[] { 7, 9, 0 }, + NumericVectorA = new float[] { 0, 5, 7 }, + NumericVectorB = new float[] { 7, 9, 0 }, }, new NumericData { Label = true, - GroupA = new float[] { 4, 0, 6 }, - GroupB = new float[] { 7, 8, 9 }, + NumericVectorA = new float[] { 4, 0, 6 }, + NumericVectorB = new float[] { 7, 8, 9 }, }, new NumericData { Label = false, - GroupA = new float[] { 0, 5, 7 }, - GroupB = new float[] { 7, 8, 0 }, + NumericVectorA = new float[] { 0, 5, 7 }, + NumericVectorB = new float[] { 7, 8, 0 }, } }; return data; From 51b85078e23932da8b953057486249dcd5e704e9 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 4 Apr 2019 23:32:39 +0000 Subject: [PATCH 5/6] take care of review comments --- .../SelectFeaturesBasedOnCount.cs | 72 ++++++++----------- .../SelectFeaturesBasedOnCountMultiColumn.cs | 31 ++++---- .../SelectFeaturesBasedOnMutualInformation.cs | 44 ++++++------ ...uresBasedOnMutualInformationMultiColumn.cs | 31 ++++---- 4 files changed, 89 insertions(+), 89 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs index ea40853c54..8b27d23e0d 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCount.cs @@ -15,58 +15,46 @@ public static void Example() // Get a small dataset as an IEnumerable and convert it to an IDataView. var rawData = GetData(); - var data = mlContext.Data.LoadFromEnumerable(rawData); - var convertedData = mlContext.Data.CreateEnumerable(data, true); + // Printing the columns of the input data. + Console.WriteLine($"NumericVector StringVector"); + foreach (var item in rawData) + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVector), string.Join(",", item.StringVector)); + + // NumericVector StringVector + // 4,NaN,6 A,WA,Male + // 4,5,6 A,,Female + // 4,5,6 A,NY, + // 4,NaN,NaN A,,Male - Console.WriteLine("Contents of two columns 'NumericVector' and 'StringVector'."); - foreach (var item in convertedData) - Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.NumericVector), string.Join("\t", item.StringVector)); - // 4 NaN 6 A WA Male - // 4 5 6 A Female - // 4 5 6 A NY - // 4 NaN NaN A Male + var data = mlContext.Data.LoadFromEnumerable(rawData); // We will use the SelectFeaturesBasedOnCount to retain only those slots which have at least 'count' non-default values per slot. + var pipeline = + mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(outputColumnName: "NumericVector", count: 3) // Usage on numeric column. + .Append(mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(outputColumnName: "StringVector", count: 3)); // Usage on text column. - // Usage on numeric column. - var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - outputColumnName: "NumericVector", count: 3); - - // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. var transformedData = pipeline.Fit(data).Transform(data); - Console.WriteLine("Contents of column 'NumericVector'"); - var featuresSelectedGroupA = transformedData.GetColumn(transformedData.Schema["NumericVector"]); - foreach (var row in featuresSelectedGroupA) - { - for (var i = 0; i < row.Length; i++) - Console.Write($"{row[i]}\t"); - Console.WriteLine(); - } - // 4 6 - // 4 6 - // 4 6 - // 4 NaN + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); + + // Printing the columns of the transformed data. + Console.WriteLine($"NumericVector StringVector"); + foreach (var item in convertedData) + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVector), string.Join(",", item.StringVector)); - // Usage on text column. - pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( - outputColumnName: "StringVector", count: 3); + // NumericVector StringVector + // 4,6 A,Male + // 4,6 A,Female + // 4,6 A, + // 4,NaN A,Male + } - transformedData = pipeline.Fit(data).Transform(data); + public class TransformedData + { + public float[] NumericVector { get; set; } - Console.WriteLine("Contents of column 'StringVector'"); - var featuresSelectedInfo = transformedData.GetColumn(transformedData.Schema["StringVector"]); - foreach (var row in featuresSelectedInfo) - { - for (var i = 0; i < row.Length; i++) - Console.Write($"{row[i]}\t"); - Console.WriteLine(); - } - // A Male - // A Female - // A - // A Male + public string[] StringVector { get; set; } } public class InputData diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs index a1e217b089..ae45fa375e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnCountMultiColumn.cs @@ -16,13 +16,16 @@ public static void Example() // Get a small dataset as an IEnumerable and convert it to an IDataView. var rawData = GetData(); - Console.WriteLine("Contents of two columns 'NumericVector' and 'StringVector'."); + // Printing the columns of the input data. + Console.WriteLine($"NumericVector StringVector"); foreach (var item in rawData) - Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.NumericVector), string.Join("\t", item.StringVector)); - // 4 NaN 6 A WA Male - // 4 5 6 A Female - // 4 5 6 A NY - // 4 NaN NaN A Male + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVector), string.Join(",", item.StringVector)); + + // NumericVector StringVector + // 4,NaN,6 A,WA,Male + // 4,5,6 A,,Female + // 4,5,6 A,NY, + // 4,NaN,NaN A,,Male var data = mlContext.Data.LoadFromEnumerable(rawData); @@ -37,13 +40,17 @@ public static void Example() var transformedData = pipeline.Fit(data).Transform(data); var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); - Console.WriteLine("Contents of two columns 'NumericVector' and 'StringVector'."); + + // Printing the columns of the transformed data. + Console.WriteLine($"NumericVector StringVector"); foreach (var item in convertedData) - Console.WriteLine("{0}\t\t\t{1}", string.Join("\t", item.NumericVector), string.Join("\t", item.StringVector)); - // 4 6 A Male - // 4 6 A Female - // 4 6 A - // 4 NaN A Male + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVector), string.Join(",", item.StringVector)); + + // NumericVector StringVector + // 4,6 A,Male + // 4,6 A,Female + // 4,6 A, + // 4,NaN A,Male } private class TransformedData diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs index 7937eed0a5..0b9232e5e6 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs @@ -16,9 +16,11 @@ public static void Example() // Get a small dataset as an IEnumerable and convert it to an IDataView. var rawData = GetData(); - Console.WriteLine("Contents of two columns 'Label' and 'NumericVector'."); + // Printing the columns of the input data. + Console.WriteLine($"Label NumericVector"); foreach (var item in rawData) - Console.WriteLine("{0}\t\t{1}", item.Label, string.Join(" ", item.NumericVector)); + Console.WriteLine("{0,-25} {1,-25}", item.Label, string.Join(",", item.NumericVector)); + // True 4 0 6 // False 0 5 7 // True 4 0 6 @@ -28,7 +30,6 @@ public static void Example() // We define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature // vector based on highest mutual information between that slot and a specified label. - var pipeline = mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( outputColumnName: "NumericVector", labelColumnName: "Label", slotsInOutput:2); @@ -36,24 +37,23 @@ public static void Example() // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. var transformedData = pipeline.Fit(data).Transform(data); - Console.WriteLine("Contents of column 'NumericVector'"); - PrintDataColumn(transformedData, "NumericVector"); - // 4 0 - // 0 5 - // 4 0 - // 0 5 + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); + + // Printing the columns of the transformed data. + Console.WriteLine($"NumericVector"); + foreach (var item in convertedData) + Console.WriteLine("{0,-25}", string.Join(",", item.NumericVector)); + + // NumericVector + // 4,0 + // 0,5 + // 4,0 + // 0,5 } - private static void PrintDataColumn(IDataView transformedData, string columnName) + public class TransformedData { - var countSelectColumn = transformedData.GetColumn(transformedData.Schema[columnName]); - - foreach (var row in countSelectColumn) - { - for (var i = 0; i < row.Length; i++) - Console.Write($"{row[i]} "); - Console.WriteLine(); - } + public float[] NumericVector { get; set; } } public class NumericData @@ -74,22 +74,22 @@ public static IEnumerable GetData() new NumericData { Label = true, - NumericVector = new float[] { 4, 6, 0 }, + NumericVector = new float[] { 4, 0, 6 }, }, new NumericData { Label = false, - NumericVector = new float[] { 0, 7, 5 }, + NumericVector = new float[] { 0, 5, 7 }, }, new NumericData { Label = true, - NumericVector = new float[] { 4, 6, 0 }, + NumericVector = new float[] { 4, 0, 6 }, }, new NumericData { Label = false, - NumericVector = new float[] { 0, 7, 5 }, + NumericVector = new float[] { 0, 5, 7 }, } }; return data; diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs index 0679cec9b7..d9543cee7e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformationMultiColumn.cs @@ -16,13 +16,16 @@ public static void Example() // Get a small dataset as an IEnumerable and convert it to an IDataView. var rawData = GetData(); - Console.WriteLine("Contents of columns 'Label', 'NumericVectorA' and 'NumericVectorB'."); + // Printing the columns of the input data. + Console.WriteLine($"NumericVectorA NumericVectorB"); foreach (var item in rawData) - Console.WriteLine("{0}\t\t{1}\t\t{2}", item.Label, string.Join(" ", item.NumericVectorA), string.Join(" ", item.NumericVectorB)); - // True 4 0 6 7 8 9 - // False 0 5 7 7 9 0 - // True 4 0 6 7 8 9 - // False 0 5 7 7 8 0 + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVectorA), string.Join(",", item.NumericVectorB)); + + // NumericVectorA NumericVectorB + // 4,0,6 7,8,9 + // 0,5,7 7,9,0 + // 4,0,6 7,8,9 + // 0,5,7 7,8,0 var data = mlContext.Data.LoadFromEnumerable(rawData); @@ -38,15 +41,17 @@ public static void Example() var transformedData = pipeline.Fit(data).Transform(data); var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); - Console.WriteLine("Contents of two columns 'NumericVectorA' and 'NumericVectorB'."); + + // Printing the columns of the transformed data. + Console.WriteLine($"NumericVectorA NumericVectorB"); foreach (var item in convertedData) - Console.WriteLine("{0}\t\t{1}", string.Join(" ", item.NumericVectorA), string.Join(" ", item.NumericVectorB)); + Console.WriteLine("{0,-25} {1,-25}", string.Join(",", item.NumericVectorA), string.Join(",", item.NumericVectorB)); - // Here, we see SelectFeaturesBasedOnMutualInformation selected 4 slots. - // 4 0 6 9 - // 0 5 7 0 - // 4 0 6 9 - // 0 5 7 0 + // NumericVectorA NumericVectorB + // 4,0,6 9 + // 0,5,7 0 + // 4,0,6 9 + // 0,5,7 0 } private class TransformedData From cb09d87a86e83f03369e058668cba581d8a9b249 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 4 Apr 2019 23:40:04 +0000 Subject: [PATCH 6/6] fix copy paste output error --- .../SelectFeaturesBasedOnMutualInformation.cs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs index 0b9232e5e6..d7aa805ceb 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/FeatureSelection/SelectFeaturesBasedOnMutualInformation.cs @@ -21,10 +21,11 @@ public static void Example() foreach (var item in rawData) Console.WriteLine("{0,-25} {1,-25}", item.Label, string.Join(",", item.NumericVector)); - // True 4 0 6 - // False 0 5 7 - // True 4 0 6 - // False 0 5 7 + // Label NumericVector + // True 4,0,6 + // False 0,5,7 + // True 4,0,6 + // False 0,5,7 var data = mlContext.Data.LoadFromEnumerable(rawData);