diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs index 113b4794fb..de98b1ddb0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs @@ -55,7 +55,7 @@ public static void Example() // specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information // between features and label. var mutualInfoEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( - outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumn: "Label", slotsInOutput: 5); + outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumnName: "Label", slotsInOutput: 5); // Now, we can put the previous two transformations together in a pipeline. var pipeline = countSelectEst.Append(mutualInfoEst); diff --git a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs index 4e719a4cd9..7d16a7936e 100644 --- a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs @@ -10,7 +10,6 @@ using Microsoft.ML; using Microsoft.ML.CommandLine; using Microsoft.ML.Data; -using Microsoft.ML.EntryPoints; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Transforms.FeatureSelection; @@ -54,23 +53,25 @@ public sealed class ColumnOptions public readonly string Name; /// Name of the column to transform. public readonly string InputColumnName; - /// If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved. - public readonly long MinCount; + /// If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved. + public readonly long Count; /// /// Describes the parameters of the feature selection process for a column pair. /// /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved. - public ColumnOptions(string name, string inputColumnName = null, long minCount = Defaults.Count) + /// If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved. + + public ColumnOptions(string name, string inputColumnName = null, long count = Defaults.Count) { Name = name; Contracts.CheckValue(Name, nameof(Name)); InputColumnName = inputColumnName ?? name; Contracts.CheckValue(InputColumnName, nameof(InputColumnName)); - MinCount = minCount; + Contracts.CheckParam(count >= 0, nameof(count), "Must be non-negative."); + Count = count; } } @@ -183,7 +184,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa host.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns)); host.CheckUserArg(options.Count > 0, nameof(options.Count)); - var columnOptions = options.Columns.Select(inColName => new ColumnOptions(inColName, minCount: options.Count)).ToArray(); + var columnOptions = options.Columns.Select(inColName => new ColumnOptions(inColName, count: options.Count)).ToArray(); return new CountFeatureSelectingEstimator(env, columnOptions).Fit(input).Transform(input) as IDataTransform; } @@ -206,11 +207,11 @@ private static void CreateDropAndCopyColumns(ColumnOptions[] columnOptions, int selectedCount[i] = 0; for (int j = 0; j < score.Length; j++) { - if (score[j] < columnOptions[i].MinCount) + if (score[j] < columnOptions[i].Count) { // Adjacent slots are combined into a single range. int min = j; - while (j < score.Length && score[j] < columnOptions[i].MinCount) + while (j < score.Length && score[j] < columnOptions[i].Count) j++; int max = j - 1; slots.Add((min, max)); diff --git a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs index 5217043a50..32a977be53 100644 --- a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs +++ b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs @@ -14,9 +14,9 @@ public static class FeatureSelectionCatalog { /// /// The transform's catalog. - /// Name of the column to use for labels. + /// The name of the label column. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. - /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. + /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. /// Specifies the names of the input columns for the transformation, and their respective output column names. /// /// @@ -26,20 +26,20 @@ public static class FeatureSelectionCatalog /// /// public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog, - string labelColumn = MutualInfoSelectDefaults.LabelColumn, + string labelColumnName = MutualInfoSelectDefaults.LabelColumn, int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput, - int numBins = MutualInfoSelectDefaults.NumBins, + int numberOfBins = MutualInfoSelectDefaults.NumBins, params ColumnOptions[] columns) - => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumn, slotsInOutput, numBins, + => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumnName, slotsInOutput, numberOfBins, ColumnOptions.ConvertToValueTuples(columns)); /// /// The transform's catalog. /// Name of the column resulting from the transformation of . /// Name of column to transform. If set to , the value of the will be used as source. - /// Name of the column to use for labels. + /// The name of the label column. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. - /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. + /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. /// /// /// public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog, string outputColumnName, string inputColumnName = null, - string labelColumn = MutualInfoSelectDefaults.LabelColumn, + string labelColumnName = MutualInfoSelectDefaults.LabelColumn, int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput, - int numBins = MutualInfoSelectDefaults.NumBins) - => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumn, slotsInOutput, numBins); + int numberOfBins = MutualInfoSelectDefaults.NumBins) + => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumnName, slotsInOutput, numberOfBins); /// /// The transform's catalog. diff --git a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs index 4c0066e1b9..05d67116a2 100644 --- a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs @@ -66,7 +66,7 @@ internal sealed class Options : TransformInputBase /// The environment to use. /// Name of the column to use for labels. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. - /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. + /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. /// Specifies the names of the input columns for the transformation, and their respective output column names. /// /// @@ -78,7 +78,7 @@ internal sealed class Options : TransformInputBase internal MutualInformationFeatureSelectingEstimator(IHostEnvironment env, string labelColumn = Defaults.LabelColumn, int slotsInOutput = Defaults.SlotsInOutput, - int numBins = Defaults.NumBins, + int numberOfBins = Defaults.NumBins, params (string outputColumnName, string inputColumnName)[] columns) { Contracts.CheckValue(env, nameof(env)); @@ -87,12 +87,12 @@ internal MutualInformationFeatureSelectingEstimator(IHostEnvironment env, _host.CheckUserArg(Utils.Size(columns) > 0, nameof(columns)); _host.CheckUserArg(slotsInOutput > 0, nameof(slotsInOutput)); _host.CheckNonWhiteSpace(labelColumn, nameof(labelColumn)); - _host.Check(numBins > 1, "numBins must be greater than 1."); + _host.Check(numberOfBins > 1, "numBins must be greater than 1."); _columns = columns; _labelColumn = labelColumn; _slotsInOutput = slotsInOutput; - _numBins = numBins; + _numBins = numberOfBins; } /// diff --git a/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs b/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs index f5c143d793..55d802bc83 100644 --- a/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs @@ -42,7 +42,7 @@ public void FeatureSelectionWorkout() var est = new WordBagEstimator(ML, "bag_of_words", "text") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words_count", "bag_of_words", 10) - .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumn: "label"))); + .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumnName: "label"))); var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv"); using (var ch = Env.Start("save")) @@ -115,11 +115,11 @@ public void CountFeatureSelectionWorkout() var data = ML.Data.Cache(reader.Load(new MultiFileSource(dataPath)).AsDynamic); var columns = new[] { - new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", minCount: 1), - new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing690", "ScalarFloat", minCount: 690), - new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing100", "ScalarFloat", minCount: 100), - new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing690", "VectorDouble", minCount: 690), - new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing100", "VectorDouble", minCount: 100) + new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", count: 1), + new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing690", "ScalarFloat", count: 690), + new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing100", "ScalarFloat", count: 100), + new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing690", "VectorDouble", count: 690), + new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing100", "VectorDouble", count: 100) }; var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("FeatureSelect", "VectorFloat", count: 1) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(columns)); @@ -182,8 +182,8 @@ public void MutualInformationSelectionWorkout() var data = reader.Load(new MultiFileSource(dataPath)).AsDynamic; - var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumn: "Label") - .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(labelColumn: "Label", slotsInOutput: 2, numBins: 100, + var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label") + .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(labelColumnName: "Label", slotsInOutput: 2, numberOfBins: 100, columns: new ColumnOptions[] { ("out1", "VectorFloat"), ("out2", "VectorDouble") @@ -220,7 +220,7 @@ public void TestMutualInformationOldSavingAndLoading() var dataView = reader.Load(new MultiFileSource(dataPath)).AsDynamic; - var pipe = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumn: "Label"); + var pipe = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label"); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result);