From 6207a54ee8c1f983eef0974d5903be1ab52d3233 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Fri, 22 Mar 2019 11:36:30 -0700 Subject: [PATCH 1/4] multicolumn mapping for some estimators --- .../ConversionsExtensionsCatalog.cs | 62 ++++++++++++++++++- .../Transforms/ExtensionsCatalog.cs | 26 ++++++++ .../CategoricalCatalog.cs | 48 ++++++++++++++ .../ExtensionsCatalog.cs | 40 ++++++++---- .../FeatureSelectionCatalog.cs | 41 ++++++------ .../Transformers/FeatureSelectionTests.cs | 6 +- .../Transformers/NAIndicatorTests.cs | 24 +++++-- 7 files changed, 208 insertions(+), 39 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index 4792dd61d4..d755265ce5 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -65,6 +65,20 @@ public static TypeConvertingEstimator ConvertType(this TransformsCatalog.Convers DataKind outputKind = ConvertDefaults.DefaultOutputKind) => new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new TypeConvertingEstimator.ColumnOptions(outputColumnName, outputKind, inputColumnName) }); + /// + /// Changes column type of the input columns. + /// + /// The conversion transform's catalog. + /// Specifies the names of the columns on which to apply the transformation. + /// The expected kind of the output column. + public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, + InputOutputColumnPair[] columns, + DataKind outputKind = ConvertDefaults.DefaultOutputKind) + { + var columnOptions = columns.Select(x => new TypeConvertingEstimator.ColumnOptions(x.OutputColumnName, outputKind, x.InputColumnName)).ToArray(); + return new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + } + /// /// Changes column type of the input column. /// @@ -89,6 +103,14 @@ internal static TypeConvertingEstimator ConvertType(this TransformsCatalog.Conve public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, string outputColumnName, string inputColumnName = null) => new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName); + /// + /// Convert the key types back to their original values. + /// + /// The conversion transform's catalog. + /// Specifies the names of the columns on which to apply the transformation. + public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, InputOutputColumnPair[] columns) + => new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray()); + /// /// Convert the key types (name of the column specified in the first item of the tuple) back to their original values /// (named as specified in the second item of the tuple). @@ -127,6 +149,21 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog. string outputColumnName, string inputColumnName = null, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector) => new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, outputCountVector); + /// + /// Maps columns of key types or key values into columns of floating point vectors. + /// + /// The conversion transform's catalog. + /// Specifies the names of the columns on which to apply the transformation. + /// Whether to combine multiple indicator vectors into a single vector of counts instead of concatenating them. + /// This is only relevant when the input column is a vector of keys. + public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog, + InputOutputColumnPair[] columns, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector) + { + var columnOptions = columns.Select(x => new KeyToVectorMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputCountVector)).ToArray(); + return new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + + } + /// /// Converts value types into . /// @@ -158,7 +195,30 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co new[] { new ValueToKeyMappingEstimator.ColumnOptions(outputColumnName, inputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText) }, keyData); /// - /// Converts value types into , optionally loading the keys to use from . + /// Converts value types into . + /// + /// The conversion transform's catalog. + /// Specifies the names of the columns on which to apply the transformation. + /// Maximum number of keys to keep per column when auto-training. + /// How items should be ordered when vectorized. If choosen they will be in the order encountered. + /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). + /// Whether key value annotations should be text, regardless of the actual input type. + /// The data view containing the terms. If specified, this should be a single column data + /// view, and the key-values will be taken from that column. If unspecified, the key-values will be determined + /// from the input data upon fitting. + public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog, + InputOutputColumnPair[] columns, + int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys, + ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality, + bool addKeyValueAnnotationsAsText = ValueToKeyMappingEstimator.Defaults.AddKeyValueAnnotationsAsText, + IDataView keyData = null) + { + var columnOptions = columns.Select(x => new ValueToKeyMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText)).ToArray(); + return new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions, keyData); + } + + /// + /// Converts value types into , optionally loading the keys to use from . /// /// The conversion transform's catalog. /// The data columns to map to keys. diff --git a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs index c4b97d0dea..501c35cebe 100644 --- a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs @@ -8,6 +8,32 @@ namespace Microsoft.ML { + /// + /// Specifies input and output column names for a transformation. + /// + public sealed class InputOutputColumnPair + { + /// + /// Name of the column to transform. If set to , the value of the will be used as source. + /// + public readonly string InputColumnName; + /// + /// Name of the column resulting from the transformation of . + /// + public readonly string OutputColumnName; + + /// + /// Specifies input and output column names for a transformation. + /// + /// Name of the column resulting from the transformation of . + /// Name of the column to transform. If set to , the value of the will be used as source. + public InputOutputColumnPair(string outputColumnName, string inputColumnName = null) + { + InputColumnName = inputColumnName; + OutputColumnName = outputColumnName; + } + } + /// /// Specifies input and output column names for a transformation. /// diff --git a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs index 22ebfe7890..8ac7b6c4a5 100644 --- a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs +++ b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.Transforms; @@ -40,6 +41,28 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate => new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new OneHotEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality) }, keyData); + /// + /// Convert text columns into one-hot encoded vectors. + /// + /// The transform catalog + /// Specifies the names of the columns on which to apply the transformation. + /// Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector. + /// Maximum number of terms to keep per column when auto-training. + /// How items should be ordered when vectorized. If choosen they will be in the order encountered. + /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). + /// Specifies an ordering for the encoding. If specified, this should be a single column data view, + /// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting. + public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog, + InputOutputColumnPair[] columns, + OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.Defaults.OutKind, + int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys, + ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality, + IDataView keyData = null) + { + var columnOptions = columns.Select(x => new OneHotEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality)).ToArray(); + return new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions, keyData); + } + /// /// Convert several text column into one-hot encoded vectors. /// @@ -88,6 +111,31 @@ public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCata => new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new OneHotHashEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts) }); + /// + /// Convert text columns into hash-based one-hot encoded vector columns. + /// + /// The transform catalog + /// Specifies the names of the columns on which to apply the transformation. + /// The conversion mode. + /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// Hashing seed. + /// Whether the position of each term should be included in the hash. + /// During hashing we constuct mappings between original values and the produced hash values. + /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. + /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. + /// 0 does not retain any input values. -1 retains all input values mapping to each hash. + public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog, + InputOutputColumnPair[] columns, + OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.OutputKind.Indicator, + int numberOfBits = OneHotHashEncodingEstimator.Defaults.NumberOfBits, + uint seed = OneHotHashEncodingEstimator.Defaults.Seed, + bool useOrderedHashing = OneHotHashEncodingEstimator.Defaults.UseOrderedHashing, + int maximumNumberOfInverts = OneHotHashEncodingEstimator.Defaults.MaximumNumberOfInverts) + { + var columnOptions = columns.Select(x => new OneHotHashEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts)).ToArray(); + return new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + } + /// /// Convert several text column into hash-based one-hot encoded vectors. /// diff --git a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs index 55659fbcb9..7e4d2a1d99 100644 --- a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.Transforms; @@ -9,17 +10,6 @@ namespace Microsoft.ML { public static class ExtensionsCatalog { - /// - /// Creates a new output column, of boolean type, with the same number of slots as the input column. The value in the output column - /// is true if the value in the input column is missing. - /// - /// The transform extensions' catalog. - /// The names of the input columns of the transformation and the corresponding names for the output columns. - [BestFriend] - internal static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, - params ColumnOptions[] columns) - => new MissingValueIndicatorEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns)); - /// /// Creates a new output column, or replaces the source with a new column /// (depending on whether the is given a value, or left to null) @@ -41,6 +31,15 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor string inputColumnName = null) => new MissingValueIndicatorEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName); + /// + /// Creates a new output column, of boolean type, with the same number of slots as the input column. The value in the output column + /// is true if the value in the input column is missing. + /// + /// The transform extensions' catalog. + /// Specifies the names of the columns on which to apply the transformation. + public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, InputOutputColumnPair[] columns) + => new MissingValueIndicatorEstimator(CatalogUtils.GetEnvironment(catalog), columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray()); + /// /// Creates a new output column, or replaces the source with a new column /// (depending on whether the is given a value, or left to null) @@ -69,6 +68,25 @@ public static MissingValueReplacingEstimator ReplaceMissingValues(this Transform bool imputeBySlot = MissingValueReplacingEstimator.Defaults.ImputeBySlot) => new MissingValueReplacingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new MissingValueReplacingEstimator.ColumnOptions(outputColumnName, inputColumnName, replacementMode, imputeBySlot) }); + /// + /// Creates a new output column, identical to the input column for everything but the missing values. + /// The missing values of the input column, in this new column are replaced with . + /// + /// The transform extensions' catalog. + /// Specifies the names of the columns on which to apply the transformation. + /// The type of replacement to use as specified in + /// If true, per-slot imputation of replacement is performed. + /// Otherwise, replacement value is imputed for the entire vector column. This setting is ignored for scalars and variable vectors, + /// where imputation is always for the entire column. + public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog, + InputOutputColumnPair[] columns, + MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode, + bool imputeBySlot = MissingValueReplacingEstimator.Defaults.ImputeBySlot) + { + var columnOptions = columns.Select(x => new MissingValueReplacingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, replacementMode, imputeBySlot)).ToArray(); + return new MissingValueReplacingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + } + /// /// Creates a new output column, identical to the input column for everything but the missing values. /// The missing values of the input column, in this new column are replaced with . diff --git a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs index 3c59b738a4..be1dc7a511 100644 --- a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs +++ b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.Transforms; @@ -14,10 +15,11 @@ public static class FeatureSelectionCatalog { /// /// The transform's catalog. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. /// The name of the label column. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. - /// Specifies the names of the input columns for the transformation, and their respective output column names. /// /// /// /// /// - [BestFriend] - internal static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog, + public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog, + string outputColumnName, string inputColumnName = null, string labelColumnName = MutualInfoSelectDefaults.LabelColumn, int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput, - int numberOfBins = MutualInfoSelectDefaults.NumBins, - params ColumnOptions[] columns) - => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumnName, slotsInOutput, numberOfBins, - ColumnOptions.ConvertToValueTuples(columns)); + int numberOfBins = MutualInfoSelectDefaults.NumBins) + => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumnName, slotsInOutput, numberOfBins); /// /// The transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of column to transform. If set to , the value of the will be used as source. + /// Specifies the names of the input columns for the transformation, and their respective output column names. /// The name of the label column. /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. - /// - /// - /// - /// - /// public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog, - string outputColumnName, string inputColumnName = null, + InputOutputColumnPair[] columns, string labelColumnName = MutualInfoSelectDefaults.LabelColumn, int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput, int numberOfBins = MutualInfoSelectDefaults.NumBins) - => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumnName, slotsInOutput, numberOfBins); + => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumnName, slotsInOutput, numberOfBins, + columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray()); /// /// The transform's catalog. @@ -87,5 +80,17 @@ public static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this Tra string inputColumnName = null, long count = CountSelectDefaults.Count) => new CountFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, count); + + /// + /// The transform's catalog. + /// Specifies the names of the columns on which to apply the transformation. + /// If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved. + public static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this TransformsCatalog.FeatureSelectionTransforms catalog, + InputOutputColumnPair[] columns, + long count = CountSelectDefaults.Count) + { + var columnOptions = columns.Select(x => new CountFeatureSelectingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, count)).ToArray(); + return new CountFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + } } } diff --git a/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs b/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs index 7bcf688e1c..a7e8c685cf 100644 --- a/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs @@ -182,9 +182,9 @@ public void MutualInformationSelectionWorkout() var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label") .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(labelColumnName: "Label", slotsInOutput: 2, numberOfBins: 100, - columns: new ColumnOptions[] { - ("out1", "VectorFloat"), - ("out2", "VectorDouble") + columns: new[] { + new InputOutputColumnPair("out1", "VectorFloat"), + new InputOutputColumnPair("out2", "VectorDouble") })); TestEstimatorCore(est, data); diff --git a/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs b/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs index fe1b0a8d9f..a5bb6f7c1c 100644 --- a/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs @@ -44,7 +44,12 @@ public void NAIndicatorWorkout() }; var dataView = ML.Data.LoadFromEnumerable(data); - var pipe = ML.Transforms.IndicateMissingValues(new ColumnOptions[] { ("NAA", "A"), ("NAB", "B"), ("NAC", "C"), ("NAD", "D") }); + var pipe = ML.Transforms.IndicateMissingValues(new[] { + new InputOutputColumnPair("NAA", "A"), + new InputOutputColumnPair("NAB", "B"), + new InputOutputColumnPair("NAC", "C"), + new InputOutputColumnPair("NAD", "D") + }); TestEstimatorCore(pipe, dataView); Done(); } @@ -67,7 +72,12 @@ public void TestOldSavingAndLoading() }; var dataView = ML.Data.LoadFromEnumerable(data); - var pipe = ML.Transforms.IndicateMissingValues(new ColumnOptions[] { ("NAA", "A"), ("NAB", "B"), ("NAC", "C"), ("NAD", "D") }); + var pipe = ML.Transforms.IndicateMissingValues(new[] { + new InputOutputColumnPair("NAA", "A"), + new InputOutputColumnPair("NAB", "B"), + new InputOutputColumnPair("NAC", "C"), + new InputOutputColumnPair("NAD", "D") + }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); using (var ms = new MemoryStream()) @@ -92,10 +102,12 @@ public void NAIndicatorFileOutput() var data = reader.Load(new MultiFileSource(dataPath)).AsDynamic; var wrongCollection = new[] { new TestClass() { A = 1, B = 3, C = new float[2] { 1, 2 }, D = new double[2] { 3, 4 } } }; var invalidData = ML.Data.LoadFromEnumerable(wrongCollection); - var est = ML.Transforms.IndicateMissingValues(new ColumnOptions[] + var est = ML.Transforms.IndicateMissingValues(new[] { - ("A", "ScalarFloat"), ("B", "ScalarDouble"), - ("C", "VectorFloat"), ("D", "VectorDoulbe") + new InputOutputColumnPair("A", "ScalarFloat"), + new InputOutputColumnPair("B", "ScalarDouble"), + new InputOutputColumnPair("C", "VectorFloat"), + new InputOutputColumnPair("D", "VectorDoulbe") }); TestEstimatorCore(est, data, invalidInput: invalidData); @@ -125,7 +137,7 @@ public void NAIndicatorMetadataTest() var dataView = ML.Data.LoadFromEnumerable(data); var pipe = ML.Transforms.Categorical.OneHotEncoding("CatA", "A"); - var newpipe = pipe.Append(ML.Transforms.IndicateMissingValues(("NAA", "CatA"))); + var newpipe = pipe.Append(ML.Transforms.IndicateMissingValues("NAA", "CatA")); var result = newpipe.Fit(dataView).Transform(dataView); Assert.True(result.Schema.TryGetColumnIndex("NAA", out var col)); // Check that the column is normalized. From b4c3ea054dc55024390fc8478629f6031922da04 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Fri, 22 Mar 2019 13:49:44 -0700 Subject: [PATCH 2/4] review comment --- src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs index 501c35cebe..f0de57e403 100644 --- a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs @@ -29,7 +29,7 @@ public sealed class InputOutputColumnPair /// Name of the column to transform. If set to , the value of the will be used as source. public InputOutputColumnPair(string outputColumnName, string inputColumnName = null) { - InputColumnName = inputColumnName; + InputColumnName = inputColumnName ?? outputColumnName; OutputColumnName = outputColumnName; } } From 802e4de79822d44a04eefc13c0ac9a4780185e3e Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Fri, 22 Mar 2019 15:54:41 -0700 Subject: [PATCH 3/4] merging ML.ColumnOptions and ML.InputOutputColumnPair --- docs/code/MlNetCookBook.md | 2 +- .../ConversionsExtensionsCatalog.cs | 34 +++++------------ .../Transforms/ExtensionsCatalog.cs | 37 ++----------------- .../ExtensionsCatalog.cs | 8 ++-- .../ConversionsCatalog.cs | 4 +- .../NormalizerCatalog.cs | 4 +- .../Text/TextCatalog.cs | 4 +- .../CookbookSamplesDynamicApi.cs | 2 +- ...PlantClassificationWithStringLabelTests.cs | 2 +- .../TensorflowTests.cs | 16 ++++---- .../KeyToBinaryVectorEstimatorTest.cs | 11 ++++-- .../Transformers/NormalizerTests.cs | 20 +++++----- .../Transformers/ValueMappingTests.cs | 6 +-- 13 files changed, 55 insertions(+), 95 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 727ed415e2..f509ebfe57 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -424,7 +424,7 @@ var pipeline = // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.SdcaCalibrated()) // Apply the inverse conversion from 'PredictedLabel' column back to string value. - .Append(mlContext.Transforms.Conversion.MapKeyToValue(("PredictedLabel", "Data"))); + .Append(mlContext.Transforms.Conversion.MapKeyToValue("Data", "PredictedLabel")); // Train the model. var model = pipeline.Fit(trainData); diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index d755265ce5..ec61cbed5f 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -111,22 +111,6 @@ public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.Co public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, InputOutputColumnPair[] columns) => new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray()); - /// - /// Convert the key types (name of the column specified in the first item of the tuple) back to their original values - /// (named as specified in the second item of the tuple). - /// - /// The conversion transform's catalog - /// The pairs of input and output columns. - /// - /// - /// - /// - [BestFriend] - internal static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, params ColumnOptions[] columns) - => new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns)); - /// /// Maps key types or key values into a floating point vector. /// @@ -218,7 +202,7 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co } /// - /// Converts value types into , optionally loading the keys to use from . + /// Converts value types into , optionally loading the keys to use from . /// /// The conversion transform's catalog. /// The data columns to map to keys. @@ -292,11 +276,11 @@ public static ValueMappingEstimator MapValue MapValue( this TransformsCatalog.ConversionTransforms catalog, IEnumerable> keyValuePairs, - params ColumnOptions[] columns) + params InputOutputColumnPair[] columns) { var keys = keyValuePairs.Select(pair => pair.Key); var values = keyValuePairs.Select(pair => pair.Value); - return new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, ColumnOptions.ConvertToValueTuples(columns)); + return new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, InputOutputColumnPair.ConvertToValueTuples(columns)); } /// @@ -320,12 +304,12 @@ internal static ValueMappingEstimator MapValue> keyValuePairs, bool treatValuesAsKeyType, - params ColumnOptions[] columns) + params InputOutputColumnPair[] columns) { var keys = keyValuePairs.Select(pair => pair.Key); var values = keyValuePairs.Select(pair => pair.Value); return new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, treatValuesAsKeyType, - ColumnOptions.ConvertToValueTuples(columns)); + InputOutputColumnPair.ConvertToValueTuples(columns)); } /// @@ -381,12 +365,12 @@ public static ValueMappingEstimator MapValue MapValue( this TransformsCatalog.ConversionTransforms catalog, IEnumerable> keyValuePairs, - params ColumnOptions[] columns) + params InputOutputColumnPair[] columns) { var keys = keyValuePairs.Select(pair => pair.Key); var values = keyValuePairs.Select(pair => pair.Value); return new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, - ColumnOptions.ConvertToValueTuples(columns)); + InputOutputColumnPair.ConvertToValueTuples(columns)); } /// @@ -437,8 +421,8 @@ public static ValueMappingEstimator MapValue( [BestFriend] internal static ValueMappingEstimator MapValue( this TransformsCatalog.ConversionTransforms catalog, - IDataView lookupMap, DataViewSchema.Column keyColumn, DataViewSchema.Column valueColumn, params ColumnOptions[] columns) + IDataView lookupMap, DataViewSchema.Column keyColumn, DataViewSchema.Column valueColumn, params InputOutputColumnPair[] columns) => new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), lookupMap, keyColumn.Name, valueColumn.Name, - ColumnOptions.ConvertToValueTuples(columns)); + InputOutputColumnPair.ConvertToValueTuples(columns)); } } diff --git a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs index f0de57e403..d5b1867fde 100644 --- a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs @@ -32,40 +32,11 @@ public InputOutputColumnPair(string outputColumnName, string inputColumnName = n InputColumnName = inputColumnName ?? outputColumnName; OutputColumnName = outputColumnName; } - } - - /// - /// Specifies input and output column names for a transformation. - /// - [BestFriend] - internal sealed class ColumnOptions - { - private readonly string _outputColumnName; - private readonly string _inputColumnName; - - /// - /// Specifies input and output column names for a transformation. - /// - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. - public ColumnOptions(string outputColumnName, string inputColumnName = null) - { - _outputColumnName = outputColumnName; - _inputColumnName = inputColumnName ?? outputColumnName; - } - - /// - /// Instantiates a from a tuple of input and output column names. - /// - public static implicit operator ColumnOptions((string outputColumnName, string inputColumnName) value) - { - return new ColumnOptions(value.outputColumnName, value.inputColumnName); - } [BestFriend] - internal static (string outputColumnName, string inputColumnName)[] ConvertToValueTuples(ColumnOptions[] infos) + internal static (string outputColumnName, string inputColumnName)[] ConvertToValueTuples(InputOutputColumnPair[] infos) { - return infos.Select(info => (info._outputColumnName, info._inputColumnName)).ToArray(); + return infos.Select(info => (info.OutputColumnName, info.InputColumnName)).ToArray(); } } @@ -104,8 +75,8 @@ public static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog, /// /// [BestFriend] - internal static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog, params ColumnOptions[] columns) - => new ColumnCopyingEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns)); + internal static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog, params InputOutputColumnPair[] columns) + => new ColumnCopyingEstimator(CatalogUtils.GetEnvironment(catalog), InputOutputColumnPair.ConvertToValueTuples(columns)); /// /// Concatenates columns together. diff --git a/src/Microsoft.ML.ImageAnalytics/ExtensionsCatalog.cs b/src/Microsoft.ML.ImageAnalytics/ExtensionsCatalog.cs index ea203359a5..af6b4e89b8 100644 --- a/src/Microsoft.ML.ImageAnalytics/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.ImageAnalytics/ExtensionsCatalog.cs @@ -32,8 +32,8 @@ public static ImageGrayscalingEstimator ConvertToGrayscale(this TransformsCatalo /// ]]> /// [BestFriend] - internal static ImageGrayscalingEstimator ConvertToGrayscale(this TransformsCatalog catalog, params ColumnOptions[] columns) - => new ImageGrayscalingEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns)); + internal static ImageGrayscalingEstimator ConvertToGrayscale(this TransformsCatalog catalog, params InputOutputColumnPair[] columns) + => new ImageGrayscalingEstimator(CatalogUtils.GetEnvironment(catalog), InputOutputColumnPair.ConvertToValueTuples(columns)); /// /// Loads the images from the into memory. @@ -80,8 +80,8 @@ public static ImageLoadingEstimator LoadImages(this TransformsCatalog catalog, s /// ]]> /// [BestFriend] - internal static ImageLoadingEstimator LoadImages(this TransformsCatalog catalog, string imageFolder, params ColumnOptions[] columns) - => new ImageLoadingEstimator(CatalogUtils.GetEnvironment(catalog), imageFolder, ColumnOptions.ConvertToValueTuples(columns)); + internal static ImageLoadingEstimator LoadImages(this TransformsCatalog catalog, string imageFolder, params InputOutputColumnPair[] columns) + => new ImageLoadingEstimator(CatalogUtils.GetEnvironment(catalog), imageFolder, InputOutputColumnPair.ConvertToValueTuples(columns)); /// /// The transform's catalog. diff --git a/src/Microsoft.ML.Transforms/ConversionsCatalog.cs b/src/Microsoft.ML.Transforms/ConversionsCatalog.cs index 406cef8d2d..7409fc591b 100644 --- a/src/Microsoft.ML.Transforms/ConversionsCatalog.cs +++ b/src/Microsoft.ML.Transforms/ConversionsCatalog.cs @@ -20,8 +20,8 @@ public static class ConversionsCatalog /// Specifies the output and input columns on which the transformation should be applied. [BestFriend] internal static KeyToBinaryVectorMappingEstimator MapKeyToBinaryVector(this TransformsCatalog.ConversionTransforms catalog, - params ColumnOptions[] columns) - => new KeyToBinaryVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns)); + params InputOutputColumnPair[] columns) + => new KeyToBinaryVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), InputOutputColumnPair.ConvertToValueTuples(columns)); /// /// Convert the key types back to binary vector. diff --git a/src/Microsoft.ML.Transforms/NormalizerCatalog.cs b/src/Microsoft.ML.Transforms/NormalizerCatalog.cs index 657e58bf32..a71e3834a7 100644 --- a/src/Microsoft.ML.Transforms/NormalizerCatalog.cs +++ b/src/Microsoft.ML.Transforms/NormalizerCatalog.cs @@ -43,8 +43,8 @@ public static NormalizingEstimator Normalize(this TransformsCatalog catalog, [BestFriend] internal static NormalizingEstimator Normalize(this TransformsCatalog catalog, NormalizingEstimator.NormalizationMode mode, - params ColumnOptions[] columns) - => new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), mode, ColumnOptions.ConvertToValueTuples(columns)); + params InputOutputColumnPair[] columns) + => new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), mode, InputOutputColumnPair.ConvertToValueTuples(columns)); /// /// Normalize (rescale) columns according to specified custom parameters. diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 4aa28da763..bf811045ab 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -75,8 +75,8 @@ public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this [BestFriend] internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters, - params ColumnOptions[] columns) - => new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, ColumnOptions.ConvertToValueTuples(columns)); + params InputOutputColumnPair[] columns) + => new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, InputOutputColumnPair.ConvertToValueTuples(columns)); /// /// Normalizes incoming text in by changing case, removing diacritical marks, punctuation marks and/or numbers diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index e2a71c27cd..cb5c35793a 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -187,7 +187,7 @@ private ITransformer TrainOnIris(string irisDataPath) // [2] -9.709775 float // Apply the inverse conversion from 'PredictedLabel' column back to string value. - var finalPipeline = pipeline.Append(mlContext.Transforms.Conversion.MapKeyToValue(("Data", "PredictedLabel"))); + var finalPipeline = pipeline.Append(mlContext.Transforms.Conversion.MapKeyToValue("Data", "PredictedLabel")); dataPreview = finalPipeline.Preview(trainData); return finalPipeline.Fit(trainData); diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs index 010a973741..1c05497e55 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs @@ -39,7 +39,7 @@ public void TrainAndPredictIrisModelWithStringLabelTest() .AppendCacheCheckpoint(mlContext) .Append(mlContext.MulticlassClassification.Trainers.SdcaCalibrated( new SdcaCalibratedMulticlassTrainer.Options { NumberOfThreads = 1 })) - .Append(mlContext.Transforms.Conversion.MapKeyToValue(("Plant", "PredictedLabel"))); + .Append(mlContext.Transforms.Conversion.MapKeyToValue("Plant", "PredictedLabel")); // Train the pipeline var trainedModel = pipe.Fit(trainData); diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 20f97d63a8..9cab97c2e0 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -391,7 +391,7 @@ public void TensorFlowTransformInceptionTest() ); var data = reader.Load(new MultiFileSource(dataFile)); - var images = mlContext.Transforms.LoadImages(imageFolder, ("ImageReal", "ImagePath")).Fit(data).Transform(data); + var images = mlContext.Transforms.LoadImages("ImageReal", "ImagePath", imageFolder).Fit(data).Transform(data); var cropped = mlContext.Transforms.ResizeImages("ImageCropped", 224, 224, "ImageReal").Fit(images).Transform(images); var pixels = mlContext.Transforms.ExtractPixels(inputName, "ImageCropped", interleavePixelColors: true).Fit(cropped).Transform(cropped); var tf = mlContext.Model.LoadTensorFlowModel(modelLocation).ScoreTensorFlowModel(outputName, inputName, true).Fit(pixels).Transform(pixels); @@ -507,7 +507,7 @@ public void TensorFlowTransformMNISTConvTest() var trainData = reader.Load(GetDataPath(TestDatasets.mnistTiny28.trainFilename)); var testData = reader.Load(GetDataPath(TestDatasets.mnistOneClass.testFilename)); - var pipe = mlContext.Transforms.CopyColumns(("reshape_input", "Placeholder")) + var pipe = mlContext.Transforms.CopyColumns("reshape_input", "Placeholder") .Append(mlContext.Model.LoadTensorFlowModel("mnist_model/frozen_saved_model.pb").ScoreTensorFlowModel(new[] { "Softmax", "dense/Relu" }, new[] { "Placeholder", "reshape_input" })) .Append(mlContext.Transforms.Concatenate("Features", "Softmax", "dense/Relu")) .Append(mlContext.MulticlassClassification.Trainers.LightGbm("Label", "Features")); @@ -662,7 +662,7 @@ private void ExecuteTFTransformMNISTConvTrainingTest(bool shuffle, int? shuffleS preprocessedTestData = testData; } - var pipe = mlContext.Transforms.CopyColumns(("Features", "Placeholder")) + var pipe = mlContext.Transforms.CopyColumns("Features", "Placeholder") .Append(mlContext.Model.LoadTensorFlowModel(modelLocation).RetrainTensorFlowModel( inputColumnNames: new[] { "Features" }, outputColumnNames: new[] { "Prediction" }, @@ -729,7 +729,7 @@ public void TensorFlowTransformMNISTConvSavedModelTest() var trainData = reader.Load(GetDataPath(TestDatasets.mnistTiny28.trainFilename)); var testData = reader.Load(GetDataPath(TestDatasets.mnistOneClass.testFilename)); - var pipe = mlContext.Transforms.CopyColumns(("reshape_input", "Placeholder")) + var pipe = mlContext.Transforms.CopyColumns("reshape_input", "Placeholder") .Append(mlContext.Model.LoadTensorFlowModel("mnist_model").ScoreTensorFlowModel(new[] { "Softmax", "dense/Relu" }, new[] { "Placeholder", "reshape_input" })) .Append(mlContext.Transforms.Concatenate("Features", new[] { "Softmax", "dense/Relu" })) .Append(mlContext.MulticlassClassification.Trainers.LightGbm("Label", "Features")); @@ -898,7 +898,7 @@ public void TensorFlowTransformCifarSavedModel() new TextLoader.Column("Name", DataKind.String, 1), } ); - var images = mlContext.Transforms.LoadImages(imageFolder, ("ImageReal", "ImagePath")).Fit(data).Transform(data); + var images = mlContext.Transforms.LoadImages("ImageReal", imageFolder, "ImagePath").Fit(data).Transform(data); var cropped = mlContext.Transforms.ResizeImages("ImageCropped", imageWidth, imageHeight, "ImageReal").Fit(images).Transform(images); var pixels = mlContext.Transforms.ExtractPixels("Input", "ImageCropped", interleavePixelColors: true).Fit(cropped).Transform(cropped); IDataView trans = tensorFlowModel.ScoreTensorFlowModel("Output", "Input").Fit(pixels).Transform(pixels); @@ -1000,7 +1000,7 @@ public void TensorFlowSentimentClassificationTest() // The second pipeline 'tfEnginePipe' takes the resized integer vector and passes it to TensoFlow and gets the classification scores. var estimator = mlContext.Transforms.Text.TokenizeIntoWords("TokenizedWords", "Sentiment_Text") .Append(mlContext.Transforms.Conversion.MapValue(lookupMap, lookupMap.Schema["Words"], lookupMap.Schema["Ids"], - new ColumnOptions[] { ("Features", "TokenizedWords") })); + new[] { new InputOutputColumnPair("Features", "TokenizedWords") })); var model = estimator.Fit(dataView); var dataPipe = mlContext.Model.CreatePredictionEngine(model); @@ -1008,7 +1008,7 @@ public void TensorFlowSentimentClassificationTest() // c.f. https://github.com/dotnet/machinelearning-testdata/blob/master/Microsoft.ML.TensorFlow.TestModels/sentiment_model/README.md string modelLocation = @"sentiment_model"; var pipelineModel = mlContext.Model.LoadTensorFlowModel(modelLocation).ScoreTensorFlowModel(new[] { "Prediction/Softmax" }, new[] { "Features" }) - .Append(mlContext.Transforms.CopyColumns(("Prediction", "Prediction/Softmax"))) + .Append(mlContext.Transforms.CopyColumns("Prediction", "Prediction/Softmax")) .Fit(dataView); var tfEnginePipe = mlContext.Model.CreatePredictionEngine(pipelineModel); @@ -1052,7 +1052,7 @@ public void TensorFlowStringTest() var dataview = mlContext.Data.CreateTextLoader().Load(new MultiFileSource(null)); var pipeline = tensorFlowModel.ScoreTensorFlowModel(new[] { "Original_A", "Joined_Splited_Text" }, new[] { "A", "B" }) - .Append(mlContext.Transforms.CopyColumns(("AOut", "Original_A"), ("BOut", "Joined_Splited_Text"))); + .Append(mlContext.Transforms.CopyColumns(new[] { new InputOutputColumnPair("AOut", "Original_A"), new InputOutputColumnPair("BOut", "Joined_Splited_Text") })); var transformer = mlContext.Model.CreatePredictionEngine(pipeline.Fit(dataview)); var input = new TextInput diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs b/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs index 0b9d1dd190..4dfe700764 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs @@ -51,7 +51,7 @@ public void KeyToBinaryVectorWorkout() new ValueToKeyMappingEstimator.ColumnOptions("TermC", "C", addKeyValueAnnotationsAsText:true) }).Fit(dataView).Transform(dataView); - var pipe = ML.Transforms.Conversion.MapKeyToBinaryVector(("CatA", "TermA"), ("CatC", "TermC")); + var pipe = ML.Transforms.Conversion.MapKeyToBinaryVector(new[] { new InputOutputColumnPair("CatA", "TermA"), new InputOutputColumnPair("CatC", "TermC") }); TestEstimatorCore(pipe, dataView); Done(); } @@ -105,7 +105,12 @@ public void TestMetadataPropagation() var termTransformer = termEst.Fit(dataView); dataView = termTransformer.Transform(dataView); - var pipe = ML.Transforms.Conversion.MapKeyToBinaryVector(("CatA", "TA"), ("CatB", "TB"), ("CatC", "TC"), ("CatD", "TD")); + var pipe = ML.Transforms.Conversion.MapKeyToBinaryVector(new[] { + new InputOutputColumnPair("CatA", "TA"), + new InputOutputColumnPair("CatB", "TB"), + new InputOutputColumnPair("CatC", "TC"), + new InputOutputColumnPair("CatD", "TD") + }); var result = pipe.Fit(dataView).Transform(dataView); ValidateMetadata(result); @@ -155,7 +160,7 @@ public void TestOldSavingAndLoading() }); var transformer = est.Fit(dataView); dataView = transformer.Transform(dataView); - var pipe = ML.Transforms.Conversion.MapKeyToBinaryVector(("CatA", "TermA"), ("CatB", "TermB"), ("CatC", "TermC")); + var pipe = ML.Transforms.Conversion.MapKeyToBinaryVector(new[] { new InputOutputColumnPair("CatA", "TermA"), new InputOutputColumnPair("CatB", "TermB"), new InputOutputColumnPair("CatC", "TermC") }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); using (var ms = new MemoryStream()) diff --git a/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs b/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs index 54cb4da52a..5b0b01abc7 100644 --- a/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs @@ -225,7 +225,7 @@ public void SimpleConstructorsAndExtensions() var est1 = new NormalizingEstimator(Env, "float4"); var est2 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.MinMax, ("float4", "float4")); var est3 = new NormalizingEstimator(Env, new NormalizingEstimator.MinMaxColumnOptions("float4")); - var est4 = ML.Transforms.Normalize(NormalizingEstimator.NormalizationMode.MinMax, ("float4", "float4")); + var est4 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.MinMax); var est5 = ML.Transforms.Normalize("float4"); var data1 = est1.Fit(data).Transform(data); @@ -246,7 +246,7 @@ public void SimpleConstructorsAndExtensions() // Tests for MeanVariance var est6 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.MeanVariance, ("float4", "float4")); var est7 = new NormalizingEstimator(Env, new NormalizingEstimator.MeanVarianceColumnOptions("float4")); - var est8 = ML.Transforms.Normalize(NormalizingEstimator.NormalizationMode.MeanVariance, ("float4", "float4")); + var est8 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.MeanVariance); var data6 = est6.Fit(data).Transform(data); var data7 = est7.Fit(data).Transform(data); @@ -259,7 +259,7 @@ public void SimpleConstructorsAndExtensions() // Tests for LogMeanVariance var est9 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.LogMeanVariance, ("float4", "float4")); var est10 = new NormalizingEstimator(Env, new NormalizingEstimator.LogMeanVarianceColumnOptions("float4")); - var est11 = ML.Transforms.Normalize(NormalizingEstimator.NormalizationMode.LogMeanVariance, ("float4", "float4")); + var est11 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.LogMeanVariance); var data9 = est9.Fit(data).Transform(data); var data10 = est10.Fit(data).Transform(data); @@ -272,7 +272,7 @@ public void SimpleConstructorsAndExtensions() // Tests for Binning var est12 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.Binning, ("float4", "float4")); var est13 = new NormalizingEstimator(Env, new NormalizingEstimator.BinningColumnOptions("float4")); - var est14 = ML.Transforms.Normalize(NormalizingEstimator.NormalizationMode.Binning, ("float4", "float4")); + var est14 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.Binning); var data12 = est12.Fit(data).Transform(data); var data13 = est13.Fit(data).Transform(data); @@ -285,7 +285,7 @@ public void SimpleConstructorsAndExtensions() // Tests for SupervisedBinning var est15 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.SupervisedBinning, ("float4", "float4")); var est16 = new NormalizingEstimator(Env, new NormalizingEstimator.SupervisedBinningColumOptions("float4")); - var est17 = ML.Transforms.Normalize(NormalizingEstimator.NormalizationMode.SupervisedBinning, ("float4", "float4")); + var est17 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.SupervisedBinning); var data15 = est15.Fit(data).Transform(data); var data16 = est16.Fit(data).Transform(data); @@ -314,11 +314,11 @@ public void NormalizerExperimentalExtensions() var data = loader.Load(dataPath); // Normalizer Extensions - var est1 = ML.Transforms.Normalize(NormalizingEstimator.NormalizationMode.MinMax, ("float4", "float4")); - var est2 = ML.Transforms.Normalize(NormalizingEstimator.NormalizationMode.MeanVariance, ("float4", "float4")); - var est3 = ML.Transforms.Normalize(NormalizingEstimator.NormalizationMode.LogMeanVariance, ("float4", "float4")); - var est4 = ML.Transforms.Normalize(NormalizingEstimator.NormalizationMode.Binning, ("float4", "float4")); - var est5 = ML.Transforms.Normalize(NormalizingEstimator.NormalizationMode.SupervisedBinning, ("float4", "float4")); + var est1 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.MinMax); + var est2 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.MeanVariance); + var est3 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.LogMeanVariance); + var est4 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.Binning); + var est5 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.SupervisedBinning); // Normalizer Extensions (Experimental) var est6 = ML.Transforms.NormalizeMinMax("float4", "float4"); diff --git a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs index f6de4553db..090ab7e9d4 100644 --- a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs @@ -515,7 +515,7 @@ public void ValueMappingWorkout() }; // Workout on value mapping - var est = ML.Transforms.Conversion.MapValue(keyValuePairs, new ColumnOptions[] { ("D", "A"), ("E", "B"), ("F", "C") }); + var est = ML.Transforms.Conversion.MapValue(keyValuePairs, new[] { new InputOutputColumnPair("D", "A"), new InputOutputColumnPair("E", "B"), new InputOutputColumnPair("F", "C") }); TestEstimatorCore(est, validFitInput: dataView, invalidInput: badDataView); } @@ -534,7 +534,7 @@ public void ValueMappingValueTypeIsVectorWorkout() }; // Workout on value mapping - var est = ML.Transforms.Conversion.MapValue(keyValuePairs, new ColumnOptions[] { ("D", "A"), ("E", "B"), ("F", "C") }); + var est = ML.Transforms.Conversion.MapValue(keyValuePairs, new[] { new InputOutputColumnPair("D", "A"), new InputOutputColumnPair("E", "B"), new InputOutputColumnPair("F", "C") }); TestEstimatorCore(est, validFitInput: dataView, invalidInput: badDataView); } @@ -555,7 +555,7 @@ public void ValueMappingInputIsVectorWorkout() }; var est = ML.Transforms.Text.TokenizeIntoWords("TokenizeB", "B") - .Append(ML.Transforms.Conversion.MapValue(keyValuePairs, new ColumnOptions[] { ("VecB", "TokenizeB") })); + .Append(ML.Transforms.Conversion.MapValue("VecB", keyValuePairs, "TokenizeB")); TestEstimatorCore(est, validFitInput: dataView, invalidInput: badDataView); } From 4e142952d11f8739aaa4fcdcbcbffa83b2d91447 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Mon, 25 Mar 2019 14:15:54 -0700 Subject: [PATCH 4/4] review comments, adding check for non null --- .../ConversionsExtensionsCatalog.cs | 37 +++++++++++++++---- .../Transforms/ExtensionsCatalog.cs | 12 ++++-- .../ExtensionsCatalog.cs | 13 ++++++- .../CategoricalCatalog.cs | 9 ++++- .../ConversionsCatalog.cs | 7 +++- .../ExtensionsCatalog.cs | 11 +++++- .../FeatureSelectionCatalog.cs | 11 +++++- .../NormalizerCatalog.cs | 7 +++- .../Text/TextCatalog.cs | 6 ++- .../Transformers/CategoricalTests.cs | 6 +-- 10 files changed, 94 insertions(+), 25 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index ec61cbed5f..e8edbc9c9d 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -5,6 +5,7 @@ using System.Collections.Generic; using System.Linq; using Microsoft.ML.Data; +using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; namespace Microsoft.ML @@ -75,8 +76,10 @@ public static TypeConvertingEstimator ConvertType(this TransformsCatalog.Convers InputOutputColumnPair[] columns, DataKind outputKind = ConvertDefaults.DefaultOutputKind) { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); var columnOptions = columns.Select(x => new TypeConvertingEstimator.ColumnOptions(x.OutputColumnName, outputKind, x.InputColumnName)).ToArray(); - return new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + return new TypeConvertingEstimator(env, columnOptions); } /// @@ -109,7 +112,11 @@ public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.Co /// The conversion transform's catalog. /// Specifies the names of the columns on which to apply the transformation. public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, InputOutputColumnPair[] columns) - => new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray()); + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new KeyToValueMappingEstimator(env, columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray()); + } /// /// Maps key types or key values into a floating point vector. @@ -143,8 +150,10 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog. public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog, InputOutputColumnPair[] columns, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector) { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); var columnOptions = columns.Select(x => new KeyToVectorMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputCountVector)).ToArray(); - return new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + return new KeyToVectorMappingEstimator(env, columnOptions); } @@ -197,8 +206,10 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co bool addKeyValueAnnotationsAsText = ValueToKeyMappingEstimator.Defaults.AddKeyValueAnnotationsAsText, IDataView keyData = null) { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); var columnOptions = columns.Select(x => new ValueToKeyMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText)).ToArray(); - return new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions, keyData); + return new ValueToKeyMappingEstimator(env, columnOptions, keyData); } /// @@ -278,9 +289,11 @@ internal static ValueMappingEstimator MapValue> keyValuePairs, params InputOutputColumnPair[] columns) { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); var keys = keyValuePairs.Select(pair => pair.Key); var values = keyValuePairs.Select(pair => pair.Value); - return new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, InputOutputColumnPair.ConvertToValueTuples(columns)); + return new ValueMappingEstimator(env, keys, values, InputOutputColumnPair.ConvertToValueTuples(columns)); } /// @@ -306,9 +319,11 @@ internal static ValueMappingEstimator MapValue pair.Key); var values = keyValuePairs.Select(pair => pair.Value); - return new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, treatValuesAsKeyType, + return new ValueMappingEstimator(env, keys, values, treatValuesAsKeyType, InputOutputColumnPair.ConvertToValueTuples(columns)); } @@ -367,9 +382,11 @@ internal static ValueMappingEstimator MapValue> keyValuePairs, params InputOutputColumnPair[] columns) { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); var keys = keyValuePairs.Select(pair => pair.Key); var values = keyValuePairs.Select(pair => pair.Value); - return new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, + return new ValueMappingEstimator(env, keys, values, InputOutputColumnPair.ConvertToValueTuples(columns)); } @@ -422,7 +439,11 @@ public static ValueMappingEstimator MapValue( internal static ValueMappingEstimator MapValue( this TransformsCatalog.ConversionTransforms catalog, IDataView lookupMap, DataViewSchema.Column keyColumn, DataViewSchema.Column valueColumn, params InputOutputColumnPair[] columns) - => new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), lookupMap, keyColumn.Name, valueColumn.Name, + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new ValueMappingEstimator(env, lookupMap, keyColumn.Name, valueColumn.Name, InputOutputColumnPair.ConvertToValueTuples(columns)); + } } } diff --git a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs index d5b1867fde..2b84be4bc7 100644 --- a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs @@ -4,6 +4,7 @@ using System.Linq; using Microsoft.ML.Data; +using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; namespace Microsoft.ML @@ -16,11 +17,11 @@ public sealed class InputOutputColumnPair /// /// Name of the column to transform. If set to , the value of the will be used as source. /// - public readonly string InputColumnName; + public string InputColumnName { get; } /// /// Name of the column resulting from the transformation of . /// - public readonly string OutputColumnName; + public string OutputColumnName { get; } /// /// Specifies input and output column names for a transformation. @@ -29,6 +30,7 @@ public sealed class InputOutputColumnPair /// Name of the column to transform. If set to , the value of the will be used as source. public InputOutputColumnPair(string outputColumnName, string inputColumnName = null) { + Contracts.CheckNonEmpty(outputColumnName, nameof(outputColumnName)); InputColumnName = inputColumnName ?? outputColumnName; OutputColumnName = outputColumnName; } @@ -76,7 +78,11 @@ public static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog, /// [BestFriend] internal static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog, params InputOutputColumnPair[] columns) - => new ColumnCopyingEstimator(CatalogUtils.GetEnvironment(catalog), InputOutputColumnPair.ConvertToValueTuples(columns)); + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new ColumnCopyingEstimator(env, InputOutputColumnPair.ConvertToValueTuples(columns)); + } /// /// Concatenates columns together. diff --git a/src/Microsoft.ML.ImageAnalytics/ExtensionsCatalog.cs b/src/Microsoft.ML.ImageAnalytics/ExtensionsCatalog.cs index af6b4e89b8..5c7ab27db5 100644 --- a/src/Microsoft.ML.ImageAnalytics/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.ImageAnalytics/ExtensionsCatalog.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Data; +using Microsoft.ML.Runtime; using Microsoft.ML.Transforms.Image; namespace Microsoft.ML @@ -33,7 +34,11 @@ public static ImageGrayscalingEstimator ConvertToGrayscale(this TransformsCatalo /// [BestFriend] internal static ImageGrayscalingEstimator ConvertToGrayscale(this TransformsCatalog catalog, params InputOutputColumnPair[] columns) - => new ImageGrayscalingEstimator(CatalogUtils.GetEnvironment(catalog), InputOutputColumnPair.ConvertToValueTuples(columns)); + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new ImageGrayscalingEstimator(env, InputOutputColumnPair.ConvertToValueTuples(columns)); + } /// /// Loads the images from the into memory. @@ -81,7 +86,11 @@ public static ImageLoadingEstimator LoadImages(this TransformsCatalog catalog, s /// [BestFriend] internal static ImageLoadingEstimator LoadImages(this TransformsCatalog catalog, string imageFolder, params InputOutputColumnPair[] columns) - => new ImageLoadingEstimator(CatalogUtils.GetEnvironment(catalog), imageFolder, InputOutputColumnPair.ConvertToValueTuples(columns)); + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new ImageLoadingEstimator(env, imageFolder, InputOutputColumnPair.ConvertToValueTuples(columns)); + } /// /// The transform's catalog. diff --git a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs index 8ac7b6c4a5..4697824165 100644 --- a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs +++ b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs @@ -4,6 +4,7 @@ using System.Linq; using Microsoft.ML.Data; +using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; namespace Microsoft.ML @@ -59,8 +60,10 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality, IDataView keyData = null) { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); var columnOptions = columns.Select(x => new OneHotEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality)).ToArray(); - return new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions, keyData); + return new OneHotEncodingEstimator(env, columnOptions, keyData); } /// @@ -132,8 +135,10 @@ public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCata bool useOrderedHashing = OneHotHashEncodingEstimator.Defaults.UseOrderedHashing, int maximumNumberOfInverts = OneHotHashEncodingEstimator.Defaults.MaximumNumberOfInverts) { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); var columnOptions = columns.Select(x => new OneHotHashEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts)).ToArray(); - return new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + return new OneHotHashEncodingEstimator(env, columnOptions); } /// diff --git a/src/Microsoft.ML.Transforms/ConversionsCatalog.cs b/src/Microsoft.ML.Transforms/ConversionsCatalog.cs index 7409fc591b..7ea68ff33f 100644 --- a/src/Microsoft.ML.Transforms/ConversionsCatalog.cs +++ b/src/Microsoft.ML.Transforms/ConversionsCatalog.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Data; +using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; namespace Microsoft.ML @@ -21,7 +22,11 @@ public static class ConversionsCatalog [BestFriend] internal static KeyToBinaryVectorMappingEstimator MapKeyToBinaryVector(this TransformsCatalog.ConversionTransforms catalog, params InputOutputColumnPair[] columns) - => new KeyToBinaryVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), InputOutputColumnPair.ConvertToValueTuples(columns)); + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new KeyToBinaryVectorMappingEstimator(env, InputOutputColumnPair.ConvertToValueTuples(columns)); + } /// /// Convert the key types back to binary vector. diff --git a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs index 7e4d2a1d99..30685d8067 100644 --- a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs @@ -4,6 +4,7 @@ using System.Linq; using Microsoft.ML.Data; +using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; namespace Microsoft.ML @@ -38,7 +39,11 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor /// The transform extensions' catalog. /// Specifies the names of the columns on which to apply the transformation. public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, InputOutputColumnPair[] columns) - => new MissingValueIndicatorEstimator(CatalogUtils.GetEnvironment(catalog), columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray()); + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new MissingValueIndicatorEstimator(env, columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray()); + } /// /// Creates a new output column, or replaces the source with a new column @@ -83,8 +88,10 @@ public static MissingValueReplacingEstimator ReplaceMissingValues(this Transform MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode, bool imputeBySlot = MissingValueReplacingEstimator.Defaults.ImputeBySlot) { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); var columnOptions = columns.Select(x => new MissingValueReplacingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, replacementMode, imputeBySlot)).ToArray(); - return new MissingValueReplacingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + return new MissingValueReplacingEstimator(env, columnOptions); } /// diff --git a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs index be1dc7a511..1b9d35d251 100644 --- a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs +++ b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs @@ -4,6 +4,7 @@ using System.Linq; using Microsoft.ML.Data; +using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; namespace Microsoft.ML @@ -45,8 +46,12 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu string labelColumnName = MutualInfoSelectDefaults.LabelColumn, int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput, int numberOfBins = MutualInfoSelectDefaults.NumBins) - => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumnName, slotsInOutput, numberOfBins, + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new MutualInformationFeatureSelectingEstimator(env, labelColumnName, slotsInOutput, numberOfBins, columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray()); + } /// /// The transform's catalog. @@ -89,8 +94,10 @@ public static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this Tra InputOutputColumnPair[] columns, long count = CountSelectDefaults.Count) { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); var columnOptions = columns.Select(x => new CountFeatureSelectingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, count)).ToArray(); - return new CountFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + return new CountFeatureSelectingEstimator(env, columnOptions); } } } diff --git a/src/Microsoft.ML.Transforms/NormalizerCatalog.cs b/src/Microsoft.ML.Transforms/NormalizerCatalog.cs index a71e3834a7..dd12c452c4 100644 --- a/src/Microsoft.ML.Transforms/NormalizerCatalog.cs +++ b/src/Microsoft.ML.Transforms/NormalizerCatalog.cs @@ -1,4 +1,5 @@ using Microsoft.ML.Data; +using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; namespace Microsoft.ML @@ -44,7 +45,11 @@ public static NormalizingEstimator Normalize(this TransformsCatalog catalog, internal static NormalizingEstimator Normalize(this TransformsCatalog catalog, NormalizingEstimator.NormalizationMode mode, params InputOutputColumnPair[] columns) - => new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), mode, InputOutputColumnPair.ConvertToValueTuples(columns)); + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new NormalizingEstimator(env, mode, InputOutputColumnPair.ConvertToValueTuples(columns)); + } /// /// Normalize (rescale) columns according to specified custom parameters. diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index bf811045ab..e2baf1578e 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -76,7 +76,11 @@ public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters, params InputOutputColumnPair[] columns) - => new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, InputOutputColumnPair.ConvertToValueTuples(columns)); + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new TokenizingByCharactersEstimator(env, useMarkerCharacters, InputOutputColumnPair.ConvertToValueTuples(columns)); + } /// /// Normalizes incoming text in by changing case, removing diacritical marks, punctuation marks and/or numbers diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs index 5161071a1a..5d27514eb1 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs @@ -300,9 +300,9 @@ public void TestOldSavingAndLoading() var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.LoadFromEnumerable(data); var pipe = ML.Transforms.Categorical.OneHotEncoding(new[]{ - new OneHotEncodingEstimator.ColumnOptions("TermA", "A"), - new OneHotEncodingEstimator.ColumnOptions("TermB", "B"), - new OneHotEncodingEstimator.ColumnOptions("TermC", "C") + new InputOutputColumnPair("TermA", "A"), + new InputOutputColumnPair("TermB", "B"), + new InputOutputColumnPair("TermC", "C") }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result);