diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Conversion/Hash.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Conversion/Hash.cs new file mode 100644 index 0000000000..5b472f88a3 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Conversion/Hash.cs @@ -0,0 +1,95 @@ +using System; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + // This example demonstrates hashing of categorical string and integer data types. + public static class Hash + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(seed: 1); + + // Get a small dataset as an IEnumerable. + var rawData = new[] { + new DataPoint() { Category = "MLB" , Age = 18 }, + new DataPoint() { Category = "NFL" , Age = 14 }, + new DataPoint() { Category = "NFL" , Age = 15 }, + new DataPoint() { Category = "MLB" , Age = 18 }, + new DataPoint() { Category = "MLS" , Age = 14 }, + }; + + var data = mlContext.Data.LoadFromEnumerable(rawData); + + // Construct the pipeline that would hash the two columns and store the results in new columns. + // The first transform hashes the string column and the second transform hashes the integer column. + // + // Hashing is not a reversible operation, so there is no way to retrive the original value from the hashed value. + // Sometimes, for debugging, or model explainability, users will need to know what values in the original columns generated + // the values in the hashed columns, since the algorithms will mostly use the hashed values for further computations. + // The Hash method will preserve the mapping from the original values to the hashed values in the Annotations of the + // newly created column (column populated with the hashed values). + // + // Setting the maximumNumberOfInverts parameters to -1 will preserve the full map. + // If that parameter is left to the default 0 value, the mapping is not preserved. + var pipeline = mlContext.Transforms.Conversion.Hash("CategoryHashed", "Category", numberOfBits: 16, maximumNumberOfInverts: -1) + .Append(mlContext.Transforms.Conversion.Hash("AgeHashed", "Age", numberOfBits: 8)); + + // Let's fit our pipeline, and then apply it to the same data. + var transformer = pipeline.Fit(data); + var transformedData = transformer.Transform(data); + + // Convert the post transformation from the IDataView format to an IEnumerable for easy consumption. + var convertedData = mlContext.Data.CreateEnumerable(transformedData, true); + + Console.WriteLine("Category CategoryHashed\t Age\t AgeHashed"); + foreach (var item in convertedData) + Console.WriteLine($"{item.Category}\t {item.CategoryHashed}\t\t {item.Age}\t {item.AgeHashed}"); + + // Expected data after the transformation. + // + // Category CategoryHashed Age AgeHashed + // MLB 36206 18 127 + // NFL 19015 14 62 + // NFL 19015 15 43 + // MLB 36206 18 127 + // MLS 6013 14 62 + + // For the Category column, where we set the maximumNumberOfInverts parameter, the names of the original categories, + // and their correspondance with the generated hash values is preserved in the Annotations in the format of indices and values. + // the indices array will have the hashed values, and the corresponding element, position-wise, in the values array will + // contain the original value. + // + // See below for an example on how to retrieve the mapping. + var slotNames = new VBuffer>(); + transformedData.Schema["CategoryHashed"].Annotations.GetValue("KeyValues", ref slotNames); + + var indices = slotNames.GetIndices(); + var categoryNames = slotNames.GetValues(); + + for (int i = 0; i < indices.Length; i++) + Console.WriteLine($"The original value of the {indices[i]} category is {categoryNames[i]}"); + + // Output Data + // + // The original value of the 6012 category is MLS + // The original value of the 19014 category is NFL + // The original value of the 36205 category is MLB + } + + private class DataPoint + { + public string Category; + public uint Age; + } + + private class TransformedDataPoint : DataPoint + { + public uint CategoryHashed; + public uint AgeHashed; + } + + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index a316dd1abf..3409c7a085 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -3,6 +3,7 @@ netcoreapp2.1 Exe + 649 diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index 2aa61a1b14..f84e2f6e11 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -27,6 +27,13 @@ public static class ConversionsExtensionsCatalog /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// Specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. + /// + /// + /// + /// + public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string outputColumnName, string inputColumnName = null, int numberOfBits = HashDefaults.NumberOfBits, int maximumNumberOfInverts = HashDefaults.MaximumNumberOfInverts) => new HashingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, numberOfBits, maximumNumberOfInverts);