1+ using System ;
2+ using Microsoft . ML . Data ;
3+
4+ namespace Microsoft . ML . Samples . Dynamic
5+ {
6+ // This example demonstrates hashing of categorical string and integer data types.
7+ public static class Hash
8+ {
9+ public static void Example ( )
10+ {
11+ // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+ // as well as the source of randomness.
13+ var mlContext = new MLContext ( seed : 1 ) ;
14+
15+ // Get a small dataset as an IEnumerable.
16+ var rawData = new [ ] {
17+ new DataPoint ( ) { Category = "MLB" , Age = 18 } ,
18+ new DataPoint ( ) { Category = "NFL" , Age = 14 } ,
19+ new DataPoint ( ) { Category = "NFL" , Age = 15 } ,
20+ new DataPoint ( ) { Category = "MLB" , Age = 18 } ,
21+ new DataPoint ( ) { Category = "MLS" , Age = 14 } ,
22+ } ;
23+
24+ var data = mlContext . Data . LoadFromEnumerable ( rawData ) ;
25+
26+ // Construct the pipeline that would hash the two columns and store the results in new columns.
27+ // The first transform hashes the string column and the second transform hashes the integer column.
28+ //
29+ // Hashing is not a reversible operation, so there is no way to retrive the original value from the hashed value.
30+ // Sometimes, for debugging, or model explainability, users will need to know what values in the original columns generated
31+ // the values in the hashed columns, since the algorithms will mostly use the hashed values for further computations.
32+ // The Hash method will preserve the mapping from the original values to the hashed values in the Annotations of the
33+ // newly created column (column populated with the hashed values).
34+ //
35+ // Setting the maximumNumberOfInverts parameters to -1 will preserve the full map.
36+ // If that parameter is left to the default 0 value, the mapping is not preserved.
37+ var pipeline = mlContext . Transforms . Conversion . Hash ( "CategoryHashed" , "Category" , numberOfBits : 16 , maximumNumberOfInverts : - 1 )
38+ . Append ( mlContext . Transforms . Conversion . Hash ( "AgeHashed" , "Age" , numberOfBits : 8 ) ) ;
39+
40+ // Let's fit our pipeline, and then apply it to the same data.
41+ var transformer = pipeline . Fit ( data ) ;
42+ var transformedData = transformer . Transform ( data ) ;
43+
44+ // Convert the post transformation from the IDataView format to an IEnumerable<TransformedData> for easy consumption.
45+ var convertedData = mlContext . Data . CreateEnumerable < TransformedDataPoint > ( transformedData , true ) ;
46+
47+ Console . WriteLine ( "Category CategoryHashed\t Age\t AgeHashed" ) ;
48+ foreach ( var item in convertedData )
49+ Console . WriteLine ( $ "{ item . Category } \t { item . CategoryHashed } \t \t { item . Age } \t { item . AgeHashed } ") ;
50+
51+ // Expected data after the transformation.
52+ //
53+ // Category CategoryHashed Age AgeHashed
54+ // MLB 36206 18 127
55+ // NFL 19015 14 62
56+ // NFL 19015 15 43
57+ // MLB 36206 18 127
58+ // MLS 6013 14 62
59+
60+ // For the Category column, where we set the maximumNumberOfInverts parameter, the names of the original categories,
61+ // and their correspondance with the generated hash values is preserved in the Annotations in the format of indices and values.
62+ // the indices array will have the hashed values, and the corresponding element, position-wise, in the values array will
63+ // contain the original value.
64+ //
65+ // See below for an example on how to retrieve the mapping.
66+ var slotNames = new VBuffer < ReadOnlyMemory < char > > ( ) ;
67+ transformedData . Schema [ "CategoryHashed" ] . Annotations . GetValue ( "KeyValues" , ref slotNames ) ;
68+
69+ var indices = slotNames . GetIndices ( ) ;
70+ var categoryNames = slotNames . GetValues ( ) ;
71+
72+ for ( int i = 0 ; i < indices . Length ; i ++ )
73+ Console . WriteLine ( $ "The original value of the { indices [ i ] } category is { categoryNames [ i ] } ") ;
74+
75+ // Output Data
76+ //
77+ // The original value of the 6012 category is MLS
78+ // The original value of the 19014 category is NFL
79+ // The original value of the 36205 category is MLB
80+ }
81+
82+ private class DataPoint
83+ {
84+ public string Category ;
85+ public uint Age ;
86+ }
87+
88+ private class TransformedDataPoint : DataPoint
89+ {
90+ public uint CategoryHashed ;
91+ public uint AgeHashed ;
92+ }
93+
94+ }
95+ }
0 commit comments