@@ -9,8 +9,8 @@ public static class Hash
99 {
1010 public static void Example ( )
1111 {
12- // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
13- // as well as the source of randomness.
12+ // Create a new ML context, for ML.NET operations. It can be used for
13+ // exception tracking and logging, as well as the source of randomness.
1414 var mlContext = new MLContext ( seed : 1 ) ;
1515
1616 // Get a small dataset as an IEnumerable.
@@ -24,30 +24,40 @@ public static void Example()
2424
2525 var data = mlContext . Data . LoadFromEnumerable ( rawData ) ;
2626
27- // Construct the pipeline that would hash the two columns and store the results in new columns.
28- // The first transform hashes the string column and the second transform hashes the integer column.
27+ // Construct the pipeline that would hash the two columns and store the
28+ // results in new columns. The first transform hashes the string column
29+ // and the second transform hashes the integer column.
2930 //
30- // Hashing is not a reversible operation, so there is no way to retrive the original value from the hashed value.
31- // Sometimes, for debugging, or model explainability, users will need to know what values in the original columns generated
32- // the values in the hashed columns, since the algorithms will mostly use the hashed values for further computations.
33- // The Hash method will preserve the mapping from the original values to the hashed values in the Annotations of the
34- // newly created column (column populated with the hashed values).
35- //
36- // Setting the maximumNumberOfInverts parameters to -1 will preserve the full map.
37- // If that parameter is left to the default 0 value, the mapping is not preserved.
38- var pipeline = mlContext . Transforms . Conversion . Hash ( "CategoryHashed" , "Category" , numberOfBits : 16 , maximumNumberOfInverts : - 1 )
39- . Append ( mlContext . Transforms . Conversion . Hash ( "AgeHashed" , "Age" , numberOfBits : 8 ) ) ;
31+ // Hashing is not a reversible operation, so there is no way to retrive
32+ // the original value from the hashed value. Sometimes, for debugging,
33+ // or model explainability, users will need to know what values in the
34+ // original columns generated the values in the hashed columns, since
35+ // the algorithms will mostly use the hashed values for further
36+ // computations. The Hash method will preserve the mapping from the
37+ // original values to the hashed values in the Annotations of the newly
38+ // created column (column populated with the hashed values).
39+ //
40+ // Setting the maximumNumberOfInverts parameters to -1 will preserve the
41+ // full map. If that parameter is left to the default 0 value, the
42+ // mapping is not preserved.
43+ var pipeline = mlContext . Transforms . Conversion . Hash ( "CategoryHashed" ,
44+ "Category" , numberOfBits : 16 , maximumNumberOfInverts : - 1 )
45+ . Append ( mlContext . Transforms . Conversion . Hash ( "AgeHashed" , "Age" ,
46+ numberOfBits : 8 ) ) ;
4047
4148 // Let's fit our pipeline, and then apply it to the same data.
4249 var transformer = pipeline . Fit ( data ) ;
4350 var transformedData = transformer . Transform ( data ) ;
4451
45- // Convert the post transformation from the IDataView format to an IEnumerable<TransformedData> for easy consumption.
46- var convertedData = mlContext . Data . CreateEnumerable < TransformedDataPoint > ( transformedData , true ) ;
52+ // Convert the post transformation from the IDataView format to an
53+ // IEnumerable <TransformedData> for easy consumption.
54+ var convertedData = mlContext . Data . CreateEnumerable <
55+ TransformedDataPoint > ( transformedData , true ) ;
4756
4857 Console . WriteLine ( "Category CategoryHashed\t Age\t AgeHashed" ) ;
4958 foreach ( var item in convertedData )
50- Console . WriteLine ( $ "{ item . Category } \t { item . CategoryHashed } \t \t { item . Age } \t { item . AgeHashed } ") ;
59+ Console . WriteLine ( $ "{ item . Category } \t { item . CategoryHashed } \t \t " +
60+ $ "{ item . Age } \t { item . AgeHashed } ") ;
5161
5262 // Expected data after the transformation.
5363 //
@@ -58,20 +68,24 @@ public static void Example()
5868 // MLB 36206 18 127
5969 // MLS 6013 14 62
6070
61- // For the Category column, where we set the maximumNumberOfInverts parameter, the names of the original categories,
62- // and their correspondance with the generated hash values is preserved in the Annotations in the format of indices and values.
63- // the indices array will have the hashed values, and the corresponding element, position-wise, in the values array will
64- // contain the original value.
71+ // For the Category column, where we set the maximumNumberOfInverts
72+ // parameter, the names of the original categories, and their
73+ // correspondance with the generated hash values is preserved in the
74+ // Annotations in the format of indices and values.the indices array
75+ // will have the hashed values, and the corresponding element,
76+ // position -wise, in the values array will contain the original value.
6577 //
6678 // See below for an example on how to retrieve the mapping.
6779 var slotNames = new VBuffer < ReadOnlyMemory < char > > ( ) ;
68- transformedData . Schema [ "CategoryHashed" ] . Annotations . GetValue ( "KeyValues" , ref slotNames ) ;
80+ transformedData . Schema [ "CategoryHashed" ] . Annotations . GetValue (
81+ "KeyValues" , ref slotNames ) ;
6982
7083 var indices = slotNames . GetIndices ( ) ;
7184 var categoryNames = slotNames . GetValues ( ) ;
7285
7386 for ( int i = 0 ; i < indices . Length ; i ++ )
74- Console . WriteLine ( $ "The original value of the { indices [ i ] } category is { categoryNames [ i ] } ") ;
87+ Console . WriteLine ( $ "The original value of the { indices [ i ] } " +
88+ $ "category is { categoryNames [ i ] } ") ;
7589
7690 // Output Data
7791 //
0 commit comments