Skip to content

Commit 9d79ab3

Browse files
authored
Samples for categorical transform estimators (#3179)
* categorical transform estimators * review comments * fix review comments * modify samples namespace
1 parent 24645ff commit 9d79ab3

File tree

5 files changed

+267
-42
lines changed

5 files changed

+267
-42
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs

Lines changed: 34 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
using System;
22
using System.Collections.Generic;
3-
using System.Linq;
3+
using Microsoft.ML;
44
using Microsoft.ML.Data;
55
using static Microsoft.ML.Transforms.OneHotEncodingEstimator;
66

7-
namespace Microsoft.ML.Samples.Dynamic
7+
namespace Samples.Dynamic
88
{
99
public static class OneHotEncoding
1010
{
@@ -17,53 +17,39 @@ public static void Example()
1717
// Get a small dataset as an IEnumerable.
1818
var samples = new List<DataPoint>()
1919
{
20-
new DataPoint(){ Label = 0, Education = "0-5yrs" },
21-
new DataPoint(){ Label = 1, Education = "0-5yrs" },
22-
new DataPoint(){ Label = 45, Education = "6-11yrs" },
23-
new DataPoint(){ Label = 50, Education = "6-11yrs" },
24-
new DataPoint(){ Label = 50, Education = "11-15yrs" },
20+
new DataPoint(){ Education = "0-5yrs" },
21+
new DataPoint(){ Education = "0-5yrs" },
22+
new DataPoint(){ Education = "6-11yrs" },
23+
new DataPoint(){ Education = "6-11yrs" },
24+
new DataPoint(){ Education = "11-15yrs" },
2525
};
2626

2727
// Convert training data to IDataView.
28-
var trainData = mlContext.Data.LoadFromEnumerable(samples);
28+
var data = mlContext.Data.LoadFromEnumerable(samples);
2929

3030
// A pipeline for one hot encoding the Education column.
31-
var bagPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Bag);
32-
// Fit to data.
33-
var bagTransformer = bagPipeline.Fit(trainData);
31+
var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education");
3432

35-
// Get transformed data
36-
var bagTransformedData = bagTransformer.Transform(trainData);
37-
// Getting the data of the newly created column, so we can preview it.
38-
var bagEncodedColumn = bagTransformedData.GetColumn<float[]>("EducationOneHotEncoded");
33+
// Fit and transform the data.
34+
var oneHotEncodedData = pipeline.Fit(data).Transform(data);
3935

36+
PrintDataColumn(oneHotEncodedData, "EducationOneHotEncoded");
37+
// We have 3 slots, because there are three categories in the 'Education' column.
38+
// 1 0 0
39+
// 1 0 0
40+
// 0 1 0
41+
// 0 1 0
42+
// 0 0 1
43+
44+
// A pipeline for one hot encoding the Education column (using keying).
4045
var keyPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key);
41-
// Fit to data.
42-
var keyTransformer = keyPipeline.Fit(trainData);
4346

44-
// Get transformed data
45-
var keyTransformedData = keyTransformer.Transform(trainData);
46-
// Getting the data of the newly created column, so we can preview it.
47-
var keyEncodedColumn = keyTransformedData.GetColumn<uint>("EducationOneHotEncoded");
47+
// Fit and Transform data.
48+
oneHotEncodedData = keyPipeline.Fit(data).Transform(data);
4849

49-
Console.WriteLine("One Hot Encoding based on the bagging strategy.");
50-
foreach (var row in bagEncodedColumn)
51-
{
52-
for (var i = 0; i < row.Length; i++)
53-
Console.Write($"{row[i]} ");
54-
}
55-
56-
// data column obtained post-transformation.
57-
// Since there are only two categories in the Education column of the trainData, the output vector
58-
// for one hot will have two slots.
59-
//
60-
// 0 0 0
61-
// 0 0 0
62-
// 0 0 1
63-
// 0 0 1
64-
// 0 1 0
50+
var keyEncodedColumn = oneHotEncodedData.GetColumn<uint>("EducationOneHotEncoded");
6551

66-
Console.WriteLine("One Hot Encoding with key type output.");
52+
Console.WriteLine("One Hot Encoding of single column 'Education', with key type output.");
6753
foreach (var element in keyEncodedColumn)
6854
Console.WriteLine(element);
6955

@@ -72,13 +58,20 @@ public static void Example()
7258
// 2
7359
// 2
7460
// 3
75-
7661
}
62+
private static void PrintDataColumn(IDataView transformedData, string columnName)
63+
{
64+
var countSelectColumn = transformedData.GetColumn<float[]>(transformedData.Schema[columnName]);
7765

66+
foreach (var row in countSelectColumn)
67+
{
68+
for (var i = 0; i < row.Length; i++)
69+
Console.Write($"{row[i]}\t");
70+
Console.WriteLine();
71+
}
72+
}
7873
private class DataPoint
7974
{
80-
public float Label { get; set; }
81-
8275
public string Education { get; set; }
8376
}
8477
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML;
4+
5+
namespace Samples.Dynamic
6+
{
7+
public static class OneHotEncodingMultiColumn
8+
{
9+
public static void Example()
10+
{
11+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+
// as well as the source of randomness.
13+
var mlContext = new MLContext();
14+
15+
// Get a small dataset as an IEnumerable.
16+
var samples = new List<DataPoint>()
17+
{
18+
new DataPoint(){ Education = "0-5yrs", ZipCode = "98005" },
19+
new DataPoint(){ Education = "0-5yrs", ZipCode = "98052" },
20+
new DataPoint(){ Education = "6-11yrs", ZipCode = "98005" },
21+
new DataPoint(){ Education = "6-11yrs", ZipCode = "98052" },
22+
new DataPoint(){ Education = "11-15yrs", ZipCode = "98005" },
23+
};
24+
25+
// Convert training data to IDataView.
26+
var data = mlContext.Data.LoadFromEnumerable(samples);
27+
28+
// Multi column example : A pipeline for one hot encoding two columns 'Education' and 'ZipCode'
29+
var multiColumnKeyPipeline = mlContext.Transforms.Categorical.OneHotEncoding(
30+
new InputOutputColumnPair[] {
31+
new InputOutputColumnPair("Education"),
32+
new InputOutputColumnPair("ZipCode"),
33+
});
34+
35+
// Fit and Transform data.
36+
var transformedData = multiColumnKeyPipeline.Fit(data).Transform(data);
37+
38+
var convertedData = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, true);
39+
40+
Console.WriteLine("One Hot Encoding of two columns 'Education' and 'ZipCode'.");
41+
foreach (var item in convertedData)
42+
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education), string.Join(" ", item.ZipCode));
43+
44+
// 1 0 0 1 0
45+
// 1 0 0 0 1
46+
// 0 1 0 1 0
47+
// 0 1 0 0 1
48+
// 0 0 1 1 0
49+
}
50+
51+
private class DataPoint
52+
{
53+
public string Education { get; set; }
54+
55+
public string ZipCode { get; set; }
56+
}
57+
58+
private class TransformedData
59+
{
60+
public float[] Education { get; set; }
61+
62+
public float[] ZipCode { get; set; }
63+
}
64+
}
65+
}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML;
4+
using Microsoft.ML.Data;
5+
using Microsoft.ML.Transforms;
6+
7+
namespace Samples.Dynamic
8+
{
9+
public static class OneHotHashEncoding
10+
{
11+
public static void Example()
12+
{
13+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
14+
// as well as the source of randomness.
15+
var mlContext = new MLContext();
16+
17+
// Get a small dataset as an IEnumerable.
18+
var samples = new List<DataPoint>()
19+
{
20+
new DataPoint(){ Education = "0-5yrs" },
21+
new DataPoint(){ Education = "0-5yrs" },
22+
new DataPoint(){ Education = "6-11yrs" },
23+
new DataPoint(){ Education = "6-11yrs" },
24+
new DataPoint(){ Education = "11-15yrs" },
25+
};
26+
27+
// Convert training data to IDataView.
28+
var data = mlContext.Data.LoadFromEnumerable(samples);
29+
30+
// A pipeline for one hot hash encoding the 'Education' column.
31+
var pipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("EducationOneHotHashEncoded", "Education", numberOfBits: 3);
32+
33+
// Fit and transform the data.
34+
var hashEncodedData = pipeline.Fit(data).Transform(data);
35+
36+
PrintDataColumn(hashEncodedData, "EducationOneHotHashEncoded");
37+
// We have 8 slots, because we used numberOfBits = 3.
38+
39+
// 0 0 0 1 0 0 0 0
40+
// 0 0 0 1 0 0 0 0
41+
// 0 0 0 0 1 0 0 0
42+
// 0 0 0 0 1 0 0 0
43+
// 0 0 0 0 0 0 0 1
44+
45+
// A pipeline for one hot hash encoding the 'Education' column (using keying strategy).
46+
var keyPipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("EducationOneHotHashEncoded", "Education",
47+
outputKind: OneHotEncodingEstimator.OutputKind.Key,
48+
numberOfBits: 3);
49+
50+
// Fit and transform the data.
51+
var hashKeyEncodedData = keyPipeline.Fit(data).Transform(data);
52+
53+
// Getting the data of the newly created column, so we can preview it.
54+
var keyEncodedColumn = hashKeyEncodedData.GetColumn<uint>("EducationOneHotHashEncoded");
55+
56+
Console.WriteLine("One Hot Hash Encoding of single column 'Education', with key type output.");
57+
foreach (var element in keyEncodedColumn)
58+
Console.WriteLine(element);
59+
60+
// 4
61+
// 4
62+
// 5
63+
// 5
64+
// 8
65+
}
66+
67+
private static void PrintDataColumn(IDataView transformedData, string columnName)
68+
{
69+
var countSelectColumn = transformedData.GetColumn<float[]>(transformedData.Schema[columnName]);
70+
71+
foreach (var row in countSelectColumn)
72+
{
73+
for (var i = 0; i < row.Length; i++)
74+
Console.Write($"{row[i]}\t");
75+
Console.WriteLine();
76+
}
77+
}
78+
79+
private class DataPoint
80+
{
81+
public string Education { get; set; }
82+
}
83+
}
84+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML;
4+
5+
namespace Samples.Dynamic
6+
{
7+
public static class OneHotHashEncodingMultiColumn
8+
{
9+
public static void Example()
10+
{
11+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+
// as well as the source of randomness.
13+
var mlContext = new MLContext();
14+
15+
// Get a small dataset as an IEnumerable.
16+
var samples = new List<DataPoint>()
17+
{
18+
new DataPoint(){ Education = "0-5yrs", ZipCode = "98005" },
19+
new DataPoint(){ Education = "0-5yrs", ZipCode = "98052" },
20+
new DataPoint(){ Education = "6-11yrs", ZipCode = "98005" },
21+
new DataPoint(){ Education = "6-11yrs", ZipCode = "98052" },
22+
new DataPoint(){ Education = "11-15yrs", ZipCode = "98005" },
23+
};
24+
25+
// Convert training data to IDataView.
26+
var data = mlContext.Data.LoadFromEnumerable(samples);
27+
28+
// Multi column example : A pipeline for one hot has encoding two columns 'Education' and 'ZipCode'
29+
var multiColumnKeyPipeline = mlContext.Transforms.Categorical.OneHotHashEncoding(
30+
new InputOutputColumnPair[] { new InputOutputColumnPair("Education"), new InputOutputColumnPair("ZipCode") },
31+
numberOfBits: 3);
32+
33+
// Fit and Transform the data.
34+
var transformedData = multiColumnKeyPipeline.Fit(data).Transform(data);
35+
36+
var convertedData = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, true);
37+
38+
Console.WriteLine("One Hot Hash Encoding of two columns 'Education' and 'ZipCode'.");
39+
foreach (var item in convertedData)
40+
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education), string.Join(" ", item.ZipCode));
41+
42+
// We have 8 slots, because we used numberOfBits = 3.
43+
44+
// 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1
45+
// 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
46+
// 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
47+
// 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
48+
// 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
49+
}
50+
51+
private class DataPoint
52+
{
53+
public string Education { get; set; }
54+
55+
public string ZipCode { get; set; }
56+
}
57+
58+
private class TransformedData
59+
{
60+
public float[] Education { get; set; }
61+
62+
public float[] ZipCode { get; set; }
63+
}
64+
}
65+
}

src/Microsoft.ML.Transforms/CategoricalCatalog.cs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ public static class CategoricalCatalog
2929
/// <example>
3030
/// <format type="text/markdown">
3131
/// <![CDATA[
32-
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs)]
32+
/// [!code-csharp[OneHotEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs)]
3333
/// ]]></format>
3434
/// </example>
3535
public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
@@ -53,6 +53,12 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate
5353
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
5454
/// <param name="keyData">Specifies an ordering for the encoding. If specified, this should be a single column data view,
5555
/// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting.</param>
56+
/// <example>
57+
/// <format type="text/markdown">
58+
/// <![CDATA[
59+
/// [!code-csharp[OneHotEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncodingMultiColumn.cs)]
60+
/// ]]></format>
61+
/// </example>
5662
public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
5763
InputOutputColumnPair[] columns,
5864
OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.Defaults.OutKind,
@@ -103,6 +109,12 @@ internal static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Ca
103109
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
104110
/// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
105111
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
112+
/// <example>
113+
/// <format type="text/markdown">
114+
/// <![CDATA[
115+
/// [!code-csharp[OneHotHashEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotHashEncoding.cs)]
116+
/// ]]></format>
117+
/// </example>
106118
public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog,
107119
string outputColumnName,
108120
string inputColumnName = null,
@@ -127,6 +139,12 @@ public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCata
127139
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
128140
/// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
129141
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
142+
/// <example>
143+
/// <format type="text/markdown">
144+
/// <![CDATA[
145+
/// [!code-csharp[OneHotHashEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotHashEncodingMultiColumn.cs)]
146+
/// ]]></format>
147+
/// </example>
130148
public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog,
131149
InputOutputColumnPair[] columns,
132150
OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.OutputKind.Indicator,

0 commit comments

Comments
 (0)