Skip to content

Commit a4f9ee8

Browse files
committed
review comments
1 parent d646f2e commit a4f9ee8

File tree

5 files changed

+165
-89
lines changed

5 files changed

+165
-89
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs

Lines changed: 17 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -27,24 +27,13 @@ public static void Example()
2727
var data = mlContext.Data.LoadFromEnumerable(samples);
2828

2929
// A pipeline for one hot encoding the Education column.
30-
var bagPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education");
31-
32-
// Fit and transform the data.
33-
var oneHotEncodedData = bagPipeline.Fit(data).Transform(data);
34-
35-
// Getting the data of the newly created column, so we can preview it.
36-
var encodedDataColumn = oneHotEncodedData.GetColumn<float[]>("EducationOneHotEncoded");
30+
var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education");
3731

38-
Console.WriteLine("One Hot Encoding of single column 'Education'");
39-
foreach (var row in encodedDataColumn)
40-
{
41-
for (var i = 0; i < row.Length; i++)
42-
Console.Write($"{row[i]} ");
43-
Console.WriteLine();
44-
}
32+
// Fit and transform the data.
33+
var oneHotEncodedData = pipeline.Fit(data).Transform(data);
4534

35+
PrintDataColumn(oneHotEncodedData, "EducationOneHotEncoded");
4636
// We have 3 slots, because there are three categories in the 'Education' column.
47-
4837
// 1 0 0
4938
// 1 0 0
5039
// 0 1 0
@@ -53,12 +42,11 @@ public static void Example()
5342

5443
// A pipeline for one hot encoding the Education column (using keying).
5544
var keyPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key);
56-
45+
5746
// Fit and Transform data.
58-
var keyTransformer = keyPipeline.Fit(data).Transform(data);
47+
oneHotEncodedData = keyPipeline.Fit(data).Transform(data);
5948

60-
// Getting the data of the newly created column, so we can preview it.
61-
var keyEncodedColumn = keyTransformer.GetColumn<uint>("EducationOneHotEncoded");
49+
var keyEncodedColumn = oneHotEncodedData.GetColumn<uint>("EducationOneHotEncoded");
6250

6351
Console.WriteLine("One Hot Encoding of single column 'Education', with key type output.");
6452
foreach (var element in keyEncodedColumn)
@@ -69,30 +57,18 @@ public static void Example()
6957
// 2
7058
// 2
7159
// 3
72-
73-
// Multi column example : A pipeline for one hot encoding two columns 'Education' and 'ZipCode'
74-
var multiColumnKeyPipeline = mlContext.Transforms.Categorical.OneHotEncoding(
75-
new InputOutputColumnPair[] {
76-
new InputOutputColumnPair("Education"),
77-
new InputOutputColumnPair("ZipCode"),
78-
});
79-
80-
// Fit and Transform data.
81-
var transformedData = multiColumnKeyPipeline.Fit(data).Transform(data);
82-
83-
var convertedData = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, true);
84-
85-
Console.WriteLine("One Hot Encoding of two columns 'Education' and 'ZipCode'.");
86-
foreach (var item in convertedData)
87-
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education), string.Join(" ", item.ZipCode));
88-
89-
// 1 0 0 1 0
90-
// 1 0 0 0 1
91-
// 0 1 0 1 0
92-
// 0 1 0 0 1
93-
// 0 0 1 1 0
9460
}
61+
private static void PrintDataColumn(IDataView transformedData, string columnName)
62+
{
63+
var countSelectColumn = transformedData.GetColumn<float[]>(transformedData.Schema[columnName]);
9564

65+
foreach (var row in countSelectColumn)
66+
{
67+
for (var i = 0; i < row.Length; i++)
68+
Console.Write($"{row[i]}\t");
69+
Console.WriteLine();
70+
}
71+
}
9672
private class DataPoint
9773
{
9874
public float Label { get; set; }
@@ -101,12 +77,5 @@ private class DataPoint
10177

10278
public string ZipCode { get; set; }
10379
}
104-
105-
private class TransformedData
106-
{
107-
public float[] Education { get; set; }
108-
109-
public float[] ZipCode { get; set; }
110-
}
11180
}
11281
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
using System;
2+
using System.Collections.Generic;
3+
4+
namespace Microsoft.ML.Samples.Dynamic
5+
{
6+
public static class OneHotEncodingMultiColumn
7+
{
8+
public static void Example()
9+
{
10+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
11+
// as well as the source of randomness.
12+
var mlContext = new MLContext();
13+
14+
// Get a small dataset as an IEnumerable.
15+
var samples = new List<DataPoint>()
16+
{
17+
new DataPoint(){ Label = 0, Education = "0-5yrs", ZipCode = "98005" },
18+
new DataPoint(){ Label = 1, Education = "0-5yrs", ZipCode = "98052" },
19+
new DataPoint(){ Label = 45, Education = "6-11yrs", ZipCode = "98005" },
20+
new DataPoint(){ Label = 50, Education = "6-11yrs", ZipCode = "98052" },
21+
new DataPoint(){ Label = 50, Education = "11-15yrs", ZipCode = "98005" },
22+
};
23+
24+
// Convert training data to IDataView.
25+
var data = mlContext.Data.LoadFromEnumerable(samples);
26+
27+
// Multi column example : A pipeline for one hot encoding two columns 'Education' and 'ZipCode'
28+
var multiColumnKeyPipeline = mlContext.Transforms.Categorical.OneHotEncoding(
29+
new InputOutputColumnPair[] {
30+
new InputOutputColumnPair("Education"),
31+
new InputOutputColumnPair("ZipCode"),
32+
});
33+
34+
// Fit and Transform data.
35+
var transformedData = multiColumnKeyPipeline.Fit(data).Transform(data);
36+
37+
var convertedData = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, true);
38+
39+
Console.WriteLine("One Hot Encoding of two columns 'Education' and 'ZipCode'.");
40+
foreach (var item in convertedData)
41+
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education), string.Join(" ", item.ZipCode));
42+
43+
// 1 0 0 1 0
44+
// 1 0 0 0 1
45+
// 0 1 0 1 0
46+
// 0 1 0 0 1
47+
// 0 0 1 1 0
48+
}
49+
50+
private class DataPoint
51+
{
52+
public float Label { get; set; }
53+
54+
public string Education { get; set; }
55+
56+
public string ZipCode { get; set; }
57+
}
58+
59+
private class TransformedData
60+
{
61+
public float[] Education { get; set; }
62+
63+
public float[] ZipCode { get; set; }
64+
}
65+
}
66+
}

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotHashEncoding.cs

Lines changed: 14 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -27,22 +27,12 @@ public static void Example()
2727
var data = mlContext.Data.LoadFromEnumerable(samples);
2828

2929
// A pipeline for one hot hash encoding the 'Education' column.
30-
var bagPipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("EducationOneHotHashEncoded", "Education", numberOfBits: 3);
30+
var pipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("EducationOneHotHashEncoded", "Education", numberOfBits: 3);
3131

3232
// Fit and transform the data.
33-
var hashEncodedData = bagPipeline.Fit(data).Transform(data);
34-
35-
// Getting the data of the newly created column, so we can preview it.
36-
var encodedColumn = hashEncodedData.GetColumn<float[]>("EducationOneHotHashEncoded");
37-
38-
Console.WriteLine("One Hot Hash Encoding of single column 'Education', based on the bagging strategy.");
39-
foreach (var row in encodedColumn)
40-
{
41-
for (var i = 0; i < row.Length; i++)
42-
Console.Write($"{row[i]} ");
43-
Console.WriteLine();
44-
}
33+
var hashEncodedData = pipeline.Fit(data).Transform(data);
4534

35+
PrintDataColumn(hashEncodedData, "EducationOneHotHashEncoded");
4636
// We have 8 slots, because we used numberOfBits = 3.
4737

4838
// 0 0 0 1 0 0 0 0
@@ -51,7 +41,7 @@ public static void Example()
5141
// 0 0 0 0 1 0 0 0
5242
// 0 0 0 0 0 0 0 1
5343

54-
// A pipeline for one hot hash encoding the Education column (using keying strategy).
44+
// A pipeline for one hot hash encoding the 'Education' column (using keying strategy).
5545
var keyPipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("EducationOneHotHashEncoded", "Education",
5646
outputKind: OneHotEncodingEstimator.OutputKind.Key,
5747
numberOfBits: 3);
@@ -71,26 +61,18 @@ public static void Example()
7161
// 5
7262
// 5
7363
// 8
64+
}
7465

75-
// Multi column example : A pipeline for one hot has encoding two columns 'Education' and 'ZipCode'
76-
var multiColumnKeyPipeline = mlContext.Transforms.Categorical.OneHotHashEncoding(
77-
new InputOutputColumnPair[] { new InputOutputColumnPair("Education"), new InputOutputColumnPair("ZipCode") },
78-
numberOfBits: 3);
79-
80-
// Fit and Transform the data.
81-
var transformedData = multiColumnKeyPipeline.Fit(data).Transform(data);
82-
83-
var convertedData = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, true);
84-
85-
Console.WriteLine("One Hot Hash Encoding of two columns 'Education' and 'ZipCode'.");
86-
foreach (var item in convertedData)
87-
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education), string.Join(" ", item.ZipCode));
66+
private static void PrintDataColumn(IDataView transformedData, string columnName)
67+
{
68+
var countSelectColumn = transformedData.GetColumn<float[]>(transformedData.Schema[columnName]);
8869

89-
// 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1
90-
// 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
91-
// 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
92-
// 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
93-
// 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
70+
foreach (var row in countSelectColumn)
71+
{
72+
for (var i = 0; i < row.Length; i++)
73+
Console.Write($"{row[i]}\t");
74+
Console.WriteLine();
75+
}
9476
}
9577

9678
private class DataPoint
@@ -101,12 +83,5 @@ private class DataPoint
10183

10284
public string ZipCode { get; set; }
10385
}
104-
105-
private class TransformedData
106-
{
107-
public float[] Education { get; set; }
108-
109-
public float[] ZipCode { get; set; }
110-
}
11186
}
11287
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
using System;
2+
using System.Collections.Generic;
3+
4+
namespace Microsoft.ML.Samples.Dynamic
5+
{
6+
public static class OneHotHashEncodingMultiColumn
7+
{
8+
public static void Example()
9+
{
10+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
11+
// as well as the source of randomness.
12+
var mlContext = new MLContext();
13+
14+
// Get a small dataset as an IEnumerable.
15+
var samples = new List<DataPoint>()
16+
{
17+
new DataPoint(){ Label = 0, Education = "0-5yrs", ZipCode = "98005" },
18+
new DataPoint(){ Label = 1, Education = "0-5yrs", ZipCode = "98052" },
19+
new DataPoint(){ Label = 45, Education = "6-11yrs", ZipCode = "98005" },
20+
new DataPoint(){ Label = 50, Education = "6-11yrs", ZipCode = "98052" },
21+
new DataPoint(){ Label = 50, Education = "11-15yrs", ZipCode = "98005" },
22+
};
23+
24+
// Convert training data to IDataView.
25+
var data = mlContext.Data.LoadFromEnumerable(samples);
26+
27+
// Multi column example : A pipeline for one hot has encoding two columns 'Education' and 'ZipCode'
28+
var multiColumnKeyPipeline = mlContext.Transforms.Categorical.OneHotHashEncoding(
29+
new InputOutputColumnPair[] { new InputOutputColumnPair("Education"), new InputOutputColumnPair("ZipCode") },
30+
numberOfBits: 3);
31+
32+
// Fit and Transform the data.
33+
var transformedData = multiColumnKeyPipeline.Fit(data).Transform(data);
34+
35+
var convertedData = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, true);
36+
37+
Console.WriteLine("One Hot Hash Encoding of two columns 'Education' and 'ZipCode'.");
38+
foreach (var item in convertedData)
39+
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education), string.Join(" ", item.ZipCode));
40+
41+
// We have 8 slots, because we used numberOfBits = 3.
42+
43+
// 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1
44+
// 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
45+
// 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
46+
// 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
47+
// 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
48+
}
49+
50+
private class DataPoint
51+
{
52+
public float Label { get; set; }
53+
54+
public string Education { get; set; }
55+
56+
public string ZipCode { get; set; }
57+
}
58+
59+
private class TransformedData
60+
{
61+
public float[] Education { get; set; }
62+
63+
public float[] ZipCode { get; set; }
64+
}
65+
}
66+
}

src/Microsoft.ML.Transforms/CategoricalCatalog.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate
5656
/// <example>
5757
/// <format type="text/markdown">
5858
/// <![CDATA[
59-
/// [!code-csharp[OneHotEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs)]
59+
/// [!code-csharp[OneHotEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncodingMultiColumn.cs)]
6060
/// ]]></format>
6161
/// </example>
6262
public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
@@ -142,7 +142,7 @@ public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCata
142142
/// <example>
143143
/// <format type="text/markdown">
144144
/// <![CDATA[
145-
/// [!code-csharp[OneHotHashEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotHashEncoding.cs)]
145+
/// [!code-csharp[OneHotHashEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotHashEncodingMultiColumn.cs)]
146146
/// ]]></format>
147147
/// </example>
148148
public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog,

0 commit comments

Comments
 (0)