Skip to content

Commit edfd10f

Browse files
Fixes #3992 and corner case of inputColumnNames on FeaturizeText (#4211)
* Fixed issue #3992 with TextFeaturizer when no inputColumnName is provided, and when 'null' is passed explicitly as inputColumnNames. * Added Tests. * Fixed a minor mistake in documentation.
1 parent 862ae84 commit edfd10f

File tree

2 files changed

+94
-2
lines changed

2 files changed

+94
-2
lines changed

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// See the LICENSE file in the project root for more information.
44

55
using Microsoft.ML.Data;
6+
using Microsoft.ML.Internal.Utilities;
67
using Microsoft.ML.Runtime;
78
using Microsoft.ML.Transforms.Text;
89

@@ -39,7 +40,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
3940
outputColumnName, inputColumnName);
4041

4142
/// <summary>
42-
/// Create a <see cref="TextFeaturizingEstimator"/>, which transforms a text column into featurized float array that represents normalized counts of n-grams and char-grams.
43+
/// Create a <see cref="TextFeaturizingEstimator"/>, which transforms a text column into featurized vector of <see cref="System.Single"/> that represents normalized counts of n-grams and char-grams.
4344
/// </summary>
4445
/// <remarks>This transform can operate over several columns.</remarks>
4546
/// <param name="catalog">The text-related transform's catalog.</param>
@@ -62,7 +63,8 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
6263
TextFeaturizingEstimator.Options options,
6364
params string[] inputColumnNames)
6465
=> new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
65-
outputColumnName, inputColumnNames, options);
66+
outputColumnName, Utils.Size(inputColumnNames) == 0 ? new[] { outputColumnName } : inputColumnNames,
67+
options);
6668

6769
/// <summary>
6870
/// Create a <see cref="TokenizingByCharactersEstimator"/>, which tokenizes by splitting text into sequences of characters

test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ private class TestClass
3333
public float[] Features = null;
3434
}
3535

36+
private class TestClass2
37+
{
38+
public string Features;
39+
public string[] OutputTokens;
40+
}
41+
3642
[Fact]
3743
public void TextFeaturizerWithPredefinedStopWordRemoverTest()
3844
{
@@ -80,6 +86,90 @@ public void TextFeaturizerWithWordFeatureExtractorTest()
8086
Assert.Equal(expected, prediction.Features);
8187
}
8288

89+
[Fact]
90+
public void TextFeaturizerWithWordFeatureExtractorWithNullInputNamesTest()
91+
{
92+
var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
93+
new TestClass2() { Features = "This is another example", OutputTokens=null } };
94+
var dataView = ML.Data.LoadFromEnumerable(data);
95+
96+
var options = new TextFeaturizingEstimator.Options()
97+
{
98+
WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
99+
CharFeatureExtractor = null,
100+
Norm = TextFeaturizingEstimator.NormFunction.None,
101+
OutputTokensColumnName = "OutputTokens"
102+
};
103+
104+
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, null);
105+
dataView = pipeline.Fit(dataView).Transform(dataView);
106+
107+
VBuffer<float> features = default;
108+
float[][] transformed = { null, null };
109+
110+
var expected = new float[][] {
111+
new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
112+
new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
113+
};
114+
115+
using (var cursor = dataView.GetRowCursor(dataView.Schema))
116+
{
117+
var i = 0;
118+
while (cursor.MoveNext())
119+
{
120+
var featureGetter = cursor.GetGetter<VBuffer<float>>(cursor.Schema["Features"]);
121+
featureGetter(ref features);
122+
transformed[i] = features.DenseValues().ToArray();
123+
i++;
124+
}
125+
}
126+
127+
Assert.Equal(expected[0], transformed[0]);
128+
Assert.Equal(expected[1], transformed[1]);
129+
}
130+
131+
[Fact]
132+
public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames()
133+
{
134+
var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
135+
new TestClass2() { Features = "This is another example", OutputTokens=null } };
136+
var dataView = ML.Data.LoadFromEnumerable(data);
137+
138+
var options = new TextFeaturizingEstimator.Options()
139+
{
140+
WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
141+
CharFeatureExtractor = null,
142+
Norm = TextFeaturizingEstimator.NormFunction.None,
143+
OutputTokensColumnName = "OutputTokens"
144+
};
145+
146+
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options);
147+
dataView = pipeline.Fit(dataView).Transform(dataView);
148+
149+
VBuffer<float> features = default;
150+
float[][] transformed = { null, null };
151+
152+
var expected = new float[][] {
153+
new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
154+
new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
155+
};
156+
157+
using (var cursor = dataView.GetRowCursor(dataView.Schema))
158+
{
159+
var i = 0;
160+
while (cursor.MoveNext())
161+
{
162+
var featureGetter = cursor.GetGetter<VBuffer<float>>(cursor.Schema["Features"]);
163+
featureGetter(ref features);
164+
transformed[i] = features.DenseValues().ToArray();
165+
i++;
166+
}
167+
}
168+
169+
Assert.Equal(expected[0], transformed[0]);
170+
Assert.Equal(expected[1], transformed[1]);
171+
}
172+
83173
[Fact]
84174
public void TextFeaturizerWithCharFeatureExtractorTest()
85175
{

0 commit comments

Comments
 (0)