Fixes #3992 and corner case of inputColumnNames on FeaturizeText (#4211)

antoniovs1029 · web-flow · commit edfd10f131ab · 2019-09-19T09:25:30.000-07:00
* Fixed issue #3992 with TextFeaturizer when no inputColumnName is provided, and when 'null' is passed explicitly as inputColumnNames. * Added Tests. * Fixed a minor mistake in documentation.
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -3,6 +3,7 @@
 // See the LICENSE file in the project root for more information.
 
 using Microsoft.ML.Data;
+using Microsoft.ML.Internal.Utilities;
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Transforms.Text;
 
@@ -39,7 +40,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
                 outputColumnName, inputColumnName);
 
         /// <summary>
-        ///  Create a <see cref="TextFeaturizingEstimator"/>, which transforms a text column into featurized float array that represents normalized counts of n-grams and char-grams.
+        ///  Create a <see cref="TextFeaturizingEstimator"/>, which transforms a text column into featurized vector of <see cref="System.Single"/> that represents normalized counts of n-grams and char-grams.
         /// </summary>
         /// <remarks>This transform can operate over several columns.</remarks>
         /// <param name="catalog">The text-related transform's catalog.</param>
@@ -62,7 +63,8 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
             TextFeaturizingEstimator.Options options,
             params string[] inputColumnNames)
             => new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
-                outputColumnName, inputColumnNames, options);
+                outputColumnName, Utils.Size(inputColumnNames) == 0  ? new[] { outputColumnName } : inputColumnNames,
+                options);
 
         /// <summary>
         /// Create a <see cref="TokenizingByCharactersEstimator"/>, which tokenizes by splitting text into sequences of characters
diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
@@ -33,6 +33,12 @@ private class TestClass
             public float[] Features = null;
         }
 
+        private class TestClass2
+        {
+            public string Features;
+            public string[] OutputTokens;
+        }
+
         [Fact]
         public void TextFeaturizerWithPredefinedStopWordRemoverTest()
         {
@@ -80,6 +86,90 @@ public void TextFeaturizerWithWordFeatureExtractorTest()
             Assert.Equal(expected, prediction.Features);
         }
 
+        [Fact]
+        public void TextFeaturizerWithWordFeatureExtractorWithNullInputNamesTest()
+        {
+            var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
+                               new TestClass2() { Features = "This is another example", OutputTokens=null } };
+            var dataView = ML.Data.LoadFromEnumerable(data);
+
+            var options = new TextFeaturizingEstimator.Options()
+            {
+                WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
+                CharFeatureExtractor = null,
+                Norm = TextFeaturizingEstimator.NormFunction.None,
+                OutputTokensColumnName = "OutputTokens"
+            };
+
+            var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, null);
+            dataView = pipeline.Fit(dataView).Transform(dataView);
+
+            VBuffer<float> features = default;
+            float[][] transformed = { null, null };
+
+            var expected = new float[][] {
+                new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
+                new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
+            };
+
+            using (var cursor = dataView.GetRowCursor(dataView.Schema))
+            {
+                var i = 0;
+                while (cursor.MoveNext())
+                {
+                    var featureGetter = cursor.GetGetter<VBuffer<float>>(cursor.Schema["Features"]);
+                    featureGetter(ref features);
+                    transformed[i] = features.DenseValues().ToArray();
+                    i++;
+                }
+            }
+
+            Assert.Equal(expected[0], transformed[0]);
+            Assert.Equal(expected[1], transformed[1]);
+        }
+
+        [Fact]
+        public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames()
+        {
+            var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
+                               new TestClass2() { Features = "This is another example", OutputTokens=null } };
+            var dataView = ML.Data.LoadFromEnumerable(data);
+
+            var options = new TextFeaturizingEstimator.Options()
+            {
+                WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
+                CharFeatureExtractor = null,
+                Norm = TextFeaturizingEstimator.NormFunction.None,
+                OutputTokensColumnName = "OutputTokens"
+            };
+
+            var pipeline = ML.Transforms.Text.FeaturizeText("Features", options);
+            dataView = pipeline.Fit(dataView).Transform(dataView);
+
+            VBuffer<float> features = default;
+            float[][] transformed = { null, null };
+
+            var expected = new float[][] {
+                new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
+                new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
+            };
+
+            using (var cursor = dataView.GetRowCursor(dataView.Schema))
+            {
+                var i = 0;
+                while (cursor.MoveNext())
+                {
+                    var featureGetter = cursor.GetGetter<VBuffer<float>>(cursor.Schema["Features"]);
+                    featureGetter(ref features);
+                    transformed[i] = features.DenseValues().ToArray();
+                    i++;
+                }
+            }
+
+            Assert.Equal(expected[0], transformed[0]);
+            Assert.Equal(expected[1], transformed[1]);
+        }
+
         [Fact]
         public void TextFeaturizerWithCharFeatureExtractorTest()
         {