dotnet · zeahmed · Apr 5, 2019 · Apr 2, 2019 · Apr 3, 2019 · Apr 3, 2019
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs
@@ -0,0 +1,90 @@
+using System;
+using System.Collections.Generic;
+using Microsoft.ML.Data;
+using Microsoft.ML.Transforms.Text;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class ProduceHashedNgrams
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Create a small dataset as an IEnumerable.
+            var samples = new List<TextData>()
+            {
+                new TextData(){ Text = "This is an example to compute n-grams using hashing." },
+                new TextData(){ Text = "N-gram is a sequence of 'N' consecutive words/tokens." },
+                new TextData(){ Text = "ML.NET's ProduceHashedNgrams API produces count of n-grams and hashes it as an index into a vector of given bit length." },
+                new TextData(){ Text = "The hashing reduces the size of the output feature vector" },
+                new TextData(){ Text = "which is useful in case when number of n-grams is very large." },
+            };
+
+            // Convert training data to IDataView.
+            var dataview = mlContext.Data.LoadFromEnumerable(samples);
+
+            // A pipeline for converting text into numeric hashed n-gram features.
+            // The following call to 'ProduceHashedNgrams' requires the tokenized text/string as input.
+            // This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceHashedNgrams'.
+            // Please note that the length of the output feature vector depends on the 'numberOfBits' settings.
+            var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")
+                .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
+                .Append(mlContext.Transforms.Text.ProduceHashedNgrams("NgramFeatures", "Tokens", 
+                        numberOfBits: 5,
+                        ngramLength: 3,
+                        useAllLengths: false,
+                        maximumNumberOfInverts: 1));
+
+            // Fit to data.
+            var textTransformer = textPipeline.Fit(dataview);
+            var transformedDataView = textTransformer.Transform(dataview);
+
+            // Create the prediction engine to get the features extracted from the text.
+            var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
+
+            // Convert the text into numeric features.
+            var prediction = predictionEngine.Predict(samples[0]);
+
+            // Print the length of the feature vector.
+            Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}");
+
+            // Preview of the produced n-grams.
+            // Get the slot names from the column's metadata.
+            // The slot names for a vector column corresponds to the names associated with each position in the vector.
+            VBuffer<ReadOnlyMemory<char>> slotNames = default;
+            transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames);
+            var NgramFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["NgramFeatures"]);
+            var slots = slotNames.GetValues();
+            Console.Write("N-grams: ");
+            foreach (var featureRow in NgramFeaturesColumn)
+            {
+                foreach (var item in featureRow.Items())
+                    Console.Write($"{slots[item.Key]}  ");
+                Console.WriteLine();
+            }
+
+            // Print the first 10 feature values.
+            Console.Write("Features: ");
+            for (int i = 0; i < 10; i++)
+                Console.Write($"{prediction.NgramFeatures[i]:F4}  ");
+
+            //  Expected output:
+            //   Number of Features:  32
+            //   N-grams:   This|is|an  example|to|compute  compute|n-grams|using  n-grams|using|hashing.  an|example|to  is|an|example  a|sequence|of  of|'N'|consecutive  is|a|sequence  N-gram|is|a  ...
+            //   Features:    0.0000          0.0000               2.0000               0.0000               0.0000        1.0000          0.0000        0.0000              1.0000          0.0000  ...
+        }
+
+        private class TextData
+        {
+            public string Text { get; set; }
+        }
+
+        private class TransformedTextData : TextData
+        {
+            public float[] NgramFeatures { get; set; }
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs
@@ -0,0 +1,93 @@
+using System;
+using System.Collections.Generic;
+using Microsoft.ML.Data;
+using Microsoft.ML.Transforms.Text;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class ProduceNgrams
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Create a small dataset as an IEnumerable.
+            var samples = new List<TextData>()
+            {
+                new TextData(){ Text = "This is an example to compute n-grams." },
+                new TextData(){ Text = "N-gram is a sequence of 'N' consecutive words/tokens." },
+                new TextData(){ Text = "ML.NET's ProduceNgrams API produces vector of n-grams." },
+                new TextData(){ Text = "Each position in the vector corresponds to a particular n-gram." },
+                new TextData(){ Text = "The value at each position corresponds to," },
+                new TextData(){ Text = "the number of times n-gram occured in the data (Tf), or" },
+                new TextData(){ Text = "the inverse of the number of documents that contain the n-gram (Idf)," },
+                new TextData(){ Text = "or compute both and multiply together (Tf-Idf)." },
+            };
+
+            // Convert training data to IDataView.
+            var dataview = mlContext.Data.LoadFromEnumerable(samples);
+
+            // A pipeline for converting text into numeric n-gram features.
+            // The following call to 'ProduceNgrams' requires the tokenized text/string as input.
+            // This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceNgrams'.
+            // Please note that the length of the output feature vector depends on the n-gram settings.
+            var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")
+                // 'ProduceNgrams' takes key type as input. Converting the tokens into key type using 'MapValueToKey'.
+                .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
+                .Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures", "Tokens",
+                    ngramLength: 3,
+                    useAllLengths: false,
+                    weighting: NgramExtractingEstimator.WeightingCriteria.Tf));
+
+            // Fit to data.
+            var textTransformer = textPipeline.Fit(dataview);
+            var transformedDataView = textTransformer.Transform(dataview);
+
+            // Create the prediction engine to get the n-gram features extracted from the text.
+            var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
+
+            // Convert the text into numeric features.
+            var prediction = predictionEngine.Predict(samples[0]);
+
+            // Print the length of the feature vector.
+            Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}");
+
+            // Preview of the produced n-grams.
+            // Get the slot names from the column's metadata.
+            // The slot names for a vector column corresponds to the names associated with each position in the vector.
+            VBuffer<ReadOnlyMemory<char>> slotNames = default;
+            transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames);
+            var NgramFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["NgramFeatures"]);
+            var slots = slotNames.GetValues();
+            Console.Write("N-grams: ");
+            foreach (var featureRow in NgramFeaturesColumn)
+            {
+                foreach (var item in featureRow.Items())
+                    Console.Write($"{slots[item.Key]}  ");
+                Console.WriteLine();
+            }
+
+            // Print the first 10 feature values.
+            Console.Write("Features: ");
+            for (int i = 0; i < 10; i++)
+                Console.Write($"{prediction.NgramFeatures[i]:F4}  ");
+
+            //  Expected output:
+            //   Number of Features: 52
+            //   N-grams:   This|is|an  is|an|example  an|example|to  example|to|compute  to|compute|n-grams.  N-gram|is|a  is|a|sequence  a|sequence|of  sequence|of|'N'  of|'N'|consecutive  ...
+            //   Features:     1.0000      1.0000          1.0000           1.0000             1.0000            0.0000      0.0000          0.0000          0.0000          0.0000          ...
+        }
+
+        private class TextData
+        {
+            public string Text { get; set; }
+        }
+
+        private class TransformedTextData : TextData
+        {
+            public float[] NgramFeatures { get; set; }
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -217,7 +217,7 @@ internal static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        /// [!code-csharp[LpNormalize](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs?range=1-5,11-74)]
+        /// [!code-csharp[ProduceNgrams](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs)]
         /// ]]>
         /// </format>
         /// </example>
@@ -450,6 +450,13 @@ public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.T
         /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
         /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
         /// <param name="rehashUnigrams">Whether to rehash unigrams.</param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        /// [!code-csharp[ProduceHashedNgrams](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs)]
+        /// ]]>
+        /// </format>
+        /// </example>
         public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog,
             string outputColumnName,
             string[] inputColumnNames = null,