From 0853b2730976af20cae878722f43d4e05065173a Mon Sep 17 00:00:00 2001
From: Zeeshan Ahmed <zeahmed@microsoft.com>
Date: Wed, 3 Apr 2019 13:39:29 -0700
Subject: [PATCH 1/5] Created sample for 'LatentDirichletAllocation' API.

---
 .../Dynamic/LdaTransform.cs                   | 61 ---------------
 .../Text/LatentDirichletAllocation.cs         | 74 +++++++++++++++++++
 .../Text/TextCatalog.cs                       |  2 +-
 3 files changed, 75 insertions(+), 62 deletions(-)
 delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs
 create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs
deleted file mode 100644
index 239e7d93ac..0000000000
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs
+++ /dev/null
@@ -1,61 +0,0 @@
-﻿using System;
-using System.Collections.Generic;
-using Microsoft.ML.Data;
-
-namespace Microsoft.ML.Samples.Dynamic
-{
-    public static class LatentDirichletAllocationTransform
-    {
-        public static void Example()
-        {
-            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
-            // as well as the source of randomness.
-            var ml = new MLContext();
-
-            // Get a small dataset as an IEnumerable and then read it as a ML.NET data set.
-            IEnumerable<SamplesUtils.DatasetUtils.SampleTopicsData> data = SamplesUtils.DatasetUtils.GetTopicsData();
-            var trainData = ml.Data.LoadFromEnumerable(data);
-
-            // Preview of one of the columns of the the topics data. 
-            // The Review column contains the keys associated with a particular body of text.  
-            //
-            // Review                               
-            // "animals birds cats dogs fish horse" 
-            // "horse birds house fish duck cats"   
-            // "car truck driver bus pickup"       
-            // "car truck driver bus pickup horse"
-
-            string review = nameof(SamplesUtils.DatasetUtils.SampleTopicsData.Review);
-            string ldaFeatures = "LdaFeatures";
-
-            // A pipeline for featurizing the "Review" column
-            var pipeline = ml.Transforms.Text.ProduceWordBags(review).
-                Append(ml.Transforms.Text.LatentDirichletAllocation(review, ldaFeatures, numberOfTopics: 3));
-
-            // The transformed data
-            var transformer = pipeline.Fit(trainData);
-            var transformed_data = transformer.Transform(trainData);
-
-            // Column obtained after processing the input.
-            var ldaFeaturesColumn = transformed_data.GetColumn<VBuffer<float>>(transformed_data.Schema[ldaFeatures]);
-
-            Console.WriteLine($"{ldaFeatures} column obtained post-transformation.");
-            foreach (var featureRow in ldaFeaturesColumn)
-            {
-                foreach (var value in featureRow.GetValues())
-                    Console.Write($"{value} ");
-                Console.WriteLine("");
-            }
-
-            Console.WriteLine("===================================================");
-
-            // LdaFeatures column obtained post-transformation.
-            // For LDA, we had specified numTopic:3. Hence each row of text has been featurized as a vector of floats with length 3.
-
-            //0.1818182 0.4545455 0.3636364
-            //0.3636364 0.1818182 0.4545455
-            //0.2222222 0.2222222 0.5555556
-            //0.2727273 0.09090909 0.6363636
-        }
-    }
-}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
new file mode 100644
index 0000000000..550af3e600
--- /dev/null
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
@@ -0,0 +1,74 @@
+﻿using System;
+using System.Collections.Generic;
+using Microsoft.ML.Data;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class LatentDirichletAllocation
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Create a small dataset as an IEnumerable.
+            var samples = new List<TextData>()
+            {
+                new TextData(){ Text = "ML.NET's LatentDirichletAllocation API computes topic model." },
+                new TextData(){ Text = "ML.NET's LatentDirichletAllocation API is the best for topic model." },
+                new TextData(){ Text = "I like to eat broccoli and banana." },
+                new TextData(){ Text = "I eat a banana in the breakfast." },
+                new TextData(){ Text = "This car is expensive compared to last week." },
+                new TextData(){ Text = "This car was $X last week." },
+            };
+
+            // Convert training data to IDataView.
+            var dataview = mlContext.Data.LoadFromEnumerable(samples);
+
+            // A pipeline for featurizing the text/string using LatentDirichletAllocation API.
+            // To be more accurate in computing the LDA features, the pipeline first normalizes text and removes stop words
+            // before passing tokens to LatentDirichletAllocation.
+            var pipeline = mlContext.Transforms.Text.NormalizeText("normText", "Text")
+                .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "normText"))
+                .Append(mlContext.Transforms.Text.RemoveStopWords("Tokens"))
+                .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
+                .Append(mlContext.Transforms.Text.ProduceNgrams("Tokens"))
+                .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "Tokens", numberOfTopics: 3));
+
+            // Fit to data.
+            var transformer = pipeline.Fit(dataview);
+
+            // Create the prediction engine to get the LDA features extracted from the text.
+            var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(transformer);
+
+            // Convert the sample text into LDA features and print it.
+            PrintPredictions(predictionEngine.Predict(samples[0]));
+            PrintPredictions(predictionEngine.Predict(samples[1]));
+
+            // Features obtained post-transformation.
+            // For LatentDirichletAllocation, we had specified numTopic:3. Hence each prediction has been featurized as a vector of floats with length 3.
+
+            //  Topic1  Topic2  Topic3
+            //  0.6364  0.3636  0.0000
+            //  0.4118  0.1765  0.4118
+        }
+
+        private static void PrintPredictions(TransformedTextData prediction)
+        {
+            for (int i = 0; i < prediction.Features.Length; i++)
+                Console.Write($"{prediction.Features[i]:F4}  ");
+            Console.WriteLine();
+        }
+
+        public class TextData
+        {
+            public string Text { get; set; }
+        }
+
+        public class TransformedTextData : TextData
+        {
+            public float[] Features { get; set; }
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
index db412be77c..5ddd402b2a 100644
--- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
+++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -509,7 +509,7 @@ internal static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        /// [!code-csharp[LatentDirichletAllocation](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs)]
+        /// [!code-csharp[LatentDirichletAllocation](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs)]
         /// ]]>
         /// </format>
         /// </example>

From ba42f311efb7470093690fbaa91c19c648e6495d Mon Sep 17 00:00:00 2001
From: Zeeshan Ahmed <zeahmed@microsoft.com>
Date: Wed, 3 Apr 2019 13:40:50 -0700
Subject: [PATCH 2/5] Updated comment.

---
 .../Dynamic/Transforms/Text/LatentDirichletAllocation.cs        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
index 550af3e600..528de94046 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
@@ -19,7 +19,7 @@ public static void Example()
                 new TextData(){ Text = "ML.NET's LatentDirichletAllocation API is the best for topic model." },
                 new TextData(){ Text = "I like to eat broccoli and banana." },
                 new TextData(){ Text = "I eat a banana in the breakfast." },
-                new TextData(){ Text = "This car is expensive compared to last week." },
+                new TextData(){ Text = "This car is expensive compared to last week's price." },
                 new TextData(){ Text = "This car was $X last week." },
             };
 

From 621ed991c7d53a57ac3a76d6df29f5e4d463b3b5 Mon Sep 17 00:00:00 2001
From: Zeeshan Ahmed <zeahmed@microsoft.com>
Date: Wed, 3 Apr 2019 14:07:15 -0700
Subject: [PATCH 3/5] Addressed reviewers' comments.

---
 .../Dynamic/Transforms/Text/LatentDirichletAllocation.cs    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
index 528de94046..6a5ee5e601 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
@@ -31,7 +31,7 @@ public static void Example()
             // before passing tokens to LatentDirichletAllocation.
             var pipeline = mlContext.Transforms.Text.NormalizeText("normText", "Text")
                 .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "normText"))
-                .Append(mlContext.Transforms.Text.RemoveStopWords("Tokens"))
+                .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Tokens"))
                 .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
                 .Append(mlContext.Transforms.Text.ProduceNgrams("Tokens"))
                 .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "Tokens", numberOfTopics: 3));
@@ -50,8 +50,8 @@ public static void Example()
             // For LatentDirichletAllocation, we had specified numTopic:3. Hence each prediction has been featurized as a vector of floats with length 3.
 
             //  Topic1  Topic2  Topic3
-            //  0.6364  0.3636  0.0000
-            //  0.4118  0.1765  0.4118
+            //  0.6364  0.2727  0.0909
+            //  0.5455  0.1818  0.2727
         }
 
         private static void PrintPredictions(TransformedTextData prediction)

From cc2d80a8de55e64b24cae648bd2c64172182d542 Mon Sep 17 00:00:00 2001
From: Zeeshan Ahmed <zeahmed@microsoft.com>
Date: Wed, 3 Apr 2019 18:14:10 -0700
Subject: [PATCH 4/5] Addressed reviewers' comments.

---
 .../Transforms/Text/LatentDirichletAllocation.cs   | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
index 6a5ee5e601..bf0f95aa97 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
@@ -15,10 +15,10 @@ public static void Example()
             // Create a small dataset as an IEnumerable.
             var samples = new List<TextData>()
             {
-                new TextData(){ Text = "ML.NET's LatentDirichletAllocation API computes topic model." },
-                new TextData(){ Text = "ML.NET's LatentDirichletAllocation API is the best for topic model." },
-                new TextData(){ Text = "I like to eat broccoli and banana." },
-                new TextData(){ Text = "I eat a banana in the breakfast." },
+                new TextData(){ Text = "ML.NET's LatentDirichletAllocation API computes topic models." },
+                new TextData(){ Text = "ML.NET's LatentDirichletAllocation API is the best for topic models." },
+                new TextData(){ Text = "I like to eat broccoli and bananas." },
+                new TextData(){ Text = "I eat bananas for breakfast." },
                 new TextData(){ Text = "This car is expensive compared to last week's price." },
                 new TextData(){ Text = "This car was $X last week." },
             };
@@ -28,9 +28,9 @@ public static void Example()
 
             // A pipeline for featurizing the text/string using LatentDirichletAllocation API.
             // To be more accurate in computing the LDA features, the pipeline first normalizes text and removes stop words
-            // before passing tokens to LatentDirichletAllocation.
-            var pipeline = mlContext.Transforms.Text.NormalizeText("normText", "Text")
-                .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "normText"))
+            // before passing tokens (the individual words, lower cased, with common words removed) to LatentDirichletAllocation.
+            var pipeline = mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text")
+                .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "NormalizedText"))
                 .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Tokens"))
                 .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
                 .Append(mlContext.Transforms.Text.ProduceNgrams("Tokens"))

From 883eaa392c5110bf646c129cfbbafabf1246f828 Mon Sep 17 00:00:00 2001
From: Zeeshan Ahmed <zeahmed@microsoft.com>
Date: Thu, 4 Apr 2019 10:25:34 -0700
Subject: [PATCH 5/5] Addressed reviewers' comments.

---
 .../Transforms/Text/LatentDirichletAllocation.cs       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
index bf0f95aa97..3efe734f06 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs
@@ -43,8 +43,8 @@ public static void Example()
             var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(transformer);
 
             // Convert the sample text into LDA features and print it.
-            PrintPredictions(predictionEngine.Predict(samples[0]));
-            PrintPredictions(predictionEngine.Predict(samples[1]));
+            PrintLdaFeatures(predictionEngine.Predict(samples[0]));
+            PrintLdaFeatures(predictionEngine.Predict(samples[1]));
 
             // Features obtained post-transformation.
             // For LatentDirichletAllocation, we had specified numTopic:3. Hence each prediction has been featurized as a vector of floats with length 3.
@@ -54,19 +54,19 @@ public static void Example()
             //  0.5455  0.1818  0.2727
         }
 
-        private static void PrintPredictions(TransformedTextData prediction)
+        private static void PrintLdaFeatures(TransformedTextData prediction)
         {
             for (int i = 0; i < prediction.Features.Length; i++)
                 Console.Write($"{prediction.Features[i]:F4}  ");
             Console.WriteLine();
         }
 
-        public class TextData
+        private class TextData
         {
             public string Text { get; set; }
         }
 
-        public class TransformedTextData : TextData
+        private class TransformedTextData : TextData
         {
             public float[] Features { get; set; }
         }