From 15be23ce2cd48f8639a7b7acd75f2b25a1da04e7 Mon Sep 17 00:00:00 2001
From: Zeeshan Ahmed <zeahmed@microsoft.com>
Date: Fri, 29 Mar 2019 12:38:32 -0700
Subject: [PATCH 1/3] Created sample for 'ApplyWordEmbedding' API.

---
 .../Text/ApplyCustomWordEmbedding.cs          | 79 +++++++++++++++++++
 .../Transforms/Text/ApplyWordEmbedding.cs     | 68 ++++++++++++++++
 .../Text/TextCatalog.cs                       |  4 +-
 3 files changed, 149 insertions(+), 2 deletions(-)
 create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs
 create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs
new file mode 100644
index 0000000000..297df0dffb
--- /dev/null
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs
@@ -0,0 +1,79 @@
+﻿using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class ApplyCustomWordEmbedding
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as
+            // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator.
+            // The empty list is only needed to pass input schema to the pipeline.
+            var emptySamples = new List<TextData>();
+
+            // Convert sample list to an empty IDataView.
+            var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples);
+
+            var pathToCustomModel = @".\custommodel.txt";
+            using (StreamWriter file = new StreamWriter(pathToCustomModel, false))
+            {
+
+                file.WriteLine("This is custom file for 4 words with 3 dimensional word embedding vector. This first line in this file does not confirm to the '<word> <float> <float> <float>' pattern, and is therefore ignored");
+                file.WriteLine("greate" + " " + string.Join(" ", 1.0f, 2.0f, 3.0f));
+                file.WriteLine("product" + " " + string.Join(" ", -1.0f, -2.0f, -3.0f));
+                file.WriteLine("like" + " " + string.Join(" ", -1f, 100.0f, -100f));
+                file.WriteLine("buy" + " " + string.Join(" ", 0f, 0f, 20f));
+            }
+
+            // A pipeline for converting text into a 9-dimension word embedding vector using the custom word embedding model.
+            // The 'ApplyWordEmbedding' computes the minimum, average and maximum values for each token's embedding vector.
+            // Tokens in 'custommodel.txt' model are represented as 3-dimension vector.
+            // Therefore, the output is of 9-dimension [min, avg, max].
+            //
+            // The 'ApplyWordEmbedding' API requires vector of text as input.
+            // The pipeline first normalizes and tokenizes text then applies word embedding transformation.
+            var textPipeline = mlContext.Transforms.Text.NormalizeText("Text")
+                .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text"))
+                .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Features", pathToCustomModel, "Tokens"));
+
+            // Fit to data.
+            var textTransformer = textPipeline.Fit(emptyDataView);
+
+            // Create the prediction engine to get the embedding vector from the input text/string.
+            var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
+
+            // Call the prediction API to convert the text into embedding vector.
+            var data = new TextData() { Text = "This is a greate product. I would like to buy it again."  };
+            var prediction = predictionEngine.Predict(data);
+
+            // Print the length of the embedding vector.
+            Console.WriteLine($"Number of Features: {prediction.Features.Length}");
+
+            // Print the embedding vector.
+            Console.Write("Features: ");
+            foreach (var f in prediction.Features)
+                Console.Write($"{f:F4} ");
+
+            //  Expected output:
+            //   Number of Features: 9
+            //   Features: -1.0000 0.0000 -100.0000 0.0000 34.0000 -25.6667 1.0000 100.0000 20.0000
+        }
+
+        public class TextData
+        {
+            public string Text { get; set; }
+        }
+
+        public class TransformedTextData : TextData
+        {
+            public float[] Features { get; set; }
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs
new file mode 100644
index 0000000000..b7530be587
--- /dev/null
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs
@@ -0,0 +1,68 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class ApplyWordEmbedding
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as
+            // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator.
+            // The empty list is only needed to pass input schema to the pipeline.
+            var emptySamples = new List<TextData>();
+
+            // Convert sample list to an empty IDataView.
+            var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples);
+
+            // A pipeline for converting text into a 150-dimension embedding vector using pretrained 'SentimentSpecificWordEmbedding' model.
+            // The 'ApplyWordEmbedding' computes the minimum, average and maximum values for each token's embedding vector.
+            // Tokens in 'SentimentSpecificWordEmbedding' model are represented as 50-dimension vector.
+            // Therefore, the output is of 150-dimension [min, avg, max].
+            //
+            // The 'ApplyWordEmbedding' API requires vector of text as input.
+            // The pipeline first normalizes and tokenizes text then applies word embedding transformation.
+            var textPipeline = mlContext.Transforms.Text.NormalizeText("Text")
+                .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text"))
+                .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Features", "Tokens", 
+                    Transforms.Text.WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));
+
+            // Fit to data.
+            var textTransformer = textPipeline.Fit(emptyDataView);
+
+            // Create the prediction engine to get the embedding vector from the input text/string.
+            var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
+
+            // Call the prediction API to convert the text into embedding vector.
+            var data = new TextData() { Text = "This is a greate product. I would like to buy it again."  };
+            var prediction = predictionEngine.Predict(data);
+
+            // Print the length of the embedding vector.
+            Console.WriteLine($"Number of Features: {prediction.Features.Length}");
+
+            // Print the embedding vector.
+            Console.Write("Features: ");
+            foreach (var f in prediction.Features)
+                Console.Write($"{f:F4} ");
+
+            //  Expected output:
+            //   Number of Features: 150
+            //   Features: -1.2489 0.2384 -1.3034 -0.9135 -3.4978 -0.1784 -1.3823 -0.3863 -2.5262 -0.8950 ...
+        }
+
+        public class TextData
+        {
+            public string Text { get; set; }
+        }
+
+        public class TransformedTextData : TextData
+        {
+            public float[] Features { get; set; }
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
index 2be9e4dd7d..1d0ef69a33 100644
--- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
+++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -125,7 +125,7 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        /// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs)]
+        /// [!code-csharp[ApplyWordEmbedding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs)]
         /// ]]>
         /// </format>
         /// </example>
@@ -143,7 +143,7 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        /// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs)]
+        /// [!code-csharp[ApplyWordEmbedding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs)]
         /// ]]>
         /// </format>
         /// </example>

From 58e2d4be07734fcebb4b01034e31a6fda1109da9 Mon Sep 17 00:00:00 2001
From: Zeeshan Ahmed <zeahmed@microsoft.com>
Date: Fri, 29 Mar 2019 18:24:20 -0700
Subject: [PATCH 2/3] Addressed reviewers' comments.

---
 .../Transforms/Text/ApplyCustomWordEmbedding.cs   | 15 ++++++++-------
 .../Dynamic/Transforms/Text/ApplyWordEmbedding.cs |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs
index 297df0dffb..4ac4ab18da 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs
@@ -21,15 +21,16 @@ public static void Example()
             // Convert sample list to an empty IDataView.
             var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples);
 
+            // Write a custom 3-dimensional word embedding model with 4 words.
+            // Each line follows '<word> <float> <float> <float>' pattern.
+            // Lines that do not confirm to the pattern are ignored.
             var pathToCustomModel = @".\custommodel.txt";
             using (StreamWriter file = new StreamWriter(pathToCustomModel, false))
             {
-
-                file.WriteLine("This is custom file for 4 words with 3 dimensional word embedding vector. This first line in this file does not confirm to the '<word> <float> <float> <float>' pattern, and is therefore ignored");
-                file.WriteLine("greate" + " " + string.Join(" ", 1.0f, 2.0f, 3.0f));
-                file.WriteLine("product" + " " + string.Join(" ", -1.0f, -2.0f, -3.0f));
-                file.WriteLine("like" + " " + string.Join(" ", -1f, 100.0f, -100f));
-                file.WriteLine("buy" + " " + string.Join(" ", 0f, 0f, 20f));
+                file.WriteLine("great 1.0 2.0 3.0");
+                file.WriteLine("product -1.0 -2.0 -3.0");
+                file.WriteLine("like -1 100.0 -100");
+                file.WriteLine("buy 0 0 20");
             }
 
             // A pipeline for converting text into a 9-dimension word embedding vector using the custom word embedding model.
@@ -50,7 +51,7 @@ public static void Example()
             var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
 
             // Call the prediction API to convert the text into embedding vector.
-            var data = new TextData() { Text = "This is a greate product. I would like to buy it again."  };
+            var data = new TextData() { Text = "This is a great product. I would like to buy it again."  };
             var prediction = predictionEngine.Predict(data);
 
             // Print the length of the embedding vector.
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs
index b7530be587..0a58a2da07 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs
@@ -39,7 +39,7 @@ public static void Example()
             var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
 
             // Call the prediction API to convert the text into embedding vector.
-            var data = new TextData() { Text = "This is a greate product. I would like to buy it again."  };
+            var data = new TextData() { Text = "This is a great product. I would like to buy it again."  };
             var prediction = predictionEngine.Predict(data);
 
             // Print the length of the embedding vector.

From a3ec5d3870a39c08206309dd50286bd717e2f2f2 Mon Sep 17 00:00:00 2001
From: Zeeshan Ahmed <zeahmed@microsoft.com>
Date: Mon, 1 Apr 2019 10:39:12 -0700
Subject: [PATCH 3/3] Deleted old embedding sample.

---
 .../Dynamic/WordEmbeddingTransform.cs         | 109 ------------------
 1 file changed, 109 deletions(-)
 delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs
deleted file mode 100644
index 1830b3e171..0000000000
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs
+++ /dev/null
@@ -1,109 +0,0 @@
-﻿using System;
-using System.Collections.Generic;
-using System.IO;
-using Microsoft.ML.Data;
-using Microsoft.ML.Transforms.Text;
-namespace Microsoft.ML.Samples.Dynamic
-{
-    public static class WordEmbeddingTransform
-    {
-        public static void Example()
-        {
-            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
-            // as well as the source of randomness.
-            var ml = new MLContext();
-
-            // Get a small dataset as an IEnumerable and convert to IDataView.
-            var data = SamplesUtils.DatasetUtils.GetSentimentData();
-            var trainData = ml.Data.LoadFromEnumerable(data);
-
-            // Preview of the data.
-            //
-            // Sentiment    SentimentText
-            // true         Best game I've ever played.
-            // false        ==RUDE== Dude, 2.
-            // true          Until the next game, this is the best Xbox game!
-
-            // Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords.
-            var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
-                .Append(ml.Transforms.Text.TokenizeIntoWords("Words", "NormalizedText"))
-                .Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
-
-            var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData);
-            // Preview of the CleanWords column obtained after processing SentimentText.
-            var cleanWords = wordsDataview.GetColumn<ReadOnlyMemory<char>[]>(wordsDataview.Schema["CleanWords"]);
-            Console.WriteLine($" CleanWords column obtained post-transformation.");
-            foreach (var featureRow in cleanWords)
-            {
-                foreach (var value in featureRow)
-                    Console.Write($"{value} ");
-                Console.WriteLine("");
-            }
-
-            Console.WriteLine("===================================================");
-            // best game ive played
-            // == rude == dude 2
-            // game best xbox game
-
-            // Small helper to print wordembeddings in the console. 
-            Action<string, IEnumerable<float[]>> printEmbeddings = (columnName, column) =>
-            {
-                Console.WriteLine($"{columnName} column obtained post-transformation.");
-                foreach (var featureRow in column)
-                {
-                    foreach (var value in featureRow)
-                        Console.Write($"{value} ");
-                    Console.WriteLine("");
-                }
-
-                Console.WriteLine("===================================================");
-            };
-
-            // Let's apply pretrained word embedding model GloVeTwitter25D.
-            // 25D means each word mapped into 25 dimensional space, basically each word represented by 25 float values.
-            var gloveWordEmbedding = ml.Transforms.Text.ApplyWordEmbedding("GloveEmbeddings", "CleanWords",
-                WordEmbeddingEstimator.PretrainedModelKind.GloVeTwitter25D);
-
-            // We also have option to apply custom word embedding models.
-            // Let's first create one.
-            // Format is following:
-            // First line is ignored if it is a header for your file.
-            // Each next line contains a single word followed by either a tab or space, and a list of floats also separated by a tab or space.
-            // Size of array of floats should be same for whole file.
-            var pathToCustomModel = @".\custommodel.txt";
-            using (StreamWriter file = new StreamWriter(pathToCustomModel, false))
-            {
-
-                file.WriteLine("This is custom file for 4 words with 3 dimensional word embedding vector. This first line in this file does not conform to the '<word> <float> <float> <float>' pattern, and is therefore ignored");
-                file.WriteLine("xbox" + " " + string.Join(" ", 1.0f, 2.0f, 3.0f));
-                file.WriteLine("game" + " " + string.Join(" ", -1.0f, -2.0f, -3.0f));
-                file.WriteLine("dude" + " " + string.Join(" ", -1f, 100.0f, -100f));
-                file.WriteLine("best" + " " + string.Join(" ", 0f, 0f, 20f));
-            }
-            // Now let's add custom embedding on top of same words.
-            var pipeline = gloveWordEmbedding.Append(ml.Transforms.Text.ApplyWordEmbedding("CustomEmbeddings", @".\custommodel.txt", "CleanWords"));
-
-            // And do all required transformations.
-            var embeddingDataview = pipeline.Fit(wordsDataview).Transform(wordsDataview);
-
-            var customEmbeddings = embeddingDataview.GetColumn<float[]>(embeddingDataview.Schema["CustomEmbeddings"]);
-            printEmbeddings("GloveEmbeddings", customEmbeddings);
-
-            // -1  -2   -3  -0.5   -1  8.5  0   0   20
-            // -1 100 -100    -1  100 -100 -1 100 -100
-            //  1  -2   -3 -0.25 -0.5 4.25  1   2   20
-            // As you can see above we output 9 values for each line
-            // We go through each word present in row and extract 3 floats for it (if we can find that word in model).
-            // First 3 floats in output values represent minimum values (for each dimension) for extracted values. 
-            // Second set of 3 floats in output represent average (for each dimension) for extracted values.
-            // Third set of 3 floats in output represent maximum values (for each dimension) for extracted values.
-            // Preview of GloveEmbeddings.
-            var gloveEmbeddings = embeddingDataview.GetColumn<float[]>(embeddingDataview.Schema["GloveEmbeddings"]);
-            printEmbeddings("GloveEmbeddings", gloveEmbeddings);
-            // 0.23166 0.048825 0.26878 -1.3945 -0.86072 -0.026778 0.84075 -0.81987 -1.6681 -1.0658 -0.30596 0.50974 ...
-            //-0.094905 0.61109 0.52546 - 0.2516 0.054786 0.022661 1.1801 0.33329 - 0.85388 0.15471 - 0.5984 0.4364  ...
-            // 0.23166 0.048825 0.26878 - 1.3945 - 0.30044 - 0.16523 0.47251 0.10276 - 0.20978 - 0.68094 - 0.30596  ...
-
-        }
-    }
-}