From 15be23ce2cd48f8639a7b7acd75f2b25a1da04e7 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Fri, 29 Mar 2019 12:38:32 -0700 Subject: [PATCH 1/3] Created sample for 'ApplyWordEmbedding' API. --- .../Text/ApplyCustomWordEmbedding.cs | 79 +++++++++++++++++++ .../Transforms/Text/ApplyWordEmbedding.cs | 68 ++++++++++++++++ .../Text/TextCatalog.cs | 4 +- 3 files changed, 149 insertions(+), 2 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs new file mode 100644 index 0000000000..297df0dffb --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs @@ -0,0 +1,79 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class ApplyCustomWordEmbedding + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as + // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var emptySamples = new List(); + + // Convert sample list to an empty IDataView. + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + + var pathToCustomModel = @".\custommodel.txt"; + using (StreamWriter file = new StreamWriter(pathToCustomModel, false)) + { + + file.WriteLine("This is custom file for 4 words with 3 dimensional word embedding vector. This first line in this file does not confirm to the ' ' pattern, and is therefore ignored"); + file.WriteLine("greate" + " " + string.Join(" ", 1.0f, 2.0f, 3.0f)); + file.WriteLine("product" + " " + string.Join(" ", -1.0f, -2.0f, -3.0f)); + file.WriteLine("like" + " " + string.Join(" ", -1f, 100.0f, -100f)); + file.WriteLine("buy" + " " + string.Join(" ", 0f, 0f, 20f)); + } + + // A pipeline for converting text into a 9-dimension word embedding vector using the custom word embedding model. + // The 'ApplyWordEmbedding' computes the minimum, average and maximum values for each token's embedding vector. + // Tokens in 'custommodel.txt' model are represented as 3-dimension vector. + // Therefore, the output is of 9-dimension [min, avg, max]. + // + // The 'ApplyWordEmbedding' API requires vector of text as input. + // The pipeline first normalizes and tokenizes text then applies word embedding transformation. + var textPipeline = mlContext.Transforms.Text.NormalizeText("Text") + .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")) + .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Features", pathToCustomModel, "Tokens")); + + // Fit to data. + var textTransformer = textPipeline.Fit(emptyDataView); + + // Create the prediction engine to get the embedding vector from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to convert the text into embedding vector. + var data = new TextData() { Text = "This is a greate product. I would like to buy it again." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the embedding vector. + Console.WriteLine($"Number of Features: {prediction.Features.Length}"); + + // Print the embedding vector. + Console.Write("Features: "); + foreach (var f in prediction.Features) + Console.Write($"{f:F4} "); + + // Expected output: + // Number of Features: 9 + // Features: -1.0000 0.0000 -100.0000 0.0000 34.0000 -25.6667 1.0000 100.0000 20.0000 + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public float[] Features { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs new file mode 100644 index 0000000000..b7530be587 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs @@ -0,0 +1,68 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class ApplyWordEmbedding + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as + // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var emptySamples = new List(); + + // Convert sample list to an empty IDataView. + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + + // A pipeline for converting text into a 150-dimension embedding vector using pretrained 'SentimentSpecificWordEmbedding' model. + // The 'ApplyWordEmbedding' computes the minimum, average and maximum values for each token's embedding vector. + // Tokens in 'SentimentSpecificWordEmbedding' model are represented as 50-dimension vector. + // Therefore, the output is of 150-dimension [min, avg, max]. + // + // The 'ApplyWordEmbedding' API requires vector of text as input. + // The pipeline first normalizes and tokenizes text then applies word embedding transformation. + var textPipeline = mlContext.Transforms.Text.NormalizeText("Text") + .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")) + .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Features", "Tokens", + Transforms.Text.WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)); + + // Fit to data. + var textTransformer = textPipeline.Fit(emptyDataView); + + // Create the prediction engine to get the embedding vector from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to convert the text into embedding vector. + var data = new TextData() { Text = "This is a greate product. I would like to buy it again." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the embedding vector. + Console.WriteLine($"Number of Features: {prediction.Features.Length}"); + + // Print the embedding vector. + Console.Write("Features: "); + foreach (var f in prediction.Features) + Console.Write($"{f:F4} "); + + // Expected output: + // Number of Features: 150 + // Features: -1.2489 0.2384 -1.3034 -0.9135 -3.4978 -0.1784 -1.3823 -0.3863 -2.5262 -0.8950 ... + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public float[] Features { get; set; } + } + } +} diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 2be9e4dd7d..1d0ef69a33 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -125,7 +125,7 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text /// /// /// /// /// @@ -143,7 +143,7 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T /// /// /// /// /// From 58e2d4be07734fcebb4b01034e31a6fda1109da9 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Fri, 29 Mar 2019 18:24:20 -0700 Subject: [PATCH 2/3] Addressed reviewers' comments. --- .../Transforms/Text/ApplyCustomWordEmbedding.cs | 15 ++++++++------- .../Dynamic/Transforms/Text/ApplyWordEmbedding.cs | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs index 297df0dffb..4ac4ab18da 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs @@ -21,15 +21,16 @@ public static void Example() // Convert sample list to an empty IDataView. var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + // Write a custom 3-dimensional word embedding model with 4 words. + // Each line follows ' ' pattern. + // Lines that do not confirm to the pattern are ignored. var pathToCustomModel = @".\custommodel.txt"; using (StreamWriter file = new StreamWriter(pathToCustomModel, false)) { - - file.WriteLine("This is custom file for 4 words with 3 dimensional word embedding vector. This first line in this file does not confirm to the ' ' pattern, and is therefore ignored"); - file.WriteLine("greate" + " " + string.Join(" ", 1.0f, 2.0f, 3.0f)); - file.WriteLine("product" + " " + string.Join(" ", -1.0f, -2.0f, -3.0f)); - file.WriteLine("like" + " " + string.Join(" ", -1f, 100.0f, -100f)); - file.WriteLine("buy" + " " + string.Join(" ", 0f, 0f, 20f)); + file.WriteLine("great 1.0 2.0 3.0"); + file.WriteLine("product -1.0 -2.0 -3.0"); + file.WriteLine("like -1 100.0 -100"); + file.WriteLine("buy 0 0 20"); } // A pipeline for converting text into a 9-dimension word embedding vector using the custom word embedding model. @@ -50,7 +51,7 @@ public static void Example() var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); // Call the prediction API to convert the text into embedding vector. - var data = new TextData() { Text = "This is a greate product. I would like to buy it again." }; + var data = new TextData() { Text = "This is a great product. I would like to buy it again." }; var prediction = predictionEngine.Predict(data); // Print the length of the embedding vector. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs index b7530be587..0a58a2da07 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs @@ -39,7 +39,7 @@ public static void Example() var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); // Call the prediction API to convert the text into embedding vector. - var data = new TextData() { Text = "This is a greate product. I would like to buy it again." }; + var data = new TextData() { Text = "This is a great product. I would like to buy it again." }; var prediction = predictionEngine.Predict(data); // Print the length of the embedding vector. From a3ec5d3870a39c08206309dd50286bd717e2f2f2 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 1 Apr 2019 10:39:12 -0700 Subject: [PATCH 3/3] Deleted old embedding sample. --- .../Dynamic/WordEmbeddingTransform.cs | 109 ------------------ 1 file changed, 109 deletions(-) delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs deleted file mode 100644 index 1830b3e171..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs +++ /dev/null @@ -1,109 +0,0 @@ -using System; -using System.Collections.Generic; -using System.IO; -using Microsoft.ML.Data; -using Microsoft.ML.Transforms.Text; -namespace Microsoft.ML.Samples.Dynamic -{ - public static class WordEmbeddingTransform - { - public static void Example() - { - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var ml = new MLContext(); - - // Get a small dataset as an IEnumerable and convert to IDataView. - var data = SamplesUtils.DatasetUtils.GetSentimentData(); - var trainData = ml.Data.LoadFromEnumerable(data); - - // Preview of the data. - // - // Sentiment SentimentText - // true Best game I've ever played. - // false ==RUDE== Dude, 2. - // true Until the next game, this is the best Xbox game! - - // Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords. - var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) - .Append(ml.Transforms.Text.TokenizeIntoWords("Words", "NormalizedText")) - .Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); - - var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData); - // Preview of the CleanWords column obtained after processing SentimentText. - var cleanWords = wordsDataview.GetColumn[]>(wordsDataview.Schema["CleanWords"]); - Console.WriteLine($" CleanWords column obtained post-transformation."); - foreach (var featureRow in cleanWords) - { - foreach (var value in featureRow) - Console.Write($"{value} "); - Console.WriteLine(""); - } - - Console.WriteLine("==================================================="); - // best game ive played - // == rude == dude 2 - // game best xbox game - - // Small helper to print wordembeddings in the console. - Action> printEmbeddings = (columnName, column) => - { - Console.WriteLine($"{columnName} column obtained post-transformation."); - foreach (var featureRow in column) - { - foreach (var value in featureRow) - Console.Write($"{value} "); - Console.WriteLine(""); - } - - Console.WriteLine("==================================================="); - }; - - // Let's apply pretrained word embedding model GloVeTwitter25D. - // 25D means each word mapped into 25 dimensional space, basically each word represented by 25 float values. - var gloveWordEmbedding = ml.Transforms.Text.ApplyWordEmbedding("GloveEmbeddings", "CleanWords", - WordEmbeddingEstimator.PretrainedModelKind.GloVeTwitter25D); - - // We also have option to apply custom word embedding models. - // Let's first create one. - // Format is following: - // First line is ignored if it is a header for your file. - // Each next line contains a single word followed by either a tab or space, and a list of floats also separated by a tab or space. - // Size of array of floats should be same for whole file. - var pathToCustomModel = @".\custommodel.txt"; - using (StreamWriter file = new StreamWriter(pathToCustomModel, false)) - { - - file.WriteLine("This is custom file for 4 words with 3 dimensional word embedding vector. This first line in this file does not conform to the ' ' pattern, and is therefore ignored"); - file.WriteLine("xbox" + " " + string.Join(" ", 1.0f, 2.0f, 3.0f)); - file.WriteLine("game" + " " + string.Join(" ", -1.0f, -2.0f, -3.0f)); - file.WriteLine("dude" + " " + string.Join(" ", -1f, 100.0f, -100f)); - file.WriteLine("best" + " " + string.Join(" ", 0f, 0f, 20f)); - } - // Now let's add custom embedding on top of same words. - var pipeline = gloveWordEmbedding.Append(ml.Transforms.Text.ApplyWordEmbedding("CustomEmbeddings", @".\custommodel.txt", "CleanWords")); - - // And do all required transformations. - var embeddingDataview = pipeline.Fit(wordsDataview).Transform(wordsDataview); - - var customEmbeddings = embeddingDataview.GetColumn(embeddingDataview.Schema["CustomEmbeddings"]); - printEmbeddings("GloveEmbeddings", customEmbeddings); - - // -1 -2 -3 -0.5 -1 8.5 0 0 20 - // -1 100 -100 -1 100 -100 -1 100 -100 - // 1 -2 -3 -0.25 -0.5 4.25 1 2 20 - // As you can see above we output 9 values for each line - // We go through each word present in row and extract 3 floats for it (if we can find that word in model). - // First 3 floats in output values represent minimum values (for each dimension) for extracted values. - // Second set of 3 floats in output represent average (for each dimension) for extracted values. - // Third set of 3 floats in output represent maximum values (for each dimension) for extracted values. - // Preview of GloveEmbeddings. - var gloveEmbeddings = embeddingDataview.GetColumn(embeddingDataview.Schema["GloveEmbeddings"]); - printEmbeddings("GloveEmbeddings", gloveEmbeddings); - // 0.23166 0.048825 0.26878 -1.3945 -0.86072 -0.026778 0.84075 -0.81987 -1.6681 -1.0658 -0.30596 0.50974 ... - //-0.094905 0.61109 0.52546 - 0.2516 0.054786 0.022661 1.1801 0.33329 - 0.85388 0.15471 - 0.5984 0.4364 ... - // 0.23166 0.048825 0.26878 - 1.3945 - 0.30044 - 0.16523 0.47251 0.10276 - 0.20978 - 0.68094 - 0.30596 ... - - } - } -}