From 0853b2730976af20cae878722f43d4e05065173a Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 13:39:29 -0700 Subject: [PATCH 1/5] Created sample for 'LatentDirichletAllocation' API. --- .../Dynamic/LdaTransform.cs | 61 --------------- .../Text/LatentDirichletAllocation.cs | 74 +++++++++++++++++++ .../Text/TextCatalog.cs | 2 +- 3 files changed, 75 insertions(+), 62 deletions(-) delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs deleted file mode 100644 index 239e7d93ac..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs +++ /dev/null @@ -1,61 +0,0 @@ -using System; -using System.Collections.Generic; -using Microsoft.ML.Data; - -namespace Microsoft.ML.Samples.Dynamic -{ - public static class LatentDirichletAllocationTransform - { - public static void Example() - { - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var ml = new MLContext(); - - // Get a small dataset as an IEnumerable and then read it as a ML.NET data set. - IEnumerable data = SamplesUtils.DatasetUtils.GetTopicsData(); - var trainData = ml.Data.LoadFromEnumerable(data); - - // Preview of one of the columns of the the topics data. - // The Review column contains the keys associated with a particular body of text. - // - // Review - // "animals birds cats dogs fish horse" - // "horse birds house fish duck cats" - // "car truck driver bus pickup" - // "car truck driver bus pickup horse" - - string review = nameof(SamplesUtils.DatasetUtils.SampleTopicsData.Review); - string ldaFeatures = "LdaFeatures"; - - // A pipeline for featurizing the "Review" column - var pipeline = ml.Transforms.Text.ProduceWordBags(review). - Append(ml.Transforms.Text.LatentDirichletAllocation(review, ldaFeatures, numberOfTopics: 3)); - - // The transformed data - var transformer = pipeline.Fit(trainData); - var transformed_data = transformer.Transform(trainData); - - // Column obtained after processing the input. - var ldaFeaturesColumn = transformed_data.GetColumn>(transformed_data.Schema[ldaFeatures]); - - Console.WriteLine($"{ldaFeatures} column obtained post-transformation."); - foreach (var featureRow in ldaFeaturesColumn) - { - foreach (var value in featureRow.GetValues()) - Console.Write($"{value} "); - Console.WriteLine(""); - } - - Console.WriteLine("==================================================="); - - // LdaFeatures column obtained post-transformation. - // For LDA, we had specified numTopic:3. Hence each row of text has been featurized as a vector of floats with length 3. - - //0.1818182 0.4545455 0.3636364 - //0.3636364 0.1818182 0.4545455 - //0.2222222 0.2222222 0.5555556 - //0.2727273 0.09090909 0.6363636 - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs new file mode 100644 index 0000000000..550af3e600 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs @@ -0,0 +1,74 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class LatentDirichletAllocation + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create a small dataset as an IEnumerable. + var samples = new List() + { + new TextData(){ Text = "ML.NET's LatentDirichletAllocation API computes topic model." }, + new TextData(){ Text = "ML.NET's LatentDirichletAllocation API is the best for topic model." }, + new TextData(){ Text = "I like to eat broccoli and banana." }, + new TextData(){ Text = "I eat a banana in the breakfast." }, + new TextData(){ Text = "This car is expensive compared to last week." }, + new TextData(){ Text = "This car was $X last week." }, + }; + + // Convert training data to IDataView. + var dataview = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for featurizing the text/string using LatentDirichletAllocation API. + // To be more accurate in computing the LDA features, the pipeline first normalizes text and removes stop words + // before passing tokens to LatentDirichletAllocation. + var pipeline = mlContext.Transforms.Text.NormalizeText("normText", "Text") + .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "normText")) + .Append(mlContext.Transforms.Text.RemoveStopWords("Tokens")) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) + .Append(mlContext.Transforms.Text.ProduceNgrams("Tokens")) + .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "Tokens", numberOfTopics: 3)); + + // Fit to data. + var transformer = pipeline.Fit(dataview); + + // Create the prediction engine to get the LDA features extracted from the text. + var predictionEngine = mlContext.Model.CreatePredictionEngine(transformer); + + // Convert the sample text into LDA features and print it. + PrintPredictions(predictionEngine.Predict(samples[0])); + PrintPredictions(predictionEngine.Predict(samples[1])); + + // Features obtained post-transformation. + // For LatentDirichletAllocation, we had specified numTopic:3. Hence each prediction has been featurized as a vector of floats with length 3. + + // Topic1 Topic2 Topic3 + // 0.6364 0.3636 0.0000 + // 0.4118 0.1765 0.4118 + } + + private static void PrintPredictions(TransformedTextData prediction) + { + for (int i = 0; i < prediction.Features.Length; i++) + Console.Write($"{prediction.Features[i]:F4} "); + Console.WriteLine(); + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public float[] Features { get; set; } + } + } +} diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index db412be77c..5ddd402b2a 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -509,7 +509,7 @@ internal static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog /// /// /// /// /// From ba42f311efb7470093690fbaa91c19c648e6495d Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 13:40:50 -0700 Subject: [PATCH 2/5] Updated comment. --- .../Dynamic/Transforms/Text/LatentDirichletAllocation.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs index 550af3e600..528de94046 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs @@ -19,7 +19,7 @@ public static void Example() new TextData(){ Text = "ML.NET's LatentDirichletAllocation API is the best for topic model." }, new TextData(){ Text = "I like to eat broccoli and banana." }, new TextData(){ Text = "I eat a banana in the breakfast." }, - new TextData(){ Text = "This car is expensive compared to last week." }, + new TextData(){ Text = "This car is expensive compared to last week's price." }, new TextData(){ Text = "This car was $X last week." }, }; From 621ed991c7d53a57ac3a76d6df29f5e4d463b3b5 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 14:07:15 -0700 Subject: [PATCH 3/5] Addressed reviewers' comments. --- .../Dynamic/Transforms/Text/LatentDirichletAllocation.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs index 528de94046..6a5ee5e601 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs @@ -31,7 +31,7 @@ public static void Example() // before passing tokens to LatentDirichletAllocation. var pipeline = mlContext.Transforms.Text.NormalizeText("normText", "Text") .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "normText")) - .Append(mlContext.Transforms.Text.RemoveStopWords("Tokens")) + .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Tokens")) .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) .Append(mlContext.Transforms.Text.ProduceNgrams("Tokens")) .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "Tokens", numberOfTopics: 3)); @@ -50,8 +50,8 @@ public static void Example() // For LatentDirichletAllocation, we had specified numTopic:3. Hence each prediction has been featurized as a vector of floats with length 3. // Topic1 Topic2 Topic3 - // 0.6364 0.3636 0.0000 - // 0.4118 0.1765 0.4118 + // 0.6364 0.2727 0.0909 + // 0.5455 0.1818 0.2727 } private static void PrintPredictions(TransformedTextData prediction) From cc2d80a8de55e64b24cae648bd2c64172182d542 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 18:14:10 -0700 Subject: [PATCH 4/5] Addressed reviewers' comments. --- .../Transforms/Text/LatentDirichletAllocation.cs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs index 6a5ee5e601..bf0f95aa97 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs @@ -15,10 +15,10 @@ public static void Example() // Create a small dataset as an IEnumerable. var samples = new List() { - new TextData(){ Text = "ML.NET's LatentDirichletAllocation API computes topic model." }, - new TextData(){ Text = "ML.NET's LatentDirichletAllocation API is the best for topic model." }, - new TextData(){ Text = "I like to eat broccoli and banana." }, - new TextData(){ Text = "I eat a banana in the breakfast." }, + new TextData(){ Text = "ML.NET's LatentDirichletAllocation API computes topic models." }, + new TextData(){ Text = "ML.NET's LatentDirichletAllocation API is the best for topic models." }, + new TextData(){ Text = "I like to eat broccoli and bananas." }, + new TextData(){ Text = "I eat bananas for breakfast." }, new TextData(){ Text = "This car is expensive compared to last week's price." }, new TextData(){ Text = "This car was $X last week." }, }; @@ -28,9 +28,9 @@ public static void Example() // A pipeline for featurizing the text/string using LatentDirichletAllocation API. // To be more accurate in computing the LDA features, the pipeline first normalizes text and removes stop words - // before passing tokens to LatentDirichletAllocation. - var pipeline = mlContext.Transforms.Text.NormalizeText("normText", "Text") - .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "normText")) + // before passing tokens (the individual words, lower cased, with common words removed) to LatentDirichletAllocation. + var pipeline = mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text") + .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "NormalizedText")) .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Tokens")) .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) .Append(mlContext.Transforms.Text.ProduceNgrams("Tokens")) From 883eaa392c5110bf646c129cfbbafabf1246f828 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 4 Apr 2019 10:25:34 -0700 Subject: [PATCH 5/5] Addressed reviewers' comments. --- .../Transforms/Text/LatentDirichletAllocation.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs index bf0f95aa97..3efe734f06 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs @@ -43,8 +43,8 @@ public static void Example() var predictionEngine = mlContext.Model.CreatePredictionEngine(transformer); // Convert the sample text into LDA features and print it. - PrintPredictions(predictionEngine.Predict(samples[0])); - PrintPredictions(predictionEngine.Predict(samples[1])); + PrintLdaFeatures(predictionEngine.Predict(samples[0])); + PrintLdaFeatures(predictionEngine.Predict(samples[1])); // Features obtained post-transformation. // For LatentDirichletAllocation, we had specified numTopic:3. Hence each prediction has been featurized as a vector of floats with length 3. @@ -54,19 +54,19 @@ public static void Example() // 0.5455 0.1818 0.2727 } - private static void PrintPredictions(TransformedTextData prediction) + private static void PrintLdaFeatures(TransformedTextData prediction) { for (int i = 0; i < prediction.Features.Length; i++) Console.Write($"{prediction.Features[i]:F4} "); Console.WriteLine(); } - public class TextData + private class TextData { public string Text { get; set; } } - public class TransformedTextData : TextData + private class TransformedTextData : TextData { public float[] Features { get; set; } }