From 80ee90a2e1cf0436f3d41edadf75af8efe7fc132 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 2 Apr 2019 13:04:02 -0700 Subject: [PATCH 1/8] Created samples for 'ProduceNgrams' and 'ProduceHashedNgrams' APIs. --- .../Transforms/Text/ProduceHashedNgrams.cs | 69 +++++++++++++++ .../Dynamic/Transforms/Text/ProduceNgrams.cs | 88 +++++++++++++++++++ .../Text/TextCatalog.cs | 9 +- 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs new file mode 100644 index 0000000000..516695deef --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs @@ -0,0 +1,69 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Data; +using Microsoft.ML.Transforms.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class ProduceHashedNgrams + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create a small dataset as an IEnumerable. + var samples = new List() + { + new TextData(){ Text = "This is an example to compute Ngrams using hashing." }, + new TextData(){ Text = "Ngram is a sequence of 'N' consecutive words/tokens." }, + new TextData(){ Text = "ML.NET's ProduceHashedNgrams API produces count of Ngrams and hashes it as an index into a vector of given bit length." }, + new TextData(){ Text = "The hashing schem reduces the size of the output feature vector" }, + new TextData(){ Text = "which is useful in case when number of Ngrams is very large." }, + }; + + // Convert training data to IDataView. + var dataview = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for converting text into numeric hashed Ngram features. + // The following call to 'ProduceHashedNgrams' requires the tokenized text/string as input. + // This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceHashedNgrams'. + // Please note that the length of the output feature vector depends on the 'numberOfBits' settings. + var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text") + .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) + .Append(mlContext.Transforms.Text.ProduceHashedNgrams("NgramFeatures", "Tokens", numberOfBits: 8, ngramLength: 3, useAllLengths: false)); + + // Fit to data. + var textTransformer = textPipeline.Fit(dataview); + + // Create the prediction engine to get the features extracted from the text. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Convert the text into numeric features. + var prediction = predictionEngine.Predict(samples[0]); + + // Print the length of the feature vector. + Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}"); + + // Print the first 10 feature values. + Console.Write("Features: "); + for (int i = 0; i < 10; i++) + Console.Write($"{prediction.NgramFeatures[i]:F4} "); + + // Expected output: + // Number of Features: 256 + // Features: 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public float[] NgramFeatures { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs new file mode 100644 index 0000000000..884ce2ad58 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs @@ -0,0 +1,88 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Data; +using Microsoft.ML.Transforms.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class ProduceNgrams + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create a small dataset as an IEnumerable. + var samples = new List() + { + new TextData(){ Text = "This is an example to compute Ngrams." }, + new TextData(){ Text = "Ngram is a sequence of 'N' consecutive words/tokens." }, + new TextData(){ Text = "ML.NET's ProduceNgrams API produces vector of Ngrams." }, + new TextData(){ Text = "Each position in the vector corresponds to a particular Ngram." }, + new TextData(){ Text = "The value at each position corresponds to," }, + new TextData(){ Text = "the number of times Ngram occured in the data (Tf), or" }, + new TextData(){ Text = "the inverse of the number of documents that contain the Ngram (Idf), or." }, + new TextData(){ Text = "or compute both and multipy together (Tf-Idf)." }, + }; + + // Convert training data to IDataView. + var dataview = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for converting text into numeric Ngram features. + // The following call to 'ProduceNgrams' requires the tokenized text/string as input. + // This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceNgrams'. + // Please note that the length of the output feature vector depends on the Ngram settings. + var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text") + .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) + .Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures", "Tokens", + ngramLength: 3, useAllLengths: false, weighting: NgramExtractingEstimator.WeightingCriteria.Tf)); + + // Fit to data. + var textTransformer = textPipeline.Fit(dataview); + var transformedDataView = textTransformer.Transform(dataview); + + // Create the prediction engine to get the Ngram features extracted from the text. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Convert the text into numeric features. + var prediction = predictionEngine.Predict(samples[0]); + + // Print the length of the feature vector. + Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}"); + + // Preview of the produced . + VBuffer> slotNames = default; + transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames); + var NgramFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["NgramFeatures"]); + var slots = slotNames.GetValues(); + Console.Write("Ngrams: "); + foreach (var featureRow in NgramFeaturesColumn) + { + foreach (var item in featureRow.Items()) + Console.Write($"{slots[item.Key]} "); + Console.WriteLine(); + } + + // Print the first 10 feature values. + Console.Write("Features: "); + for (int i = 0; i < 10; i++) + Console.Write($"{prediction.NgramFeatures[i]:F4} "); + + // Expected output: + // Number of Features: 332 + // Ngrams: This|is|an is|an|example an|example|to example|to|compute to|compute|Ngrams. Ngram|is|a is|a|sequence a|sequence|of sequence|of|'N' of|'N'|consecutive ... + // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 ... + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public float[] NgramFeatures { get; set; } + } + } +} diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 86dafb8807..3b7e3c70dd 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -217,7 +217,7 @@ internal static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog /// /// /// /// /// @@ -450,6 +450,13 @@ public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.T /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. /// Whether to rehash unigrams. + /// + /// + /// + /// + /// public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string[] inputColumnNames = null, From fe6503335b1a563d5aa14e7d8f91fdec2a83b752 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 10:46:41 -0700 Subject: [PATCH 2/8] Addressed reviewers' comments. --- .../Dynamic/Transforms/Text/ProduceNgrams.cs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs index 884ce2ad58..87a8a5e410 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs @@ -23,7 +23,7 @@ public static void Example() new TextData(){ Text = "The value at each position corresponds to," }, new TextData(){ Text = "the number of times Ngram occured in the data (Tf), or" }, new TextData(){ Text = "the inverse of the number of documents that contain the Ngram (Idf), or." }, - new TextData(){ Text = "or compute both and multipy together (Tf-Idf)." }, + new TextData(){ Text = "or compute both and multiply together (Tf-Idf)." }, }; // Convert training data to IDataView. @@ -34,9 +34,12 @@ public static void Example() // This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceNgrams'. // Please note that the length of the output feature vector depends on the Ngram settings. var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text") + // 'ProduceNgrams' takes key type as input. Converting the tokens into key type using 'MapValueToKey'. .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) .Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures", "Tokens", - ngramLength: 3, useAllLengths: false, weighting: NgramExtractingEstimator.WeightingCriteria.Tf)); + ngramLength: 3, + useAllLengths: false, + weighting: NgramExtractingEstimator.WeightingCriteria.Tf)); // Fit to data. var textTransformer = textPipeline.Fit(dataview); @@ -51,7 +54,9 @@ public static void Example() // Print the length of the feature vector. Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}"); - // Preview of the produced . + // Preview of the produced Ngrams. + // Get the slot names from the column's metadata. + // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. VBuffer> slotNames = default; transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames); var NgramFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["NgramFeatures"]); From 4ddaf5ccfa1fcd4966660b406174064ddb233cec Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 13:57:22 -0700 Subject: [PATCH 3/8] Addressed reviewers' comments. --- .../Dynamic/Transforms/Text/ProduceHashedNgrams.cs | 11 +++++++---- .../Dynamic/Transforms/Text/ProduceNgrams.cs | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs index 516695deef..419573292c 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs @@ -19,7 +19,7 @@ public static void Example() new TextData(){ Text = "This is an example to compute Ngrams using hashing." }, new TextData(){ Text = "Ngram is a sequence of 'N' consecutive words/tokens." }, new TextData(){ Text = "ML.NET's ProduceHashedNgrams API produces count of Ngrams and hashes it as an index into a vector of given bit length." }, - new TextData(){ Text = "The hashing schem reduces the size of the output feature vector" }, + new TextData(){ Text = "The hashing reduces the size of the output feature vector" }, new TextData(){ Text = "which is useful in case when number of Ngrams is very large." }, }; @@ -32,7 +32,10 @@ public static void Example() // Please note that the length of the output feature vector depends on the 'numberOfBits' settings. var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text") .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) - .Append(mlContext.Transforms.Text.ProduceHashedNgrams("NgramFeatures", "Tokens", numberOfBits: 8, ngramLength: 3, useAllLengths: false)); + .Append(mlContext.Transforms.Text.ProduceHashedNgrams("NgramFeatures", "Tokens", + numberOfBits: 5, + ngramLength: 3, + useAllLengths: false, maximumNumberOfInverts: -1)); // Fit to data. var textTransformer = textPipeline.Fit(dataview); @@ -52,8 +55,8 @@ public static void Example() Console.Write($"{prediction.NgramFeatures[i]:F4} "); // Expected output: - // Number of Features: 256 - // Features: 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 + // Number of Features: 32 + // Features: 0.0000 0.0000 2.0000 0.0000 0.0000 1.0000 0.0000 0.0000 1.0000 0.0000 } public class TextData diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs index 87a8a5e410..a5606a25ef 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs @@ -22,7 +22,7 @@ public static void Example() new TextData(){ Text = "Each position in the vector corresponds to a particular Ngram." }, new TextData(){ Text = "The value at each position corresponds to," }, new TextData(){ Text = "the number of times Ngram occured in the data (Tf), or" }, - new TextData(){ Text = "the inverse of the number of documents that contain the Ngram (Idf), or." }, + new TextData(){ Text = "the inverse of the number of documents that contain the Ngram (Idf)," }, new TextData(){ Text = "or compute both and multiply together (Tf-Idf)." }, }; @@ -75,7 +75,7 @@ public static void Example() Console.Write($"{prediction.NgramFeatures[i]:F4} "); // Expected output: - // Number of Features: 332 + // Number of Features: 52 // Ngrams: This|is|an is|an|example an|example|to example|to|compute to|compute|Ngrams. Ngram|is|a is|a|sequence a|sequence|of sequence|of|'N' of|'N'|consecutive ... // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 ... } From 4db1cd7c39a09663bf5f0689f50eb649ebf6e4e5 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 13:59:14 -0700 Subject: [PATCH 4/8] Addressed reviewers' comments. --- .../Dynamic/Transforms/Text/ProduceHashedNgrams.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs index 419573292c..465dae8671 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs @@ -33,9 +33,7 @@ public static void Example() var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text") .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) .Append(mlContext.Transforms.Text.ProduceHashedNgrams("NgramFeatures", "Tokens", - numberOfBits: 5, - ngramLength: 3, - useAllLengths: false, maximumNumberOfInverts: -1)); + numberOfBits: 5, ngramLength: 3, useAllLengths: false)); // Fit to data. var textTransformer = textPipeline.Fit(dataview); From 12e6d9d540710348b0b90d3d89e332aa5fc2338a Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 16:08:03 -0700 Subject: [PATCH 5/8] Addressed reviewers' comments. --- .../Transforms/Text/ProduceHashedNgrams.cs | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs index 465dae8671..ee6e1de642 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs @@ -33,10 +33,14 @@ public static void Example() var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text") .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) .Append(mlContext.Transforms.Text.ProduceHashedNgrams("NgramFeatures", "Tokens", - numberOfBits: 5, ngramLength: 3, useAllLengths: false)); + numberOfBits: 5, + ngramLength: 3, + useAllLengths: false, + maximumNumberOfInverts: 1)); // Fit to data. var textTransformer = textPipeline.Fit(dataview); + var transformedDataView = textTransformer.Transform(dataview); // Create the prediction engine to get the features extracted from the text. var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); @@ -47,6 +51,21 @@ public static void Example() // Print the length of the feature vector. Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}"); + // Preview of the produced Ngrams. + // Get the slot names from the column's metadata. + // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. + VBuffer> slotNames = default; + transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames); + var NgramFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["NgramFeatures"]); + var slots = slotNames.GetValues(); + Console.Write("Ngrams: "); + foreach (var featureRow in NgramFeaturesColumn) + { + foreach (var item in featureRow.Items()) + Console.Write($"{slots[item.Key]} "); + Console.WriteLine(); + } + // Print the first 10 feature values. Console.Write("Features: "); for (int i = 0; i < 10; i++) @@ -54,7 +73,8 @@ public static void Example() // Expected output: // Number of Features: 32 - // Features: 0.0000 0.0000 2.0000 0.0000 0.0000 1.0000 0.0000 0.0000 1.0000 0.0000 + // Ngrams: This|is|an example|to|compute compute|Ngrams|using Ngrams|using|hashing. an|example|to is|an|example a|sequence|of of|'N'|consecutive is|a|sequence Ngram|is|a ... + // Features: 0.0000 0.0000 2.0000 0.0000 0.0000 1.0000 0.0000 0.0000 1.0000 0.0000 ... } public class TextData From 91eed9c1f34509e7c672af7e9c6f7af00b3194eb Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 16:16:11 -0700 Subject: [PATCH 6/8] Addressed reviewers' comments. --- .../Transforms/Text/ProduceHashedNgrams.cs | 18 ++++++------- .../Dynamic/Transforms/Text/ProduceNgrams.cs | 26 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs index ee6e1de642..6cbdd8b471 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs @@ -16,17 +16,17 @@ public static void Example() // Create a small dataset as an IEnumerable. var samples = new List() { - new TextData(){ Text = "This is an example to compute Ngrams using hashing." }, - new TextData(){ Text = "Ngram is a sequence of 'N' consecutive words/tokens." }, - new TextData(){ Text = "ML.NET's ProduceHashedNgrams API produces count of Ngrams and hashes it as an index into a vector of given bit length." }, + new TextData(){ Text = "This is an example to compute n-grams using hashing." }, + new TextData(){ Text = "N-gram is a sequence of 'N' consecutive words/tokens." }, + new TextData(){ Text = "ML.NET's ProduceHashedNgrams API produces count of n-grams and hashes it as an index into a vector of given bit length." }, new TextData(){ Text = "The hashing reduces the size of the output feature vector" }, - new TextData(){ Text = "which is useful in case when number of Ngrams is very large." }, + new TextData(){ Text = "which is useful in case when number of n-grams is very large." }, }; // Convert training data to IDataView. var dataview = mlContext.Data.LoadFromEnumerable(samples); - // A pipeline for converting text into numeric hashed Ngram features. + // A pipeline for converting text into numeric hashed n-gram features. // The following call to 'ProduceHashedNgrams' requires the tokenized text/string as input. // This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceHashedNgrams'. // Please note that the length of the output feature vector depends on the 'numberOfBits' settings. @@ -51,14 +51,14 @@ public static void Example() // Print the length of the feature vector. Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}"); - // Preview of the produced Ngrams. + // Preview of the produced n-grams. // Get the slot names from the column's metadata. // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. VBuffer> slotNames = default; transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames); var NgramFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["NgramFeatures"]); var slots = slotNames.GetValues(); - Console.Write("Ngrams: "); + Console.Write("N-grams: "); foreach (var featureRow in NgramFeaturesColumn) { foreach (var item in featureRow.Items()) @@ -73,8 +73,8 @@ public static void Example() // Expected output: // Number of Features: 32 - // Ngrams: This|is|an example|to|compute compute|Ngrams|using Ngrams|using|hashing. an|example|to is|an|example a|sequence|of of|'N'|consecutive is|a|sequence Ngram|is|a ... - // Features: 0.0000 0.0000 2.0000 0.0000 0.0000 1.0000 0.0000 0.0000 1.0000 0.0000 ... + // N-grams: This|is|an example|to|compute compute|n-grams|using n-grams|using|hashing. an|example|to is|an|example a|sequence|of of|'N'|consecutive is|a|sequence N-gram|is|a ... + // Features: 0.0000 0.0000 2.0000 0.0000 0.0000 1.0000 0.0000 0.0000 1.0000 0.0000 ... } public class TextData diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs index a5606a25ef..40653043e1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs @@ -16,23 +16,23 @@ public static void Example() // Create a small dataset as an IEnumerable. var samples = new List() { - new TextData(){ Text = "This is an example to compute Ngrams." }, - new TextData(){ Text = "Ngram is a sequence of 'N' consecutive words/tokens." }, - new TextData(){ Text = "ML.NET's ProduceNgrams API produces vector of Ngrams." }, - new TextData(){ Text = "Each position in the vector corresponds to a particular Ngram." }, + new TextData(){ Text = "This is an example to compute n-grams." }, + new TextData(){ Text = "N-gram is a sequence of 'N' consecutive words/tokens." }, + new TextData(){ Text = "ML.NET's ProduceNgrams API produces vector of n-grams." }, + new TextData(){ Text = "Each position in the vector corresponds to a particular n-gram." }, new TextData(){ Text = "The value at each position corresponds to," }, - new TextData(){ Text = "the number of times Ngram occured in the data (Tf), or" }, - new TextData(){ Text = "the inverse of the number of documents that contain the Ngram (Idf)," }, + new TextData(){ Text = "the number of times n-gram occured in the data (Tf), or" }, + new TextData(){ Text = "the inverse of the number of documents that contain the n-gram (Idf)," }, new TextData(){ Text = "or compute both and multiply together (Tf-Idf)." }, }; // Convert training data to IDataView. var dataview = mlContext.Data.LoadFromEnumerable(samples); - // A pipeline for converting text into numeric Ngram features. + // A pipeline for converting text into numeric n-gram features. // The following call to 'ProduceNgrams' requires the tokenized text/string as input. // This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceNgrams'. - // Please note that the length of the output feature vector depends on the Ngram settings. + // Please note that the length of the output feature vector depends on the n-gram settings. var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text") // 'ProduceNgrams' takes key type as input. Converting the tokens into key type using 'MapValueToKey'. .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) @@ -45,7 +45,7 @@ public static void Example() var textTransformer = textPipeline.Fit(dataview); var transformedDataView = textTransformer.Transform(dataview); - // Create the prediction engine to get the Ngram features extracted from the text. + // Create the prediction engine to get the n-gram features extracted from the text. var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); // Convert the text into numeric features. @@ -54,14 +54,14 @@ public static void Example() // Print the length of the feature vector. Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}"); - // Preview of the produced Ngrams. + // Preview of the produced n-grams. // Get the slot names from the column's metadata. // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. VBuffer> slotNames = default; transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames); var NgramFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["NgramFeatures"]); var slots = slotNames.GetValues(); - Console.Write("Ngrams: "); + Console.Write("N-grams: "); foreach (var featureRow in NgramFeaturesColumn) { foreach (var item in featureRow.Items()) @@ -76,8 +76,8 @@ public static void Example() // Expected output: // Number of Features: 52 - // Ngrams: This|is|an is|an|example an|example|to example|to|compute to|compute|Ngrams. Ngram|is|a is|a|sequence a|sequence|of sequence|of|'N' of|'N'|consecutive ... - // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 ... + // N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|n-grams. N-gram|is|a is|a|sequence a|sequence|of sequence|of|'N' of|'N'|consecutive ... + // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 ... } public class TextData From c177c6b0a33e1b1fda874b5518be78dfa94bf1b5 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 4 Apr 2019 10:31:53 -0700 Subject: [PATCH 7/8] Changed input/output classes to private. --- .../Dynamic/Transforms/Text/ProduceHashedNgrams.cs | 4 ++-- .../Dynamic/Transforms/Text/ProduceNgrams.cs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs index 6cbdd8b471..404ee7e0e0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs @@ -77,12 +77,12 @@ public static void Example() // Features: 0.0000 0.0000 2.0000 0.0000 0.0000 1.0000 0.0000 0.0000 1.0000 0.0000 ... } - public class TextData + private class TextData { public string Text { get; set; } } - public class TransformedTextData : TextData + private class TransformedTextData : TextData { public float[] NgramFeatures { get; set; } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs index 40653043e1..742f297205 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs @@ -80,12 +80,12 @@ public static void Example() // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 ... } - public class TextData + private class TextData { public string Text { get; set; } } - public class TransformedTextData : TextData + private class TransformedTextData : TextData { public float[] NgramFeatures { get; set; } } From 8b5001cefe0ee982ef4e361af331795e71f114e4 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 4 Apr 2019 12:29:27 -0700 Subject: [PATCH 8/8] Addressed reviewers' comments. --- .../Dynamic/Transforms/Text/ProduceHashedNgrams.cs | 2 +- .../Dynamic/Transforms/Text/ProduceNgrams.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs index 404ee7e0e0..952c751309 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs @@ -53,7 +53,7 @@ public static void Example() // Preview of the produced n-grams. // Get the slot names from the column's metadata. - // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. + // The slot names for a vector column corresponds to the names associated with each position in the vector. VBuffer> slotNames = default; transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames); var NgramFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["NgramFeatures"]); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs index 742f297205..dd26441b06 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs @@ -56,7 +56,7 @@ public static void Example() // Preview of the produced n-grams. // Get the slot names from the column's metadata. - // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. + // The slot names for a vector column corresponds to the names associated with each position in the vector. VBuffer> slotNames = default; transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames); var NgramFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["NgramFeatures"]);