-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Created samples for 'ProduceNgrams' and 'ProduceHashedNgrams' APIs. #3177
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
80ee90a
fe65033
4ddaf5c
4db1cd7
12e6d9d
91eed9c
c177c6b
8b5001c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,90 @@ | ||
| using System; | ||
| using System.Collections.Generic; | ||
| using Microsoft.ML.Data; | ||
| using Microsoft.ML.Transforms.Text; | ||
|
|
||
| namespace Microsoft.ML.Samples.Dynamic | ||
| { | ||
| public static class ProduceHashedNgrams | ||
| { | ||
| public static void Example() | ||
| { | ||
| // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
| // as well as the source of randomness. | ||
| var mlContext = new MLContext(); | ||
|
|
||
| // Create a small dataset as an IEnumerable. | ||
| var samples = new List<TextData>() | ||
| { | ||
| new TextData(){ Text = "This is an example to compute n-grams using hashing." }, | ||
| new TextData(){ Text = "N-gram is a sequence of 'N' consecutive words/tokens." }, | ||
| new TextData(){ Text = "ML.NET's ProduceHashedNgrams API produces count of n-grams and hashes it as an index into a vector of given bit length." }, | ||
| new TextData(){ Text = "The hashing reduces the size of the output feature vector" }, | ||
| new TextData(){ Text = "which is useful in case when number of n-grams is very large." }, | ||
| }; | ||
|
|
||
| // Convert training data to IDataView. | ||
| var dataview = mlContext.Data.LoadFromEnumerable(samples); | ||
|
|
||
| // A pipeline for converting text into numeric hashed n-gram features. | ||
| // The following call to 'ProduceHashedNgrams' requires the tokenized text/string as input. | ||
| // This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceHashedNgrams'. | ||
| // Please note that the length of the output feature vector depends on the 'numberOfBits' settings. | ||
| var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text") | ||
| .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) | ||
| .Append(mlContext.Transforms.Text.ProduceHashedNgrams("NgramFeatures", "Tokens", | ||
| numberOfBits: 5, | ||
| ngramLength: 3, | ||
| useAllLengths: false, | ||
| maximumNumberOfInverts: 1)); | ||
|
|
||
| // Fit to data. | ||
| var textTransformer = textPipeline.Fit(dataview); | ||
| var transformedDataView = textTransformer.Transform(dataview); | ||
|
|
||
| // Create the prediction engine to get the features extracted from the text. | ||
| var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); | ||
|
|
||
| // Convert the text into numeric features. | ||
| var prediction = predictionEngine.Predict(samples[0]); | ||
|
|
||
| // Print the length of the feature vector. | ||
| Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}"); | ||
|
|
||
| // Preview of the produced n-grams. | ||
| // Get the slot names from the column's metadata. | ||
| // The slot names for a vector column corresponds to the names associated with each position in the vector. | ||
| VBuffer<ReadOnlyMemory<char>> slotNames = default; | ||
| transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames); | ||
| var NgramFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["NgramFeatures"]); | ||
| var slots = slotNames.GetValues(); | ||
| Console.Write("N-grams: "); | ||
| foreach (var featureRow in NgramFeaturesColumn) | ||
| { | ||
| foreach (var item in featureRow.Items()) | ||
| Console.Write($"{slots[item.Key]} "); | ||
| Console.WriteLine(); | ||
| } | ||
|
|
||
| // Print the first 10 feature values. | ||
| Console.Write("Features: "); | ||
| for (int i = 0; i < 10; i++) | ||
| Console.Write($"{prediction.NgramFeatures[i]:F4} "); | ||
|
|
||
| // Expected output: | ||
| // Number of Features: 32 | ||
| // N-grams: This|is|an example|to|compute compute|n-grams|using n-grams|using|hashing. an|example|to is|an|example a|sequence|of of|'N'|consecutive is|a|sequence N-gram|is|a ... | ||
| // Features: 0.0000 0.0000 2.0000 0.0000 0.0000 1.0000 0.0000 0.0000 1.0000 0.0000 ... | ||
| } | ||
|
|
||
| private class TextData | ||
| { | ||
| public string Text { get; set; } | ||
| } | ||
|
|
||
| private class TransformedTextData : TextData | ||
| { | ||
| public float[] NgramFeatures { get; set; } | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,93 @@ | ||
| using System; | ||
| using System.Collections.Generic; | ||
| using Microsoft.ML.Data; | ||
| using Microsoft.ML.Transforms.Text; | ||
|
|
||
| namespace Microsoft.ML.Samples.Dynamic | ||
| { | ||
| public static class ProduceNgrams | ||
| { | ||
| public static void Example() | ||
| { | ||
| // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
| // as well as the source of randomness. | ||
| var mlContext = new MLContext(); | ||
|
|
||
| // Create a small dataset as an IEnumerable. | ||
| var samples = new List<TextData>() | ||
| { | ||
| new TextData(){ Text = "This is an example to compute n-grams." }, | ||
| new TextData(){ Text = "N-gram is a sequence of 'N' consecutive words/tokens." }, | ||
| new TextData(){ Text = "ML.NET's ProduceNgrams API produces vector of n-grams." }, | ||
| new TextData(){ Text = "Each position in the vector corresponds to a particular n-gram." }, | ||
| new TextData(){ Text = "The value at each position corresponds to," }, | ||
| new TextData(){ Text = "the number of times n-gram occured in the data (Tf), or" }, | ||
| new TextData(){ Text = "the inverse of the number of documents that contain the n-gram (Idf)," }, | ||
| new TextData(){ Text = "or compute both and multiply together (Tf-Idf)." }, | ||
| }; | ||
|
|
||
| // Convert training data to IDataView. | ||
| var dataview = mlContext.Data.LoadFromEnumerable(samples); | ||
|
|
||
| // A pipeline for converting text into numeric n-gram features. | ||
| // The following call to 'ProduceNgrams' requires the tokenized text/string as input. | ||
| // This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceNgrams'. | ||
| // Please note that the length of the output feature vector depends on the n-gram settings. | ||
| var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text") | ||
| // 'ProduceNgrams' takes key type as input. Converting the tokens into key type using 'MapValueToKey'. | ||
| .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
add one line of comment on why this is here. #Resolved
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This seems like a holdover from the internal codebase. I wonder if we should consider doing a breaking change to move this keytype conversion into the ProduceNGrams operation. The question is what other use cases do we expect to see? |
||
| .Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures", "Tokens", | ||
| ngramLength: 3, | ||
| useAllLengths: false, | ||
| weighting: NgramExtractingEstimator.WeightingCriteria.Tf)); | ||
|
|
||
| // Fit to data. | ||
| var textTransformer = textPipeline.Fit(dataview); | ||
| var transformedDataView = textTransformer.Transform(dataview); | ||
|
|
||
| // Create the prediction engine to get the n-gram features extracted from the text. | ||
| var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); | ||
|
|
||
| // Convert the text into numeric features. | ||
| var prediction = predictionEngine.Predict(samples[0]); | ||
|
|
||
| // Print the length of the feature vector. | ||
| Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}"); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this necessary? #Resolved
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| // Preview of the produced n-grams. | ||
| // Get the slot names from the column's metadata. | ||
| // The slot names for a vector column corresponds to the names associated with each position in the vector. | ||
| VBuffer<ReadOnlyMemory<char>> slotNames = default; | ||
| transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames); | ||
| var NgramFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["NgramFeatures"]); | ||
| var slots = slotNames.GetValues(); | ||
| Console.Write("N-grams: "); | ||
| foreach (var featureRow in NgramFeaturesColumn) | ||
| { | ||
| foreach (var item in featureRow.Items()) | ||
| Console.Write($"{slots[item.Key]} "); | ||
| Console.WriteLine(); | ||
| } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the concern for me right now. There is no way to get this meta data through the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you have to transform the data anyway, why not just print off the transformed IDV instead of off a prediction? You can limit it to one row with a TakeRows filter. In reply to: 271476465 [](ancestors = 271476465) |
||
|
|
||
| // Print the first 10 feature values. | ||
| Console.Write("Features: "); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
i'd remove unnecessary printings
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I'd like to keep it because its more clear to read each line with this prefix on console. In reply to: 271816037 [](ancestors = 271816037) |
||
| for (int i = 0; i < 10; i++) | ||
| Console.Write($"{prediction.NgramFeatures[i]:F4} "); | ||
|
|
||
| // Expected output: | ||
| // Number of Features: 52 | ||
| // N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|n-grams. N-gram|is|a is|a|sequence a|sequence|of sequence|of|'N' of|'N'|consecutive ... | ||
| // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 ... | ||
| } | ||
|
|
||
| private class TextData | ||
| { | ||
| public string Text { get; set; } | ||
| } | ||
|
|
||
| private class TransformedTextData : TextData | ||
| { | ||
| public float[] NgramFeatures { get; set; } | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
similar comment to the other file, is this needed? #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes.
In reply to: 271819189 [](ancestors = 271819189)