Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
using System;
using System.Collections.Generic;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms.Text;

namespace Microsoft.ML.Samples.Dynamic
{
public static class ProduceHashedNgrams
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Create a small dataset as an IEnumerable.
var samples = new List<TextData>()
{
new TextData(){ Text = "This is an example to compute n-grams using hashing." },
new TextData(){ Text = "N-gram is a sequence of 'N' consecutive words/tokens." },
new TextData(){ Text = "ML.NET's ProduceHashedNgrams API produces count of n-grams and hashes it as an index into a vector of given bit length." },
new TextData(){ Text = "The hashing reduces the size of the output feature vector" },
new TextData(){ Text = "which is useful in case when number of n-grams is very large." },
};

// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for converting text into numeric hashed n-gram features.
// The following call to 'ProduceHashedNgrams' requires the tokenized text/string as input.
// This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceHashedNgrams'.
// Please note that the length of the output feature vector depends on the 'numberOfBits' settings.
var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
.Append(mlContext.Transforms.Text.ProduceHashedNgrams("NgramFeatures", "Tokens",
numberOfBits: 5,
ngramLength: 3,
useAllLengths: false,
maximumNumberOfInverts: 1));

// Fit to data.
var textTransformer = textPipeline.Fit(dataview);
var transformedDataView = textTransformer.Transform(dataview);

// Create the prediction engine to get the features extracted from the text.
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);

// Convert the text into numeric features.
var prediction = predictionEngine.Predict(samples[0]);

// Print the length of the feature vector.
Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}");
Copy link
Member

@sfilipi sfilipi Apr 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similar comment to the other file, is this needed? #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes.


In reply to: 271819189 [](ancestors = 271819189)


// Preview of the produced n-grams.
// Get the slot names from the column's metadata.
// The slot names for a vector column corresponds to the names associated with each position in the vector.
VBuffer<ReadOnlyMemory<char>> slotNames = default;
transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames);
var NgramFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["NgramFeatures"]);
var slots = slotNames.GetValues();
Console.Write("N-grams: ");
foreach (var featureRow in NgramFeaturesColumn)
{
foreach (var item in featureRow.Items())
Console.Write($"{slots[item.Key]} ");
Console.WriteLine();
}

// Print the first 10 feature values.
Console.Write("Features: ");
for (int i = 0; i < 10; i++)
Console.Write($"{prediction.NgramFeatures[i]:F4} ");

// Expected output:
// Number of Features: 32
// N-grams: This|is|an example|to|compute compute|n-grams|using n-grams|using|hashing. an|example|to is|an|example a|sequence|of of|'N'|consecutive is|a|sequence N-gram|is|a ...
// Features: 0.0000 0.0000 2.0000 0.0000 0.0000 1.0000 0.0000 0.0000 1.0000 0.0000 ...
}

private class TextData
{
public string Text { get; set; }
}

private class TransformedTextData : TextData
{
public float[] NgramFeatures { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
using System;
using System.Collections.Generic;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms.Text;

namespace Microsoft.ML.Samples.Dynamic
{
public static class ProduceNgrams
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Create a small dataset as an IEnumerable.
var samples = new List<TextData>()
{
new TextData(){ Text = "This is an example to compute n-grams." },
new TextData(){ Text = "N-gram is a sequence of 'N' consecutive words/tokens." },
new TextData(){ Text = "ML.NET's ProduceNgrams API produces vector of n-grams." },
new TextData(){ Text = "Each position in the vector corresponds to a particular n-gram." },
new TextData(){ Text = "The value at each position corresponds to," },
new TextData(){ Text = "the number of times n-gram occured in the data (Tf), or" },
new TextData(){ Text = "the inverse of the number of documents that contain the n-gram (Idf)," },
new TextData(){ Text = "or compute both and multiply together (Tf-Idf)." },
};

// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for converting text into numeric n-gram features.
// The following call to 'ProduceNgrams' requires the tokenized text/string as input.
// This is acheived by calling 'TokenizeIntoWords' first followed by 'ProduceNgrams'.
// Please note that the length of the output feature vector depends on the n-gram settings.
var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")
// 'ProduceNgrams' takes key type as input. Converting the tokens into key type using 'MapValueToKey'.
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
Copy link
Member

@sfilipi sfilipi Apr 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) [](start = 14, length = 66)

add one line of comment on why this is here. #Resolved

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) [](start = 16, length = 64)

This seems like a holdover from the internal codebase. I wonder if we should consider doing a breaking change to move this keytype conversion into the ProduceNGrams operation. The question is what other use cases do we expect to see?

.Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures", "Tokens",
ngramLength: 3,
useAllLengths: false,
weighting: NgramExtractingEstimator.WeightingCriteria.Tf));

// Fit to data.
var textTransformer = textPipeline.Fit(dataview);
var transformedDataView = textTransformer.Transform(dataview);

// Create the prediction engine to get the n-gram features extracted from the text.
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);

// Convert the text into numeric features.
var prediction = predictionEngine.Predict(samples[0]);

// Print the length of the feature vector.
Console.WriteLine($"Number of Features: {prediction.NgramFeatures.Length}");
Copy link
Member

@sfilipi sfilipi Apr 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this necessary? #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes.


In reply to: 271817449 [](ancestors = 271817449)


// Preview of the produced n-grams.
// Get the slot names from the column's metadata.
// The slot names for a vector column corresponds to the names associated with each position in the vector.
VBuffer<ReadOnlyMemory<char>> slotNames = default;
transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames);
var NgramFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["NgramFeatures"]);
var slots = slotNames.GetValues();
Console.Write("N-grams: ");
foreach (var featureRow in NgramFeaturesColumn)
{
foreach (var item in featureRow.Items())
Console.Write($"{slots[item.Key]} ");
Console.WriteLine();
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the concern for me right now. There is no way to get this meta data through the Transformer or through the prediction engine. The only way is through IDataView obtained from .Transform call.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you have to transform the data anyway, why not just print off the transformed IDV instead of off a prediction? You can limit it to one row with a TakeRows filter.


In reply to: 271476465 [](ancestors = 271476465)


// Print the first 10 feature values.
Console.Write("Features: ");
Copy link
Member

@sfilipi sfilipi Apr 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Console.Write("Features: "); [](start = 11, length = 29)

i'd remove unnecessary printings

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I'd like to keep it because its more clear to read each line with this prefix on console.


In reply to: 271816037 [](ancestors = 271816037)

for (int i = 0; i < 10; i++)
Console.Write($"{prediction.NgramFeatures[i]:F4} ");

// Expected output:
// Number of Features: 52
// N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|n-grams. N-gram|is|a is|a|sequence a|sequence|of sequence|of|'N' of|'N'|consecutive ...
// Features: 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 ...
}

private class TextData
{
public string Text { get; set; }
}

private class TransformedTextData : TextData
{
public float[] NgramFeatures { get; set; }
}
}
}
9 changes: 8 additions & 1 deletion src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ internal static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[LpNormalize](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs?range=1-5,11-74)]
/// [!code-csharp[ProduceNgrams](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceNgrams.cs)]
/// ]]>
/// </format>
/// </example>
Expand Down Expand Up @@ -450,6 +450,13 @@ public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.T
/// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
/// <param name="rehashUnigrams">Whether to rehash unigrams.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[ProduceHashedNgrams](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedNgrams.cs)]
/// ]]>
/// </format>
/// </example>
public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string[] inputColumnNames = null,
Expand Down