Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public static void Example()
// false ==RUDE== Dude, 2.
// true Until the next game, this is the best Xbox game!

// A pipeline to tokenize text as characters and then combine them together into ngrams
// A pipeline to tokenize text as characters and then combine them together into n-grams
// The pipeline uses the default settings to featurize.

var charsPipeline = ml.Transforms.Text.TokenizeIntoCharactersAsKeys("Chars", "SentimentText", useMarkerCharacters: false);
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Transforms/InvertHashUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ public int GetHashCode(Pair obj)
/// <param name="copier">For copying input values into a value to actually store. Useful for
/// types of objects where it is possible to do a comparison relatively quickly on some sort
/// of "unsafe" object, but for which when we decide to actually store it we need to provide
/// a "safe" version of the object. Utilized in the ngram hash transform, for example.</param>
/// a "safe" version of the object. Utilized in the n-gram hash transform, for example.</param>
public InvertHashCollector(int slots, int maxCount, ValueMapper<T, StringBuilder> mapper,
IEqualityComparer<T> comparer, ValueMapper<T, T> copier = null)
{
Expand Down
38 changes: 19 additions & 19 deletions src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -303,14 +303,14 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
}

/// <summary>
/// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given text.
/// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
/// Produces a bag of counts of n-grams (sequences of consecutive words) in a given text.
/// It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="ngramLength">Ngram length.</param>
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
/// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
/// <param name="maximumNgramsCount">Maximum number of ngrams to store in the dictionary.</param>
/// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
/// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
/// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
/// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
public static Vector<float> ProduceWordBags(this Scalar<string> input,
int ngramLength = 1,
Expand Down Expand Up @@ -393,14 +393,14 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
}

/// <summary>
/// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given text.
/// It does so by hashing each ngram and using the hash value as the index in the bag.
/// Produces a bag of counts of n-grams (sequences of consecutive words of length 1-n) in a given text.
/// It does so by hashing each n-gram and using the hash value as the index in the bag.
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
/// <param name="ngramLength">Ngram length.</param>
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
/// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
/// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
/// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
/// <param name="seed">Hashing seed.</param>
/// <param name="useOrderedHashing">Whether the position of each source column should be included in the hash (when there are multiple source columns).</param>
/// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
Expand All @@ -418,7 +418,7 @@ public static Vector<float> ProduceHashedWordBags(this Scalar<string> input,
}

/// <summary>
/// Extensions for statically typed ngram estimator.
/// Extensions for statically typed n-gram estimator.
/// </summary>
public static class NgramEstimatorStaticExtensions
{
Expand Down Expand Up @@ -482,16 +482,16 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
}

/// <summary>
/// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text.
/// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
/// Produces a bag of counts of n-grams (sequences of consecutive words ) in a given tokenized text.
/// It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.
///
/// /// <see cref="ProduceNgrams"/> is different from <see cref="WordBagEstimatorStaticExtensions.ProduceWordBags"/>
/// in a way that <see cref="ProduceNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorStaticExtensions.ProduceWordBags"/> tokenizes text internally.
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="ngramLength">Ngram length.</param>
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
/// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
/// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
/// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
/// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
/// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
public static Vector<float> ProduceNgrams<TKey>(this VarVector<Key<TKey, string>> input,
Expand All @@ -504,7 +504,7 @@ public static Vector<float> ProduceNgrams<TKey>(this VarVector<Key<TKey, string>
}

/// <summary>
/// Extensions for statically typed ngram hash estimator.
/// Extensions for statically typed n-gram hash estimator.
/// </summary>
public static class NgramHashEstimatorStaticExtensions
{
Expand Down Expand Up @@ -568,17 +568,17 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
}

/// <summary>
/// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given tokenized text.
/// It does so by hashing each ngram and using the hash value as the index in the bag.
/// Produces a bag of counts of n-grams (sequences of n consecutive words of length 1-n) in a given tokenized text.
/// It does so by hashing each n-gram and using the hash value as the index in the bag.
///
/// <see cref="ProduceHashedNgrams"/> is different from <see cref="WordHashBagEstimatorStaticExtensions.ProduceHashedWordBags"/>
/// in a way that <see cref="ProduceHashedNgrams"/> takes tokenized text as input while <see cref="WordHashBagEstimatorStaticExtensions.ProduceHashedWordBags"/> tokenizes text internally.
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
/// <param name="ngramLength">Ngram length.</param>
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
/// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
/// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
/// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
/// <param name="seed">Hashing seed.</param>
/// <param name="useOrderedHashing">Whether the position of each source column should be included in the hash (when there are multiple source columns).</param>
/// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.StaticPipe/TransformsStatic.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1535,7 +1535,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
}
}
/// <summary>
/// Accept text data and converts it to array which represent combinations of ngram/skip-gram token counts.
/// Accept text data and converts it to array which represent combinations of n-gram/skip-gram token counts.
/// </summary>
/// <param name="input">Input data.</param>
/// <param name="otherInputs">Additional data.</param>
Expand Down
Loading