Skip to content

Commit 9d9a3d9

Browse files
authored
Scrub n-gram hashing and n-gram (#2898)
* Address comments and also handle WordBags and HashedWordBags
1 parent abc2d63 commit 9d9a3d9

File tree

14 files changed

+296
-429
lines changed

14 files changed

+296
-429
lines changed

docs/code/MlNetCookBook.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,7 @@ var pipeline =
772772

773773
// NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices.
774774
.Append(new WordHashBagEstimator(mlContext, "BagOfBigrams","NormalizedMessage",
775-
ngramLength: 2, allLengths: false))
775+
ngramLength: 2, useAllLengths: false))
776776

777777
// NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting.
778778
.Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message"))

docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ public static void NgramTransform()
6161
// 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 'R' - 1 'U' - 1 'D' - 2 'E' - 1 'u' - 1 ',' - 1 '2' - 1
6262
// 'B' - 0 'e' - 6 's' - 3 't' - 6 '<?>' - 9 'g' - 2 'a' - 2 'm' - 2 'I' - 0 ''' - 0 'v' - 0 ...
6363
// Preview of the CharsTwoGrams column obtained after processing the input.
64-
var charsTwoGramColumn = transformedData_twochars.GetColumn<VBuffer<float>>(transformedData_onechars.Schema["CharsUnigrams"]);
64+
var charsTwoGramColumn = transformedData_twochars.GetColumn<VBuffer<float>>(transformedData_twochars.Schema["CharsTwograms"]);
6565
transformedData_twochars.Schema["CharsTwograms"].GetSlotNames(ref slotNames);
6666
printHelper("CharsTwograms", charsTwoGramColumn, slotNames);
6767

src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs

Lines changed: 63 additions & 63 deletions
Large diffs are not rendered by default.

src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs

Lines changed: 43 additions & 72 deletions
Large diffs are not rendered by default.

src/Microsoft.ML.Transforms/Text/NgramTransform.cs

Lines changed: 66 additions & 59 deletions
Large diffs are not rendered by default.

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

Lines changed: 40 additions & 156 deletions
Large diffs are not rendered by default.

src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ public WordBagEstimator.Options WordFeatureExtractor
141141
extractor = new NgramExtractorTransform.NgramExtractorArguments();
142142
extractor.NgramLength = _wordFeatureExtractor.NgramLength;
143143
extractor.SkipLength = _wordFeatureExtractor.SkipLength;
144-
extractor.AllLengths = _wordFeatureExtractor.AllLengths;
144+
extractor.UseAllLengths = _wordFeatureExtractor.UseAllLengths;
145145
extractor.MaxNumTerms = _wordFeatureExtractor.MaximumNgramsCount;
146146
extractor.Weighting = _wordFeatureExtractor.Weighting;
147147
}
@@ -173,7 +173,7 @@ public WordBagEstimator.Options CharFeatureExtractor
173173
extractor = new NgramExtractorTransform.NgramExtractorArguments();
174174
extractor.NgramLength = _charFeatureExtractor.NgramLength;
175175
extractor.SkipLength = _charFeatureExtractor.SkipLength;
176-
extractor.AllLengths = _charFeatureExtractor.AllLengths;
176+
extractor.UseAllLengths = _charFeatureExtractor.UseAllLengths;
177177
extractor.MaxNumTerms = _charFeatureExtractor.MaximumNgramsCount;
178178
extractor.Weighting = _charFeatureExtractor.Weighting;
179179
}
@@ -187,7 +187,7 @@ public WordBagEstimator.Options CharFeatureExtractor
187187
public Options()
188188
{
189189
WordFeatureExtractor = new WordBagEstimator.Options();
190-
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, AllLengths = false };
190+
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false };
191191
}
192192
}
193193

src/Microsoft.ML.Transforms/Text/WordBagTransform.cs

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ internal sealed class Column : ManyToOneColumn
5454

5555
[Argument(ArgumentType.AtMostOnce,
5656
HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength),
57-
ShortName = "all")]
58-
public bool? AllLengths;
57+
Name = "AllLengths", ShortName = "all")]
58+
public bool? UseAllLengths;
5959

6060
[Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")]
6161
public int[] MaxNumTerms = null;
@@ -76,7 +76,7 @@ internal static Column Parse(string str)
7676
internal bool TryUnparse(StringBuilder sb)
7777
{
7878
Contracts.AssertValue(sb);
79-
if (NgramLength != null || SkipLength != null || AllLengths != null || Utils.Size(MaxNumTerms) > 0 ||
79+
if (NgramLength != null || SkipLength != null || UseAllLengths != null || Utils.Size(MaxNumTerms) > 0 ||
8080
Weighting != null)
8181
{
8282
return false;
@@ -123,7 +123,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
123123
MaxNumTerms = options.MaxNumTerms,
124124
NgramLength = options.NgramLength,
125125
SkipLength = options.SkipLength,
126-
AllLengths = options.AllLengths,
126+
UseAllLengths = options.UseAllLengths,
127127
Weighting = options.Weighting,
128128
Columns = new NgramExtractorTransform.Column[options.Columns.Length]
129129
};
@@ -146,7 +146,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
146146
NgramLength = column.NgramLength,
147147
SkipLength = column.SkipLength,
148148
Weighting = column.Weighting,
149-
AllLengths = column.AllLengths
149+
UseAllLengths = column.UseAllLengths
150150
};
151151
}
152152

@@ -175,8 +175,9 @@ internal sealed class Column : OneToOneColumn
175175
public int? SkipLength;
176176

177177
[Argument(ArgumentType.AtMostOnce, HelpText =
178-
"Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), ShortName = "all")]
179-
public bool? AllLengths;
178+
"Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength),
179+
Name = "AllLengths", ShortName = "all")]
180+
public bool? UseAllLengths;
180181

181182
// REVIEW: This argument is actually confusing. If you set only one value we will use this value for all ngrams respectfully for example,
182183
// if we specify 3 ngrams we will have maxNumTerms * 3. And it also pick first value from this array to run term transform, so if you specify
@@ -200,7 +201,7 @@ internal static Column Parse(string str)
200201
internal bool TryUnparse(StringBuilder sb)
201202
{
202203
Contracts.AssertValue(sb);
203-
if (NgramLength != null || SkipLength != null || AllLengths != null || Utils.Size(MaxNumTerms) > 0 ||
204+
if (NgramLength != null || SkipLength != null || UseAllLengths != null || Utils.Size(MaxNumTerms) > 0 ||
204205
Weighting != null)
205206
{
206207
return false;
@@ -225,11 +226,11 @@ internal abstract class ArgumentsBase
225226

226227
[Argument(ArgumentType.AtMostOnce,
227228
HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength),
228-
ShortName = "all")]
229-
public bool AllLengths = NgramExtractingEstimator.Defaults.AllLengths;
229+
Name = "AllLengths", ShortName = "all")]
230+
public bool UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths;
230231

231232
[Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")]
232-
public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaxNumTerms };
233+
public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount };
233234

234235
[Argument(ArgumentType.AtMostOnce, HelpText = "The weighting criteria")]
235236
public NgramExtractingEstimator.WeightingCriteria Weighting = NgramExtractingEstimator.Defaults.Weighting;
@@ -315,7 +316,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
315316
termArgs =
316317
new ValueToKeyMappingTransformer.Options()
317318
{
318-
MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaxNumTerms,
319+
MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaximumNgramsCount,
319320
Columns = new ValueToKeyMappingTransformer.Column[termCols.Count]
320321
};
321322
}
@@ -347,7 +348,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
347348
ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnOptions(column.Name,
348349
column.NgramLength ?? options.NgramLength,
349350
column.SkipLength ?? options.SkipLength,
350-
column.AllLengths ?? options.AllLengths,
351+
column.UseAllLengths ?? options.UseAllLengths,
351352
column.Weighting ?? options.Weighting,
352353
column.MaxNumTerms ?? options.MaxNumTerms,
353354
isTermCol[iinfo] ? column.Name : column.Source
@@ -380,7 +381,7 @@ internal static IDataTransform Create(IHostEnvironment env, NgramExtractorArgume
380381
Columns = extractorCols,
381382
NgramLength = extractorArgs.NgramLength,
382383
SkipLength = extractorArgs.SkipLength,
383-
AllLengths = extractorArgs.AllLengths,
384+
UseAllLengths = extractorArgs.UseAllLengths,
384385
MaxNumTerms = extractorArgs.MaxNumTerms,
385386
Weighting = extractorArgs.Weighting
386387
};

src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
129129
Ordered = column.Ordered,
130130
MaximumNumberOfInverts = column.MaximumNumberOfInverts,
131131
FriendlyNames = options.Columns[iinfo].Source,
132-
AllLengths = column.AllLengths
132+
UseAllLengths = column.UseAllLengths
133133
};
134134
}
135135

@@ -138,7 +138,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
138138
var featurizeArgs =
139139
new NgramHashExtractingTransformer.Options
140140
{
141-
AllLengths = options.AllLengths,
141+
UseAllLengths = options.UseAllLengths,
142142
NumberOfBits = options.NumberOfBits,
143143
NgramLength = options.NgramLength,
144144
SkipLength = options.SkipLength,
@@ -189,8 +189,8 @@ internal abstract class ColumnBase : ManyToOneColumn
189189

190190
[Argument(ArgumentType.AtMostOnce,
191191
HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength),
192-
ShortName = "all", SortOrder = 4)]
193-
public bool? AllLengths;
192+
Name = "AllLengths", ShortName = "all", SortOrder = 4)]
193+
public bool? UseAllLengths;
194194
}
195195

196196
internal sealed class Column : ColumnBase
@@ -279,8 +279,8 @@ internal abstract class ArgumentsBase
279279

280280
[Argument(ArgumentType.AtMostOnce,
281281
HelpText = "Whether to include all ngram lengths up to ngramLength or only ngramLength",
282-
ShortName = "all", SortOrder = 4)]
283-
public bool AllLengths = true;
282+
Name = "AllLengths", ShortName = "all", SortOrder = 4)]
283+
public bool UseAllLengths = true;
284284
}
285285

286286
internal static class DefaultArguments
@@ -291,7 +291,7 @@ internal static class DefaultArguments
291291
public const uint Seed = 314489979;
292292
public const bool Ordered = true;
293293
public const int MaximumNumberOfInverts = 0;
294-
public const bool AllLengths = true;
294+
public const bool UseAllLengths = true;
295295
}
296296

297297
[TlcModule.Component(Name = "NGramHash", FriendlyName = "NGram Hash Extractor Transform", Alias = "NGramHashExtractorTransform,NGramHashExtractor",
@@ -369,7 +369,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
369369
new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo],
370370
column.NgramLength ?? options.NgramLength,
371371
column.SkipLength ?? options.SkipLength,
372-
column.AllLengths ?? options.AllLengths,
372+
column.UseAllLengths ?? options.UseAllLengths,
373373
column.NumberOfBits ?? options.NumberOfBits,
374374
column.Seed ?? options.Seed,
375375
column.Ordered ?? options.Ordered,
@@ -439,7 +439,7 @@ internal static IDataTransform Create(NgramHashExtractorArguments extractorArgs,
439439
MaximumNumberOfInverts = extractorArgs.MaximumNumberOfInverts,
440440
Ordered = extractorArgs.Ordered,
441441
Seed = extractorArgs.Seed,
442-
AllLengths = extractorArgs.AllLengths
442+
UseAllLengths = extractorArgs.UseAllLengths
443443
};
444444

445445
return Create(h, options, input, termLoaderArgs);

0 commit comments

Comments
 (0)