Skip to content

Commit 91a8703

Browse files
authored
Polish char- and word-level tokenizers & stopword removers (#2916)
* Polish char-level tokenizers * Polish word-level tokenizers * Scrub stopword removers
1 parent fa9268d commit 91a8703

File tree

19 files changed

+58
-87
lines changed

19 files changed

+58
-87
lines changed

docs/code/MlNetCookBook.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -775,12 +775,12 @@ var pipeline =
775775
ngramLength: 2, useAllLengths: false))
776776

777777
// NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting.
778-
.Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message"))
778+
.Append(mlContext.Transforms.Text.ProduceCharactersAsKeys("MessageChars", "Message"))
779779
.Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars",
780780
ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf))
781781

782782
// NLP pipeline 4: word embeddings.
783-
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
783+
.Append(mlContext.Transforms.Text.ProduceWordTokens("TokenizedMessage", "NormalizedMessage"))
784784
.Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage",
785785
WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));
786786

docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,15 @@ public static void Example()
3030
// making use of default settings.
3131
string defaultColumnName = "DefaultKeys";
3232
// REVIEW create through the catalog extension
33-
var default_pipeline = ml.Transforms.Text.TokenizeWords("Review")
33+
var default_pipeline = ml.Transforms.Text.TokenizeIntoWords("Review")
3434
.Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));
3535

3636
// Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
3737
// We can change the maximumNumberOfKeys to limit how many keys will get generated out of the set of words,
3838
// and condition the order in which they get evaluated by changing keyOrdinality from the default ByOccurence (order in which they get encountered)
3939
// to value/alphabetically.
4040
string customizedColumnName = "CustomizedKeys";
41-
var customized_pipeline = ml.Transforms.Text.TokenizeWords("Review")
41+
var customized_pipeline = ml.Transforms.Text.TokenizeIntoWords("Review")
4242
.Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maximumNumberOfKeys: 10, keyOrdinality: ValueToKeyMappingEstimator.KeyOrdinality.ByValue));
4343

4444
// The transformed data.

docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public static void NgramTransform()
2626
// A pipeline to tokenize text as characters and then combine them together into ngrams
2727
// The pipeline uses the default settings to featurize.
2828

29-
var charsPipeline = ml.Transforms.Text.TokenizeCharacters("Chars", "SentimentText", useMarkerCharacters: false);
29+
var charsPipeline = ml.Transforms.Text.TokenizeIntoCharactersAsKeys("Chars", "SentimentText", useMarkerCharacters: false);
3030
var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("CharsUnigrams", "Chars", ngramLength: 1);
3131
var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("CharsTwograms", "Chars");
3232
var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline);

docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ public static void Example()
2525

2626
// Let's take SentimentText column and break it into vector of words.
2727
string originalTextColumnName = "Words";
28-
var words = ml.Transforms.Text.TokenizeWords("SentimentText", originalTextColumnName);
28+
var words = ml.Transforms.Text.TokenizeIntoWords("SentimentText", originalTextColumnName);
2929

3030
// Default pipeline will apply default stop word remover which is based on predifined set of words for certain languages.
3131
var defaultPipeline = words.Append(ml.Transforms.Text.RemoveDefaultStopWords(originalTextColumnName, "DefaultRemover"));

docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ public static void Example()
6868
j.Features = features;
6969
};
7070

71-
var engine = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text")
71+
var engine = mlContext.Transforms.Text.TokenizeIntoWords("TokenizedWords", "Sentiment_Text")
7272
.Append(mlContext.Transforms.Conversion.MapValue(lookupMap, "Words", "Ids", new ColumnOptions[] { ("VariableLenghtFeatures", "TokenizedWords") }))
7373
.Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize"))
7474
.Append(tensorFlowModel.ScoreTensorFlowModel(new[] { "Prediction/Softmax" }, new[] { "Features" }))

docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public static void Example()
2626

2727
// Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords.
2828
var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
29-
.Append(ml.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
29+
.Append(ml.Transforms.Text.TokenizeIntoWords("Words", "NormalizedText"))
3030
.Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
3131

3232
var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData);

src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
5555
/// </summary>
5656
/// <param name="input">The column to apply to.</param>
5757
/// <param name="separators">The separators to use (uses space character by default).</param>
58-
public static VarVector<string> TokenizeText(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
58+
public static VarVector<string> TokenizeIntoWords(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
5959
}
6060

6161
/// <summary>
@@ -109,7 +109,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
109109
/// </summary>
110110
/// <param name="input">The column to apply to.</param>
111111
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
112-
public static VarVector<Key<ushort, string>> TokenizeIntoCharacters(this Scalar<string> input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters);
112+
public static VarVector<Key<ushort, string>> TokenizeIntoCharactersAsKeys(this Scalar<string> input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters);
113113
}
114114

115115
/// <summary>
@@ -162,8 +162,8 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
162162
/// Remove stop words from incoming text.
163163
/// </summary>
164164
/// <param name="input">The column to apply to.</param>
165-
/// <param name="language">Langauge of the input text.</param>
166-
public static VarVector<string> RemoveStopwords(this VarVector<string> input,
165+
/// <param name="language">Langauge of the input text. It will be used to retrieve a built-in stopword list.</param>
166+
public static VarVector<string> RemoveDefaultStopWords(this VarVector<string> input,
167167
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English) => new OutPipelineColumn(input, language);
168168
}
169169

src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ private static VersionInfo GetVersionInfo()
133133
/// <summary>
134134
/// Defines the behavior of the transformer.
135135
/// </summary>
136-
public IReadOnlyCollection<StopWordsRemovingEstimator.ColumnOptions> Columns => _columns.AsReadOnly();
136+
internal IReadOnlyCollection<StopWordsRemovingEstimator.ColumnOptions> Columns => _columns.AsReadOnly();
137137

138138
private readonly StopWordsRemovingEstimator.ColumnOptions[] _columns;
139139
private static volatile NormStr.Pool[] _stopWords;
@@ -828,7 +828,7 @@ private void LoadStopWords(IChannel ch, ReadOnlyMemory<char> stopwords, string d
828828
/// <summary>
829829
/// The names of the input output column pairs on which this transformation is applied.
830830
/// </summary>
831-
public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();
831+
internal IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();
832832

833833
/// <summary>
834834
/// Custom stopword remover removes specified list of stop words.

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

Lines changed: 8 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
5555
/// <param name="catalog">The text-related transform's catalog.</param>
5656
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
5757
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
58-
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
59-
public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog,
58+
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
59+
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
60+
public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
6061
string outputColumnName,
6162
string inputColumnName = null,
6263
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters)
@@ -67,10 +68,11 @@ public static TokenizingByCharactersEstimator TokenizeCharacters(this Transforms
6768
/// Tokenize incoming text in input columns and output the tokens as output columns.
6869
/// </summary>
6970
/// <param name="catalog">The text-related transform's catalog.</param>
70-
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
71+
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
72+
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
7173
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
7274

73-
public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog,
75+
public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
7476
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters,
7577
params ColumnOptions[] columns)
7678
=> new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, ColumnOptions.ConvertToValueTuples(columns));
@@ -157,29 +159,18 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
157159
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
158160
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
159161
/// <param name="separators">The separators to use (uses space character by default).</param>
160-
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
162+
public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog,
161163
string outputColumnName,
162164
string inputColumnName = null,
163165
char[] separators = null)
164166
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, separators);
165167

166-
/// <summary>
167-
/// Tokenizes incoming text in input columns and outputs the tokens using <paramref name="separators"/> as separators.
168-
/// </summary>
169-
/// <param name="catalog">The text-related transform's catalog.</param>
170-
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
171-
/// <param name="separators">The separators to use (uses space character by default).</param>
172-
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
173-
(string outputColumnName, string inputColumnName)[] columns,
174-
char[] separators = null)
175-
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, separators);
176-
177168
/// <summary>
178169
/// Tokenizes incoming text in input columns, using per-column configurations, and outputs the tokens.
179170
/// </summary>
180171
/// <param name="catalog">The text-related transform's catalog.</param>
181172
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
182-
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
173+
public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog,
183174
params WordTokenizingEstimator.ColumnOptions[] columns)
184175
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);
185176

@@ -243,24 +234,6 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC
243234
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English)
244235
=> new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, language);
245236

246-
/// <summary>
247-
/// Removes stop words from incoming token streams in input columns
248-
/// and outputs the token streams without stop words as output columns.
249-
/// </summary>
250-
/// <param name="catalog">The text-related transform's catalog.</param>
251-
/// <param name="columns">Pairs of columns to remove stop words on.</param>
252-
/// <param name="language">Langauge of the input text columns <paramref name="columns"/>.</param>
253-
/// <example>
254-
/// <format type="text/markdown">
255-
/// <![CDATA[
256-
/// [!code-csharp[FastTree](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs)]
257-
/// ]]></format>
258-
/// </example>
259-
public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsCatalog.TextTransforms catalog,
260-
(string outputColumnName, string inputColumnName)[] columns,
261-
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English)
262-
=> new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, language);
263-
264237
/// <summary>
265238
/// Removes stop words from incoming token streams in <paramref name="inputColumnName"/>
266239
/// and outputs the token streams without stopwords as <paramref name="outputColumnName"/>.
@@ -281,24 +254,6 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa
281254
params string[] stopwords)
282255
=> new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, stopwords);
283256

284-
/// <summary>
285-
/// Removes stop words from incoming token streams in input columns
286-
/// and outputs the token streams without stop words as output columns.
287-
/// </summary>
288-
/// <param name="catalog">The text-related transform's catalog.</param>
289-
/// <param name="columns">Pairs of columns to remove stop words on.</param>
290-
/// <param name="stopwords">Array of words to remove.</param>
291-
/// <example>
292-
/// <format type="text/markdown">
293-
/// <![CDATA[
294-
/// [!code-csharp[FastTree](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs)]
295-
/// ]]></format>
296-
/// </example>
297-
public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCatalog.TextTransforms catalog,
298-
(string outputColumnName, string inputColumnName)[] columns,
299-
params string[] stopwords)
300-
=> new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, stopwords);
301-
302257
/// <summary>
303258
/// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
304259
/// and outputs bag of word vector as <paramref name="outputColumnName"/>

0 commit comments

Comments
 (0)