Skip to content

Commit 0d6f036

Browse files
authored
Scrub text normalizer (#2918)
1 parent 40abffc commit 0d6f036

File tree

6 files changed

+72
-63
lines changed

6 files changed

+72
-63
lines changed

src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ private sealed class OutPipelineColumn : Scalar<string>
176176
{
177177
public readonly Scalar<string> Input;
178178

179-
public OutPipelineColumn(Scalar<string> input, TextNormalizingEstimator.CaseNormalizationMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
179+
public OutPipelineColumn(Scalar<string> input, TextNormalizingEstimator.CaseMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
180180
: base(new Reconciler(textCase, keepDiacritics, keepPunctuations, keepNumbers), input)
181181
{
182182
Input = input;
@@ -185,12 +185,12 @@ public OutPipelineColumn(Scalar<string> input, TextNormalizingEstimator.CaseNorm
185185

186186
private sealed class Reconciler : EstimatorReconciler, IEquatable<Reconciler>
187187
{
188-
private readonly TextNormalizingEstimator.CaseNormalizationMode _textCase;
188+
private readonly TextNormalizingEstimator.CaseMode _textCase;
189189
private readonly bool _keepDiacritics;
190190
private readonly bool _keepPunctuations;
191191
private readonly bool _keepNumbers;
192192

193-
public Reconciler(TextNormalizingEstimator.CaseNormalizationMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
193+
public Reconciler(TextNormalizingEstimator.CaseMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
194194
{
195195
_textCase = textCase;
196196
_keepDiacritics = keepDiacritics;
@@ -227,15 +227,15 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
227227
/// Normalizes input text by changing case, removing diacritical marks, punctuation marks and/or numbers.
228228
/// </summary>
229229
/// <param name="input">The column to apply to.</param>
230-
/// <param name="textCase">Casing text using the rules of the invariant culture.</param>
230+
/// <param name="caseMode">Casing text using the rules of the invariant culture.</param>
231231
/// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
232232
/// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
233233
/// <param name="keepNumbers">Whether to keep numbers or remove them.</param>
234234
public static Scalar<string> NormalizeText(this Scalar<string> input,
235-
TextNormalizingEstimator.CaseNormalizationMode textCase = TextNormalizingEstimator.CaseNormalizationMode.Lower,
235+
TextNormalizingEstimator.CaseMode caseMode = TextNormalizingEstimator.CaseMode.Lower,
236236
bool keepDiacritics = false,
237237
bool keepPunctuations = true,
238-
bool keepNumbers = true) => new OutPipelineColumn(input, textCase, keepDiacritics, keepPunctuations, keepNumbers);
238+
bool keepNumbers = true) => new OutPipelineColumn(input, caseMode, keepDiacritics, keepPunctuations, keepNumbers);
239239
}
240240

241241
/// <summary>

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,19 +83,19 @@ public static TokenizingByCharactersEstimator TokenizeCharacters(this Transforms
8383
/// <param name="catalog">The text-related transform's catalog.</param>
8484
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
8585
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
86-
/// <param name="textCase">Casing text using the rules of the invariant culture.</param>
86+
/// <param name="caseMode">Casing text using the rules of the invariant culture.</param>
8787
/// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
8888
/// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
8989
/// <param name="keepNumbers">Whether to keep numbers or remove them.</param>
9090
public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.TextTransforms catalog,
9191
string outputColumnName,
9292
string inputColumnName = null,
93-
TextNormalizingEstimator.CaseNormalizationMode textCase = TextNormalizeDefaults.TextCase,
93+
TextNormalizingEstimator.CaseMode caseMode = TextNormalizeDefaults.Mode,
9494
bool keepDiacritics = TextNormalizeDefaults.KeepDiacritics,
9595
bool keepPunctuations = TextNormalizeDefaults.KeepPunctuations,
9696
bool keepNumbers = TextNormalizeDefaults.KeepNumbers)
9797
=> new TextNormalizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
98-
outputColumnName, inputColumnName, textCase, keepDiacritics, keepPunctuations, keepNumbers);
98+
outputColumnName, inputColumnName, caseMode, keepDiacritics, keepPunctuations, keepNumbers);
9999

100100
/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
101101
/// <param name="catalog">The text-related transform's catalog.</param>

src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
namespace Microsoft.ML.Transforms.Text
2626
{
27-
using CaseNormalizationMode = TextNormalizingEstimator.CaseNormalizationMode;
27+
using CaseMode = TextNormalizingEstimator.CaseMode;
2828
// A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts
2929
// of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature
3030
// integer index mapping through hashing) as an option.
@@ -100,7 +100,7 @@ internal sealed class Arguments : TransformInputBase
100100
public bool UsePredefinedStopWordRemover = false;
101101

102102
[Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", ShortName = "case", SortOrder = 5)]
103-
public CaseNormalizationMode TextCase = TextNormalizingEstimator.Defaults.TextCase;
103+
public CaseMode TextCase = TextNormalizingEstimator.Defaults.Mode;
104104

105105
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to keep diacritical marks or remove them.", ShortName = "diac", SortOrder = 6)]
106106
public bool KeepDiacritics = TextNormalizingEstimator.Defaults.KeepDiacritics;
@@ -142,7 +142,7 @@ public sealed class Options
142142
/// <summary>
143143
/// Casing used for the text.
144144
/// </summary>
145-
public CaseNormalizationMode TextCase { get; set; } = CaseNormalizationMode.Lower;
145+
public CaseMode TextCase { get; set; } = CaseMode.Lower;
146146
/// <summary>
147147
/// Whether to keep diacritical marks or remove them.
148148
/// </summary>
@@ -203,7 +203,7 @@ private sealed class TransformApplierParams
203203
public readonly NormFunction VectorNormalizer;
204204
public readonly Language Language;
205205
public readonly bool UsePredefinedStopWordRemover;
206-
public readonly CaseNormalizationMode TextCase;
206+
public readonly CaseMode TextCase;
207207
public readonly bool KeepDiacritics;
208208
public readonly bool KeepPunctuations;
209209
public readonly bool KeepNumbers;
@@ -241,7 +241,7 @@ public bool NeedsNormalizeTransform
241241
get
242242
{
243243
return
244-
TextCase != CaseNormalizationMode.None ||
244+
TextCase != CaseMode.None ||
245245
!KeepDiacritics ||
246246
!KeepPunctuations ||
247247
!KeepNumbers;
@@ -275,7 +275,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
275275
{
276276
var host = parent._host;
277277
host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.TextLanguage));
278-
host.Check(Enum.IsDefined(typeof(CaseNormalizationMode), parent.OptionalSettings.TextCase));
278+
host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.TextCase));
279279
WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary);
280280
CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary);
281281
VectorNormalizer = parent.OptionalSettings.VectorNormalizer;

0 commit comments

Comments
 (0)