diff --git a/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json b/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json
index 048d707214..16bc5cab17 100644
--- a/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json
+++ b/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json
@@ -75,7 +75,8 @@
"SentenceSimilarity",
"ObjectDetection",
"QuestionAnswering",
- "NamedEntityRecognition"
+ "NamedEntityRecognition",
+ "NormalizeText"
]
},
"nugetDependencies": {
@@ -114,7 +115,8 @@
"Microsoft.ML.Transforms.Image",
"Microsoft.ML.Trainers.FastTree",
"Microsoft.ML.TorchSharp",
- "Microsoft.ML.Trainers.LightGbm"
+ "Microsoft.ML.Trainers.LightGbm",
+ "Microsoft.ML.Transforms.Text.TextNormalizingEstimator"
]
}
},
@@ -198,7 +200,11 @@
"scoreThreshold",
"steps",
"initLearningRate",
- "weightDecay"
+ "weightDecay",
+ "mode",
+ "keepPunctuations",
+ "keepDiacritics",
+ "keepNumbers"
]
},
"argumentType": {
diff --git a/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json
new file mode 100644
index 0000000000..f37eb61f6a
--- /dev/null
+++ b/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json
@@ -0,0 +1,34 @@
+{
+ "$schema": "./search-space-schema.json#",
+ "name": "normalize_text_option",
+ "search_space": [
+ {
+ "name": "InputColumnName",
+ "type": "string"
+ },
+ {
+ "name": "OutputColumnName",
+ "type": "string"
+ },
+ {
+ "name": "Mode",
+ "type": "caseMode",
+ "default": "CaseMode.Lower"
+ },
+ {
+ "name": "KeepDiacritics",
+ "type": "boolean",
+ "default": false
+ },
+ {
+ "name": "KeepPunctuations",
+ "type": "boolean",
+ "default": true
+ },
+ {
+ "name": "KeepNumbers",
+ "type": "boolean",
+ "default": true
+ }
+ ]
+}
diff --git a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json
index c0512afad1..c7a2a33b58 100644
--- a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json
+++ b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json
@@ -66,6 +66,14 @@
"DataKind.DateTimeOffset"
]
},
+ "caseMode": {
+ "type": "string",
+ "enum": [
+ "CaseMode.Lower",
+ "CaseMode.Upper",
+ "CaseMode.None"
+ ]
+ },
"bertArchitectureArray": {
"type": "array",
"items": {
@@ -90,7 +98,7 @@
"$ref": "#/definitions/dnnModelFactoryArray"
},
{
- "$ref": "#/definitions/imageClassificationArchArray"
+ "$ref": "#/definitions/imageClassificationArchArray"
},
{
"$ref": "#/definitions/boolArray"
@@ -168,7 +176,8 @@
"sentence_similarity_option",
"object_detection_option",
"question_answering_option",
- "named_entity_recognition_option"
+ "named_entity_recognition_option",
+ "normalize_text_option"
]
},
"option_name": {
@@ -241,7 +250,11 @@
"TopKAnswers",
"TargetType",
"PredictionColumnName",
- "KeyData"
+ "KeyData",
+ "Mode",
+ "KeepPunctuations",
+ "KeepDiacritics",
+ "KeepNumbers"
]
},
"option_type": {
@@ -261,7 +274,8 @@
"bertArchitecture",
"imageClassificationArchType",
"dataKind",
- "dataView"
+ "dataView",
+ "caseMode"
]
}
},
diff --git a/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json b/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json
index 0fc42d3aa6..560b4b9002 100644
--- a/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json
+++ b/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json
@@ -180,6 +180,39 @@
"usingStatements": [ "Microsoft.ML", "Microsoft.ML.Data" ],
"searchOption": "featurize_text_option"
},
+ {
+ "functionName": "NormalizeText",
+ "estimatorTypes": [ "Text" ],
+ "arguments": [
+ {
+ "argumentName": "outputColumnName",
+ "argumentType": "string"
+ },
+ {
+ "argumentName": "inputColumnName",
+ "argumentType": "string"
+ },
+ {
+ "argumentName": "mode",
+ "argumentType": "caseMode"
+ },
+ {
+ "argumentName": "keepDiacritics",
+ "argumentType": "boolean"
+ },
+ {
+ "argumentName": "keepPunctuations",
+ "argumentType": "boolean"
+ },
+ {
+ "argumentName": "keepNumbers",
+ "argumentType": "boolean"
+ }
+ ],
+ "nugetDependencies": [ "Microsoft.ML" ],
+ "usingStatements": [ "Microsoft.ML", "Microsoft.ML.Data", "Microsoft.ML.Transforms.Text.TextNormalizingEstimator"],
+ "searchOption": "normalize_text_option"
+ },
{
"functionName": "ConvertType",
"estimatorTypes": [ "Conversion" ],
diff --git a/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json
index b512e32fc0..e862215264 100644
--- a/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json
+++ b/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json
@@ -18,6 +18,10 @@
{
"name": "KeyData",
"type": "dataView"
+ },
+ {
+ "name": "Mode",
+ "type": "caseMode"
}
]
}
diff --git a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj
index 669afe59de..b7a2d92c3d 100644
--- a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj
+++ b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj
@@ -69,6 +69,9 @@
+
+
+
@@ -86,7 +89,7 @@
-
+
diff --git a/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/NormalizeText.cs b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/NormalizeText.cs
new file mode 100644
index 0000000000..b421381a63
--- /dev/null
+++ b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/NormalizeText.cs
@@ -0,0 +1,18 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Microsoft.ML.AutoML.CodeGen
+{
+ internal partial class NormalizeText
+ {
+ public override IEstimator BuildFromOption(MLContext context, NormalizeTextOption param)
+ {
+ return context.Transforms.Text.NormalizeText(param.OutputColumnName, param.InputColumnName, param.Mode, param.KeepDiacritics, param.KeepPunctuations, param.KeepNumbers);
+ }
+ }
+}
diff --git a/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs b/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs
index 85ed140019..f46906ac57 100644
--- a/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs
+++ b/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs
@@ -58,6 +58,7 @@ public void Execute(GeneratorExecutionContext context)
"imageClassificationArchType" => "Microsoft.ML.Vision.ImageClassificationTrainer.Architecture",
"dataKind" => "Microsoft.ML.Data.DataKind",
"dataView" => "Microsoft.ML.IDataView",
+ "caseMode" => "Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode",
_ => throw new ArgumentException("unknown type"),
};
@@ -78,6 +79,7 @@ public void Execute(GeneratorExecutionContext context)
(_, "Microsoft.ML.Vision.ImageClassificationTrainer.Architecture") => defaultToken.GetValue(),
(_, "Microsoft.ML.Data.DataKind") => defaultToken.GetValue(),
(_, "Microsoft.ML.IDataView") => defaultToken.GetValue(),
+ (_, "Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode") => defaultToken.GetValue(),
(_, _) => throw new ArgumentException("unknown"),
};
diff --git a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs
index 409937611b..487e9f734d 100644
--- a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs
+++ b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs
@@ -35,6 +35,8 @@ public virtual string TransformText()
using BertArchitecture = Microsoft.ML.TorchSharp.NasBert.BertArchitecture;
using static Microsoft.ML.Vision.ImageClassificationTrainer.Architecture;
using DataKind = Microsoft.ML.Data.DataKind;
+using CaseMode = Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode;
+
#nullable enable
namespace ");
diff --git a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt
index bbfad4ecb5..6f03586299 100644
--- a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt
+++ b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt
@@ -13,6 +13,8 @@ using Anchor = Microsoft.ML.Transforms.Image.ImageResizingEstimator.Anchor;
using BertArchitecture = Microsoft.ML.TorchSharp.NasBert.BertArchitecture;
using static Microsoft.ML.Vision.ImageClassificationTrainer.Architecture;
using DataKind = Microsoft.ML.Data.DataKind;
+using CaseMode = Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode;
+
#nullable enable
namespace <#=NameSpace#>