diff --git a/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json b/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json index 048d707214..16bc5cab17 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json +++ b/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json @@ -75,7 +75,8 @@ "SentenceSimilarity", "ObjectDetection", "QuestionAnswering", - "NamedEntityRecognition" + "NamedEntityRecognition", + "NormalizeText" ] }, "nugetDependencies": { @@ -114,7 +115,8 @@ "Microsoft.ML.Transforms.Image", "Microsoft.ML.Trainers.FastTree", "Microsoft.ML.TorchSharp", - "Microsoft.ML.Trainers.LightGbm" + "Microsoft.ML.Trainers.LightGbm", + "Microsoft.ML.Transforms.Text.TextNormalizingEstimator" ] } }, @@ -198,7 +200,11 @@ "scoreThreshold", "steps", "initLearningRate", - "weightDecay" + "weightDecay", + "mode", + "keepPunctuations", + "keepDiacritics", + "keepNumbers" ] }, "argumentType": { diff --git a/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json new file mode 100644 index 0000000000..f37eb61f6a --- /dev/null +++ b/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json @@ -0,0 +1,34 @@ +{ + "$schema": "./search-space-schema.json#", + "name": "normalize_text_option", + "search_space": [ + { + "name": "InputColumnName", + "type": "string" + }, + { + "name": "OutputColumnName", + "type": "string" + }, + { + "name": "Mode", + "type": "caseMode", + "default": "CaseMode.Lower" + }, + { + "name": "KeepDiacritics", + "type": "boolean", + "default": false + }, + { + "name": "KeepPunctuations", + "type": "boolean", + "default": true + }, + { + "name": "KeepNumbers", + "type": "boolean", + "default": true + } + ] +} diff --git a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json index c0512afad1..c7a2a33b58 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json +++ b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json @@ -66,6 +66,14 @@ "DataKind.DateTimeOffset" ] }, + "caseMode": { + "type": "string", + "enum": [ + "CaseMode.Lower", + "CaseMode.Upper", + "CaseMode.None" + ] + }, "bertArchitectureArray": { "type": "array", "items": { @@ -90,7 +98,7 @@ "$ref": "#/definitions/dnnModelFactoryArray" }, { - "$ref": "#/definitions/imageClassificationArchArray" + "$ref": "#/definitions/imageClassificationArchArray" }, { "$ref": "#/definitions/boolArray" @@ -168,7 +176,8 @@ "sentence_similarity_option", "object_detection_option", "question_answering_option", - "named_entity_recognition_option" + "named_entity_recognition_option", + "normalize_text_option" ] }, "option_name": { @@ -241,7 +250,11 @@ "TopKAnswers", "TargetType", "PredictionColumnName", - "KeyData" + "KeyData", + "Mode", + "KeepPunctuations", + "KeepDiacritics", + "KeepNumbers" ] }, "option_type": { @@ -261,7 +274,8 @@ "bertArchitecture", "imageClassificationArchType", "dataKind", - "dataView" + "dataView", + "caseMode" ] } }, diff --git a/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json b/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json index 0fc42d3aa6..560b4b9002 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json +++ b/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json @@ -180,6 +180,39 @@ "usingStatements": [ "Microsoft.ML", "Microsoft.ML.Data" ], "searchOption": "featurize_text_option" }, + { + "functionName": "NormalizeText", + "estimatorTypes": [ "Text" ], + "arguments": [ + { + "argumentName": "outputColumnName", + "argumentType": "string" + }, + { + "argumentName": "inputColumnName", + "argumentType": "string" + }, + { + "argumentName": "mode", + "argumentType": "caseMode" + }, + { + "argumentName": "keepDiacritics", + "argumentType": "boolean" + }, + { + "argumentName": "keepPunctuations", + "argumentType": "boolean" + }, + { + "argumentName": "keepNumbers", + "argumentType": "boolean" + } + ], + "nugetDependencies": [ "Microsoft.ML" ], + "usingStatements": [ "Microsoft.ML", "Microsoft.ML.Data", "Microsoft.ML.Transforms.Text.TextNormalizingEstimator"], + "searchOption": "normalize_text_option" + }, { "functionName": "ConvertType", "estimatorTypes": [ "Conversion" ], diff --git a/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json index b512e32fc0..e862215264 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json +++ b/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json @@ -18,6 +18,10 @@ { "name": "KeyData", "type": "dataView" + }, + { + "name": "Mode", + "type": "caseMode" } ] } diff --git a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj index 669afe59de..b7a2d92c3d 100644 --- a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj +++ b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj @@ -69,6 +69,9 @@ + + + @@ -86,7 +89,7 @@ - + diff --git a/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/NormalizeText.cs b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/NormalizeText.cs new file mode 100644 index 0000000000..b421381a63 --- /dev/null +++ b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/NormalizeText.cs @@ -0,0 +1,18 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.AutoML.CodeGen +{ + internal partial class NormalizeText + { + public override IEstimator BuildFromOption(MLContext context, NormalizeTextOption param) + { + return context.Transforms.Text.NormalizeText(param.OutputColumnName, param.InputColumnName, param.Mode, param.KeepDiacritics, param.KeepPunctuations, param.KeepNumbers); + } + } +} diff --git a/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs b/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs index 85ed140019..f46906ac57 100644 --- a/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs +++ b/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs @@ -58,6 +58,7 @@ public void Execute(GeneratorExecutionContext context) "imageClassificationArchType" => "Microsoft.ML.Vision.ImageClassificationTrainer.Architecture", "dataKind" => "Microsoft.ML.Data.DataKind", "dataView" => "Microsoft.ML.IDataView", + "caseMode" => "Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode", _ => throw new ArgumentException("unknown type"), }; @@ -78,6 +79,7 @@ public void Execute(GeneratorExecutionContext context) (_, "Microsoft.ML.Vision.ImageClassificationTrainer.Architecture") => defaultToken.GetValue(), (_, "Microsoft.ML.Data.DataKind") => defaultToken.GetValue(), (_, "Microsoft.ML.IDataView") => defaultToken.GetValue(), + (_, "Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode") => defaultToken.GetValue(), (_, _) => throw new ArgumentException("unknown"), }; diff --git a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs index 409937611b..487e9f734d 100644 --- a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs +++ b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs @@ -35,6 +35,8 @@ public virtual string TransformText() using BertArchitecture = Microsoft.ML.TorchSharp.NasBert.BertArchitecture; using static Microsoft.ML.Vision.ImageClassificationTrainer.Architecture; using DataKind = Microsoft.ML.Data.DataKind; +using CaseMode = Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode; + #nullable enable namespace "); diff --git a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt index bbfad4ecb5..6f03586299 100644 --- a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt +++ b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt @@ -13,6 +13,8 @@ using Anchor = Microsoft.ML.Transforms.Image.ImageResizingEstimator.Anchor; using BertArchitecture = Microsoft.ML.TorchSharp.NasBert.BertArchitecture; using static Microsoft.ML.Vision.ImageClassificationTrainer.Architecture; using DataKind = Microsoft.ML.Data.DataKind; +using CaseMode = Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode; + #nullable enable namespace <#=NameSpace#>