From caf8f6467153dec49d427366667949fa880230eb Mon Sep 17 00:00:00 2001 From: zewditu Hailemariam Date: Mon, 12 Feb 2024 18:50:24 -0800 Subject: [PATCH 1/5] Add text normalizer transformer to AutoML --- .../CodeGen/normalize_text_search_space.json | 33 +++++++++++++++++++ .../CodeGen/search-space-schema.json | 13 ++++++-- .../CodeGen/transformer-estimators.json | 33 +++++++++++++++++++ .../CodeGen/type_converter_search_space.json | 5 +++ .../Microsoft.ML.AutoML.csproj | 6 +++- .../Estimators/NormalizeText.cs | 18 ++++++++++ .../SearchSpaceGenerator.cs | 3 ++ 7 files changed, 107 insertions(+), 4 deletions(-) create mode 100644 src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json create mode 100644 src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/NormalizeText.cs diff --git a/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json new file mode 100644 index 0000000000..a3bbee5a3b --- /dev/null +++ b/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json @@ -0,0 +1,33 @@ +{ + "$schema": "./search-space-schema.json#", + "name": "normalize_text_option", + "search_space": [ + { + "name": "InputColumnName", + "type": "string" + }, + { + "name": "OutputColumnName", + "type": "string" + }, + { + "name": "Mode", + "type": "caseMode" + }, + { + "name": "KeepDiacritics", + "type": "boolean", + "default": false + }, + { + "name": "KeepPunctuations", + "type": "boolean", + "default": false + }, + { + "name": "KeepNumbers", + "type": "boolean", + "default": false + } + ] +} diff --git a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json index c0512afad1..24567a3100 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json +++ b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json @@ -168,7 +168,8 @@ "sentence_similarity_option", "object_detection_option", "question_answering_option", - "named_entity_recognition_option" + "named_entity_recognition_option", + "normalize_text_option" ] }, "option_name": { @@ -241,7 +242,12 @@ "TopKAnswers", "TargetType", "PredictionColumnName", - "KeyData" + "KeyData", + "Mode", + "KeepPunctuations", + "KeepDiacritics", + "KeepNumbers" + ] }, "option_type": { @@ -261,7 +267,8 @@ "bertArchitecture", "imageClassificationArchType", "dataKind", - "dataView" + "dataView", + "caseMode" ] } }, diff --git a/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json b/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json index 0fc42d3aa6..59bfe0c0be 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json +++ b/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json @@ -180,6 +180,39 @@ "usingStatements": [ "Microsoft.ML", "Microsoft.ML.Data" ], "searchOption": "featurize_text_option" }, + { + "functionName": "NormalizeText", + "estimatorTypes": [ "Text" ], + "arguments": [ + { + "argumentName": "outputColumnName", + "argumentType": "string" + }, + { + "argumentName": "inputColumnName", + "argumentType": "string" + }, + { + "argumentName": "mode", + "argumentType": "caseMode" + }, + { + "argumentName": "keepDiacritics", + "argumentType": "boolean" + }, + { + "argumentName": "keepPunctuations", + "argumentType": "boolean" + }, + { + "argumentName": "keepNumbers", + "argumentType": "boolean" + } + ], + "nugetDependencies": [ "Microsoft.ML" ], + "usingStatements": [ "Microsoft.ML", "Microsoft.ML.Data", "Microsoft.ML.Transforms.Text" ], + "searchOption": "normalize_text_option" + }, { "functionName": "ConvertType", "estimatorTypes": [ "Conversion" ], diff --git a/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json index b512e32fc0..43828be0aa 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json +++ b/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json @@ -18,6 +18,11 @@ { "name": "KeyData", "type": "dataView" + }, + + { + "name": "Mode", + "type": "caseMode" } ] } diff --git a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj index 669afe59de..cc9a61220e 100644 --- a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj +++ b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj @@ -69,6 +69,10 @@ + + + + @@ -86,7 +90,7 @@ - + diff --git a/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/NormalizeText.cs b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/NormalizeText.cs new file mode 100644 index 0000000000..b421381a63 --- /dev/null +++ b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/NormalizeText.cs @@ -0,0 +1,18 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.AutoML.CodeGen +{ + internal partial class NormalizeText + { + public override IEstimator BuildFromOption(MLContext context, NormalizeTextOption param) + { + return context.Transforms.Text.NormalizeText(param.OutputColumnName, param.InputColumnName, param.Mode, param.KeepDiacritics, param.KeepPunctuations, param.KeepNumbers); + } + } +} diff --git a/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs b/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs index 85ed140019..4cc636a0d4 100644 --- a/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs +++ b/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs @@ -58,6 +58,7 @@ public void Execute(GeneratorExecutionContext context) "imageClassificationArchType" => "Microsoft.ML.Vision.ImageClassificationTrainer.Architecture", "dataKind" => "Microsoft.ML.Data.DataKind", "dataView" => "Microsoft.ML.IDataView", + "caseMode" => "Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode", _ => throw new ArgumentException("unknown type"), }; @@ -78,6 +79,8 @@ public void Execute(GeneratorExecutionContext context) (_, "Microsoft.ML.Vision.ImageClassificationTrainer.Architecture") => defaultToken.GetValue(), (_, "Microsoft.ML.Data.DataKind") => defaultToken.GetValue(), (_, "Microsoft.ML.IDataView") => defaultToken.GetValue(), + (_, "Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode") => defaultToken.GetValue(), + (_, _) => throw new ArgumentException("unknown"), }; From 64d9b2c2d7b8e25de7c23f11c18f3b805242f42b Mon Sep 17 00:00:00 2001 From: zewditu Hailemariam Date: Mon, 12 Feb 2024 20:13:21 -0800 Subject: [PATCH 2/5] clean --- .../CodeGen/estimator-schema.json | 12 +++++++++--- .../CodeGen/search-space-schema.json | 11 +++++++++-- .../CodeGen/transformer-estimators.json | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json b/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json index 048d707214..16bc5cab17 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json +++ b/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json @@ -75,7 +75,8 @@ "SentenceSimilarity", "ObjectDetection", "QuestionAnswering", - "NamedEntityRecognition" + "NamedEntityRecognition", + "NormalizeText" ] }, "nugetDependencies": { @@ -114,7 +115,8 @@ "Microsoft.ML.Transforms.Image", "Microsoft.ML.Trainers.FastTree", "Microsoft.ML.TorchSharp", - "Microsoft.ML.Trainers.LightGbm" + "Microsoft.ML.Trainers.LightGbm", + "Microsoft.ML.Transforms.Text.TextNormalizingEstimator" ] } }, @@ -198,7 +200,11 @@ "scoreThreshold", "steps", "initLearningRate", - "weightDecay" + "weightDecay", + "mode", + "keepPunctuations", + "keepDiacritics", + "keepNumbers" ] }, "argumentType": { diff --git a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json index 24567a3100..c7a2a33b58 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json +++ b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json @@ -66,6 +66,14 @@ "DataKind.DateTimeOffset" ] }, + "caseMode": { + "type": "string", + "enum": [ + "CaseMode.Lower", + "CaseMode.Upper", + "CaseMode.None" + ] + }, "bertArchitectureArray": { "type": "array", "items": { @@ -90,7 +98,7 @@ "$ref": "#/definitions/dnnModelFactoryArray" }, { - "$ref": "#/definitions/imageClassificationArchArray" + "$ref": "#/definitions/imageClassificationArchArray" }, { "$ref": "#/definitions/boolArray" @@ -247,7 +255,6 @@ "KeepPunctuations", "KeepDiacritics", "KeepNumbers" - ] }, "option_type": { diff --git a/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json b/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json index 59bfe0c0be..560b4b9002 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json +++ b/src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json @@ -210,7 +210,7 @@ } ], "nugetDependencies": [ "Microsoft.ML" ], - "usingStatements": [ "Microsoft.ML", "Microsoft.ML.Data", "Microsoft.ML.Transforms.Text" ], + "usingStatements": [ "Microsoft.ML", "Microsoft.ML.Data", "Microsoft.ML.Transforms.Text.TextNormalizingEstimator"], "searchOption": "normalize_text_option" }, { From 60c4e04c1efd72711ed6157d4cfb50c5287d3ba6 Mon Sep 17 00:00:00 2001 From: zewditu Hailemariam Date: Mon, 12 Feb 2024 22:47:51 -0800 Subject: [PATCH 3/5] clean --- src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json | 1 - src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj | 1 - .../Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs | 1 - 3 files changed, 3 deletions(-) diff --git a/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json index 43828be0aa..e862215264 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json +++ b/src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json @@ -19,7 +19,6 @@ "name": "KeyData", "type": "dataView" }, - { "name": "Mode", "type": "caseMode" diff --git a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj index cc9a61220e..b7a2d92c3d 100644 --- a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj +++ b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj @@ -69,7 +69,6 @@ - diff --git a/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs b/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs index 4cc636a0d4..f46906ac57 100644 --- a/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs +++ b/tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs @@ -80,7 +80,6 @@ public void Execute(GeneratorExecutionContext context) (_, "Microsoft.ML.Data.DataKind") => defaultToken.GetValue(), (_, "Microsoft.ML.IDataView") => defaultToken.GetValue(), (_, "Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode") => defaultToken.GetValue(), - (_, _) => throw new ArgumentException("unknown"), }; From 5e3a0f272bff5b38e3fc553a89eff2a6e3384b8d Mon Sep 17 00:00:00 2001 From: zewditu Hailemariam Date: Tue, 13 Feb 2024 11:15:08 -0800 Subject: [PATCH 4/5] Add default --- .../CodeGen/normalize_text_search_space.json | 3 ++- .../Template/SearchSpace.cs | 2 ++ .../Template/SearchSpace.tt | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json index a3bbee5a3b..86ebc7bff3 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json +++ b/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json @@ -12,7 +12,8 @@ }, { "name": "Mode", - "type": "caseMode" + "type": "caseMode", + "default": "CaseMode.None" }, { "name": "KeepDiacritics", diff --git a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs index 409937611b..487e9f734d 100644 --- a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs +++ b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs @@ -35,6 +35,8 @@ public virtual string TransformText() using BertArchitecture = Microsoft.ML.TorchSharp.NasBert.BertArchitecture; using static Microsoft.ML.Vision.ImageClassificationTrainer.Architecture; using DataKind = Microsoft.ML.Data.DataKind; +using CaseMode = Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode; + #nullable enable namespace "); diff --git a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt index bbfad4ecb5..6f03586299 100644 --- a/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt +++ b/tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt @@ -13,6 +13,8 @@ using Anchor = Microsoft.ML.Transforms.Image.ImageResizingEstimator.Anchor; using BertArchitecture = Microsoft.ML.TorchSharp.NasBert.BertArchitecture; using static Microsoft.ML.Vision.ImageClassificationTrainer.Architecture; using DataKind = Microsoft.ML.Data.DataKind; +using CaseMode = Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode; + #nullable enable namespace <#=NameSpace#> From 5a2b8a0ec9b79bbd87d37e102198df1dbe869d7c Mon Sep 17 00:00:00 2001 From: zewditu Hailemariam Date: Tue, 13 Feb 2024 11:21:45 -0800 Subject: [PATCH 5/5] Follow default pattern of ML.Net --- .../CodeGen/normalize_text_search_space.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json index 86ebc7bff3..f37eb61f6a 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json +++ b/src/Microsoft.ML.AutoML/CodeGen/normalize_text_search_space.json @@ -13,7 +13,7 @@ { "name": "Mode", "type": "caseMode", - "default": "CaseMode.None" + "default": "CaseMode.Lower" }, { "name": "KeepDiacritics", @@ -23,12 +23,12 @@ { "name": "KeepPunctuations", "type": "boolean", - "default": false + "default": true }, { "name": "KeepNumbers", "type": "boolean", - "default": false + "default": true } ] }