From 3f23bbe0cfbe7e8ecf99bfe4985cec0d1e0750b0 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Wed, 9 Nov 2022 22:32:17 -0800 Subject: [PATCH 1/2] implement sentence similarity sweepable estimator --- .../CodeGen/estimator-schema.json | 4 +- .../CodeGen/search-space-schema.json | 3 +- .../sentence_similarity_search_space.json | 45 +++++++++++++++++++ .../CodeGen/trainer-estimators.json | 19 ++++---- .../Microsoft.ML.AutoML.csproj | 1 - .../Estimators/SentenceSimilarity.cs | 28 ++++++++++++ 6 files changed, 86 insertions(+), 14 deletions(-) create mode 100644 src/Microsoft.ML.AutoML/CodeGen/sentence_similarity_search_space.json create mode 100644 src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/SentenceSimilarity.cs diff --git a/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json b/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json index 44ee31b892..68d731a6a1 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json +++ b/src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json @@ -71,7 +71,8 @@ "DnnFeaturizerImage", "Naive", "ForecastBySsa", - "TextClassifcation" + "TextClassifcation", + "SentenceSimilarity" ] }, "nugetDependencies": { @@ -109,6 +110,7 @@ "Microsoft.ML.Vision", "Microsoft.ML.Transforms.Image", "Microsoft.ML.Trainers.FastTree", + "Microsoft.ML.TorchSharp", "Microsoft.ML.Trainers.LightGbm" ] } diff --git a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json index c7ef37e7be..22432ebf80 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json +++ b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json @@ -126,7 +126,8 @@ "image_classification_option", "matrix_factorization_option", "dnn_featurizer_image_option", - "text_classification_option" + "text_classification_option", + "sentence_similarity_option" ] }, "option_name": { diff --git a/src/Microsoft.ML.AutoML/CodeGen/sentence_similarity_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/sentence_similarity_search_space.json new file mode 100644 index 0000000000..37157b5010 --- /dev/null +++ b/src/Microsoft.ML.AutoML/CodeGen/sentence_similarity_search_space.json @@ -0,0 +1,45 @@ +{ + "$schema": "./search-space-schema.json#", + "name": "sentence_similarity_option", + "search_space": [ + { + "name": "LabelColumnName", + "type": "string", + "default": "Label" + }, + { + "name": "Sentence1ColumnName", + "type": "string", + "default": "Sentence1" + }, + { + "name": "Sentence2ColumnName", + "type": "string" + }, + { + "name": "ScoreColumnName", + "type": "string", + "default": "Score" + }, + { + "name": "OutputColumnName", + "type": "string", + "default": "PredictedLabel" + }, + { + "name": "BatchSize", + "type": "integer", + "default": 32 + }, + { + "name": "MaxEpochs", + "type": "integer", + "default": 10 + }, + { + "name": "Architecture", + "type": "bertArchitecture", + "default": "BertArchitecture.Roberta" + } + ] +} diff --git a/src/Microsoft.ML.AutoML/CodeGen/trainer-estimators.json b/src/Microsoft.ML.AutoML/CodeGen/trainer-estimators.json index 4deeb9b804..a4ab25b60b 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/trainer-estimators.json +++ b/src/Microsoft.ML.AutoML/CodeGen/trainer-estimators.json @@ -306,7 +306,7 @@ "argumentType": "boolean" } ], - "nugetDependencies": ["Microsoft.ML"], + "nugetDependencies": [ "Microsoft.ML" ], "usingStatements": [ "Microsoft.ML", "Microsoft.ML.Trainers" ], "searchOption": "lbfgs_option" }, @@ -514,20 +514,17 @@ { "functionName": "TextClassifcation", "estimatorTypes": [ "MultiClassification" ], - "arguments": [ - { - "argumentName": "labelColumnName", - "argumentType": "string" - }, - { - "argumentName": "sentence1ColumnName", - "argumentType": "string" - } - ], "nugetDependencies": [ "Microsoft.ML", "Microsoft.ML.TorchSharp" ], "usingStatements": [ "Microsoft.ML", "Microsoft.ML.Trainers", "Microsoft.ML.TorchSharp" ], "searchOption": "text_classification_option" }, + { + "functionName": "SentenceSimilarity", + "estimatorTypes": [ "Regression" ], + "nugetDependencies": [ "Microsoft.ML", "Microsoft.ML.TorchSharp" ], + "usingStatements": [ "Microsoft.ML", "Microsoft.ML.Trainers", "Microsoft.ML.TorchSharp" ], + "searchOption": "sentence_similarity_option" + }, { "functionName": "ForecastBySsa", "estimatorTypes": [ "Forecasting" ], diff --git a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj index 1c03c24792..562ee0410e 100644 --- a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj +++ b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj @@ -66,7 +66,6 @@ - diff --git a/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/SentenceSimilarity.cs b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/SentenceSimilarity.cs new file mode 100644 index 0000000000..f12191d6b8 --- /dev/null +++ b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/SentenceSimilarity.cs @@ -0,0 +1,28 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Reflection; +using System.Text; +using Microsoft.ML.TorchSharp; + +namespace Microsoft.ML.AutoML.CodeGen +{ + internal partial class SentenceSimilarityRegression + { + public override IEstimator BuildFromOption(MLContext context, SentenceSimilarityOption param) + { + return context.Regression.Trainers.SentenceSimilarity( + labelColumnName: param.LabelColumnName, + sentence1ColumnName: param.Sentence1ColumnName, + scoreColumnName: param.ScoreColumnName, + sentence2ColumnName: param.Sentence2ColumnName, + outputColumnName: param.OutputColumnName, + batchSize: param.BatchSize, + maxEpochs: param.MaxEpochs, + architecture: param.Architecture); + } + } +} From 560e1472d6171b7b065e8c0b930092c8b532206d Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Wed, 9 Nov 2022 22:58:03 -0800 Subject: [PATCH 2/2] fix build error --- .../CodeGen/sentence_similarity_search_space.json | 5 ----- .../SweepableEstimator/Estimators/SentenceSimilarity.cs | 1 - 2 files changed, 6 deletions(-) diff --git a/src/Microsoft.ML.AutoML/CodeGen/sentence_similarity_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/sentence_similarity_search_space.json index 37157b5010..0ad05a1858 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/sentence_similarity_search_space.json +++ b/src/Microsoft.ML.AutoML/CodeGen/sentence_similarity_search_space.json @@ -21,11 +21,6 @@ "type": "string", "default": "Score" }, - { - "name": "OutputColumnName", - "type": "string", - "default": "PredictedLabel" - }, { "name": "BatchSize", "type": "integer", diff --git a/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/SentenceSimilarity.cs b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/SentenceSimilarity.cs index f12191d6b8..24ae6cb3a3 100644 --- a/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/SentenceSimilarity.cs +++ b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/SentenceSimilarity.cs @@ -19,7 +19,6 @@ public override IEstimator BuildFromOption(MLContext context, Sent sentence1ColumnName: param.Sentence1ColumnName, scoreColumnName: param.ScoreColumnName, sentence2ColumnName: param.Sentence2ColumnName, - outputColumnName: param.OutputColumnName, batchSize: param.BatchSize, maxEpochs: param.MaxEpochs, architecture: param.Architecture);