From 8479f016ea74ca57fb1b3aebde6039088d706322 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 10 Dec 2020 14:03:03 -0800 Subject: [PATCH 1/4] Fixed bug --- .../Experiment/Runners/CrossValSummaryRunner.cs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs b/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs index 0079be3ade..401d2c3247 100644 --- a/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs +++ b/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs @@ -159,6 +159,16 @@ private static TMetrics GetAverageMetrics(IEnumerable metrics, TMetric private static double[] GetAverageOfNonNaNScoresInNestedEnumerable(IEnumerable> results) { + if (results.Contains(null)) + { + // If any of the nested enumerables is null, we can't take the average from it. + // This is expected to happen on Multiclass metrics where the TopKAccuracyForAllK + // array can be null if the topKPredictionCount isn't a valid number. + // In that case all of the results sub arrays will be null anyway, and so + // returning null is the expected solution. + return null; + } + double[] arr = new double[results.ElementAt(0).Count()]; for (int i = 0; i < arr.Length; i++) { From f082851128c6a756fa12ef813ec5d50f7037d97c Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 10 Dec 2020 14:03:29 -0800 Subject: [PATCH 2/4] Added test --- .../Microsoft.ML.AutoML.Tests/AutoFitTests.cs | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs index 14771bb329..cdbdf213ea 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs @@ -42,19 +42,48 @@ public void AutoFitBinaryTest() Assert.NotNull(result.BestRun.TrainerName); } - [Fact] - public void AutoFitMultiTest() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void AutoFitMultiTest(bool useNumberOfCVFolds) { var context = new MLContext(0); var columnInference = context.Auto().InferColumns(DatasetUtil.TrivialMulticlassDatasetPath, DatasetUtil.TrivialMulticlassDatasetLabel); var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions); var trainData = textLoader.Load(DatasetUtil.TrivialMulticlassDatasetPath); - var result = context.Auto() - .CreateMulticlassClassificationExperiment(0) - .Execute(trainData, 5, DatasetUtil.TrivialMulticlassDatasetLabel); - Assert.True(result.BestRun.Results.First().ValidationMetrics.MicroAccuracy >= 0.7); - var scoredData = result.BestRun.Results.First().Model.Transform(trainData); - Assert.Equal(NumberDataViewType.Single, scoredData.Schema[DefaultColumnNames.PredictedLabel].Type); + + if(useNumberOfCVFolds) + { + // When setting numberOfCVFolds + // The results object is a CrossValidationExperimentResults<> object + uint numberOfCVFolds = 5; + var result = context.Auto() + .CreateMulticlassClassificationExperiment(0) + .Execute(trainData, numberOfCVFolds, DatasetUtil.TrivialMulticlassDatasetLabel); + + Assert.True(result.BestRun.Results.First().ValidationMetrics.MicroAccuracy >= 0.7); + var scoredData = result.BestRun.Results.First().Model.Transform(trainData); + Assert.Equal(NumberDataViewType.Single, scoredData.Schema[DefaultColumnNames.PredictedLabel].Type); + } + else + { + // When using this API, if the trainset is under the + // crossValRowCounThreshold, AutoML will also perform CrossValidation + // but through a very different path that the one above, + // throw a CrossValSummaryRunner and will return + // a different type of object as "result" which would now be + // simply a ExperimentResult<> object + + int crossValRowCountThreshold = 15000; + trainData = context.Data.TakeRows(trainData, crossValRowCountThreshold); + var result = context.Auto() + .CreateMulticlassClassificationExperiment(0) + .Execute(trainData, DatasetUtil.TrivialMulticlassDatasetLabel); + + Assert.True(result.BestRun.ValidationMetrics.MicroAccuracy >= 0.7); + var scoredData = result.BestRun.Model.Transform(trainData); + Assert.Equal(NumberDataViewType.Single, scoredData.Schema[DefaultColumnNames.PredictedLabel].Type); + } } [TensorFlowFact] From e9958353b1def7b67c484accbd64dbe6a183425a Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 10 Dec 2020 15:09:41 -0800 Subject: [PATCH 3/4] Filter out null enumerables --- .../Experiment/Runners/CrossValSummaryRunner.cs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs b/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs index 401d2c3247..9c382468a6 100644 --- a/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs +++ b/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs @@ -159,16 +159,19 @@ private static TMetrics GetAverageMetrics(IEnumerable metrics, TMetric private static double[] GetAverageOfNonNaNScoresInNestedEnumerable(IEnumerable> results) { - if (results.Contains(null)) + if (results.All(result => result == null)) { - // If any of the nested enumerables is null, we can't take the average from it. + // If all nested enumerables are null, we say the average is a null enumerable as well. // This is expected to happen on Multiclass metrics where the TopKAccuracyForAllK // array can be null if the topKPredictionCount isn't a valid number. - // In that case all of the results sub arrays will be null anyway, and so + // In that case all of the "results" enumerables will be null anyway, and so // returning null is the expected solution. return null; } + // In case there are only some null elements, we'll ignore them: + results = results.Where(result => result != null); + double[] arr = new double[results.ElementAt(0).Count()]; for (int i = 0; i < arr.Length; i++) { From ade2592e719cb4674f08e58e867653fec9d1738a Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 10 Dec 2020 15:10:00 -0800 Subject: [PATCH 4/4] Nits in test --- test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs index cdbdf213ea..86670d5630 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs @@ -52,7 +52,7 @@ public void AutoFitMultiTest(bool useNumberOfCVFolds) var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions); var trainData = textLoader.Load(DatasetUtil.TrivialMulticlassDatasetPath); - if(useNumberOfCVFolds) + if (useNumberOfCVFolds) { // When setting numberOfCVFolds // The results object is a CrossValidationExperimentResults<> object @@ -67,7 +67,7 @@ public void AutoFitMultiTest(bool useNumberOfCVFolds) } else { - // When using this API, if the trainset is under the + // When using this other API, if the trainset is under the // crossValRowCounThreshold, AutoML will also perform CrossValidation // but through a very different path that the one above, // throw a CrossValSummaryRunner and will return @@ -75,7 +75,7 @@ public void AutoFitMultiTest(bool useNumberOfCVFolds) // simply a ExperimentResult<> object int crossValRowCountThreshold = 15000; - trainData = context.Data.TakeRows(trainData, crossValRowCountThreshold); + trainData = context.Data.TakeRows(trainData, crossValRowCountThreshold - 1); var result = context.Auto() .CreateMulticlassClassificationExperiment(0) .Execute(trainData, DatasetUtil.TrivialMulticlassDatasetLabel);