Use Timer and ctx.CancelExecution() to fix AutoML max-time experiment bug (#5445)

mstfbl · web-flow · commit 600d48dec1f8 · 2020-11-03T10:40:17.000-08:00
* Use ctx.CalncelExecution() to fix AutoML max-time experiment bug

* Added unit test for checking canceled experiment

* Nit fix

* Different run time on Linux

* Review

* Testing four ouput

* Used reflection to test for contexts being canceled

* Reviews

* Reviews

* Added main MLContext listener-timer

* Added PRNG on _context, held onto timers for avoiding GC

* Addressed reviews

* Unit test edits

* Increase run time of experiment to guarantee probabilities

* Edited unit test to check produced schema of next run model's predictions

* Remove scheme check as different CI builds result in varying schemas

* Decrease max experiment time unit test time

* Added Timers

* Increase second timer time, edit unit test

* Added try catch for OperationCanceledException in Execute()

* Add AggregateException try catch to slow unit tests for parallel testing

* Reviews

* Final reviews

* Added LightGBMFact to binary classification test

* Removed extra Operation Stopped exception try catch

* Add back OperationCanceledException to Experiment.cs
diff --git a/src/Microsoft.ML.AutoML/Experiment/Experiment.cs b/src/Microsoft.ML.AutoML/Experiment/Experiment.cs
@@ -7,6 +7,8 @@
 using System.Diagnostics;
 using System.IO;
 using System.Linq;
+using System.Threading;
+using Microsoft.ML.Data;
 using Microsoft.ML.Runtime;
 
 namespace Microsoft.ML.AutoML
@@ -25,6 +27,11 @@ internal class Experiment<TRunDetail, TMetrics> where TRunDetail : RunDetail
         private readonly IRunner<TRunDetail> _runner;
         private readonly IList<SuggestedPipelineRunDetail> _history;
         private readonly IChannel _logger;
+        private Timer _maxExperimentTimeTimer;
+        private Timer _mainContextCanceledTimer;
+        private bool _experimentTimerExpired;
+        private MLContext _currentModelMLContext;
+        private Random _newContextSeedGenerator;
 
         public Experiment(MLContext context,
             TaskKind task,
@@ -49,60 +56,125 @@ public Experiment(MLContext context,
             _datasetColumnInfo = datasetColumnInfo;
             _runner = runner;
             _logger = logger;
+            _experimentTimerExpired = false;
+        }
+
+        private void MaxExperimentTimeExpiredEvent(object state)
+        {
+            // If at least one model was run, end experiment immediately.
+            // Else, wait for first model to run before experiment is concluded.
+            _experimentTimerExpired = true;
+            if (_history.Any(r => r.RunSucceeded))
+            {
+                _logger.Warning("Allocated time for Experiment of {0} seconds has elapsed with {1} models run. Ending experiment...",
+                    _experimentSettings.MaxExperimentTimeInSeconds, _history.Count());
+                _currentModelMLContext.CancelExecution();
+            }
+        }
+
+        private void MainContextCanceledEvent(object state)
+        {
+            // If the main MLContext is canceled, cancel the ongoing model training and MLContext.
+            if ((_context.Model.GetEnvironment() as ICancelable).IsCanceled)
+            {
+                _logger.Warning("Main MLContext has been canceled. Ending experiment...");
+                // Stop timer to prevent restarting and prevent continuous calls to
+                // MainContextCanceledEvent
+                _mainContextCanceledTimer.Change(Timeout.Infinite, Timeout.Infinite);
+                _currentModelMLContext.CancelExecution();
+            }
         }
 
         public IList<TRunDetail> Execute()
         {
-            var stopwatch = Stopwatch.StartNew();
             var iterationResults = new List<TRunDetail>();
+            // Create a timer for the max duration of experiment. When given time has
+            // elapsed, MaxExperimentTimeExpiredEvent is called to interrupt training
+            // of current model. Timer is not used if no experiment time is given, or
+            // is not a positive number.
+            if (_experimentSettings.MaxExperimentTimeInSeconds > 0)
+            {
+                _maxExperimentTimeTimer = new Timer(
+                    new TimerCallback(MaxExperimentTimeExpiredEvent), null,
+                    _experimentSettings.MaxExperimentTimeInSeconds * 1000, Timeout.Infinite
+                );
+            }
+            // If given max duration of experiment is 0, only 1 model will be trained.
+            // _experimentSettings.MaxExperimentTimeInSeconds is of type uint, it is
+            // either 0 or >0.
+            else
+                _experimentTimerExpired = true;
+
+            // Add second timer to check for the cancelation signal from the main MLContext
+            // to the active child MLContext. This timer will propagate the cancelation
+            // signal from the main to the child MLContexs if the main MLContext is
+            // canceled.
+            _mainContextCanceledTimer = new Timer(new TimerCallback(MainContextCanceledEvent), null, 1000, 1000);
+
+            // Pseudo random number generator to result in deterministic runs with the provided main MLContext's seed and to
+            // maintain variability between training iterations.
+            int? mainContextSeed = ((ISeededEnvironment)_context.Model.GetEnvironment()).Seed;
+            _newContextSeedGenerator = (mainContextSeed.HasValue) ? RandomUtils.Create(mainContextSeed.Value) : null;
 
             do
             {
-                var iterationStopwatch = Stopwatch.StartNew();
-
-                // get next pipeline
-                var getPipelineStopwatch = Stopwatch.StartNew();
-                var pipeline = PipelineSuggester.GetNextInferredPipeline(_context, _history, _datasetColumnInfo, _task,
-                    _optimizingMetricInfo.IsMaximizing, _experimentSettings.CacheBeforeTrainer, _logger, _trainerAllowList);
-
-                var pipelineInferenceTimeInSeconds = getPipelineStopwatch.Elapsed.TotalSeconds;
-
-                // break if no candidates returned, means no valid pipeline available
-                if (pipeline == null)
-                {
-                    break;
-                }
-
-                // evaluate pipeline
-                _logger.Trace($"Evaluating pipeline {pipeline.ToString()}");
-                (SuggestedPipelineRunDetail suggestedPipelineRunDetail, TRunDetail runDetail)
-                    = _runner.Run(pipeline, _modelDirectory, _history.Count + 1);
-
-                _history.Add(suggestedPipelineRunDetail);
-                WriteIterationLog(pipeline, suggestedPipelineRunDetail, iterationStopwatch);
-
-                runDetail.RuntimeInSeconds = iterationStopwatch.Elapsed.TotalSeconds;
-                runDetail.PipelineInferenceTimeInSeconds = getPipelineStopwatch.Elapsed.TotalSeconds;
-
-                ReportProgress(runDetail);
-                iterationResults.Add(runDetail);
-
-                // if model is perfect, break
-                if (_metricsAgent.IsModelPerfect(suggestedPipelineRunDetail.Score))
+                try
                 {
-                    break;
+                    var iterationStopwatch = Stopwatch.StartNew();
+
+                    // get next pipeline
+                    var getPipelineStopwatch = Stopwatch.StartNew();
+
+                    // A new MLContext is needed per model run. When max experiment time is reached, each used
+                    // context is canceled to stop further model training. The cancellation of the main MLContext
+                    // a user has instantiated is not desirable, thus additional MLContexts are used.
+                    _currentModelMLContext = _newContextSeedGenerator == null ? new MLContext() : new MLContext(_newContextSeedGenerator.Next());
+                    var pipeline = PipelineSuggester.GetNextInferredPipeline(_currentModelMLContext, _history, _datasetColumnInfo, _task,
+                        _optimizingMetricInfo.IsMaximizing, _experimentSettings.CacheBeforeTrainer, _logger, _trainerAllowList);
+                    // break if no candidates returned, means no valid pipeline available
+                    if (pipeline == null)
+                    {
+                        break;
+                    }
+
+                    // evaluate pipeline
+                    _logger.Trace($"Evaluating pipeline {pipeline.ToString()}");
+                    (SuggestedPipelineRunDetail suggestedPipelineRunDetail, TRunDetail runDetail)
+                        = _runner.Run(pipeline, _modelDirectory, _history.Count + 1);
+
+                    _history.Add(suggestedPipelineRunDetail);
+                    WriteIterationLog(pipeline, suggestedPipelineRunDetail, iterationStopwatch);
+
+                    runDetail.RuntimeInSeconds = iterationStopwatch.Elapsed.TotalSeconds;
+                    runDetail.PipelineInferenceTimeInSeconds = getPipelineStopwatch.Elapsed.TotalSeconds;
+
+                    ReportProgress(runDetail);
+                    iterationResults.Add(runDetail);
+
+                    // if model is perfect, break
+                    if (_metricsAgent.IsModelPerfect(suggestedPipelineRunDetail.Score))
+                    {
+                        break;
+                    }
+
+                    // If after third run, all runs have failed so far, throw exception
+                    if (_history.Count() == 3 && _history.All(r => !r.RunSucceeded))
+                    {
+                        throw new InvalidOperationException($"Training failed with the exception: {_history.Last().Exception}");
+                    }
                 }
-
-                // If after third run, all runs have failed so far, throw exception
-                if (_history.Count() == 3 && _history.All(r => !r.RunSucceeded))
+                catch (OperationCanceledException e)
                 {
-                    throw new InvalidOperationException($"Training failed with the exception: {_history.Last().Exception}");
+                    // This exception is thrown when the IHost/MLContext of the trainer is canceled due to
+                    // reaching maximum experiment time. Simply catch this exception and return finished
+                    // iteration results.
+                    _logger.Warning("OperationCanceledException has been caught after maximum experiment time" +
+                        "was reached, and the running MLContext was stopped. Details: {0}", e.Message);
+                    return iterationResults;
                 }
-
             } while (_history.Count < _experimentSettings.MaxModels &&
                     !_experimentSettings.CancellationToken.IsCancellationRequested &&
-                    stopwatch.Elapsed.TotalSeconds < _experimentSettings.MaxExperimentTimeInSeconds);
-
+                    !_experimentTimerExpired);
             return iterationResults;
         }
 
diff --git a/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs b/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs
@@ -58,7 +58,7 @@ public CrossValSummaryRunner(MLContext context,
             for (var i = 0; i < _trainDatasets.Length; i++)
             {
                 var modelFileInfo = RunnerUtil.GetModelFileInfo(modelDirectory, iterationNum, i + 1);
-                var trainResult = RunnerUtil.TrainAndScorePipeline(_context, pipeline, _trainDatasets[i], _validDatasets[i],
+                var trainResult = RunnerUtil.TrainAndScorePipeline(pipeline.GetContext(), pipeline, _trainDatasets[i], _validDatasets[i],
                     _groupIdColumn, _labelColumn, _metricsAgent, _preprocessorTransforms?.ElementAt(i), modelFileInfo, _modelInputSchema,
                     _logger);
                 trainResults.Add(trainResult);
diff --git a/src/Microsoft.ML.AutoML/Experiment/SuggestedPipeline.cs b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipeline.cs
@@ -52,6 +52,11 @@ public override int GetHashCode()
             return ToString().GetHashCode();
         }
 
+        public MLContext GetContext()
+        {
+            return _context;
+        }
+
         public Pipeline ToPipeline()
         {
             var pipelineElements = new List<PipelineNode>();
diff --git a/src/Microsoft.ML.Core/Data/IHostEnvironment.cs b/src/Microsoft.ML.Core/Data/IHostEnvironment.cs
@@ -72,7 +72,7 @@ public interface IHostEnvironment : IChannelProvider, IProgressChannelProvider
     internal interface ICancelable
     {
         /// <summary>
-        /// Signal to stop exection in all the hosts.
+        /// Signal to stop execution in all the hosts.
         /// </summary>
         void CancelExecution();
 
diff --git a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ public CrossValSummaryRunner(MLContext context,`
`58`	`58`	`for (var i = 0; i < _trainDatasets.Length; i++)`
`59`	`59`	`{`
`60`	`60`	`var modelFileInfo = RunnerUtil.GetModelFileInfo(modelDirectory, iterationNum, i + 1);`
`61`		`- var trainResult = RunnerUtil.TrainAndScorePipeline(_context, pipeline, _trainDatasets[i], _validDatasets[i],`
	`61`	`+ var trainResult = RunnerUtil.TrainAndScorePipeline(pipeline.GetContext(), pipeline, _trainDatasets[i], _validDatasets[i],`
`62`	`62`	`_groupIdColumn, _labelColumn, _metricsAgent, _preprocessorTransforms?.ElementAt(i), modelFileInfo, _modelInputSchema,`
`63`	`63`	`_logger);`
`64`	`64`	`trainResults.Add(trainResult);`
Original file line number	Diff line number	Diff line change
`@@ -52,6 +52,11 @@ public override int GetHashCode()`
`52`	`52`	`return ToString().GetHashCode();`
`53`	`53`	`}`
`54`	`54`
	`55`	`+ public MLContext GetContext()`
	`56`	`+ {`
	`57`	`+ return _context;`
	`58`	`+ }`
	`59`	`+`
`55`	`60`	`public Pipeline ToPipeline()`
`56`	`61`	`{`
`57`	`62`	`var pipelineElements = new List<PipelineNode>();`
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ public interface IHostEnvironment : IChannelProvider, IProgressChannelProvider`
`72`	`72`	`internal interface ICancelable`
`73`	`73`	`{`
`74`	`74`	`/// <summary>`
`75`		`- /// Signal to stop exection in all the hosts.`
	`75`	`+ /// Signal to stop execution in all the hosts.`
`76`	`76`	`/// </summary>`
`77`	`77`	`void CancelExecution();`
`78`	`78`