From e2ee90d1d687d6128fa6b915c75c135b2fd1cb0c Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 10 Aug 2020 11:04:16 +0100 Subject: [PATCH] [ML] Fix progress on resume after final training has completed for classification and regression (#1443) --- docs/CHANGELOG.asciidoc | 7 +++++++ include/maths/CBoostedTreeImpl.h | 3 +++ lib/maths/CBoostedTreeImpl.cc | 13 ++++++++++--- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index e73d7e2af9..30ab5863a2 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -28,6 +28,13 @@ //=== Regressions +== {es} version 7.10.0 + +=== Bug Fixes + +* Fix progress on resume after final training has completed for classification and regression. + We previously showed progress stuck at zero for final training. (See {ml-pull}1443[#1443].) + == {es} version 7.9.0 === New Features diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index 23c5f6ab7a..953eac33ea 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -315,6 +315,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Start monitoring the final model training. void startProgressMonitoringFinalTrain(); + //! Skip monitoring the final model training. + void skipProgressMonitoringFinalTrain(); + //! Record the training state using the \p recordTrainState callback function void recordState(const TTrainingStateCallback& recordTrainState) const; diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index ff36cd6d33..6760958679 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -272,6 +272,8 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, core::CProgramCounters::counter(counter_t::E_DFTPMTrainedForestNumberTrees) = m_BestForest.size(); + } else { + this->skipProgressMonitoringFinalTrain(); } this->computeClassificationWeights(frame); @@ -346,9 +348,9 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, std::size_t dataTypeMemoryUsage{maximumNumberFeatures * sizeof(CDataFrameUtils::SDataType)}; std::size_t featureSampleProbabilities{maximumNumberFeatures * sizeof(double)}; // Assuming either many or few missing rows, we get good compression of the bit - // vector. Specifically, we'll assume the average run length is 256 for which - // we get a constant 4 * 8 / 256. - std::size_t missingFeatureMaskMemoryUsage{32 * numberColumns * numberRows / 256}; + // vector. Specifically, we'll assume the average run length is 64 for which + // we get a constant 8 / 64. + std::size_t missingFeatureMaskMemoryUsage{8 * numberColumns * numberRows / 64}; std::size_t trainTestMaskMemoryUsage{ 2 * static_cast(std::ceil(std::log2(static_cast(m_NumberFolds)))) * numberRows}; @@ -1347,6 +1349,7 @@ void CBoostedTreeImpl::startProgressMonitoringFineTuneHyperparameters() { } void CBoostedTreeImpl::startProgressMonitoringFinalTrain() { + // The final model training uses more data so it's monitored separately. m_Instrumentation->startNewProgressMonitoredTask(CBoostedTreeFactory::FINAL_TRAINING); @@ -1354,6 +1357,10 @@ void CBoostedTreeImpl::startProgressMonitoringFinalTrain() { m_MaximumNumberTrees, m_Instrumentation->progressCallback(), 1.0, 1024}; } +void CBoostedTreeImpl::skipProgressMonitoringFinalTrain() { + m_Instrumentation->startNewProgressMonitoredTask(CBoostedTreeFactory::FINAL_TRAINING); +} + namespace { const std::string VERSION_7_8_TAG{"7.8"}; const TStrVec SUPPORTED_VERSIONS{VERSION_7_8_TAG};