Skip to content

Commit 09be0fa

Browse files
authored
[7.1][ML] Improve residual model selection (#474)
Backport #468.
1 parent a93c883 commit 09be0fa

File tree

11 files changed

+284
-253
lines changed

11 files changed

+284
-253
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ to the model. (See {ml-pull}214[#214].)
4040
* Only select more complex trend models for forecasting if there is evidence that they are needed.
4141
(See {ml-pull}463[#463].)
4242
43+
* Improve residual model selection. (See {ml-pull}468[#468].)
44+
4345
=== Bug Fixes
4446
4547
* Handle NaNs when detrending seasonal components. {ml-pull}408[#408]

include/maths/COneOfNPrior.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ class MATHS_EXPORT COneOfNPrior : public CPrior {
324324
using TDoubleSizePr5Vec = core::CSmallVector<TDoubleSizePr, 5>;
325325
using TWeightPriorPtrPr = std::pair<CModelWeight, TPriorPtr>;
326326
using TWeightPriorPtrPrVec = std::vector<TWeightPriorPtrPr>;
327-
using TMaxAccumulator = CBasicStatistics::SMax<double>::TAccumulator;
327+
using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
328328

329329
private:
330330
//! Read parameters from \p traverser.
@@ -350,6 +350,9 @@ class MATHS_EXPORT COneOfNPrior : public CPrior {
350350
private:
351351
//! A collection of component models and their probabilities.
352352
TWeightPriorPtrPrVec m_Models;
353+
354+
//! The moments of the samples added.
355+
TMeanVarAccumulator m_SampleMoments;
353356
};
354357
}
355358
}

include/maths/Constants.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,6 @@ const double MINIMUM_CLUSTER_SPLIT_COUNT{24.0};
116116

117117
//! The minimum count of a category in the sketch to cluster.
118118
const double MINIMUM_CATEGORY_COUNT{0.5};
119-
120-
//! Get the maximum amount we'll penalize a model in addSamples.
121-
MATHS_EXPORT double maxModelPenalty(double numberSamples);
122119
}
123120
}
124121

lib/api/unittest/CAnomalyJobLimitTest.cc

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -349,12 +349,12 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
349349
std::size_t s_ExpectedByFields;
350350
std::size_t s_ExpectedOverFields;
351351
std::size_t s_ExpectedPartitionFields;
352-
std::size_t s_ExpectedByLowerMemoryLimit;
353-
std::size_t s_ExpectedPartitionLowerMemoryLimit;
354-
std::size_t s_ExpectedOverLowerMemoryLimit;
355-
} testParams[] = {{600, 550, 6000, 300, 33, 40, 40},
356-
{3600, 550, 5500, 300, 30, 25, 20},
357-
{172800, 150, 850, 120, 6, 6, 3}};
352+
std::size_t s_ExpectedByMemoryUsageRelativeErrorDivisor;
353+
std::size_t s_ExpectedPartitionUsageRelativeErrorDivisor;
354+
std::size_t s_ExpectedOverUsageRelativeErrorDivisor;
355+
} testParams[]{{600, 550, 6000, 300, 33, 40, 40},
356+
{3600, 550, 5500, 300, 27, 25, 20},
357+
{172800, 150, 850, 120, 6, 6, 3}};
358358

359359
for (const auto& testParam : testParams) {
360360
TGeneratorVec generators{periodic, tradingDays, level, ramp, sparse};
@@ -410,7 +410,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
410410
CPPUNIT_ASSERT_EQUAL(std::size_t(2), used.s_PartitionFields);
411411
CPPUNIT_ASSERT_DOUBLES_EQUAL(
412412
memoryLimit * 1024 * 1024 / 2, used.s_Usage,
413-
memoryLimit * 1024 * 1024 / testParam.s_ExpectedByLowerMemoryLimit);
413+
memoryLimit * 1024 * 1024 / testParam.s_ExpectedByMemoryUsageRelativeErrorDivisor);
414414
}
415415

416416
LOG_DEBUG(<< "**** Test partition with bucketLength = " << testParam.s_BucketLength
@@ -461,7 +461,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
461461
0.96 * static_cast<double>(used.s_PartitionFields));
462462
CPPUNIT_ASSERT_DOUBLES_EQUAL(
463463
memoryLimit * 1024 * 1024 / 2, used.s_Usage,
464-
memoryLimit * 1024 * 1024 / testParam.s_ExpectedPartitionLowerMemoryLimit);
464+
memoryLimit * 1024 * 1024 / testParam.s_ExpectedPartitionUsageRelativeErrorDivisor);
465465
}
466466

467467
LOG_DEBUG(<< "**** Test over with bucketLength = " << testParam.s_BucketLength
@@ -508,7 +508,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
508508
used.s_OverFields < 7000);
509509
CPPUNIT_ASSERT_DOUBLES_EQUAL(
510510
memoryLimit * 1024 * 1024 / 2, used.s_Usage,
511-
memoryLimit * 1024 * 1024 / testParam.s_ExpectedOverLowerMemoryLimit);
511+
memoryLimit * 1024 * 1024 / testParam.s_ExpectedOverUsageRelativeErrorDivisor);
512512
}
513513
}
514514
}

lib/maths/CMultivariateOneOfNPrior.cc

Lines changed: 66 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,6 @@ const std::string WEIGHT_TAG("c");
5454
const std::string PRIOR_TAG("d");
5555
const std::string DECAY_RATE_TAG("e");
5656

57-
//! Add elements of \p x to \p y.
58-
void add(const TDouble10Vec& x, TDouble10Vec& y) {
59-
for (std::size_t i = 0u; i < x.size(); ++i) {
60-
y[i] += x[i];
61-
}
62-
}
63-
6457
//! Get the min of \p x and \p y.
6558
TDouble10Vec min(const TDouble10Vec& x, const TDouble10Vec& y) {
6659
TDouble10Vec result(x);
@@ -103,11 +96,6 @@ void updateMean(const TDouble10Vec10Vec& x, double nx, TDouble10Vec10Vec& mean,
10396
n += nx;
10497
}
10598

106-
//! Get the largest element of \p x.
107-
double largest(const TDouble10Vec& x) {
108-
return *std::max_element(x.begin(), x.end());
109-
}
110-
11199
//! Add a model vector entry reading parameters from \p traverser.
112100
bool modelAcceptRestoreTraverser(const SDistributionRestoreParams& params,
113101
TWeightPriorPtrPrVec& models,
@@ -174,7 +162,7 @@ void modelAcceptPersistInserter(const CModelWeight& weight,
174162
const double DERATE = 0.99999;
175163
const double MINUS_INF = DERATE * boost::numeric::bounds<double>::lowest();
176164
const double INF = DERATE * boost::numeric::bounds<double>::highest();
177-
const double LOG_INITIAL_WEIGHT = std::log(1e-6);
165+
const double MAXIMUM_LOG_BAYES_FACTOR = std::log(1e6);
178166
const double MINIMUM_SIGNIFICANT_WEIGHT = 0.01;
179167
}
180168

@@ -306,81 +294,94 @@ void CMultivariateOneOfNPrior::addSamples(const TDouble10Vec1Vec& samples,
306294

307295
this->adjustOffset(samples, weights);
308296

309-
double penalty = CTools::fastLog(this->numberSamples());
297+
double n{this->numberSamples()};
310298
this->CMultivariatePrior::addSamples(samples, weights);
311-
penalty = (penalty - CTools::fastLog(this->numberSamples())) / 2.0;
299+
n = this->numberSamples() - n;
312300

313301
// See COneOfNPrior::addSamples for a discussion.
314302

315303
CScopeCanonicalizeWeights<TPriorPtr> canonicalize(m_Models);
316304

317305
// We need to check *before* adding samples to the constituent models.
318-
bool isNonInformative = this->isNonInformative();
306+
bool isNonInformative{this->isNonInformative()};
319307

320308
// Compute the unnormalized posterior weights and update the component
321309
// priors. These weights are computed on the side since they are only
322310
// updated if all marginal likelihoods can be computed.
323-
TDouble3Vec logLikelihoods;
324-
TMaxAccumulator maxLogLikelihood;
325-
TBool3Vec used, uses;
311+
TDouble3Vec minusBics;
312+
TBool3Vec used;
313+
TBool3Vec uses;
314+
bool failed{false};
315+
316+
// If none of the models are a good fit for the new data then restrict
317+
// the maximum Bayes Factor since the data likelihood given the model
318+
// is most useful if one of the models is (mostly) correct.
319+
double m{std::max(n, 1.0)};
320+
double maxLogBayesFactor{-m * MAXIMUM_LOG_BAYES_FACTOR};
321+
326322
for (auto& model : m_Models) {
327-
bool use = model.second->participatesInModelSelection();
328-
329-
// Update the weights with the marginal likelihoods.
330-
double logLikelihood = 0.0;
331-
maths_t::EFloatingPointErrorStatus status =
332-
use ? model.second->jointLogMarginalLikelihood(samples, weights, logLikelihood)
333-
: maths_t::E_FpOverflowed;
334-
if (status & maths_t::E_FpFailed) {
335-
LOG_ERROR(<< "Failed to compute log-likelihood");
336-
LOG_ERROR(<< "samples = " << core::CContainerPrinter::print(samples));
337-
} else {
338-
if (!(status & maths_t::E_FpOverflowed)) {
339-
logLikelihood += model.second->unmarginalizedParameters() * penalty;
340-
logLikelihoods.push_back(logLikelihood);
341-
maxLogLikelihood.add(logLikelihood);
342-
} else {
343-
logLikelihoods.push_back(MINUS_INF);
323+
324+
double minusBic{0.0};
325+
maths_t::EFloatingPointErrorStatus status{maths_t::E_FpOverflowed};
326+
327+
used.push_back(model.second->participatesInModelSelection());
328+
if (used.back()) {
329+
double logLikelihood;
330+
status = model.second->jointLogMarginalLikelihood(samples, weights, logLikelihood);
331+
332+
minusBic = 2.0 * logLikelihood -
333+
model.second->unmarginalizedParameters() * CTools::fastLog(n);
334+
335+
double modeLogLikelihood;
336+
TDouble10Vec1Vec modes;
337+
modes.reserve(weights.size());
338+
for (const auto& weight : weights) {
339+
modes.push_back(model.second->marginalLikelihoodMode(weight));
344340
}
341+
model.second->jointLogMarginalLikelihood(modes, weights, modeLogLikelihood);
342+
maxLogBayesFactor = std::max(
343+
maxLogBayesFactor, logLikelihood - std::max(modeLogLikelihood, logLikelihood));
344+
}
345+
346+
minusBics.push_back((status & maths_t::E_FpOverflowed) ? MINUS_INF : minusBic);
347+
failed |= (status & maths_t::E_FpFailed);
345348

346-
// Update the component prior distribution.
347-
model.second->addSamples(samples, weights);
349+
// Update the component prior distribution.
350+
model.second->addSamples(samples, weights);
348351

349-
used.push_back(use);
350-
uses.push_back(model.second->participatesInModelSelection());
352+
uses.push_back(model.second->participatesInModelSelection());
353+
if (!uses.back()) {
354+
model.first.logWeight(MINUS_INF);
351355
}
352356
}
353357

354-
TDouble10Vec n(m_Dimension, 0.0);
355-
for (const auto& weight : weights) {
356-
add(maths_t::count(weight), n);
358+
if (failed) {
359+
LOG_ERROR(<< "Failed to compute log-likelihood");
360+
LOG_ERROR(<< "samples = " << core::CContainerPrinter::print(samples));
361+
LOG_ERROR(<< "weights = " << core::CContainerPrinter::print(weights));
362+
return;
363+
}
364+
if (isNonInformative) {
365+
return;
357366
}
358367

359-
if (!isNonInformative && maxLogLikelihood.count() > 0) {
360-
LOG_TRACE(<< "logLikelihoods = " << core::CContainerPrinter::print(logLikelihoods));
368+
maxLogBayesFactor += m * MAXIMUM_LOG_BAYES_FACTOR;
361369

362-
// The idea here is to limit the amount which extreme samples
363-
// affect model selection, particularly early on in the model
364-
// life-cycle.
365-
double l = largest(n);
366-
double minLogLikelihood =
367-
maxLogLikelihood[0] -
368-
l * std::min(maxModelPenalty(this->numberSamples()), 100.0);
370+
LOG_TRACE(<< "BICs = " << core::CContainerPrinter::print(minusBics));
371+
LOG_TRACE(<< "max Bayes Factor = " << maxLogBayesFactor);
369372

370-
TMaxAccumulator maxLogWeight;
371-
for (std::size_t i = 0; i < logLikelihoods.size(); ++i) {
372-
CModelWeight& weight = m_Models[i].first;
373-
if (!uses[i]) {
374-
weight.logWeight(MINUS_INF);
375-
} else if (used[i]) {
376-
weight.addLogFactor(std::max(logLikelihoods[i], minLogLikelihood));
377-
maxLogWeight.add(weight.logWeight());
378-
}
373+
double maxLogModelWeight{MINUS_INF + m * MAXIMUM_LOG_BAYES_FACTOR};
374+
double maxMinusBic{*std::max_element(minusBics.begin(), minusBics.end())};
375+
for (std::size_t i = 0; i < m_Models.size(); ++i) {
376+
if (used[i] && uses[i]) {
377+
m_Models[i].first.addLogFactor(
378+
std::max(minusBics[i] / 2.0, maxMinusBic / 2.0 - maxLogBayesFactor));
379+
maxLogModelWeight = std::max(maxLogModelWeight, m_Models[i].first.logWeight());
379380
}
380-
for (std::size_t i = 0u; i < m_Models.size(); ++i) {
381-
if (!used[i] && uses[i]) {
382-
m_Models[i].first.logWeight(maxLogWeight[0] + LOG_INITIAL_WEIGHT);
383-
}
381+
}
382+
for (std::size_t i = 0; i < m_Models.size(); ++i) {
383+
if (!used[i] && uses[i]) {
384+
m_Models[i].first.logWeight(maxLogModelWeight - m * MAXIMUM_LOG_BAYES_FACTOR);
384385
}
385386
}
386387

0 commit comments

Comments
 (0)