diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 59de98112c..252a13e975 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -28,6 +28,7 @@ === Enhancements Improve and use periodic boundary condition for seasonal component modeling ({pull}84[#84]) +Improve robustness w.r.t. outliers of detection and initialisation of seasonal components ({pull}90[#90]) === Bug Fixes diff --git a/include/maths/CPeriodicityHypothesisTests.h b/include/maths/CPeriodicityHypothesisTests.h index fe63c58a31..b32eb7a9f8 100644 --- a/include/maths/CPeriodicityHypothesisTests.h +++ b/include/maths/CPeriodicityHypothesisTests.h @@ -225,6 +225,8 @@ class MATHS_EXPORT CPeriodicityHypothesisTests { CPeriodicityHypothesisTestsResult s_H0; //! The variance estimate of H0. double s_V0; + //! The autocorrelation estimate of H0. + double s_R0; //! The degrees of freedom in the variance estimate of H0. double s_DF0; //! The trend for the null hypothesis. @@ -361,9 +363,7 @@ class MATHS_EXPORT CPeriodicityHypothesisTests { //! Condition \p buckets assuming the null hypothesis is true. //! //! This removes any trend associated with the null hypothesis. - void conditionOnHypothesis(const TTimeTimePr2Vec& windows, - const STestStats& stats, - TFloatMeanAccumulatorVec& buckets) const; + void conditionOnHypothesis(const STestStats& stats, TFloatMeanAccumulatorVec& buckets) const; //! Test to see if there is significant evidence for a component //! with period \p period. diff --git a/include/maths/CTimeSeriesDecompositionDetail.h b/include/maths/CTimeSeriesDecompositionDetail.h index 441b9464aa..8f1f3b81af 100644 --- a/include/maths/CTimeSeriesDecompositionDetail.h +++ b/include/maths/CTimeSeriesDecompositionDetail.h @@ -609,6 +609,20 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail { maths_t::TCalendarComponentVec& components, TComponentErrorsVec& errors) const; + //! Reweight the outlier values in \p values. + //! + //! These are the values with largest error w.r.t. \p predictor. + void reweightOutliers(core_t::TTime startTime, + core_t::TTime dt, + TPredictor predictor, + TFloatMeanAccumulatorVec& values) const; + + //! Fit the trend component \p component to \p values. + void fit(core_t::TTime startTime, + core_t::TTime dt, + const TFloatMeanAccumulatorVec& values, + CTrendComponent& trend) const; + //! Clear all component error statistics. void clearComponentErrors(); diff --git a/include/maths/CTools.h b/include/maths/CTools.h index 6d951d2d1b..5b298d8632 100644 --- a/include/maths/CTools.h +++ b/include/maths/CTools.h @@ -665,19 +665,16 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable { //! Sigmoid function of \p p. static double sigmoid(double p) { return 1.0 / (1.0 + 1.0 / p); } - //! A smooth Heaviside function centred at one. + //! The logistic function. //! - //! This is a smooth version of the Heaviside function implemented - //! as \f$sigmoid\left(\frac{sign (x - 1)}{wb}\right)\f$ normalized - //! to the range [0, 1], where \f$b\f$ is \p boundary and \f$w\f$ - //! is \p width. Note, if \p sign is one this is a step up and if - //! it is -1 it is a step down. + //! i.e. \f$sigmoid\left(\frac{sign (x - x0)}{width}\right)\f$. //! //! \param[in] x The argument. //! \param[in] width The step width. + //! \param[in] x0 The centre of the step. //! \param[in] sign Determines whether it's a step up or down. - static double smoothHeaviside(double x, double width, double sign = 1.0) { - return sigmoid(std::exp(sign * (x - 1.0) / width)) / sigmoid(std::exp(1.0 / width)); + static double logisticFunction(double x, double width, double x0 = 0.0, double sign = 1.0) { + return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width)); } }; } diff --git a/include/maths/Constants.h b/include/maths/Constants.h index 5745de926d..99de4b29fa 100644 --- a/include/maths/Constants.h +++ b/include/maths/Constants.h @@ -12,6 +12,8 @@ #include #include +#include + namespace ml { namespace maths { @@ -50,22 +52,39 @@ const double LOG_NORMAL_OFFSET_MARGIN{1.0}; //! reduce the prediction error variance and still be worthwhile //! modeling. We have different thresholds because we have inductive //! bias for particular types of components. -const double SIGNIFICANT_VARIANCE_REDUCTION[]{0.7, 0.5}; +const double COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[]{0.6, 0.4}; //! The minimum repeated amplitude of a seasonal component, as a //! multiple of error standard deviation, to be worthwhile modeling. //! We have different thresholds because we have inductive bias for //! particular types of components. -const double SIGNIFICANT_AMPLITUDE[]{1.0, 2.0}; +const double SEASONAL_SIGNIFICANT_AMPLITUDE[]{1.0, 2.0}; //! The minimum autocorrelation of a seasonal component to be //! worthwhile modeling. We have different thresholds because we //! have inductive bias for particular types of components. -const double SIGNIFICANT_AUTOCORRELATION[]{0.5, 0.7}; +const double SEASONAL_SIGNIFICANT_AUTOCORRELATION[]{0.5, 0.6}; + +//! The fraction of values which are treated as outliers when testing +//! for and initializing a seasonal component. +const double SEASONAL_OUTLIER_FRACTION{0.1}; + +//! The minimum multiplier of the mean inlier fraction difference +//! (from a periodic pattern) to constitute an outlier when testing +//! for and initializing a seasonal component. +const double SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD{3.0}; -//! The maximum significance of a test statistic to choose to model +//! The weight to assign outliers when testing for and initializing +//! a seasonal component. +const double SEASONAL_OUTLIER_WEIGHT{0.1}; + +//! The significance of a test statistic to choose to model //! a trend decomposition component. -const double MAXIMUM_SIGNIFICANCE{0.001}; +const double COMPONENT_STATISTICALLY_SIGNIFICANT{0.001}; + +//! The log of COMPONENT_STATISTICALLY_SIGNIFICANT. +const double LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE{ + std::log(COMPONENT_STATISTICALLY_SIGNIFICANT)}; //! The minimum variance scale for which the likelihood function //! can be accurately adjusted. For smaller scales errors are diff --git a/lib/maths/CPeriodicityHypothesisTests.cc b/lib/maths/CPeriodicityHypothesisTests.cc index 5ff686da0d..8cadc39e36 100644 --- a/lib/maths/CPeriodicityHypothesisTests.cc +++ b/lib/maths/CPeriodicityHypothesisTests.cc @@ -44,6 +44,7 @@ namespace maths { namespace { using TDoubleVec = std::vector; +using TSizeVec = std::vector; using TTimeVec = std::vector; using TSizeSizePr = std::pair; using TSizeSizePr2Vec = core::CSmallVector; @@ -55,12 +56,24 @@ using TTimeTimePr = std::pair; using TTimeTimePr2Vec = core::CSmallVector; using TTimeTimePrMeanVarAccumulatorPr = std::pair; +//! The confidence interval used for test statistic values. +const double CONFIDENCE_INTERVAL{80.0}; +//! The soft minimum number of repeats which we'll use to test +//! for periodicity using the variance test. +const double MINIMUM_REPEATS_TO_TEST_VARIANCE{3.0}; +//! The minimum number of repeats which we'll use to test for +//! periodicity using the amplitude test. +const std::size_t MINIMUM_REPEATS_TO_TEST_AMPLITUDE{4}; +//! A high priority for components we want to take precendence. +double HIGH_PRIORITY{2.0}; + //! \brief Accumulates the minimum amplitude. class CMinAmplitude { public: CMinAmplitude(std::size_t n, double level) - : m_Level(level), m_Count(0), m_Min(std::max(n, MINIMUM_COUNT_TO_TEST)), - m_Max(std::max(n, MINIMUM_COUNT_TO_TEST)) {} + : m_Level(level), m_Count(0), + m_Min(std::max(n, MINIMUM_REPEATS_TO_TEST_AMPLITUDE)), + m_Max(std::max(n, MINIMUM_REPEATS_TO_TEST_AMPLITUDE)) {} void add(double x, double n) { if (n > 0.0) { @@ -71,7 +84,7 @@ class CMinAmplitude { } double amplitude() const { - if (this->count() >= MINIMUM_COUNT_TO_TEST) { + if (this->count() >= MINIMUM_REPEATS_TO_TEST_AMPLITUDE) { return std::max(std::max(-m_Min.biggest(), 0.0), std::max(m_Max.biggest(), 0.0)); } @@ -79,15 +92,13 @@ class CMinAmplitude { } double significance(const boost::math::normal& normal) const { - if (this->count() < MINIMUM_COUNT_TO_TEST) { + if (this->count() < MINIMUM_REPEATS_TO_TEST_AMPLITUDE) { return 1.0; } - double F{2.0 * CTools::safeCdf(normal, -this->amplitude())}; if (F == 0.0) { return 0.0; } - double n{static_cast(this->count())}; boost::math::binomial binomial(static_cast(m_Count), F); return CTools::safeCdfComplement(binomial, n - 1.0); @@ -101,10 +112,6 @@ class CMinAmplitude { private: std::size_t count() const { return m_Min.count(); } -private: - //! The minimum number of repeats for which we'll test. - static const std::size_t MINIMUM_COUNT_TO_TEST; - private: //! The mean of the trend. double m_Level; @@ -116,18 +123,16 @@ class CMinAmplitude { TMaxAccumulator m_Max; }; -const std::size_t CMinAmplitude::MINIMUM_COUNT_TO_TEST{4}; - using TMinAmplitudeVec = std::vector; //! \brief Holds the relevant summary for choosing between alternative //! (non-nested) hypotheses. struct SHypothesisSummary { - SHypothesisSummary(double v, double DF, const CPeriodicityHypothesisTestsResult& H) - : s_V(v), s_DF(DF), s_H(H) {} - double s_V; + double s_R; double s_DF; + double s_Vt; + double s_Rt; CPeriodicityHypothesisTestsResult s_H; }; @@ -159,25 +164,21 @@ const std::string DIURNAL_COMPONENT_NAMES[] = { "weekend daily", "weekend weekly", "weekday daily", "weekday weekly", "daily", "weekly"}; -//! The confidence interval used for test statistic values. -const double CONFIDENCE_INTERVAL{80.0}; -//! A high priority for components we want to take precendence. -double HIGH_PRIORITY{2.0}; - //! Fit and remove a linear trend from \p values. void removeLinearTrend(TFloatMeanAccumulatorVec& values) { using TRegression = CRegression::CLeastSquaresOnline<1, double>; - TRegression trend; - double time{0.0}; double dt{10.0 / static_cast(values.size())}; + double time{dt / 2.0}; for (const auto& value : values) { trend.add(time, CBasicStatistics::mean(value), CBasicStatistics::count(value)); time += dt; } time = dt / 2.0; for (auto& value : values) { - CBasicStatistics::moment<0>(value) -= trend.predict(time); + if (CBasicStatistics::count(value) > 0.0) { + CBasicStatistics::moment<0>(value) -= trend.predict(time); + } time += dt; } } @@ -285,9 +286,9 @@ void project(const TFloatMeanAccumulatorVec& values, calculateIndexWindows(windows_, bucketLength, windows); result.reserve(length(windows)); std::size_t n{values.size()}; - for (std::size_t i = 0u; i < windows.size(); ++i) { - std::size_t a{windows[i].first}; - std::size_t b{windows[i].second}; + for (const auto& window : windows) { + std::size_t a{window.first}; + std::size_t b{window.second}; for (std::size_t j = a; j < b; ++j) { const TFloatMeanAccumulator& value{values[j % n]}; result.push_back(value); @@ -296,20 +297,113 @@ void project(const TFloatMeanAccumulatorVec& values, } } +//! Calculate the number of non-empty buckets at each bucket offset in +//! the period for the \p values in \p windows. +TSizeVec calculateRepeats(const TSizeSizePr2Vec& windows, + std::size_t period, + TFloatMeanAccumulatorVec& values) { + TSizeVec result(std::min(period, length(windows[0])), 0); + std::size_t n{values.size()}; + for (const auto& window : windows) { + std::size_t a{window.first}; + std::size_t b{window.second}; + for (std::size_t i = a; i < b; ++i) { + if (CBasicStatistics::count(values[i % n]) > 0.0) { + ++result[(i - a) % period]; + } + } + } + return result; +} + +//! Calculate the number of non-empty buckets at each bucket offset in +//! the period for the \p values in \p windows. +TSizeVec calculateRepeats(const TTimeTimePr2Vec& windows_, + core_t::TTime period, + core_t::TTime bucketLength, + TFloatMeanAccumulatorVec& values) { + TSizeSizePr2Vec windows; + calculateIndexWindows(windows_, bucketLength, windows); + return calculateRepeats(windows, period / bucketLength, values); +} + +//! Reweight outliers from \p values. +//! +//! These are defined as some fraction of the values which are most +//! different from the periodic trend on the time windows \p windows_. +template +void reweightOutliers(const std::vector& trend, + const TTimeTimePr2Vec& windows_, + core_t::TTime bucketLength, + TFloatMeanAccumulatorVec& values) { + using TDoubleSizePr = std::pair; + using TMaxAccumulator = + CBasicStatistics::COrderStatisticsHeap>; + + if (!values.empty()) { + TSizeSizePr2Vec windows; + calculateIndexWindows(windows_, bucketLength, windows); + std::size_t period{trend.size()}; + std::size_t n{values.size()}; + + TSizeVec repeats{calculateRepeats(windows, period, values)}; + double excess{std::accumulate( + repeats.begin(), repeats.end(), 0.0, [](double excess_, std::size_t repeat) { + return excess_ + static_cast(repeat > 1 ? repeat - 1 : 0); + })}; + std::size_t numberOutliers{static_cast(SEASONAL_OUTLIER_FRACTION * excess)}; + LOG_TRACE(<< "Number outliers = " << numberOutliers); + + if (numberOutliers > 0) { + TMaxAccumulator outliers{numberOutliers}; + TMeanAccumulator meanDifference; + for (const auto& window : windows) { + std::size_t a{window.first}; + std::size_t b{window.second}; + for (std::size_t j = a; j < b; ++j) { + const TFloatMeanAccumulator& value{values[j % n]}; + if (CBasicStatistics::count(value) > 0.0) { + std::size_t offset{(j - a) % period}; + double difference{std::fabs(CBasicStatistics::mean(value) - + CBasicStatistics::mean(trend[offset]))}; + outliers.add({difference, j}); + meanDifference.add(difference); + } + } + } + TMeanAccumulator meanDifferenceOfOutliers; + for (const auto& outlier : outliers) { + meanDifferenceOfOutliers.add(outlier.first); + } + meanDifference -= meanDifferenceOfOutliers; + LOG_TRACE(<< "mean difference = " << CBasicStatistics::mean(meanDifference)); + LOG_TRACE(<< "outliers = " << core::CContainerPrinter::print(outliers)); + + for (const auto& outlier : outliers) { + if (outlier.first > SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD * + CBasicStatistics::mean(meanDifference)) { + CBasicStatistics::count(values[outlier.second % n]) *= SEASONAL_OUTLIER_WEIGHT; + } + } + LOG_TRACE(<< "Values - outliers = " << core::CContainerPrinter::print(values)); + } + } +} + //! Compute the periodic trend from \p values falling in \p windows. -template -void periodicTrend(const U& values, +template +void periodicTrend(const std::vector& values, const TSizeSizePr2Vec& windows_, core_t::TTime bucketLength, - V& trend) { + CONTAINER& trend) { if (!trend.empty()) { TSizeSizePr2Vec windows; calculateIndexWindows(windows_, bucketLength, windows); std::size_t period{trend.size()}; std::size_t n{values.size()}; - for (std::size_t i = 0u; i < windows.size(); ++i) { - std::size_t a{windows[i].first}; - std::size_t b{windows[i].second}; + for (const auto& window : windows) { + std::size_t a{window.first}; + std::size_t b{window.second}; for (std::size_t j = a; j < b; ++j) { const TFloatMeanAccumulator& value{values[j % n]}; trend[(j - a) % period].add(CBasicStatistics::mean(value), @@ -319,6 +413,18 @@ void periodicTrend(const U& values, } } +//! Compute the periodic trend on \p values minus outliers. +template +void periodicTrendMinusOutliers(std::vector& values, + const TSizeSizePr2Vec& windows, + core_t::TTime bucketLength, + CONTAINER& trend) { + periodicTrend(values, windows, bucketLength, trend); + reweightOutliers(trend, windows, bucketLength, values); + trend.assign(trend.size(), TMeanVarAccumulator{}); + periodicTrend(values, windows, bucketLength, trend); +} + //! Compute the average of the values at \p times. void averageValue(const TFloatMeanAccumulatorVec& values, const TTimeVec& times, @@ -364,8 +470,8 @@ struct SResidualVarianceImpl { }; //! Compute the residual variance of the trend \p trend. -template -R residualVariance(const T& trend, double scale) { +template +R residualVariance(const CONTAINER& trend, double scale) { TMeanAccumulator result; for (const auto& bucket : trend) { result.add(CBasicStatistics::maximumLikelihoodVariance(bucket), @@ -881,9 +987,14 @@ void CPeriodicityHypothesisTests::hypothesesForPeriod(const TTimeTimePr2Vec& win CPeriodicityHypothesisTestsResult CPeriodicityHypothesisTests::best(const TNestedHypothesesVec& hypotheses) const { - // Note if there isn't a clear cut best hypothesis for variance - // reduction we choose the simplest hypothesis, i.e. with maximum - // degrees-of-freedom. + // We are comparing different accepted hypotheses here. In particular, + // diurnal and the best non-diurnal components with and without fitting + // a linear ramp to the values. We use a smooth decision function to + // select between them preferring: + // 1) The hypothesis which explains the most variance, + // 2) The simplest hypothesis (fewest parameters), + // 3) The hypothesis which most exceeds the minimum autocorrelation + // to accept. using TMinAccumulator = CBasicStatistics::SMin::TAccumulator; @@ -898,24 +1009,42 @@ CPeriodicityHypothesisTests::best(const TNestedHypothesesVec& hypotheses) const STestStats stats; CPeriodicityHypothesisTestsResult resultForHypothesis{hypothesis.test(stats)}; if (stats.s_B > stats.s_DF0) { - summaries.emplace_back(stats.s_V0, stats.s_B - stats.s_DF0, - std::move(resultForHypothesis)); + if (!resultForHypothesis.periodic()) { + stats.setThresholds( + COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[E_HighThreshold], + SEASONAL_SIGNIFICANT_AMPLITUDE[E_HighThreshold], + SEASONAL_SIGNIFICANT_AUTOCORRELATION[E_HighThreshold]); + stats.s_R0 = stats.s_Rt; + } + LOG_TRACE(<< resultForHypothesis.print()); + summaries.push_back(SHypothesisSummary{ + stats.s_V0, stats.s_R0, stats.s_B - stats.s_DF0, stats.s_Vt, + stats.s_Rt, std::move(resultForHypothesis)}); } } - TMinAccumulator vCutoff; - for (const auto& summary : summaries) { - vCutoff.add(varianceAtPercentile(summary.s_V, summary.s_DF, - 50.0 + CONFIDENCE_INTERVAL / 2.0)); - } - if (vCutoff.count() > 0) { - LOG_TRACE(<< "variance cutoff = " << vCutoff[0]); + if (!summaries.empty()) { + TMinAccumulator vmin; + TMinAccumulator DFmin; + for (const auto& summary : summaries) { + vmin.add(varianceAtPercentile(summary.s_V, summary.s_DF, + 50.0 + CONFIDENCE_INTERVAL / 2.0) / + summary.s_Vt); + DFmin.add(summary.s_DF); + } - TMinAccumulator df; + TMinAccumulator pmin; for (const auto& summary : summaries) { double v{varianceAtPercentile(summary.s_V, summary.s_DF, - 50.0 - CONFIDENCE_INTERVAL / 2.0)}; - if (v <= vCutoff[0] && df.add(-summary.s_DF)) { + 50.0 - CONFIDENCE_INTERVAL / 2.0) / + summary.s_Vt / vmin[0]}; + double R{summary.s_R / summary.s_Rt}; + double DF{summary.s_DF / DFmin[0]}; + double p{CTools::logisticFunction(v, 0.2, 1.0, -1.0) * + CTools::logisticFunction(R, 0.2, 1.0, +1.0) * + CTools::logisticFunction(DF, 0.2, 1.0, +1.0)}; + LOG_TRACE(<< "p = " << p); + if (pmin.add(-p)) { result = summary.s_H; } } @@ -941,9 +1070,9 @@ CPeriodicityHypothesisTests::testForDaily(const TTimeTimePr2Vec& windows, CPeriodicityHypothesisTestsResult result{stats.s_H0}; stats.s_HasPeriod = m_Config.hasDaily(); - stats.setThresholds(SIGNIFICANT_VARIANCE_REDUCTION[E_LowThreshold], - SIGNIFICANT_AMPLITUDE[E_LowThreshold], - SIGNIFICANT_AUTOCORRELATION[E_LowThreshold]); + stats.setThresholds(COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[E_LowThreshold], + SEASONAL_SIGNIFICANT_AMPLITUDE[E_LowThreshold], + SEASONAL_SIGNIFICANT_AUTOCORRELATION[E_LowThreshold]); if (m_Config.testForDiurnal() && m_BucketLength <= DAY / 4 && this->seenSufficientDataToTest(DAY, buckets) && @@ -966,9 +1095,9 @@ CPeriodicityHypothesisTests::testForWeekly(const TTimeTimePr2Vec& windows, CPeriodicityHypothesisTestsResult result{stats.s_H0}; stats.s_HasPeriod = m_Config.hasWeekly(); - stats.setThresholds(SIGNIFICANT_VARIANCE_REDUCTION[E_LowThreshold], - SIGNIFICANT_AMPLITUDE[E_LowThreshold], - SIGNIFICANT_AUTOCORRELATION[E_LowThreshold]); + stats.setThresholds(COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[E_LowThreshold], + SEASONAL_SIGNIFICANT_AMPLITUDE[E_LowThreshold], + SEASONAL_SIGNIFICANT_AUTOCORRELATION[E_LowThreshold]); if (m_Config.testForDiurnal() && m_BucketLength <= WEEK / 4 && this->seenSufficientDataToTest(WEEK, buckets) && @@ -993,9 +1122,9 @@ CPeriodicityHypothesisTests::testForDailyWithWeekend(const TFloatMeanAccumulator stats.s_HasPartition = m_Config.hasWeekend(); stats.s_StartOfPartition = m_Config.hasWeekend() ? m_Config.startOfWeek() : 0; - stats.setThresholds(SIGNIFICANT_VARIANCE_REDUCTION[E_HighThreshold], - SIGNIFICANT_AMPLITUDE[E_HighThreshold], - SIGNIFICANT_AUTOCORRELATION[E_HighThreshold]); + stats.setThresholds(COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[E_HighThreshold], + SEASONAL_SIGNIFICANT_AMPLITUDE[E_HighThreshold], + SEASONAL_SIGNIFICANT_AUTOCORRELATION[E_HighThreshold]); TTimeTimePr2Vec partition{{0, WEEKEND}, {WEEKEND, WEEK}}; std::size_t bucketsPerWeek(WEEK / m_BucketLength); @@ -1092,9 +1221,9 @@ CPeriodicityHypothesisTests::testForPeriod(const TTimeTimePr2Vec& windows, this->seenSufficientDataToTest(m_Period, buckets)) { stats.s_HasPeriod = false; EThreshold index{m_Period % DAY == 0 ? E_LowThreshold : E_HighThreshold}; - stats.setThresholds(SIGNIFICANT_VARIANCE_REDUCTION[index], - SIGNIFICANT_AMPLITUDE[index], - SIGNIFICANT_AUTOCORRELATION[index]); + stats.setThresholds(COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[index], + SEASONAL_SIGNIFICANT_AMPLITUDE[index], + SEASONAL_SIGNIFICANT_AUTOCORRELATION[index]); if (this->testPeriod(windows, buckets, m_Period, stats)) { stats.s_StartOfPartition = 0; stats.s_Partition.assign(1, {0, length(buckets, m_BucketLength)}); @@ -1168,14 +1297,15 @@ void CPeriodicityHypothesisTests::nullHypothesis(const TTimeTimePr2Vec& window, STestStats& stats) const { if (this->testStatisticsFor(buckets, stats)) { TMeanVarAccumulatorVec trend(1); - periodicTrend(buckets, window, m_BucketLength, trend); + TFloatMeanAccumulatorVec values(buckets.begin(), buckets.end()); + periodicTrendMinusOutliers(values, window, m_BucketLength, trend); double mean{CBasicStatistics::mean(trend[0])}; double v0{CBasicStatistics::variance(trend[0])}; LOG_TRACE(<< "mean = " << mean); LOG_TRACE(<< "variance = " << v0); stats.s_DF0 = 1.0; stats.s_V0 = v0; - stats.s_T0.assign(1, TDoubleVec{mean}); + stats.s_T0.assign(1, {mean}); stats.s_Partition = window; } } @@ -1188,29 +1318,27 @@ void CPeriodicityHypothesisTests::hypothesis(const TTime2Vec& periods, stats.s_DF0 = 0.0; stats.s_T0 = TDoubleVec2Vec(stats.s_Partition.size()); for (std::size_t i = 0u; i < stats.s_Partition.size(); ++i) { - core_t::TTime period_{std::min(periods[i], length(stats.s_Partition[i])) / m_BucketLength}; + core_t::TTime period{std::min(periods[i], length(stats.s_Partition[i])) / m_BucketLength}; TTimeTimePr2Vec windows(calculateWindows( stats.s_StartOfPartition, length(buckets, m_BucketLength), length(stats.s_Partition), stats.s_Partition[i])); + TMeanVarAccumulatorVec trend(periods[i] / m_BucketLength); - periodicTrend(buckets, windows, m_BucketLength, trend); + TFloatMeanAccumulatorVec values(buckets.begin(), buckets.end()); + periodicTrendMinusOutliers(values, windows, m_BucketLength, trend); + stats.s_V0 += residualVariance(trend, 1.0 / stats.s_M); - stats.s_DF0 += static_cast(std::count_if( - trend.begin(), trend.end(), [](const TMeanVarAccumulator& value) { - return CBasicStatistics::count(value) > 0.0; - })); - stats.s_T0[i].reserve(period_); - std::for_each(trend.begin(), trend.end(), - [&stats, i](const TMeanVarAccumulator& value) { - stats.s_T0[i].push_back(CBasicStatistics::mean(value)); - }); + stats.s_T0[i].reserve(period); + std::for_each(trend.begin(), trend.end(), [&stats, i](const TMeanVarAccumulator& value) { + stats.s_T0[i].push_back(CBasicStatistics::mean(value)); + stats.s_DF0 += (CBasicStatistics::count(value) > 0.0 ? 1.0 : 0.0); + }); } stats.s_V0 /= static_cast(periods.size()); } } -void CPeriodicityHypothesisTests::conditionOnHypothesis(const TTimeTimePr2Vec& windows, - const STestStats& stats, +void CPeriodicityHypothesisTests::conditionOnHypothesis(const STestStats& stats, TFloatMeanAccumulatorVec& buckets) const { std::size_t n{buckets.size()}; core_t::TTime windowLength{static_cast(n) * m_BucketLength}; @@ -1233,14 +1361,6 @@ void CPeriodicityHypothesisTests::conditionOnHypothesis(const TTimeTimePr2Vec& w } } } - - if (length(windows) < windowLength) { - LOG_TRACE(<< "Projecting onto " << core::CContainerPrinter::print(windows)); - TFloatMeanAccumulatorVec projection; - project(buckets, windows, m_BucketLength, projection); - buckets = std::move(projection); - LOG_TRACE(<< "# values = " << buckets.size()); - } } bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec& windows, @@ -1259,6 +1379,10 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec& windows, if (!this->testStatisticsFor(buckets, stats) || stats.nullHypothesisGoodEnough()) { return false; } + if (stats.s_HasPeriod) { + stats.s_R0 = stats.s_Rt; + return true; + } period_ = std::min(period_, length(windows[0])); std::size_t period{static_cast(period_ / m_BucketLength)}; @@ -1268,89 +1392,152 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec& windows, if (!this->seenSufficientPeriodicallyPopulatedBucketsToTest(buckets, period)) { return false; } - if (stats.s_HasPeriod) { - return true; + + core_t::TTime windowLength{length(windows)}; + TTimeTimePr2Vec window{{0, windowLength}}; + TFloatMeanAccumulatorVec values(buckets.begin(), buckets.end()); + this->conditionOnHypothesis(stats, values); + + if (windowLength < length(buckets, m_BucketLength)) { + LOG_TRACE(<< "Projecting onto " << core::CContainerPrinter::print(windows)); + TFloatMeanAccumulatorVec projection; + project(values, windows, m_BucketLength, projection); + values = std::move(projection); } - TTimeTimePr2Vec window{{0, length(windows)}}; - double M{stats.s_M}; - double B{stats.s_B}; - double scale{1.0 / M}; + double B{static_cast(std::count_if( + values.begin(), values.end(), [](const TFloatMeanAccumulator& value) { + return CBasicStatistics::count(value) > 0.0; + }))}; double df0{B - stats.s_DF0}; + + // We need fewer degrees of freedom in the null hypothesis trend model + // we're fitting than non-empty buckets. if (df0 <= 0.0) { return false; } - double v0{varianceAtPercentile(stats.s_V0, df0, 50.0 + CONFIDENCE_INTERVAL / 2.0)}; - double vt{stats.s_Vt * v0}; - double at{stats.s_At * std::sqrt(v0 / scale)}; - LOG_TRACE(<< " M = " << M); - - TFloatMeanAccumulatorVec values(buckets.begin(), buckets.end()); - this->conditionOnHypothesis(window, stats, values); - - // The variance test. TMeanVarAccumulatorVec trend(period); periodicTrend(values, window, m_BucketLength, trend); + double b{static_cast(std::count_if( trend.begin(), trend.end(), [](const TMeanVarAccumulator& value) { return CBasicStatistics::count(value) > 0.0; }))}; + double df1{B - b}; LOG_TRACE(<< " populated = " << b); - double df1{B - b}; - if (df1 > 0.0) { - double v1{varianceAtPercentile(residualVariance(trend, scale), - df1, 50.0 + CONFIDENCE_INTERVAL / 2.0)}; - LOG_TRACE(<< " variance = " << v1); - LOG_TRACE(<< " varianceThreshold = " << vt); - LOG_TRACE(<< " significance = " - << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)); - - double Rt{stats.s_Rt * CTools::truncate(1.0 - 0.5 * (vt - v1) / vt, 0.9, 1.0)}; - if (v1 < vt && B > 1.0 && - CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE) { - double R{CSignal::autocorrelation(period, values)}; - R = autocorrelationAtPercentile(R, B, 50.0 - CONFIDENCE_INTERVAL / 2.0); - LOG_TRACE(<< " autocorrelation = " << R); - LOG_TRACE(<< " autocorrelationThreshold = " << Rt); - if (R > Rt) { - return true; - } - } + // We need fewer degrees of freedom in the trend model we're fitting + // than non-empty buckets. + if (df1 <= 0.0) { + return false; + } - // The amplitude test. + double scale{1.0 / stats.s_M}; + LOG_TRACE(<< "scale = " << scale); - double F1{1.0}; - if (v1 > 0.0) { - try { - std::size_t n{static_cast( - std::ceil(Rt * static_cast(length(window) / period_)))}; - TMeanAccumulator level; - for (const auto& value : values) { - if (CBasicStatistics::count(value) > 0.0) { - level.add(CBasicStatistics::mean(value)); - } + double v{residualVariance(trend, scale)}; + v = varianceAtPercentile(v, df1, 50.0 + CONFIDENCE_INTERVAL / 2.0); + reweightOutliers(trend, window, m_BucketLength, values); + trend.assign(period, TMeanVarAccumulator{}); + periodicTrend(values, window, m_BucketLength, trend); + + // The Variance Test + + double v0{varianceAtPercentile(stats.s_V0, df0, 50.0 + CONFIDENCE_INTERVAL / 2.0)}; + double vt{stats.s_Vt * v0}; + double v1{varianceAtPercentile(residualVariance(trend, scale), df1, + 50.0 + CONFIDENCE_INTERVAL / 2.0)}; + LOG_TRACE(<< " variance = " << v1); + LOG_TRACE(<< " varianceThreshold = " << vt); + LOG_TRACE(<< " significance = " + << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)); + + double R{CSignal::autocorrelation(period, values)}; + R = autocorrelationAtPercentile(R, B, 50.0 - CONFIDENCE_INTERVAL / 2.0); + double Rt{stats.s_Rt}; + LOG_TRACE(<< " autocorrelation = " << R); + LOG_TRACE(<< " autocorrelationThreshold = " << Rt); + + TSizeVec repeats{calculateRepeats(window, period_, m_BucketLength, values)}; + double meanRepeats{CBasicStatistics::mean( + std::accumulate(repeats.begin(), repeats.end(), TMeanAccumulator{}, + [](TMeanAccumulator mean, std::size_t repeat) { + mean.add(static_cast(repeat)); + return mean; + }))}; + LOG_TRACE(<< " relative mean repeats = " << meanRepeats); + + // We're trading off: + // 1) The significance of the variance reduction, + // 2) The cyclic autocorrelation of the periodic component, + // 3) The amount of variance reduction and + // 4) The number of repeats we've observed. + // + // Specifically, the period will just be accepted if the p-value is equal + // to the threshold to be statistically significant, the autocorrelation + // is equal to the threshold, the variance reduction is equal to the + // threshold and we've observed three periods on average. + + double relativeLogSignificance{ + CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) / + LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE}; + double relativeMeanRepeats{meanRepeats / MINIMUM_REPEATS_TO_TEST_VARIANCE}; + double pVariance{CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) * + CTools::logisticFunction(R / Rt, 0.15, 1.0) * + (vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0) + : CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0)) * + CTools::logisticFunction(relativeMeanRepeats, 0.25, 1.0)}; + LOG_TRACE(<< " p(variance) = " << pVariance); + + if (pVariance >= 0.0625) { + stats.s_R0 = R; + return true; + } + + // The Amplitude Test + + double F1{1.0}; + if (v > 0.0) { + try { + std::size_t n{static_cast( + std::ceil(Rt * static_cast(windowLength / period_)))}; + double at{stats.s_At * std::sqrt(v0 / scale)}; + LOG_TRACE(<< " n = " << n << ", at = " << at << ", v = " << v); + TMeanAccumulator level; + for (const auto& value : values) { + if (CBasicStatistics::count(value) > 0.0) { + level.add(CBasicStatistics::mean(value)); } - TMinAmplitudeVec amplitudes(period, {n, CBasicStatistics::mean(level)}); - periodicTrend(values, window, m_BucketLength, amplitudes); - boost::math::normal normal(0.0, std::sqrt(v1)); - std::for_each(amplitudes.begin(), amplitudes.end(), - [&F1, &normal, at](CMinAmplitude& x) { - if (x.amplitude() >= at) { - F1 = std::min(F1, x.significance(normal)); - } - }); - } catch (const std::exception& e) { - LOG_ERROR(<< "Unable to compute significance of amplitude: " << e.what()); } - } - LOG_TRACE(<< " F(amplitude) = " << F1); - - if (1.0 - std::pow(1.0 - F1, b) <= MAXIMUM_SIGNIFICANCE) { - return true; + TMinAmplitudeVec amplitudes(period, {n, CBasicStatistics::mean(level)}); + periodicTrend(values, window, m_BucketLength, amplitudes); + boost::math::normal normal(0.0, std::sqrt(v)); + std::for_each(amplitudes.begin(), amplitudes.end(), + [&F1, &normal, at](CMinAmplitude& x) { + if (x.amplitude() >= at) { + F1 = std::min(F1, x.significance(normal)); + } + }); + } catch (const std::exception& e) { + LOG_ERROR(<< "Unable to compute significance of amplitude: " << e.what()); } } + LOG_TRACE(<< " F(amplitude) = " << F1); + + // Trade off the test significance and the mean number of repeats + // we've observed. + relativeLogSignificance = CTools::fastLog(1.0 - std::pow(1.0 - F1, b)) / + LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE; + relativeMeanRepeats = meanRepeats / static_cast(2 * MINIMUM_REPEATS_TO_TEST_AMPLITUDE); + double pAmplitude{CTools::logisticFunction(relativeLogSignificance, 0.2, 1.0) * + CTools::logisticFunction(relativeMeanRepeats, 0.5, 1.0)}; + LOG_TRACE(<< " p(amplitude) = " << pAmplitude); + + if (pAmplitude >= 0.25) { + stats.s_R0 = R; + return true; + } return false; } @@ -1359,9 +1546,16 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition core_t::TTime period_, double correction, STestStats& stats) const { + // We check to see if partitioning the data as well as fitting a periodic + // component with period "period_" explains a non-negligible absolute and + // statistically significant amount of variance and the cyclic correlation + // at the repeat of at least one of the partition's periodic components + // is high enough. In order to do this we search over all permitted offsets + // for the start of the split. + using TDoubleTimePr = std::pair; using TDoubleTimePrVec = std::vector; - using TMinAccumulator = CBasicStatistics::COrderStatisticsStack; + using TMinAccumulator = CBasicStatistics::SMin::TAccumulator; using TMeanVarAccumulatorBuffer = boost::circular_buffer; LOG_TRACE(<< "Testing partition " << core::CContainerPrinter::print(partition) @@ -1370,39 +1564,50 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition if (!this->testStatisticsFor(buckets, stats) || stats.nullHypothesisGoodEnough()) { return false; } + if (stats.s_HasPartition) { + stats.s_R0 = stats.s_Rt; + return true; + } std::size_t period{static_cast(period_ / m_BucketLength)}; + core_t::TTime windowLength{length(buckets, m_BucketLength)}; + core_t::TTime repeat{length(partition)}; + double scale{1.0 / stats.s_M}; + LOG_TRACE(<< "scale = " << scale); // We need to observe a minimum number of repeated values to test with // an acceptable false positive rate. if (!this->seenSufficientPeriodicallyPopulatedBucketsToTest(buckets, period)) { return false; } - if (stats.s_HasPartition) { - return true; - } // Find the partition of the data such that the residual variance // w.r.t. the period is minimized and check if there is significant // evidence that it reduces the residual variance and repeats. - core_t::TTime windowLength{length(buckets, m_BucketLength)}; - core_t::TTime repeat{length(partition)}; - core_t::TTime startOfPartition{stats.s_StartOfPartition}; double B{stats.s_B}; - double scale{1.0 / stats.s_M}; double df0{B - stats.s_DF0}; + + // We need fewer degrees of freedom in the null hypothesis trend model + // we're fitting than non-empty buckets. if (df0 <= 0.0) { return false; } + + TFloatMeanAccumulatorVec values(buckets.begin(), buckets.end()); + this->conditionOnHypothesis(stats, values); + { + TTimeTimePr2Vec window{{0, windowLength}}; + TMeanAccumulatorVec trend(period); + periodicTrend(values, window, m_BucketLength, trend); + reweightOutliers(trend, window, m_BucketLength, values); + } + double v0{varianceAtPercentile(stats.s_V0, df0, 50.0 + CONFIDENCE_INTERVAL / 2.0)}; double vt{stats.s_Vt * v0}; LOG_TRACE(<< "period = " << period); - LOG_TRACE(<< "scale = " << scale); - - TFloatMeanAccumulatorVec values(buckets.begin(), buckets.end()); - this->conditionOnHypothesis({{0, windowLength}}, stats, values); + core_t::TTime startOfPartition{stats.s_StartOfPartition}; TTimeTimePr2Vec windows[]{ calculateWindows(startOfPartition, windowLength, repeat, partition[0]), calculateWindows(startOfPartition, windowLength, repeat, partition[1])}; @@ -1411,12 +1616,12 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition TTimeVec deltas[2]; deltas[0].reserve((length(partition[0]) * windowLength) / (period_ * repeat)); deltas[1].reserve((length(partition[1]) * windowLength) / (period_ * repeat)); - for (std::size_t i = 0u; i < 2; ++i) { - for (const auto& window : windows[i]) { + for (std::size_t j = 0u; j < 2; ++j) { + for (const auto& window : windows[j]) { core_t::TTime a_{window.first}; core_t::TTime b_{window.second}; for (core_t::TTime t = a_ + period_; t <= b_; t += period_) { - deltas[i].push_back(t - m_BucketLength); + deltas[j].push_back(t - m_BucketLength); } } } @@ -1438,18 +1643,18 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition TDoubleTimePrVec candidates; candidates.reserve(period); for (core_t::TTime time = m_BucketLength; time < repeat; time += m_BucketLength) { - for (std::size_t i = 0u; i < 2; ++i) { - for (auto& delta : deltas[i]) { + for (std::size_t j = 0u; j < 2; ++j) { + for (auto& delta : deltas[j]) { delta = (delta + m_BucketLength) % windowLength; } - TMeanVarAccumulator oldBucket{trends[i].front()}; + TMeanVarAccumulator oldBucket{trends[j].front()}; TMeanVarAccumulator newBucket; - averageValue(values, deltas[i], m_BucketLength, newBucket); + averageValue(values, deltas[j], m_BucketLength, newBucket); - trends[i].pop_front(); - trends[i].push_back(newBucket); - variances[i] -= residualVariance(oldBucket, scale); - variances[i] += residualVariance(newBucket, scale); + trends[j].pop_front(); + trends[j].push_back(newBucket); + variances[j] -= residualVariance(oldBucket, scale); + variances[j] += residualVariance(newBucket, scale); } double variance{ (residualVariance(variances[0]) + residualVariance(variances[1])) / 2.0}; @@ -1462,29 +1667,25 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition double b{0.0}; TMinAccumulator best; - TTimeTimePr2Vec candidateWindows; for (const auto& candidate : candidates) { if (candidate.first <= 1.05 * minimum[0].first) { - core_t::TTime candidateStartOfPartition{candidate.second}; - candidateWindows = calculateWindows(candidateStartOfPartition, - windowLength, repeat, partition[0]); + startOfPartition = candidate.second; TMeanAccumulator cost; - for (const auto& window : candidateWindows) { + for (const auto& window : calculateWindows(startOfPartition, windowLength, + repeat, partition[0])) { core_t::TTime a_{window.first / m_BucketLength}; core_t::TTime b_{window.second / m_BucketLength - 1}; double va{CBasicStatistics::mean(values[a_ % values.size()])}; double vb{CBasicStatistics::mean(values[b_ % values.size()])}; cost.add(std::fabs(va) + std::fabs(vb) + std::fabs(vb - va)); } - if (best.add({CBasicStatistics::mean(cost), candidateStartOfPartition})) { + if (best.add({CBasicStatistics::mean(cost), startOfPartition})) { b = 0.0; - for (std::size_t i = 0u; i < 2; ++i) { - candidateWindows = calculateWindows(candidateStartOfPartition, windowLength, - repeat, partition[i]); - + for (std::size_t i = 0u; i < partition.size(); ++i) { + windows[i] = calculateWindows(startOfPartition, windowLength, + repeat, partition[i]); TMeanVarAccumulatorVec trend(period); - periodicTrend(values, candidateWindows, m_BucketLength, trend); - + periodicTrend(values, windows[i], m_BucketLength, trend); b += static_cast(std::count_if( trend.begin(), trend.end(), [](const TMeanVarAccumulator& value) { return CBasicStatistics::count(value) > 0.0; @@ -1495,46 +1696,87 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition } double df1{B - b}; - if (df1 > 0.0) { - double variance{correction * minimum[0].first}; - double v1{varianceAtPercentile(variance, df1, 50.0 + CONFIDENCE_INTERVAL / 2.0)}; - LOG_TRACE(<< " variance = " << v1); - LOG_TRACE(<< " varianceThreshold = " << vt); - LOG_TRACE(<< " significance = " - << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)); - - if (v1 <= vt && CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE) { - double R{-1.0}; - double Rt{stats.s_Rt * CTools::truncate(1.0 - 0.5 * (vt - v1) / vt, 0.9, 1.0)}; - - startOfPartition = best[0].second; - windows[0] = calculateWindows(startOfPartition, windowLength, - repeat, partition[0]); - windows[1] = calculateWindows(startOfPartition, windowLength, - repeat, partition[1]); - for (const auto& windows_ : windows) { - TFloatMeanAccumulatorVec partitionValues; - project(values, windows_, m_BucketLength, partitionValues); - std::size_t windowLength_(length(windows_[0]) / m_BucketLength); - double BW{std::accumulate( - partitionValues.begin(), partitionValues.end(), 0.0, - [](double n, const TFloatMeanAccumulator& value) { - return n + (CBasicStatistics::count(value) > 0.0 ? 1.0 : 0.0); - })}; - if (BW > 1.0) { - double RW{CSignal::autocorrelation(windowLength_ + period, partitionValues)}; - R = std::max(R, autocorrelationAtPercentile( - RW, BW, 50.0 - CONFIDENCE_INTERVAL / 2.0)); - LOG_TRACE(<< " autocorrelation = " << R); - LOG_TRACE(<< " autocorrelationThreshold = " << Rt); - } - } - if (R > Rt) { - stats.s_StartOfPartition = startOfPartition; - return true; - } + // We need fewer degrees of freedom in the trend model we're fitting + // than non-empty buckets. + if (df1 <= 0.0) { + return false; + } + + startOfPartition = best[0].second; + double v1{varianceAtPercentile(correction * minimum[0].first, df1, + 50.0 + CONFIDENCE_INTERVAL / 2.0)}; + LOG_TRACE(<< " start of partition = " << startOfPartition); + LOG_TRACE(<< " variance = " << v1); + LOG_TRACE(<< " varianceThreshold = " << vt); + LOG_TRACE(<< " significance = " + << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)); + + values.assign(buckets.begin(), buckets.end()); + TMeanAccumulatorVec trend; + for (const auto& window : windows) { + trend.assign(period, TMeanAccumulator{}); + periodicTrend(values, window, m_BucketLength, trend); + reweightOutliers(trend, window, m_BucketLength, values); + } + + // In the following we're trading off: + // 1) The cyclic autocorrelation of each periodic component in the + // partition, + // 2) The number of repeats we've observed of each periodic component + // in the partition, + // 3) The significance of the variance reduction, and + // 4) The amount of variance reduction. + + double p{0.0}; + double R{-1.0}; + double Rt{stats.s_Rt}; + + TFloatMeanAccumulatorVec partitionValues; + for (const auto& window : windows) { + project(values, window, m_BucketLength, partitionValues); + + double RW{-1.0}; + double BW{std::accumulate( + partitionValues.begin(), partitionValues.end(), 0.0, + [](double n, const TFloatMeanAccumulator& value) { + return n + (CBasicStatistics::count(value) > 0.0 ? 1.0 : 0.0); + })}; + if (BW > 1.0) { + RW = CSignal::autocorrelation(length(window[0]) / m_BucketLength + period, + partitionValues); + RW = autocorrelationAtPercentile(RW, BW, 50.0 - CONFIDENCE_INTERVAL / 2.0); + LOG_TRACE(<< " autocorrelation = " << RW); + LOG_TRACE(<< " autocorrelationThreshold = " << Rt); } + + TSizeVec repeats{calculateRepeats(window, period_, m_BucketLength, values)}; + double relativeMeanRepeats{CBasicStatistics::mean(std::accumulate( + repeats.begin(), repeats.end(), TMeanAccumulator{}, + [](TMeanAccumulator mean, std::size_t repeat_) { + mean.add(static_cast(repeat_)); + return mean; + })) / + MINIMUM_REPEATS_TO_TEST_VARIANCE}; + LOG_TRACE(<< " relative mean repeats = " << relativeMeanRepeats); + + p = std::max(p, CTools::logisticFunction(RW / Rt, 0.15, 1.0) * + CTools::logisticFunction(relativeMeanRepeats, 0.25, 1.0)); + R = std::max(R, RW); + } + + double relativeLogSignificance{ + CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) / + LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE}; + p *= CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) * + (vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0) + : CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0)); + LOG_TRACE(<< " p(partition) = " << p); + + if (p >= 0.0625) { + stats.s_StartOfPartition = startOfPartition; + stats.s_R0 = R; + return true; } return false; } @@ -1543,8 +1785,11 @@ const double CPeriodicityHypothesisTests::ACCURATE_TEST_POPULATED_FRACTION{0.9}; const double CPeriodicityHypothesisTests::MINIMUM_COEFFICIENT_OF_VARIATION{1e-4}; CPeriodicityHypothesisTests::STestStats::STestStats() - : s_HasPeriod(false), s_HasPartition(false), s_Vt(0.0), s_At(0.0), s_Rt(0.0), - s_Range(0.0), s_B(0.0), s_M(0.0), s_V0(0.0), s_DF0(0.0), s_StartOfPartition(0) { + : s_HasPeriod(false), s_HasPartition(false), + s_Vt(COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION[E_HighThreshold]), + s_At(SEASONAL_SIGNIFICANT_AMPLITUDE[E_HighThreshold]), + s_Rt(SEASONAL_SIGNIFICANT_AUTOCORRELATION[E_HighThreshold]), s_Range(0.0), + s_B(0.0), s_M(0.0), s_V0(0.0), s_R0(0.0), s_DF0(0.0), s_StartOfPartition(0) { } void CPeriodicityHypothesisTests::STestStats::setThresholds(double vt, double at, double Rt) { diff --git a/lib/maths/CSeasonalComponent.cc b/lib/maths/CSeasonalComponent.cc index 703c081dc4..c53d902e70 100644 --- a/lib/maths/CSeasonalComponent.cc +++ b/lib/maths/CSeasonalComponent.cc @@ -229,7 +229,7 @@ double CSeasonalComponent::delta(core_t::TTime time, // a delta for the case that the difference from the mean // is 1/3 of the range. We force the delta to zero for values // significantly smaller than this. - double scale{CTools::smoothHeaviside(3.0 * min[0] / minmax.range(), 1.0 / 12.0)}; + double scale{CTools::logisticFunction(3.0 * min[0] / minmax.range(), 0.1, 1.0)}; scale = CTools::truncate(1.002 * scale - 0.001, 0.0, 1.0); return -scale * min[0] * CTools::sign(shortPeriodValue); diff --git a/lib/maths/CSeasonalComponentAdaptiveBucketing.cc b/lib/maths/CSeasonalComponentAdaptiveBucketing.cc index 770db1a01b..4d5902ee86 100644 --- a/lib/maths/CSeasonalComponentAdaptiveBucketing.cc +++ b/lib/maths/CSeasonalComponentAdaptiveBucketing.cc @@ -581,7 +581,7 @@ double CSeasonalComponentAdaptiveBucketing::predict(std::size_t bucket, // We mean revert our predictions if trying to predict much further // ahead than the observed interval for the data. - double alpha{CTools::smoothHeaviside(extrapolateInterval / interval, 1.0 / 12.0, -1.0)}; + double alpha{CTools::logisticFunction(extrapolateInterval / interval, 0.1, 1.0, -1.0)}; double beta{1.0 - alpha}; return alpha * regression.predict(t) + beta * regression.mean(); } diff --git a/lib/maths/CSignal.cc b/lib/maths/CSignal.cc index a39bd97fe7..74e415064a 100644 --- a/lib/maths/CSignal.cc +++ b/lib/maths/CSignal.cc @@ -141,7 +141,7 @@ double CSignal::autocorrelation(std::size_t offset, TFloatMeanAccumulatorCRng va TMeanVarAccumulator moments; for (const auto& value : values) { if (CBasicStatistics::count(value) > 0.0) { - moments.add(CBasicStatistics::mean(value)); + moments.add(CBasicStatistics::mean(value), CBasicStatistics::count(value)); } } @@ -153,8 +153,10 @@ double CSignal::autocorrelation(std::size_t offset, TFloatMeanAccumulatorCRng va double ni = CBasicStatistics::count(values[i]); double nj = CBasicStatistics::count(values[j]); if (ni > 0.0 && nj > 0.0) { + double weight = std::sqrt(ni * nj); autocorrelation.add((CBasicStatistics::mean(values[i]) - mean) * - (CBasicStatistics::mean(values[j]) - mean)); + (CBasicStatistics::mean(values[j]) - mean), + weight); } } diff --git a/lib/maths/CTimeSeriesDecompositionDetail.cc b/lib/maths/CTimeSeriesDecompositionDetail.cc index 414e065450..9d83523d58 100644 --- a/lib/maths/CTimeSeriesDecompositionDetail.cc +++ b/lib/maths/CTimeSeriesDecompositionDetail.cc @@ -44,6 +44,7 @@ #include #include +#include #include #include @@ -200,18 +201,6 @@ void stepwisePropagateForwards(core_t::TTime step, } } -//! Apply the common shift to the slope of \p trend. -void shiftSlope(const TTimeTimePrDoubleFMap& slopes, double decayRate, CTrendComponent& trend) { - CBasicStatistics::CMinMax minmax; - for (const auto& slope : slopes) { - minmax.add(slope.second); - } - double shift{minmax.signMargin()}; - if (shift != 0.0) { - trend.shiftSlope(decayRate, shift); - } -} - // Periodicity Test State Machine // States @@ -1122,10 +1111,20 @@ void CTimeSeriesDecompositionDetail::CComponents::handle(const SAddValue& messag double v1{CBasicStatistics::variance(m_MomentsMinusTrend)}; double df0{CBasicStatistics::count(m_Moments) - 1.0}; double df1{CBasicStatistics::count(m_MomentsMinusTrend) - m_Trend.parameters()}; - m_UsingTrendForPrediction = - v1 < SIGNIFICANT_VARIANCE_REDUCTION[0] * v0 && df0 > 0.0 && df1 > 0.0 && - CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE; - *m_Watcher = m_UsingTrendForPrediction; + if (df0 > 0.0 && df1 > 0.0 && v0 > 0.0) { + double relativeLogSignificance{ + CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) / + LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE}; + double vt{*std::max_element( + boost::begin(COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION), + boost::end(COMPONENT_SIGNIFICANT_VARIANCE_REDUCTION)) * + v0}; + double p{CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) * + (vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0) + : CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0))}; + m_UsingTrendForPrediction = (p >= 0.25); + *m_Watcher = m_UsingTrendForPrediction; + } } } break; case SC_DISABLED: @@ -1419,34 +1418,57 @@ bool CTimeSeriesDecompositionDetail::CComponents::addSeasonalComponents( std::sort(newSeasonalTimes.begin(), newSeasonalTimes.end(), maths::COrderings::SLess()); + core_t::TTime startTime{window.startTime()}; + core_t::TTime endTime{window.endTime()}; + core_t::TTime dt{window.bucketLength()}; + TFloatMeanAccumulatorVec values; for (const auto& seasonalTime : newSeasonalTimes) { values = window.valuesMinusPrediction(predictor); + if (seasonalTime->windowed()) { + core_t::TTime time{startTime + dt / 2}; + for (auto& value : values) { + if (!seasonalTime->inWindow(time)) { + value = TFloatMeanAccumulator{}; + } + time += dt; + } + } + double bucketLength{static_cast(m_BucketLength)}; core_t::TTime period{seasonalTime->period()}; + auto boundaryCondition = period > seasonalTime->windowLength() + ? CSplineTypes::E_Natural + : CSplineTypes::E_Periodic; + + CSeasonalComponent candidate{*seasonalTime, m_SeasonalComponentSize, + m_DecayRate, bucketLength, boundaryCondition}; + candidate.initialize(startTime, endTime, values); + candidate.interpolate(CIntegerTools::floor(endTime, period)); + this->reweightOutliers( + startTime, dt, + [&candidate](core_t::TTime time) { + return CBasicStatistics::mean(candidate.value(time, 0.0)); + }, + values); + components.emplace_back(*seasonalTime, m_SeasonalComponentSize, - m_DecayRate, static_cast(m_BucketLength), - period > seasonalTime->windowLength() - ? CSplineTypes::E_Natural - : CSplineTypes::E_Periodic); - components.back().initialize(window.startTime(), window.endTime(), values); - components.back().interpolate(CIntegerTools::floor(window.endTime(), period)); + m_DecayRate, bucketLength, boundaryCondition); + components.back().initialize(startTime, endTime, values); + components.back().interpolate(CIntegerTools::floor(endTime, period)); } - CTrendComponent windowTrend{trend.defaultDecayRate()}; + CTrendComponent candidate{trend.defaultDecayRate()}; values = window.valuesMinusPrediction(predictor); - core_t::TTime time{window.startTime() + window.bucketLength() / 2}; - for (const auto& value : values) { - // Because we now test before the window is fully compressed - // we can get a run of unset values at the end of the window, - // we should just ignore these. - if (CBasicStatistics::count(value) > 0.0) { - windowTrend.add(time, CBasicStatistics::mean(value), - CBasicStatistics::count(value)); - windowTrend.propagateForwardsByTime(window.bucketLength()); - } - time += window.bucketLength(); - } - trend.swap(windowTrend); + this->fit(startTime, dt, values, candidate); + this->reweightOutliers(startTime, dt, + [&candidate](core_t::TTime time) { + return CBasicStatistics::mean(candidate.value(time, 0.0)); + }, + values); + + CTrendComponent trend_{trend.defaultDecayRate()}; + this->fit(startTime, dt, values, trend_); + trend.swap(trend_); errors.resize(components.size()); COrderings::simultaneousSort( @@ -1473,6 +1495,66 @@ bool CTimeSeriesDecompositionDetail::CComponents::addCalendarComponent( return true; } +void CTimeSeriesDecompositionDetail::CComponents::reweightOutliers( + core_t::TTime startTime, + core_t::TTime dt, + TPredictor predictor, + TFloatMeanAccumulatorVec& values) const { + using TMinAccumulator = + CBasicStatistics::COrderStatisticsHeap>; + using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; + + double numberValues{std::accumulate( + values.begin(), values.end(), 0.0, [](double n, const TFloatMeanAccumulator& value) { + return n + (CBasicStatistics::count(value) > 0.0 ? 1.0 : 0.0); + })}; + double numberOutliers{SEASONAL_OUTLIER_FRACTION * numberValues}; + + TMinAccumulator outliers{static_cast(2.0 * numberOutliers)}; + TMeanAccumulator meanDifference; + core_t::TTime time = startTime + dt / 2; + for (std::size_t i = 0; i < values.size(); ++i, time += dt) { + if (CBasicStatistics::count(values[i]) > 0.0) { + double difference{std::fabs(CBasicStatistics::mean(values[i]) - predictor(time))}; + outliers.add({-difference, i}); + meanDifference.add(difference); + } + } + outliers.sort(); + TMeanAccumulator meanDifferenceOfOutliers; + for (std::size_t i = 0u; i < static_cast(numberOutliers); ++i) { + meanDifferenceOfOutliers.add(-outliers[i].first); + } + meanDifference -= meanDifferenceOfOutliers; + for (std::size_t i = 0; i < outliers.count(); ++i) { + if (-outliers[i].first > SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD * + CBasicStatistics::mean(meanDifference)) { + double weight{SEASONAL_OUTLIER_WEIGHT + + (1.0 - SEASONAL_OUTLIER_WEIGHT) * + CTools::logisticFunction(static_cast(i) / numberOutliers, + 0.1, 1.0)}; + CBasicStatistics::count(values[outliers[i].second]) *= weight; + } + } +} + +void CTimeSeriesDecompositionDetail::CComponents::fit(core_t::TTime startTime, + core_t::TTime dt, + const TFloatMeanAccumulatorVec& values, + CTrendComponent& trend) const { + core_t::TTime time{startTime + dt / 2}; + for (const auto& value : values) { + // Because we test before the window is fully compressed we can + // get a run of unset values at the end of the window, we should + // just ignore these. + if (CBasicStatistics::count(value) > 0.0) { + trend.add(time, CBasicStatistics::mean(value), CBasicStatistics::count(value)); + trend.propagateForwardsByTime(dt); + } + time += dt; + } +}; + void CTimeSeriesDecompositionDetail::CComponents::clearComponentErrors() { if (m_Seasonal) { for (auto& errors : m_Seasonal->s_PredictionErrors) { @@ -1536,6 +1618,8 @@ void CTimeSeriesDecompositionDetail::CComponents::shiftOrigin(core_t::TTime time } void CTimeSeriesDecompositionDetail::CComponents::canonicalize(core_t::TTime time) { + using TMinMaxAccumulator = CBasicStatistics::CMinMax; + this->shiftOrigin(time); if (m_Seasonal && m_Seasonal->prune(time, m_BucketLength)) { @@ -1548,20 +1632,37 @@ void CTimeSeriesDecompositionDetail::CComponents::canonicalize(core_t::TTime tim if (m_Seasonal) { TSeasonalComponentVec& seasonal{m_Seasonal->s_Components}; - TTimeTimePrDoubleFMap slope; - slope.reserve(seasonal.size()); + double slope{0.0}; + TTimeTimePrDoubleFMap windowSlopes; + windowSlopes.reserve(seasonal.size()); for (auto& component : seasonal) { if (component.slopeAccurate(time)) { - const CSeasonalTime& time_{component.time()}; double si{component.slope()}; - component.shiftSlope(-si); - slope[time_.window()] += si; + if (component.time().windowed()) { + windowSlopes[component.time().window()] += si; + } else { + slope += si; + component.shiftSlope(-si); + } + } + } + TMinMaxAccumulator windowedSlope; + for (const auto& windowSlope : windowSlopes) { + windowedSlope.add(windowSlope.second); + } + slope += windowedSlope.signMargin(); + LOG_TRACE(<< "slope = " << slope); + + for (auto& component : seasonal) { + if (component.slopeAccurate(time) && component.time().windowed()) { + component.shiftSlope(-windowedSlope.signMargin()); } } - LOG_TRACE(<< "slope = " << core::CContainerPrinter::print(slope)); - shiftSlope(slope, m_DecayRate, m_Trend); + if (slope != 0.0) { + m_Trend.shiftSlope(m_DecayRate, slope); + } } } @@ -1584,7 +1685,6 @@ bool CTimeSeriesDecompositionDetail::CComponents::CScopeNotifyOnStateChange::cha bool CTimeSeriesDecompositionDetail::CComponents::CComponentErrors::fromDelimited(const std::string& str) { TFloatMeanAccumulator* state[] = {&m_MeanErrorWithComponent, &m_MeanErrorWithoutComponent}; - std::string suffix = str; for (std::size_t i = 0u, n = 0; i < 2; ++i, suffix = suffix.substr(n + 1)) { n = suffix.find(CBasicStatistics::EXTERNAL_DELIMITER); @@ -1593,7 +1693,6 @@ bool CTimeSeriesDecompositionDetail::CComponents::CComponentErrors::fromDelimite return false; } } - return true; } diff --git a/lib/maths/CTrendComponent.cc b/lib/maths/CTrendComponent.cc index f96af92bf3..d26510db29 100644 --- a/lib/maths/CTrendComponent.cc +++ b/lib/maths/CTrendComponent.cc @@ -445,7 +445,7 @@ double CTrendComponent::weightOfPrediction(core_t::TTime time) const { return 1.0; } - return CTools::smoothHeaviside(extrapolateInterval / interval, 1.0 / 12.0, -1.0); + return CTools::logisticFunction(extrapolateInterval / interval, 0.1, 1.0, -1.0); } CTrendComponent::SModel::SModel(double weight) { diff --git a/lib/maths/unittest/CForecastTest.cc b/lib/maths/unittest/CForecastTest.cc index b7eed1f28f..58c6357d13 100644 --- a/lib/maths/unittest/CForecastTest.cc +++ b/lib/maths/unittest/CForecastTest.cc @@ -90,7 +90,7 @@ void CForecastTest::testDailyNoLongTermTrend() { return 40.0 + alpha * y[i / 6] + beta * y[(i / 6 + 1) % y.size()] + noise; }; - this->test(trend, bucketLength, 63, 64.0, 5.0, 0.14); + this->test(trend, bucketLength, 63, 64.0, 5.0, 0.13); } void CForecastTest::testDailyConstantLongTermTrend() { @@ -130,7 +130,7 @@ void CForecastTest::testDailyVaryingLongTermTrend() { 8.0 * std::sin(boost::math::double_constants::two_pi * time_ / 43200.0) + noise; }; - this->test(trend, bucketLength, 98, 9.0, 14.0, 0.042); + this->test(trend, bucketLength, 98, 9.0, 24.0, 0.042); } void CForecastTest::testComplexNoLongTermTrend() { @@ -146,7 +146,7 @@ void CForecastTest::testComplexNoLongTermTrend() { return scale[d] * (20.0 + y[h] + noise); }; - this->test(trend, bucketLength, 63, 24.0, 8.0, 0.13); + this->test(trend, bucketLength, 63, 24.0, 9.0, 0.13); } void CForecastTest::testComplexConstantLongTermTrend() { @@ -163,7 +163,7 @@ void CForecastTest::testComplexConstantLongTermTrend() { scale[d] * (20.0 + y[h] + noise); }; - this->test(trend, bucketLength, 63, 24.0, 8.0, 0.01); + this->test(trend, bucketLength, 63, 24.0, 11.0, 0.01); } void CForecastTest::testComplexVaryingLongTermTrend() { @@ -193,7 +193,7 @@ void CForecastTest::testComplexVaryingLongTermTrend() { return trend_.value(time_) + scale[d] * (20.0 + y[h] + noise); }; - this->test(trend, bucketLength, 63, 4.0, 42.0, 0.06); + this->test(trend, bucketLength, 63, 4.0, 44.0, 0.06); } void CForecastTest::testNonNegative() { diff --git a/lib/maths/unittest/CPeriodicityHypothesisTestsTest.cc b/lib/maths/unittest/CPeriodicityHypothesisTestsTest.cc index ef64bfefcc..5170818cb8 100644 --- a/lib/maths/unittest/CPeriodicityHypothesisTestsTest.cc +++ b/lib/maths/unittest/CPeriodicityHypothesisTestsTest.cc @@ -209,7 +209,7 @@ void CPeriodicityHypothesisTestsTest::testDiurnal() { } LOG_DEBUG(<< ""); - LOG_DEBUG(<< "*** Diurnal: daily and weekends ***"); + LOG_DEBUG(<< "*** Diurnal: daily, weekly and weekends + outliers ***"); { TTimeDoublePrVec timeseries; core_t::TTime startTime; @@ -234,8 +234,9 @@ void CPeriodicityHypothesisTestsTest::testDiurnal() { core_t::TTime time{timeseries[i].first}; if (time > lastTest + window) { maths::CPeriodicityHypothesisTestsResult result{hypotheses.test()}; - CPPUNIT_ASSERT_EQUAL(std::string("{ 'weekend daily' 'weekday daily' }"), - result.print()); + LOG_DEBUG(<< "result = " << result.print()); + CPPUNIT_ASSERT(result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' }" || + result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' 'weekday weekly' }"); hypotheses = maths::CPeriodicityHypothesisTests(); hypotheses.initialize(HOUR, window, DAY); lastTest += window; @@ -307,7 +308,8 @@ void CPeriodicityHypothesisTestsTest::testDiurnal() { core_t::TTime time{timeseries[i].first}; if (time > lastTest + window) { maths::CPeriodicityHypothesisTestsResult result{hypotheses.test()}; - CPPUNIT_ASSERT(result.print() == "{ 'weekend daily' 'weekday daily' }" || + LOG_DEBUG(<< "result = " << result.print()); + CPPUNIT_ASSERT(result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' }" || result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' 'weekday weekly' }"); hypotheses = maths::CPeriodicityHypothesisTests(); hypotheses.initialize(HOUR, window, DAY); @@ -397,9 +399,13 @@ void CPeriodicityHypothesisTestsTest::testNonDiurnal() { } void CPeriodicityHypothesisTestsTest::testWithSparseData() { + // Test we correctly identify periodicity if there are periodically + // missing data. + test::CRandomNumbers rng; - LOG_DEBUG(<< "Daily Periodic") { + LOG_DEBUG(<< "Daily Periodic"); + { maths::CPeriodicityHypothesisTests hypotheses; hypotheses.initialize(HALF_HOUR, WEEK, DAY); @@ -424,7 +430,8 @@ void CPeriodicityHypothesisTestsTest::testWithSparseData() { } } - LOG_DEBUG(<< "Daily Not Periodic") { + LOG_DEBUG(<< "Daily Not Periodic"); + { maths::CPeriodicityHypothesisTests hypotheses; hypotheses.initialize(HALF_HOUR, WEEK, DAY); @@ -450,7 +457,8 @@ void CPeriodicityHypothesisTestsTest::testWithSparseData() { } } - LOG_DEBUG(<< "Weekly") { + LOG_DEBUG(<< "Weekly"); + { maths::CPeriodicityHypothesisTests hypotheses; hypotheses.initialize(HOUR, 2 * WEEK, WEEK); @@ -488,7 +496,8 @@ void CPeriodicityHypothesisTestsTest::testWithSparseData() { } } - LOG_DEBUG(<< "Weekly Not Periodic") { + LOG_DEBUG(<< "Weekly Not Periodic"); + { maths::CPeriodicityHypothesisTests hypotheses; hypotheses.initialize(HOUR, 4 * WEEK, WEEK); @@ -527,6 +536,117 @@ void CPeriodicityHypothesisTestsTest::testWithSparseData() { } } +void CPeriodicityHypothesisTestsTest::testWithOutliers() { + // Test the we can robustly detect the correct underlying periodic + // components. + + TTimeVec windows{WEEK, 2 * WEEK, 16 * DAY, 4 * WEEK}; + TTimeVec bucketLengths{TEN_MINS, HALF_HOUR}; + TTimeVec periods{DAY, WEEK}; + TDoubleVec modulation{0.1, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0}; + core_t::TTime startTime{10000}; + + test::CRandomNumbers rng; + + TDoubleVec noise; + TSizeVec outliers; + TDoubleVec spikeOrTroughSelector; + + LOG_DEBUG(<< "Daily + Weekly"); + for (const auto& period : periods) { + for (const auto& window : windows) { + if (window < 2 * period) { + continue; + } + for (const auto& bucketLength : bucketLengths) { + core_t::TTime buckets{window / bucketLength}; + std::size_t numberOutliers{static_cast(0.12 * buckets)}; + rng.generateUniformSamples(0, buckets, numberOutliers, outliers); + rng.generateUniformSamples(0, 1.0, numberOutliers, spikeOrTroughSelector); + rng.generateNormalSamples(0.0, 9.0, buckets, noise); + std::sort(outliers.begin(), outliers.end()); + + //std::ofstream file; + //file.open("results.m"); + //TDoubleVec f; + + maths::TFloatMeanAccumulatorVec values(buckets); + for (core_t::TTime time = startTime; time < startTime + window; + time += bucketLength) { + std::size_t bucket((time - startTime) / bucketLength); + auto outlier = std::lower_bound(outliers.begin(), outliers.end(), bucket); + if (outlier != outliers.end() && *outlier == bucket) { + values[bucket].add( + spikeOrTroughSelector[outlier - outliers.begin()] > 0.2 ? 0.0 : 100.0); + } else { + values[bucket].add( + 20.0 + 20.0 * std::sin(boost::math::double_constants::two_pi * + static_cast(time) / + static_cast(period))); + } + //f.push_back(maths::CBasicStatistics::mean(values[bucket])); + } + //file << "f = " << core::CContainerPrinter::print(f) << ";"; + + maths::CPeriodicityHypothesisTestsConfig config; + maths::CPeriodicityHypothesisTestsResult result{ + maths::testForPeriods(config, startTime, bucketLength, values)}; + LOG_DEBUG(<< "result = " << result.print()); + if (period == DAY) { + CPPUNIT_ASSERT_EQUAL(std::string{"{ 'daily' }"}, result.print()); + } else if (period == WEEK) { + CPPUNIT_ASSERT_EQUAL(std::string{"{ 'weekly' }"}, result.print()); + } + } + } + } + + LOG_DEBUG(<< "Weekday / Weekend"); + for (const auto& window : windows) { + if (window < 2 * WEEK) { + continue; + } + for (const auto& bucketLength : bucketLengths) { + core_t::TTime buckets{window / bucketLength}; + std::size_t numberOutliers{static_cast(0.12 * buckets)}; + rng.generateUniformSamples(0, buckets, numberOutliers, outliers); + rng.generateUniformSamples(0, 1.0, numberOutliers, spikeOrTroughSelector); + rng.generateNormalSamples(0.0, 9.0, buckets, noise); + std::sort(outliers.begin(), outliers.end()); + + //std::ofstream file; + //file.open("results.m"); + //TDoubleVec f; + + maths::TFloatMeanAccumulatorVec values(buckets); + for (core_t::TTime time = startTime; time < startTime + window; time += bucketLength) { + std::size_t bucket((time - startTime) / bucketLength); + auto outlier = std::lower_bound(outliers.begin(), outliers.end(), bucket); + if (outlier != outliers.end() && *outlier == bucket) { + values[bucket].add( + spikeOrTroughSelector[outlier - outliers.begin()] > 0.2 ? 0.0 : 100.0); + } else { + values[bucket].add( + modulation[((time - startTime) / DAY) % 7] * + (20.0 + 20.0 * std::sin(boost::math::double_constants::two_pi * + static_cast(time) / + static_cast(DAY)))); + } + //f.push_back(maths::CBasicStatistics::mean(values[bucket])); + } + //file << "f = " << core::CContainerPrinter::print(f) << ";"; + + maths::CPeriodicityHypothesisTestsConfig config; + maths::CPeriodicityHypothesisTestsResult result{ + maths::testForPeriods(config, startTime, bucketLength, values)}; + LOG_DEBUG(<< "result = " << result.print()); + CPPUNIT_ASSERT( + result.print() == std::string("{ 'weekend daily' 'weekday daily' }") || + result.print() == std::string("{ 'weekend daily' 'weekday daily' 'weekend weekly' }")); + } + } +} + void CPeriodicityHypothesisTestsTest::testTestForPeriods() { // Test the ability to correctly find and test for periodic // signals without being told the periods to test a-priori. @@ -550,15 +670,13 @@ void CPeriodicityHypothesisTestsTest::testTestForPeriods() { if (test % 10 == 0) { LOG_DEBUG(<< "test " << test << " / 100"); } - for (std::size_t i = 0u; i < windows.size(); ++i) { - core_t::TTime window{windows[i]}; - + for (const auto& window : windows) { TDoubleVec scaling_; rng.generateUniformSamples(1.0, 5.0, 1, scaling_); double scaling{test % 2 == 0 ? scaling_[0] : 1.0 / scaling_[0]}; - for (std::size_t j = 0u; j < bucketLengths.size(); ++j) { - core_t::TTime bucketLength{bucketLengths[j]}; + for (std::size_t i = 0u; i < bucketLengths.size(); ++i) { + core_t::TTime bucketLength{bucketLengths[i]}; core_t::TTime period{maths::CIntegerTools::floor( static_cast(static_cast(DAY) / scaling), bucketLength)}; scaling = static_cast(DAY) / static_cast(period); @@ -581,7 +699,7 @@ void CPeriodicityHypothesisTestsTest::testTestForPeriods() { rng.generateLogNormalSamples(0.2, 0.3, window / bucketLength, noise); break; } - rng.generateUniformSamples(0, permittedGenerators[j], 1, index); + rng.generateUniformSamples(0, permittedGenerators[i], 1, index); rng.generateUniformSamples(3, 20, 1, repeats); maths::CPeriodicityHypothesisTests hypotheses; @@ -646,6 +764,9 @@ CppUnit::Test* CPeriodicityHypothesisTestsTest::suite() { suiteOfTests->addTest(new CppUnit::TestCaller( "CPeriodicityHypothesisTestsTest::testWithSparseData", &CPeriodicityHypothesisTestsTest::testWithSparseData)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CPeriodicityHypothesisTestsTest::testWithOutliers", + &CPeriodicityHypothesisTestsTest::testWithOutliers)); suiteOfTests->addTest(new CppUnit::TestCaller( "CPeriodicityHypothesisTestsTest::testTestForPeriods", &CPeriodicityHypothesisTestsTest::testTestForPeriods)); diff --git a/lib/maths/unittest/CPeriodicityHypothesisTestsTest.h b/lib/maths/unittest/CPeriodicityHypothesisTestsTest.h index b1a2215fc5..794ad8f095 100644 --- a/lib/maths/unittest/CPeriodicityHypothesisTestsTest.h +++ b/lib/maths/unittest/CPeriodicityHypothesisTestsTest.h @@ -15,6 +15,7 @@ class CPeriodicityHypothesisTestsTest : public CppUnit::TestFixture { void testDiurnal(); void testNonDiurnal(); void testWithSparseData(); + void testWithOutliers(); void testTestForPeriods(); static CppUnit::Test* suite(); diff --git a/lib/maths/unittest/CTimeSeriesDecompositionTest.cc b/lib/maths/unittest/CTimeSeriesDecompositionTest.cc index 1043699368..0e697589e8 100644 --- a/lib/maths/unittest/CTimeSeriesDecompositionTest.cc +++ b/lib/maths/unittest/CTimeSeriesDecompositionTest.cc @@ -126,8 +126,8 @@ void CTimeSeriesDecompositionTest::testSuperpositionOfSines() { LOG_DEBUG(<< "70% error = " << percentileError / sumValue); if (time >= 2 * WEEK) { - CPPUNIT_ASSERT(sumResidual < 0.039 * sumValue); - CPPUNIT_ASSERT(maxResidual < 0.041 * maxValue); + CPPUNIT_ASSERT(sumResidual < 0.052 * sumValue); + CPPUNIT_ASSERT(maxResidual < 0.10 * maxValue); CPPUNIT_ASSERT(percentileError < 0.02 * sumValue); totalSumResidual += sumResidual; totalMaxResidual += maxResidual; @@ -149,8 +149,8 @@ void CTimeSeriesDecompositionTest::testSuperpositionOfSines() { //file << "plot(t(1:length(fe)), fe, 'r');\n"; //file << "plot(t(1:length(r)), r, 'k');\n"; - CPPUNIT_ASSERT(totalSumResidual < 0.020 * totalSumValue); - CPPUNIT_ASSERT(totalMaxResidual < 0.021 * totalMaxValue); + CPPUNIT_ASSERT(totalSumResidual < 0.019 * totalSumValue); + CPPUNIT_ASSERT(totalMaxResidual < 0.020 * totalMaxValue); CPPUNIT_ASSERT(totalPercentileError < 0.01 * totalSumValue); } @@ -308,9 +308,9 @@ void CTimeSeriesDecompositionTest::testDistortedPeriodic() { LOG_DEBUG(<< "70% error = " << percentileError / sumValue); if (time >= 2 * WEEK) { - CPPUNIT_ASSERT(sumResidual < 0.30 * sumValue); + CPPUNIT_ASSERT(sumResidual < 0.27 * sumValue); CPPUNIT_ASSERT(maxResidual < 0.56 * maxValue); - CPPUNIT_ASSERT(percentileError < 0.21 * sumValue); + CPPUNIT_ASSERT(percentileError < 0.16 * sumValue); totalSumResidual += sumResidual; totalMaxResidual += maxResidual; @@ -334,8 +334,8 @@ void CTimeSeriesDecompositionTest::testDistortedPeriodic() { LOG_DEBUG(<< "total 70% error = " << totalPercentileError / totalSumValue); CPPUNIT_ASSERT(totalSumResidual < 0.18 * totalSumValue); - CPPUNIT_ASSERT(totalMaxResidual < 0.22 * totalMaxValue); - CPPUNIT_ASSERT(totalPercentileError < 0.03 * totalSumValue); + CPPUNIT_ASSERT(totalMaxResidual < 0.28 * totalMaxValue); + CPPUNIT_ASSERT(totalPercentileError < 0.09 * totalSumValue); } void CTimeSeriesDecompositionTest::testMinimizeLongComponents() { @@ -554,9 +554,9 @@ void CTimeSeriesDecompositionTest::testWeekend() { LOG_DEBUG(<< "total 'max residual' / 'max value' = " << totalMaxResidual / totalMaxValue); LOG_DEBUG(<< "total 70% error = " << totalPercentileError / totalSumValue); - CPPUNIT_ASSERT(totalSumResidual < 0.026 * totalSumValue); + CPPUNIT_ASSERT(totalSumResidual < 0.024 * totalSumValue); CPPUNIT_ASSERT(totalMaxResidual < 0.13 * totalMaxValue); - CPPUNIT_ASSERT(totalPercentileError < 0.012 * totalSumValue); + CPPUNIT_ASSERT(totalPercentileError < 0.01 * totalSumValue); } void CTimeSeriesDecompositionTest::testSinglePeriodicity() { @@ -656,8 +656,8 @@ void CTimeSeriesDecompositionTest::testSinglePeriodicity() { LOG_DEBUG(<< "total 'sum residual' / 'sum value' = " << totalSumResidual / totalSumValue); LOG_DEBUG(<< "total 'max residual' / 'max value' = " << totalMaxResidual / totalMaxValue); LOG_DEBUG(<< "total 70% error = " << totalPercentileError / totalSumValue); - CPPUNIT_ASSERT(totalSumResidual < 0.015 * totalSumValue); - CPPUNIT_ASSERT(totalMaxResidual < 0.024 * totalMaxValue); + CPPUNIT_ASSERT(totalSumResidual < 0.013 * totalSumValue); + CPPUNIT_ASSERT(totalMaxResidual < 0.023 * totalMaxValue); CPPUNIT_ASSERT(totalPercentileError < 0.01 * totalSumValue); // Check that only the daily component has been initialized. @@ -781,8 +781,8 @@ void CTimeSeriesDecompositionTest::testSeasonalOnset() { LOG_DEBUG(<< "total 'sum residual' / 'sum value' = " << totalSumResidual / totalSumValue); LOG_DEBUG(<< "total 'max residual' / 'max value' = " << totalMaxResidual / totalMaxValue); LOG_DEBUG(<< "total 70% error = " << totalPercentileError / totalSumValue); - CPPUNIT_ASSERT(totalSumResidual < 0.07 * totalSumValue); - CPPUNIT_ASSERT(totalMaxResidual < 0.09 * totalMaxValue); + CPPUNIT_ASSERT(totalSumResidual < 0.062 * totalSumValue); + CPPUNIT_ASSERT(totalMaxResidual < 0.077 * totalMaxValue); CPPUNIT_ASSERT(totalPercentileError < 0.03 * totalSumValue); } @@ -836,7 +836,7 @@ void CTimeSeriesDecompositionTest::testVarianceScale() { LOG_DEBUG(<< "mean error = " << maths::CBasicStatistics::mean(error)); LOG_DEBUG(<< "mean 70% error = " << maths::CBasicStatistics::mean(percentileError)) LOG_DEBUG(<< "mean scale = " << maths::CBasicStatistics::mean(meanScale)); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(error) < 0.23); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(error) < 0.24); CPPUNIT_ASSERT(maths::CBasicStatistics::mean(percentileError) < 0.05); CPPUNIT_ASSERT_DOUBLES_EQUAL(1.0, maths::CBasicStatistics::mean(meanScale), 0.04); } @@ -1016,9 +1016,9 @@ void CTimeSeriesDecompositionTest::testSpikeyDataProblemCase() { LOG_DEBUG(<< "total 'max residual' / 'max value' = " << totalMaxResidual / totalMaxValue); LOG_DEBUG(<< "total 70% error = " << totalPercentileError / totalSumValue); - CPPUNIT_ASSERT(totalSumResidual < 0.20 * totalSumValue); - CPPUNIT_ASSERT(totalMaxResidual < 0.33 * totalMaxValue); - CPPUNIT_ASSERT(totalPercentileError < 0.14 * totalSumValue); + CPPUNIT_ASSERT(totalSumResidual < 0.19 * totalSumValue); + CPPUNIT_ASSERT(totalMaxResidual < 0.31 * totalMaxValue); + CPPUNIT_ASSERT(totalPercentileError < 0.13 * totalSumValue); //std::ofstream file; //file.open("results.m"); @@ -1161,9 +1161,9 @@ void CTimeSeriesDecompositionTest::testVeryLargeValuesProblemCase() { LOG_DEBUG(<< "total 'max residual' / 'max value' = " << totalMaxResidual / totalMaxValue); LOG_DEBUG(<< "total 70% error = " << totalPercentileError / totalSumValue); - CPPUNIT_ASSERT(totalSumResidual < 0.31 * totalSumValue); + CPPUNIT_ASSERT(totalSumResidual < 0.26 * totalSumValue); CPPUNIT_ASSERT(totalMaxResidual < 0.72 * totalMaxValue); - CPPUNIT_ASSERT(totalPercentileError < 0.21 * totalSumValue); + CPPUNIT_ASSERT(totalPercentileError < 0.14 * totalSumValue); //file << "hold on;\n"; //file << "t = " << core::CContainerPrinter::print(times) << ";\n"; @@ -1277,10 +1277,6 @@ void CTimeSeriesDecompositionTest::testMixedSmoothAndSpikeyDataProblemCase() { LOG_DEBUG(<< "total 'max residual' / 'max value' = " << totalMaxResidual / totalMaxValue); LOG_DEBUG(<< "total 70% error = " << totalPercentileError / totalSumValue); - CPPUNIT_ASSERT(totalSumResidual < 0.17 * totalSumValue); - CPPUNIT_ASSERT(totalMaxResidual < 0.40 * totalMaxValue); - CPPUNIT_ASSERT(totalPercentileError < 0.07 * totalSumValue); - //file << "hold on;\n"; //file << "t = " << core::CContainerPrinter::print(times) << ";\n"; //file << "f = " << core::CContainerPrinter::print(values) << ";\n"; @@ -1289,6 +1285,10 @@ void CTimeSeriesDecompositionTest::testMixedSmoothAndSpikeyDataProblemCase() { //file << "r = " << core::CContainerPrinter::print(r) << ";\n"; //file << "plot(t(1:length(fe)), fe);\n"; //file << "plot(t(1:length(r)), r, 'k');\n"; + + CPPUNIT_ASSERT(totalSumResidual < 0.14 * totalSumValue); + CPPUNIT_ASSERT(totalMaxResidual < 0.38 * totalMaxValue); + CPPUNIT_ASSERT(totalPercentileError < 0.04 * totalSumValue); } void CTimeSeriesDecompositionTest::testDiurnalPeriodicityWithMissingValues() { @@ -1331,7 +1331,7 @@ void CTimeSeriesDecompositionTest::testDiurnalPeriodicityWithMissingValues() { } LOG_DEBUG(<< "mean error = " << maths::CBasicStatistics::mean(error)); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(error) < 0.1); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(error) < 0.09); //file << "hold on;\n"; //file << "t = " << core::CContainerPrinter::print(times) << ";\n"; @@ -1565,7 +1565,7 @@ void CTimeSeriesDecompositionTest::testLongTermTrend() { //file << "plot(t, fe);\n"; CPPUNIT_ASSERT(totalSumResidual / totalSumValue < 0.38); - CPPUNIT_ASSERT(totalMaxResidual / totalMaxValue < 0.42); + CPPUNIT_ASSERT(totalMaxResidual / totalMaxValue < 0.41); } } @@ -1637,8 +1637,8 @@ void CTimeSeriesDecompositionTest::testLongTermTrendAndPeriodicity() { totalSumValue += sumValue; totalMaxValue += maxValue; - CPPUNIT_ASSERT(sumResidual / sumValue < 0.4); - CPPUNIT_ASSERT(maxResidual / maxValue < 0.4); + CPPUNIT_ASSERT(sumResidual / sumValue < 0.34); + CPPUNIT_ASSERT(maxResidual / maxValue < 0.39); } lastDay += DAY; } @@ -1681,7 +1681,7 @@ void CTimeSeriesDecompositionTest::testNonDiurnal() { rng.generateNormalSamples(0.0, 1.0, trends[1].size(), noise); core_t::TTime startTesting[]{3 * HOUR, 16 * DAY}; - TDoubleVec thresholds[]{TDoubleVec{0.09, 0.07}, TDoubleVec{0.19, 0.16}}; + TDoubleVec thresholds[]{TDoubleVec{0.08, 0.07}, TDoubleVec{0.18, 0.15}}; for (std::size_t t = 0u; t < 2; ++t) { //std::ofstream file; @@ -1841,7 +1841,7 @@ void CTimeSeriesDecompositionTest::testNonDiurnal() { //file << "plot(t, fe);\n"; CPPUNIT_ASSERT(totalSumResidual / totalSumValue < 0.1); - CPPUNIT_ASSERT(totalMaxResidual / totalMaxValue < 0.17); + CPPUNIT_ASSERT(totalMaxResidual / totalMaxValue < 0.18); } } @@ -1907,7 +1907,72 @@ void CTimeSeriesDecompositionTest::testYearly() { //file << "plot(t, fe);\n"; LOG_DEBUG(<< "mean error = " << maths::CBasicStatistics::mean(meanError)); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanError) < 0.01); + + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanError) < 0.011); +} + +void CTimeSeriesDecompositionTest::testWithOutliers() { + // Test smooth periodic signal polluted with outliers. + + using TSizeVec = std::vector; + + test::CRandomNumbers rng; + + TDoubleVec noise; + TSizeVec outliers; + TDoubleVec spikeOrTroughSelector; + + core_t::TTime buckets{WEEK / TEN_MINS}; + std::size_t numberOutliers{static_cast(0.1 * buckets)}; + rng.generateUniformSamples(0, buckets, numberOutliers, outliers); + rng.generateUniformSamples(0, 1.0, numberOutliers, spikeOrTroughSelector); + rng.generateNormalSamples(0.0, 9.0, buckets, noise); + std::sort(outliers.begin(), outliers.end()); + + //std::ofstream file; + //file.open("results.m"); + //TTimeVec times; + //TDoubleVec values; + //TDoubleVec f; + + auto trend = [](core_t::TTime time) { + return 25.0 + 20.0 * std::sin(boost::math::double_constants::two_pi * + static_cast(time) / + static_cast(DAY)); + }; + + maths::CTimeSeriesDecomposition decomposition(0.01, TEN_MINS); + + for (core_t::TTime time = 0; time < WEEK; time += TEN_MINS) { + std::size_t bucket(time / TEN_MINS); + auto outlier = std::lower_bound(outliers.begin(), outliers.end(), bucket); + double value = + outlier != outliers.end() && *outlier == bucket + ? (spikeOrTroughSelector[outlier - outliers.begin()] > 0.5 ? 0.0 : 50.0) + : trend(time); + + if (decomposition.addPoint(time, value)) { + TMeanAccumulator error; + for (core_t::TTime endTime = time + DAY; time < endTime; time += TEN_MINS) { + double prediction = + maths::CBasicStatistics::mean(decomposition.baseline(time, 0.0)); + error.add(std::fabs(prediction - trend(time)) / trend(time)); + //times.push_back(time); + //values.push_back(trend(time)); + //f.push_back(prediction); + } + + //file << "t = " << core::CContainerPrinter::print(times) << ";\n"; + //file << "f = " << core::CContainerPrinter::print(values) << ";\n"; + //file << "fe = " << core::CContainerPrinter::print(f) << ";\n"; + //file << "plot(t, f, 'r');\n"; + //file << "plot(t, fe);\n"; + + LOG_DEBUG(<< "error = " << maths::CBasicStatistics::mean(error)); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(error) < 0.05); + break; + } + } } void CTimeSeriesDecompositionTest::testCalendar() { @@ -2000,8 +2065,8 @@ void CTimeSeriesDecompositionTest::testConditionOfTrend() { maths::CTimeSeriesDecomposition decomposition(0.0005, bucketLength); TDoubleVec noise; - for (core_t::TTime time = 0; time < 10 * YEAR; time += 6 * HOUR) { - rng.generateNormalSamples(0.0, 3.0, 1, noise); + for (core_t::TTime time = 0; time < 9 * YEAR; time += 6 * HOUR) { + rng.generateNormalSamples(0.0, 4.0, 1, noise); decomposition.addPoint(time, trend(time) + noise[0]); if (time > 10 * WEEK) { CPPUNIT_ASSERT(std::fabs(decomposition.detrend(time, trend(time), 0.0)) < 3.0); @@ -2303,6 +2368,9 @@ CppUnit::Test* CTimeSeriesDecompositionTest::suite() { &CTimeSeriesDecompositionTest::testNonDiurnal)); suiteOfTests->addTest(new CppUnit::TestCaller( "CTimeSeriesDecompositionTest::testYearly", &CTimeSeriesDecompositionTest::testYearly)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTimeSeriesDecompositionTest::testWithOutliers", + &CTimeSeriesDecompositionTest::testWithOutliers)); suiteOfTests->addTest(new CppUnit::TestCaller( "CTimeSeriesDecompositionTest::testCalendar", &CTimeSeriesDecompositionTest::testCalendar)); diff --git a/lib/maths/unittest/CTimeSeriesDecompositionTest.h b/lib/maths/unittest/CTimeSeriesDecompositionTest.h index cc7cfe964a..d7df6781f4 100644 --- a/lib/maths/unittest/CTimeSeriesDecompositionTest.h +++ b/lib/maths/unittest/CTimeSeriesDecompositionTest.h @@ -26,6 +26,7 @@ class CTimeSeriesDecompositionTest : public CppUnit::TestFixture { void testLongTermTrendAndPeriodicity(); void testNonDiurnal(); void testYearly(); + void testWithOutliers(); void testCalendar(); void testConditionOfTrend(); void testSwap(); diff --git a/lib/maths/unittest/CTimeSeriesModelTest.cc b/lib/maths/unittest/CTimeSeriesModelTest.cc index 1ce94da2a9..6a2ac96003 100644 --- a/lib/maths/unittest/CTimeSeriesModelTest.cc +++ b/lib/maths/unittest/CTimeSeriesModelTest.cc @@ -1394,7 +1394,7 @@ void CTimeSeriesModelTest::testWeights() { error.add(std::fabs(scale - dataScale) / dataScale); } LOG_DEBUG(<< "error = " << maths::CBasicStatistics::mean(error)); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(error) < 0.2); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(error) < 0.21); LOG_DEBUG(<< "Winsorisation"); TDouble2Vec prediction(model.predict(time));