diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 252a13e975..ca132857ba 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -29,6 +29,7 @@ Improve and use periodic boundary condition for seasonal component modeling ({pull}84[#84]) Improve robustness w.r.t. outliers of detection and initialisation of seasonal components ({pull}90[#90]) +Explicit change point detection and modelling ({pull}92[#92]) === Bug Fixes diff --git a/include/core/CContainerPrinter.h b/include/core/CContainerPrinter.h index fb8331c59a..9b533bd346 100644 --- a/include/core/CContainerPrinter.h +++ b/include/core/CContainerPrinter.h @@ -274,7 +274,7 @@ class CORE_EXPORT CContainerPrinter : private CNonInstantiatable { return *value; } - //! Print a boost::shared_pointer. + //! Print a std::shared_pointer. template static std::string printElement(const std::shared_ptr& value) { if (value == std::shared_ptr()) { diff --git a/include/core/CRapidJsonWriterBase.h b/include/core/CRapidJsonWriterBase.h index 4a64dfaef9..b10c14df57 100644 --- a/include/core/CRapidJsonWriterBase.h +++ b/include/core/CRapidJsonWriterBase.h @@ -74,7 +74,6 @@ class CRapidJsonWriterBase using TValue = rapidjson::Value; using TDocumentWeakPtr = std::weak_ptr; using TValuePtr = std::shared_ptr; - using TPoolAllocatorPtr = std::shared_ptr; using TPoolAllocatorPtrStack = std::stack; using TStrPoolAllocatorPtrMap = boost::unordered_map; diff --git a/include/core/Constants.h b/include/core/Constants.h index 95114b3cfc..208939f920 100644 --- a/include/core/Constants.h +++ b/include/core/Constants.h @@ -16,38 +16,41 @@ namespace ml { namespace core { namespace constants { +//! A minute in seconds. +const core_t::TTime MINUTE{60}; + //! An hour in seconds. -const core_t::TTime HOUR = 3600; +const core_t::TTime HOUR{3600}; //! A day in seconds. -const core_t::TTime DAY = 86400; +const core_t::TTime DAY{86400}; //! A (two day) weekend in seconds. -const core_t::TTime WEEKEND = 172800; +const core_t::TTime WEEKEND{172800}; //! Five weekdays in seconds. -const core_t::TTime WEEKDAYS = 432000; +const core_t::TTime WEEKDAYS{432000}; //! A week in seconds. -const core_t::TTime WEEK = 604800; +const core_t::TTime WEEK{604800}; //! A (364 day) year in seconds. -const core_t::TTime YEAR = 31449600; +const core_t::TTime YEAR{31449600}; //! Log of min double. -const double LOG_MIN_DOUBLE = std::log(std::numeric_limits::min()); +const double LOG_MIN_DOUBLE{std::log(std::numeric_limits::min())}; //! Log of max double. -const double LOG_MAX_DOUBLE = std::log(std::numeric_limits::max()); +const double LOG_MAX_DOUBLE{std::log(std::numeric_limits::max())}; //! Log of double epsilon. -const double LOG_DOUBLE_EPSILON = std::log(std::numeric_limits::epsilon()); +const double LOG_DOUBLE_EPSILON{std::log(std::numeric_limits::epsilon())}; //! Log of two. -const double LOG_TWO = 0.693147180559945; +const double LOG_TWO{0.693147180559945}; //! Log of two pi. -const double LOG_TWO_PI = 1.83787706640935; +const double LOG_TWO_PI{1.83787706640935}; #ifdef Windows const char PATH_SEPARATOR = '\\'; diff --git a/include/maths/CBasicStatistics.h b/include/maths/CBasicStatistics.h index cb3640a2ad..11d6ae0676 100644 --- a/include/maths/CBasicStatistics.h +++ b/include/maths/CBasicStatistics.h @@ -201,6 +201,14 @@ class MATHS_EXPORT CBasicStatistics { } } + //! Update the moments with the collection \p x. + template + void add(const core::CSmallVector& x) { + for (const auto& xi : x) { + this->add(xi); + } + } + //! Update the moments with the collection \p x. template void add(const std::vector>& x) { diff --git a/include/maths/CCalendarComponent.h b/include/maths/CCalendarComponent.h index 59cdd73ee8..d5ec0b5fc1 100644 --- a/include/maths/CCalendarComponent.h +++ b/include/maths/CCalendarComponent.h @@ -82,6 +82,9 @@ class MATHS_EXPORT CCalendarComponent : private CDecompositionComponent { //! Clear all data. void clear(); + //! Linearly scale the component's by \p scale. + void linearScale(core_t::TTime time, double scale); + //! Adds a value \f$(t, f(t))\f$ to this component. //! //! \param[in] time The time of the point. diff --git a/include/maths/CCalendarComponentAdaptiveBucketing.h b/include/maths/CCalendarComponentAdaptiveBucketing.h index 2a303792f9..dd389e9c0d 100644 --- a/include/maths/CCalendarComponentAdaptiveBucketing.h +++ b/include/maths/CCalendarComponentAdaptiveBucketing.h @@ -67,6 +67,9 @@ class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : private CAdaptiveBucket //! allocated memory. void clear(); + //! Linearly scale the bucket values by \p scale. + void linearScale(double scale); + //! Add the function value at \p time. //! //! \param[in] time The time of \p value. diff --git a/include/maths/CModel.h b/include/maths/CModel.h index 90d6a3c6ec..46cc4a243d 100644 --- a/include/maths/CModel.h +++ b/include/maths/CModel.h @@ -44,9 +44,11 @@ using TForecastPushDatapointFunc = std::function; class MATHS_EXPORT CModelParams { public: CModelParams(core_t::TTime bucketLength, - const double& learnRate, - const double& decayRate, - double minimumSeasonalVarianceScale); + double learnRate, + double decayRate, + double minimumSeasonalVarianceScale, + core_t::TTime minimumTimeToDetectChange, + core_t::TTime maximumTimeToTestForChange); //! Get the bucket length. core_t::TTime bucketLength() const; @@ -63,6 +65,15 @@ class MATHS_EXPORT CModelParams { //! Get the minimum seasonal variance scale. double minimumSeasonalVarianceScale() const; + //! Check if we should start testing for a change point in the model. + bool testForChange(core_t::TTime changeInterval) const; + + //! Get the minimum time to detect a change point in the model. + core_t::TTime minimumTimeToDetectChange(void) const; + + //! Get the maximum time to test for a change point in the model. + core_t::TTime maximumTimeToTestForChange(void) const; + //! Set the probability that the bucket will be empty for the model. void probabilityBucketEmpty(double probability); @@ -78,6 +89,10 @@ class MATHS_EXPORT CModelParams { double m_DecayRate; //! The minimum seasonal variance scale. double m_MinimumSeasonalVarianceScale; + //! The minimum time permitted to detect a change in the model. + core_t::TTime m_MinimumTimeToDetectChange; + //! The maximum time permitted to test for a change in the model. + core_t::TTime m_MaximumTimeToTestForChange; //! The probability that a bucket will be empty for the model. double m_ProbabilityBucketEmpty; }; @@ -90,8 +105,6 @@ class MATHS_EXPORT CModelAddSamplesParams { public: CModelAddSamplesParams(); - CModelAddSamplesParams(const CModelAddSamplesParams&) = delete; - const CModelAddSamplesParams& operator=(const CModelAddSamplesParams&) = delete; //! Set whether or not the data are integer valued. CModelAddSamplesParams& integer(bool integer); @@ -145,8 +158,6 @@ class MATHS_EXPORT CModelProbabilityParams { public: CModelProbabilityParams(); - CModelProbabilityParams(const CModelAddSamplesParams&) = delete; - const CModelProbabilityParams& operator=(const CModelAddSamplesParams&) = delete; //! Set the tag for the entity for which to compute the probability. CModelProbabilityParams& tag(std::size_t tag); @@ -254,6 +265,9 @@ class MATHS_EXPORT CModel { E_Reset //!< Model reset. }; + //! Combine the results \p lhs and \p rhs. + static EUpdateResult combine(EUpdateResult lhs, EUpdateResult rhs); + public: CModel(const CModelParams& params); virtual ~CModel() = default; diff --git a/include/maths/CNaiveBayes.h b/include/maths/CNaiveBayes.h new file mode 100644 index 0000000000..c5a7cfb5a0 --- /dev/null +++ b/include/maths/CNaiveBayes.h @@ -0,0 +1,278 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_maths_CNaiveBayes_h +#define INCLUDED_ml_maths_CNaiveBayes_h + +#include + +#include + +#include +#include + +#include +#include +#include +#include + +namespace ml { +namespace core { +class CStatePersistInserter; +class CStateRestoreTraverser; +} +namespace maths { +struct SDistributionRestoreParams; + +//! \brief The interface expected by CNaiveBayes for implementations +//! of the class conditional density functions. +class MATHS_EXPORT CNaiveBayesFeatureDensity { +public: + using TDouble1Vec = core::CSmallVector; + +public: + virtual ~CNaiveBayesFeatureDensity() = default; + + //! Create and return a clone. + //! + //! \note The caller owns this. + virtual CNaiveBayesFeatureDensity* clone() const = 0; + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser) = 0; + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const = 0; + + //! Set the data type. + virtual void dataType(maths_t::EDataType dataType) = 0; + + //! Add the value \p x. + virtual void add(const TDouble1Vec& x) = 0; + + //! Compute the log value of the density function at \p x. + virtual double logValue(const TDouble1Vec& x) const = 0; + + //! Compute the density at the mode. + virtual double logMaximumValue() const = 0; + + //! Age out old values density to account for \p time passing. + virtual void propagateForwardsByTime(double time) = 0; + + //! Debug the memory used by this object. + virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const = 0; + + //! Get the static size of this object. + virtual std::size_t staticSize() const = 0; + + //! Get the memory used by this object. + virtual std::size_t memoryUsage() const = 0; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const = 0; + + //! Get a human readable description of the class density function. + virtual std::string print() const = 0; +}; + +//! \brief An implementation of the class conditional density function +//! based on the CPrior hierarchy. +class MATHS_EXPORT CNaiveBayesFeatureDensityFromPrior final : public CNaiveBayesFeatureDensity { +public: + CNaiveBayesFeatureDensityFromPrior() = default; + CNaiveBayesFeatureDensityFromPrior(const CPrior& prior); + + //! Create and return a clone. + //! + //! \note The caller owns this. + virtual CNaiveBayesFeatureDensityFromPrior* clone() const; + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser); + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + + //! Add the value \p x. + virtual void add(const TDouble1Vec& x); + + //! Compute the log value of the density function at \p x. + virtual double logValue(const TDouble1Vec& x) const; + + //! Compute the density at the mode. + virtual double logMaximumValue() const; + + //! Set the data type. + virtual void dataType(maths_t::EDataType dataType); + + //! Age out old values density to account for \p time passing. + virtual void propagateForwardsByTime(double time); + + //! Debug the memory used by this object. + virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the static size of this object. + virtual std::size_t staticSize() const; + + //! Get the memory used by this object. + virtual std::size_t memoryUsage() const; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const; + + //! Get a human readable description of the class density function. + virtual std::string print() const; + +private: + using TPriorPtr = std::shared_ptr; + +private: + //! The density model. + TPriorPtr m_Prior; +}; + +//! \brief Implements a Naive Bayes classifier. +class MATHS_EXPORT CNaiveBayes { +public: + using TDoubleSizePr = std::pair; + using TDoubleSizePrVec = std::vector; + using TDouble1Vec = core::CSmallVector; + using TDouble1VecVec = std::vector; + using TOptionalDouble = boost::optional; + +public: + explicit CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar, + double decayRate = 0.0, + TOptionalDouble minMaxLogLikelihoodToUseFeature = TOptionalDouble()); + CNaiveBayes(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser); + + //! Persist state by passing information to \p inserter. + void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + + //! Efficiently swap the contents of this and \p other. + void swap(CNaiveBayes& other); + + //! Check if any training data has been added initialized. + bool initialized() const; + + //! This can be used to optionally seed the class counts + //! with \p counts. These are added on to data class counts + //! to compute the class posterior probabilities. + void initialClassCounts(const TDoubleSizePrVec& counts); + + //! Add a training data point comprising the pair \f$(x,l)\f$ + //! for feature vector \f$x\f$ and class label \f$l\f$. + //! + //! \param[in] label The class label for \p x. + //! \param[in] x The feature values. + //! \note \p x size should be equal to the number of features. + //! A feature is missing is indicated by passing an empty vector + //! for that feature. + void addTrainingDataPoint(std::size_t label, const TDouble1VecVec& x); + + //! Set the data type. + void dataType(maths_t::EDataType dataType); + + //! Age out old values from the class conditional densities + //! to account for \p time passing. + void propagateForwardsByTime(double time); + + //! Get the top \p n class probabilities for \p x. + //! + //! \param[in] n The number of class probabilities to estimate. + //! \param[in] x The feature values. + //! \note \p x size should be equal to the number of features. + //! A feature is missing is indicated by passing an empty vector + //! for that feature. + TDoubleSizePrVec highestClassProbabilities(std::size_t n, const TDouble1VecVec& x) const; + + //! Get the probability of the class labeled \p label for \p x. + //! + //! \param[in] label The label of the class of interest. + //! \param[in] x The feature values. + //! \note \p x size should be equal to the number of features. + //! A feature is missing is indicated by passing an empty vector + //! for that feature. + double classProbability(std::size_t label, const TDouble1VecVec& x) const; + + //! Get the probabilities of all the classes for \p x. + //! + //! \param[in] x The feature values. + //! \note \p x size should be equal to the number of features. + //! A feature is missing is indicated by passing an empty vector + //! for that feature. + TDoubleSizePrVec classProbabilities(const TDouble1VecVec& x) const; + + //! Debug the memory used by this object. + void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the memory used by this object. + std::size_t memoryUsage() const; + + //! Get a checksum for this object. + uint64_t checksum(uint64_t seed = 0) const; + + //! Get a human readable description of the classifier. + std::string print() const; + +private: + using TFeatureDensityPtr = std::shared_ptr; + using TFeatureDensityPtrVec = std::vector; + + //! \brief The data associated with a class. + struct SClass { + //! Initialize by reading state from \p traverser. + bool acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser); + //! Persist state by passing information to \p inserter. + void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + //! Debug the memory used by this object. + void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + //! Get the memory used by this object. + std::size_t memoryUsage() const; + //! Get a checksum for this object. + uint64_t checksum(uint64_t seed = 0) const; + + //! The number of examples in this class. + double s_Count = 0.0; + //! The feature conditional densities for this class. + TFeatureDensityPtrVec s_ConditionalDensities; + }; + + using TSizeClassUMap = boost::unordered_map; + +private: + //! Initialize by reading state from \p traverser. + bool acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser); + + //! Validate \p x. + bool validate(const TDouble1VecVec& x) const; + +private: + //! It is not always appropriate to use features with very low + //! probability in all classes to discriminate: the class choice + //! will be very sensitive to the underlying conditional density + //! model. This is a cutoff (for the minimum maximum class log + //! likelihood) in order to use a feature. + TOptionalDouble m_MinMaxLogLikelihoodToUseFeature; + + //! Controls the rate at which data are aged out. + double m_DecayRate; + + //! An exemplar for creating conditional densities. + TFeatureDensityPtr m_Exemplar; + + //! The class conditional density estimates and weights. + TSizeClassUMap m_ClassConditionalDensities; +}; +} +} + +#endif // INCLUDED_ml_maths_CNaiveBayes_h diff --git a/include/maths/CPriorDetail.h b/include/maths/CPriorDetail.h index 5c2a982cd7..03087a6e05 100644 --- a/include/maths/CPriorDetail.h +++ b/include/maths/CPriorDetail.h @@ -11,21 +11,6 @@ namespace ml { namespace maths { -//! Compute the expectation of the specified function w.r.t. to the marginal -//! likelihood. -//! -//! This computes the expectation using order three Gauss-Legendre quadrature -//! in \p numberIntervals subdivisions of a high confidence interval for the -//! marginal likelihood. -//! -//! \param f The function to integrate. -//! \param numberIntervals The number intervals to use for integration. -//! \param result Filled in with the result if the expectation could be calculated. -//! -//! \tparam F This must conform to the function type expected by -//! CIntegration::gaussLegendre. -//! \tparam T The return type of the function F which must conform to the type -//! expected by CIntegration::gaussLegendre. template bool CPrior::expectation(const F& f, std::size_t numberIntervals, @@ -39,13 +24,13 @@ bool CPrior::expectation(const F& f, result = T(); - double n = static_cast(numberIntervals); - TDoubleDoublePr interval = - this->marginalLikelihoodConfidenceInterval(100.0 - 1.0 / (100.0 * n), weight); - double x = interval.first; - double dx = (interval.second - interval.first) / n; + double n{static_cast(numberIntervals)}; + TDoubleDoublePr interval{this->marginalLikelihoodConfidenceInterval( + 100.0 - 1.0 / (100.0 * n), weight)}; + double x{interval.first}; + double dx{(interval.second - interval.first) / n}; - double normalizationFactor = 0.0; + double Z{0.0}; TDoubleWeightsAry1Vec weights{weight}; CPrior::CLogMarginalLikelihood logLikelihood(*this, weights); CCompositeFunctions::CExp likelihood(logLikelihood); @@ -59,9 +44,9 @@ bool CPrior::expectation(const F& f, return false; } result += productIntegral; - normalizationFactor += likelihoodIntegral; + Z += likelihoodIntegral; } - result /= normalizationFactor; + result /= Z; return true; } diff --git a/include/maths/CRegression.h b/include/maths/CRegression.h index a4126dae1b..ae16838aa1 100644 --- a/include/maths/CRegression.h +++ b/include/maths/CRegression.h @@ -215,6 +215,20 @@ class MATHS_EXPORT CRegression { } } + //! Linearly scale the regression model. + //! + //! i.e. apply a transform such that each regression parameter maps + //! to \p scale times its current value. + //! + //! \param[in] scale The scale to apply to the regression parameters. + void linearScale(double scale) { + if (CBasicStatistics::count(m_S) > 0.0) { + for (std::size_t i = 0u; i < N; ++i) { + CBasicStatistics::moment<0>(m_S)(i + 2 * N - 1) *= scale; + } + } + } + //! Multiply the statistics' count by \p scale. CLeastSquaresOnline scaled(double scale) const { CLeastSquaresOnline result(*this); @@ -251,12 +265,11 @@ class MATHS_EXPORT CRegression { TArray params; if (this->parameters(params, maxCondition)) { std::ptrdiff_t n = static_cast(params.size()); - double xi = x; for (std::ptrdiff_t i = n - 1; i >= 0; --i) { result[i] = params[i]; for (std::ptrdiff_t j = i + 1; j < n; ++j) { params[j] *= static_cast(i + 1) / - static_cast(j - i) * xi; + static_cast(j - i) * x; result[i] += params[j]; } } diff --git a/include/maths/CRestoreParams.h b/include/maths/CRestoreParams.h index 045e4ee52d..dda3bdcf0a 100644 --- a/include/maths/CRestoreParams.h +++ b/include/maths/CRestoreParams.h @@ -9,6 +9,7 @@ #include +#include #include #include @@ -18,31 +19,14 @@ namespace ml { namespace maths { class CModelParams; -//! \brief Gatherers up extra parameters supplied when restoring -//! time series decompositions. -struct MATHS_EXPORT STimeSeriesDecompositionRestoreParams { - STimeSeriesDecompositionRestoreParams(double decayRate, - core_t::TTime minimumBucketLength, - std::size_t componentSize); - - //! The rate at which decomposition loses information. - double s_DecayRate; - - //! The data bucket length. - core_t::TTime s_MinimumBucketLength; - - //! The decomposition seasonal component size. - std::size_t s_ComponentSize; -}; - //! \brief Gatherers up extra parameters supplied when restoring //! distribution models. struct MATHS_EXPORT SDistributionRestoreParams { SDistributionRestoreParams(maths_t::EDataType dataType, double decayRate, - double minimumClusterFraction, - double minimumClusterCount, - double minimumCategoryCount); + double minimumClusterFraction = MINIMUM_CLUSTER_SPLIT_FRACTION, + double minimumClusterCount = MINIMUM_CLUSTER_SPLIT_COUNT, + double minimumCategoryCount = MINIMUM_CATEGORY_COUNT); //! The type of data being clustered. maths_t::EDataType s_DataType; @@ -62,6 +46,30 @@ struct MATHS_EXPORT SDistributionRestoreParams { //! \brief Gatherers up extra parameters supplied when restoring //! time series decompositions. +struct MATHS_EXPORT STimeSeriesDecompositionRestoreParams { + STimeSeriesDecompositionRestoreParams(double decayRate, + core_t::TTime minimumBucketLength, + std::size_t componentSize, + const SDistributionRestoreParams& changeModelParams); + STimeSeriesDecompositionRestoreParams(double decayRate, + core_t::TTime minimumBucketLength, + const SDistributionRestoreParams& changeModelParams); + + //! The rate at which decomposition loses information. + double s_DecayRate; + + //! The data bucket length. + core_t::TTime s_MinimumBucketLength; + + //! The decomposition seasonal component size. + std::size_t s_ComponentSize; + + //! The change model distributions' restore parameters. + SDistributionRestoreParams s_ChangeModelParams; +}; + +//! \brief Gatherers up extra parameters supplied when restoring +//! time series models. struct MATHS_EXPORT SModelRestoreParams { using TModelParamsCRef = boost::reference_wrapper; @@ -75,7 +83,7 @@ struct MATHS_EXPORT SModelRestoreParams { //! The time series decomposition restore parameters. STimeSeriesDecompositionRestoreParams s_DecompositionParams; - //! The time series decomposition restore parameters. + //! The time series residual distribution restore parameters. SDistributionRestoreParams s_DistributionParams; }; } diff --git a/include/maths/CSeasonalComponent.h b/include/maths/CSeasonalComponent.h index ac1fa4abc8..8c84207b5a 100644 --- a/include/maths/CSeasonalComponent.h +++ b/include/maths/CSeasonalComponent.h @@ -100,6 +100,9 @@ class MATHS_EXPORT CSeasonalComponent : private CDecompositionComponent { //! Shift the component's slope by \p shift. void shiftSlope(double shift); + //! Linearly scale the component's by \p scale. + void linearScale(core_t::TTime time, double scale); + //! Adds a value \f$(t, f(t))\f$ to this component. //! //! \param[in] time The time of the point. diff --git a/include/maths/CSeasonalComponentAdaptiveBucketing.h b/include/maths/CSeasonalComponentAdaptiveBucketing.h index c568119203..cfc4ddaf84 100644 --- a/include/maths/CSeasonalComponentAdaptiveBucketing.h +++ b/include/maths/CSeasonalComponentAdaptiveBucketing.h @@ -95,6 +95,9 @@ class MATHS_EXPORT CSeasonalComponentAdaptiveBucketing : private CAdaptiveBucket //! Shift the regressions' gradients by \p shift. void shiftSlope(double shift); + //! Linearly scale the regressions by \p scale. + void linearScale(double scale); + //! Add the function value at \p time. //! //! \param[in] time The time of \p value. diff --git a/include/maths/CTimeSeriesChangeDetector.h b/include/maths/CTimeSeriesChangeDetector.h new file mode 100644 index 0000000000..c249636d83 --- /dev/null +++ b/include/maths/CTimeSeriesChangeDetector.h @@ -0,0 +1,409 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_maths_CTimeSeriesChangeDetector_h +#define INCLUDED_ml_maths_CTimeSeriesChangeDetector_h + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +namespace ml { +namespace core { +class CStatePersistInserter; +class CStateRestoreTraverser; +} +namespace maths { +class CModelAddSamplesParams; +class CPrior; +class CTimeSeriesDecompositionInterface; +struct SDistributionRestoreParams; +struct SModelRestoreParams; + +namespace time_series_change_detector_detail { +class CUnivariateChangeModel; +} + +//! \brief A description of a time series change. +struct MATHS_EXPORT SChangeDescription { + using TDouble2Vec = core::CSmallVector; + using TPriorPtr = std::shared_ptr; + + //! The types of change we can detect. + enum EDescription { E_LevelShift, E_LinearScale, E_TimeShift }; + + SChangeDescription(EDescription decription, double value, const TPriorPtr& residualModel); + + //! Get a description of this change. + std::string print() const; + + //! The type of change. + EDescription s_Description; + + //! The change value. + TDouble2Vec s_Value; + + //! The residual model to use after the change. + TPriorPtr s_ResidualModel; +}; + +//! \brief Tests a variety of possible changes which might have +//! occurred in a time series and selects one if it provides a +//! good explanation of the recent behaviour. +class MATHS_EXPORT CUnivariateTimeSeriesChangeDetector { +public: + using TTimeDoublePr = std::pair; + using TTimeDoublePr1Vec = core::CSmallVector; + using TDoubleWeightsAry1Vec = maths_t::TDoubleWeightsAry1Vec; + using TDecompositionPtr = std::shared_ptr; + using TPriorPtr = std::shared_ptr; + using TOptionalChangeDescription = boost::optional; + +public: + CUnivariateTimeSeriesChangeDetector(const TDecompositionPtr& trendModel, + const TPriorPtr& residualModel, + core_t::TTime minimumTimeToDetect = 12 * core::constants::HOUR, + core_t::TTime maximumTimeToDetect = core::constants::DAY, + double minimumDeltaBicToDetect = 14.0); + + //! Initialize by reading state from \p traverser. + bool acceptRestoreTraverser(const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser); + + //! Persist state by passing information to \p inserter. + void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + + //! Check if there has been a change and get a description + //! if there has been. + TOptionalChangeDescription change(); + + //! The function used to decide whether to accept a change. + //! A change is accepted at a value of 1.0 for this function. + //! + //! \param[out] change Filled in with the index of the change + //! the most likely change. + double decisionFunction(std::size_t& change) const; + + //! Add \p samples to the change detector. + void addSamples(const TTimeDoublePr1Vec& samples, const TDoubleWeightsAry1Vec& weights); + + //! Check if we should stop testing. + bool stopTesting() const; + + //! Debug the memory used by this object. + void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the memory used by this object. + std::size_t memoryUsage() const; + + //! Get a checksum for this object. + uint64_t checksum(uint64_t seed = 0) const; + +private: + using TChangeModel = time_series_change_detector_detail::CUnivariateChangeModel; + using TChangeModelPtr = std::shared_ptr; + using TChangeModelPtr5Vec = core::CSmallVector; + using TMinMaxAccumulator = CBasicStatistics::CMinMax; + +private: + //! The minimum amount of time we need to observe before + //! selecting a change model. + core_t::TTime m_MinimumTimeToDetect; + + //! The maximum amount of time to try to detect a change. + core_t::TTime m_MaximumTimeToDetect; + + //! The minimum increase in BIC select a change model. + double m_MinimumDeltaBicToDetect; + + //! The start and end of the change model. + TMinMaxAccumulator m_TimeRange; + + //! The count of samples added to the change models. + std::size_t m_SampleCount; + + //! The current evidence of a change. + double m_CurrentEvidenceOfChange; + + //! The change models. + TChangeModelPtr5Vec m_ChangeModels; +}; + +namespace time_series_change_detector_detail { + +//! \brief Helper interface for change detection. Implementations of +//! this are used to model specific types of changes which can occur. +class MATHS_EXPORT CUnivariateChangeModel : private core::CNonCopyable { +public: + using TDouble1Vec = core::CSmallVector; + using TTimeDoublePr = std::pair; + using TTimeDoublePr1Vec = core::CSmallVector; + using TDoubleWeightsAry1Vec = maths_t::TDoubleWeightsAry1Vec; + using TDecompositionPtr = std::shared_ptr; + using TPriorPtr = std::shared_ptr; + using TOptionalChangeDescription = boost::optional; + +public: + CUnivariateChangeModel(const TDecompositionPtr& trendModel, const TPriorPtr& residualModel); + virtual ~CUnivariateChangeModel() = default; + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser) = 0; + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const = 0; + + //! The BIC of applying the change. + virtual double bic() const = 0; + + //! The expected BIC of applying the change. + virtual double expectedBic() const = 0; + + //! Get a description of the change. + virtual TOptionalChangeDescription change() const = 0; + + //! Update the change model with \p samples. + virtual void addSamples(const std::size_t count, + const TTimeDoublePr1Vec& samples, + TDoubleWeightsAry1Vec weights) = 0; + + //! Debug the memory used by this object. + void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the memory used by this object. + std::size_t memoryUsage() const; + + //! Get the static size of this object. + virtual std::size_t staticSize() const = 0; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const = 0; + +protected: + //! Restore the residual model reading state from \p traverser. + bool restoreResidualModel(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser); + + //! Get the log-likelihood. + double logLikelihood() const; + + //! Get the expected log-likelihood. + double expectedLogLikelihood() const; + + //! Update the log-likelihood with \p samples. + void updateLogLikelihood(const TDouble1Vec& samples, const TDoubleWeightsAry1Vec& weights); + + //! Update the expected log-likelihoods. + void updateExpectedLogLikelihood(const TDoubleWeightsAry1Vec& weights); + + //! Get the time series trend model. + const CTimeSeriesDecompositionInterface& trendModel() const; + + //! Get the time series residual model. + const CPrior& residualModel() const; + //! Get the time series residual model. + CPrior& residualModel(); + //! Get the time series residual model member variable. + TPriorPtr residualModelPtr() const; + +private: + //! The likelihood of the data under this model. + double m_LogLikelihood; + + //! The expected log-likelihood of the data under this model. + double m_ExpectedLogLikelihood; + + //! A model decomposing the time series trend. + TDecompositionPtr m_TrendModel; + + //! A reference to the underlying prior. + TPriorPtr m_ResidualModel; +}; + +//! \brief Used to capture the likelihood of the data given no change. +class MATHS_EXPORT CUnivariateNoChangeModel final : public CUnivariateChangeModel { +public: + CUnivariateNoChangeModel(const TDecompositionPtr& trendModel, const TPriorPtr& residualModel); + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser); + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + + //! Returns the no change BIC. + virtual double bic() const; + + //! The expected BIC of applying the change. + virtual double expectedBic() const; + + //! Returns a null object. + virtual TOptionalChangeDescription change() const; + + //! Get the log likelihood of \p samples. + virtual void addSamples(const std::size_t count, + const TTimeDoublePr1Vec& samples, + TDoubleWeightsAry1Vec weights); + + //! Get the static size of this object. + virtual std::size_t staticSize() const; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const; +}; + +//! \brief Captures the likelihood of the data given an arbitrary +//! level shift. +class MATHS_EXPORT CUnivariateLevelShiftModel final : public CUnivariateChangeModel { +public: + CUnivariateLevelShiftModel(const TDecompositionPtr& trendModel, + const TPriorPtr& residualModel); + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser); + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + + //! The BIC of applying the level shift. + virtual double bic() const; + + //! The expected BIC of applying the change. + virtual double expectedBic() const; + + //! Get a description of the level shift. + virtual TOptionalChangeDescription change() const; + + //! Update with \p samples. + virtual void addSamples(const std::size_t count, + const TTimeDoublePr1Vec& samples, + TDoubleWeightsAry1Vec weights); + + //! Get the static size of this object. + virtual std::size_t staticSize() const; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const; + +private: + using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; + +private: + //! The optimal shift. + TMeanAccumulator m_Shift; + + //! The mode of the initial residual distribution model. + double m_ResidualModelMode; + + //! The number of samples added so far. + double m_SampleCount; +}; + +//! \brief Captures the likelihood of the data given an arbitrary +//! linear scaling. +class MATHS_EXPORT CUnivariateLinearScaleModel final : public CUnivariateChangeModel { +public: + CUnivariateLinearScaleModel(const TDecompositionPtr& trendModel, + const TPriorPtr& residualModel); + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser); + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + + //! The BIC of applying the level shift. + virtual double bic() const; + + //! The expected BIC of applying the change. + virtual double expectedBic() const; + + //! Get a description of the level shift. + virtual TOptionalChangeDescription change() const; + + //! Update with \p samples. + virtual void addSamples(const std::size_t count, + const TTimeDoublePr1Vec& samples, + TDoubleWeightsAry1Vec weights); + + //! Get the static size of this object. + virtual std::size_t staticSize() const; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const; + +private: + using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; + +private: + //! The optimal shift. + TMeanAccumulator m_Scale; + + //! The mode of the initial residual distribution model. + double m_ResidualModelMode; + + //! The number of samples added so far. + double m_SampleCount; +}; + +//! \brief Captures the likelihood of the data given a specified +//! time shift. +class MATHS_EXPORT CUnivariateTimeShiftModel final : public CUnivariateChangeModel { +public: + CUnivariateTimeShiftModel(const TDecompositionPtr& trendModel, + const TPriorPtr& residualModel, + core_t::TTime shift); + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser); + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + + //! The BIC of applying the time shift. + virtual double bic() const; + + //! The expected BIC of applying the change. + virtual double expectedBic() const; + + //! Get a description of the time shift. + virtual TOptionalChangeDescription change() const; + + //! Update with \p samples. + virtual void addSamples(const std::size_t count, + const TTimeDoublePr1Vec& samples, + TDoubleWeightsAry1Vec weights); + + //! Get the static size of this object. + virtual std::size_t staticSize() const; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const; + +private: + //! The shift in time of the time series trend model. + core_t::TTime m_Shift; +}; +} +} +} + +#endif // INCLUDED_ml_maths_CTimeSeriesChangeDetector_h diff --git a/include/maths/CTimeSeriesDecomposition.h b/include/maths/CTimeSeriesDecomposition.h index b94f26c08a..38c72c8bb0 100644 --- a/include/maths/CTimeSeriesDecomposition.h +++ b/include/maths/CTimeSeriesDecomposition.h @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -22,6 +23,7 @@ class CStateRestoreTraverser; } namespace maths { class CPrior; +struct STimeSeriesDecompositionRestoreParams; //! \brief Decomposes a time series into a linear combination //! of periodic functions and a stationary random process. @@ -51,10 +53,6 @@ class MATHS_EXPORT CTimeSeriesDecomposition : public CTimeSeriesDecompositionInt public: using TSizeVec = std::vector; -public: - //! The default size to use for the seasonal components. - static const std::size_t DEFAULT_COMPONENT_SIZE; - public: //! \param[in] decayRate The rate at which information is lost. //! \param[in] bucketLength The data bucketing length. @@ -62,16 +60,15 @@ class MATHS_EXPORT CTimeSeriesDecomposition : public CTimeSeriesDecompositionInt //! use estimate a seasonal component. explicit CTimeSeriesDecomposition(double decayRate = 0.0, core_t::TTime bucketLength = 0, - std::size_t seasonalComponentSize = DEFAULT_COMPONENT_SIZE); + std::size_t seasonalComponentSize = COMPONENT_SIZE); //! Construct from part of a state document. - CTimeSeriesDecomposition(double decayRate, - core_t::TTime bucketLength, - std::size_t seasonalComponentSize, + CTimeSeriesDecomposition(const STimeSeriesDecompositionRestoreParams& params, core::CStateRestoreTraverser& traverser); //! Deep copy. - CTimeSeriesDecomposition(const CTimeSeriesDecomposition& other); + CTimeSeriesDecomposition(const CTimeSeriesDecomposition& other, + bool isForForecast = false); //! An efficient swap of the state of this and \p other. void swap(CTimeSeriesDecomposition& other); @@ -83,7 +80,10 @@ class MATHS_EXPORT CTimeSeriesDecomposition : public CTimeSeriesDecompositionInt void acceptPersistInserter(core::CStatePersistInserter& inserter) const; //! Clone this decomposition. - virtual CTimeSeriesDecomposition* clone() const; + virtual CTimeSeriesDecomposition* clone(bool isForForecast = false) const; + + //! Set the data type. + virtual void dataType(maths_t::EDataType dataType); //! Set the decay rate. virtual void decayRate(double decayRate); @@ -107,22 +107,31 @@ class MATHS_EXPORT CTimeSeriesDecomposition : public CTimeSeriesDecompositionInt double value, const maths_t::TDoubleWeightsAry& weights = TWeights::UNIT); + //! Apply \p change at \p time. + //! + //! \param[in] time The time of the change point. + //! \param[in] value The value immediately before the change + //! point. + //! \param[in] change A description of the change to apply. + //! \return True if a new component was detected. + virtual bool applyChange(core_t::TTime time, double value, const SChangeDescription& change); + //! Propagate the decomposition forwards to \p time. - void propagateForwardsTo(core_t::TTime time); + virtual void propagateForwardsTo(core_t::TTime time); - //! Get the mean value of the baseline in the vicinity of \p time. - virtual double mean(core_t::TTime time) const; + //! Get the mean value of the time series in the vicinity of \p time. + virtual double meanValue(core_t::TTime time) const; - //! Get the value of the time series baseline at \p time. + //! Get the value of the time series at \p time. //! //! \param[in] time The time of interest. //! \param[in] confidence The symmetric confidence interval for the prediction //! the baseline as a percentage. //! \param[in] components The components to include in the baseline. - virtual maths_t::TDoubleDoublePr baseline(core_t::TTime time, - double confidence = 0.0, - int components = E_All, - bool smooth = true) const; + virtual maths_t::TDoubleDoublePr value(core_t::TTime time, + double confidence = 0.0, + int components = E_All, + bool smooth = true) const; //! Forecast from \p start to \p end at \p dt intervals. //! @@ -131,18 +140,18 @@ class MATHS_EXPORT CTimeSeriesDecomposition : public CTimeSeriesDecompositionInt //! \param[in] step The time increment. //! \param[in] confidence The forecast confidence interval. //! \param[in] minimumScale The minimum permitted seasonal scale. - //! \param[in] result Filled in with the forecast lower bound, prediction - //! and upper bound. + //! \param[in] writer Forecast results are passed to this callback. virtual void forecast(core_t::TTime startTime, core_t::TTime endTime, core_t::TTime step, double confidence, double minimumScale, - TDouble3VecVec& result); + const TWriteForecastResult& writer); //! Detrend \p value from the time series being modeled by removing //! any trend and periodic component at \p time. - virtual double detrend(core_t::TTime time, double value, double confidence) const; + virtual double + detrend(core_t::TTime time, double value, double confidence, int components = E_All) const; //! Get the mean variance of the baseline. virtual double meanVariance() const; @@ -172,10 +181,14 @@ class MATHS_EXPORT CTimeSeriesDecomposition : public CTimeSeriesDecompositionInt //! Get the static size of this object. virtual std::size_t staticSize() const; + //! Get the time shift which is being applied. + virtual core_t::TTime timeShift(void) const; + //! Get the seasonal components. virtual const maths_t::TSeasonalComponentVec& seasonalComponents() const; - //! This is the latest time of any point added to this object or the time skipped to. + //! This is the latest time of any point added to this object or + //! the time skipped to. virtual core_t::TTime lastValueTime() const; private: @@ -186,7 +199,8 @@ class MATHS_EXPORT CTimeSeriesDecomposition : public CTimeSeriesDecompositionInt void initializeMediator(); //! Create from part of a state document. - bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); + bool acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser); //! The correction to produce a smooth join between periodic //! repeats and partitions. @@ -205,6 +219,9 @@ class MATHS_EXPORT CTimeSeriesDecomposition : public CTimeSeriesDecompositionInt static const core_t::TTime SMOOTHING_INTERVAL; private: + //! Any time shift to supplied times. + core_t::TTime m_TimeShift; + //! The time of the latest value added. core_t::TTime m_LastValueTime; diff --git a/include/maths/CTimeSeriesDecompositionDetail.h b/include/maths/CTimeSeriesDecompositionDetail.h index 8f1f3b81af..7b99bd6711 100644 --- a/include/maths/CTimeSeriesDecompositionDetail.h +++ b/include/maths/CTimeSeriesDecompositionDetail.h @@ -176,9 +176,13 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail { //! \brief Scans through increasingly low frequencies looking for custom //! diurnal and any other large amplitude seasonal components. class MATHS_EXPORT CPeriodicityTest : public CHandler { + public: + //! Test types (categorised as short and long period tests). + enum ETest { E_Short, E_Long }; + public: CPeriodicityTest(double decayRate, core_t::TTime bucketLength); - CPeriodicityTest(const CPeriodicityTest& other); + CPeriodicityTest(const CPeriodicityTest& other, bool isForForecast = false); //! Initialize by reading state from \p traverser. bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); @@ -198,6 +202,9 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail { //! Test to see whether any seasonal components are present. void test(const SAddValue& message); + //! Clear the test identified by \p test. + void clear(ETest test, core_t::TTime time); + //! Age the test to account for the interval \p end - \p start //! elapsed time. void propagateForwards(core_t::TTime start, core_t::TTime end); @@ -216,9 +223,6 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail { using TExpandingWindowPtr = std::shared_ptr; using TExpandingWindowPtrAry = boost::array; - //! Test types (categorised as short and long period tests). - enum ETest { E_Short, E_Long }; - private: //! The bucket lengths to use to test for short period components. static const TTimeVec SHORT_BUCKET_LENGTHS; @@ -259,7 +263,7 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail { class MATHS_EXPORT CCalendarTest : public CHandler { public: CCalendarTest(double decayRate, core_t::TTime bucketLength); - CCalendarTest(const CCalendarTest& other); + CCalendarTest(const CCalendarTest& other, bool isForForecast = false); //! Initialize by reading state from \p traverser. bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); @@ -347,7 +351,8 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail { }; //! Initialize by reading state from \p traverser. - bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); + bool acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser); //! Persist state by passing information to \p inserter. void acceptPersistInserter(core::CStatePersistInserter& inserter) const; @@ -364,8 +369,23 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail { //! Create a new calendar component. virtual void handle(const SDetectedCalendar& message); + //! Start using the trend for prediction. + void useTrendForPrediction(void); + + //! Apply \p shift to the level at \p time and \p value. + void shiftLevel(core_t::TTime time, double value, double shift); + + //! Apply a linear scale of \p scale. + void linearScale(core_t::TTime time, double scale); + + //! Maybe re-interpolate the components. + void interpolate(const SMessage& message); + //! Maybe re-interpolate the components. - void interpolate(const SMessage& message, bool refine = true); + void interpolateForForecast(core_t::TTime time); + + //! Set the data type. + void dataType(maths_t::EDataType dataType); //! Set the decay rate. void decayRate(double decayRate); @@ -512,6 +532,9 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail { //! Shift the components' time origin to \p time. void shiftOrigin(core_t::TTime time); + //! Linearly scale the components' by \p scale. + void linearScale(core_t::TTime time, double scale); + //! Get a checksum for this object. uint64_t checksum(uint64_t seed = 0) const; @@ -570,6 +593,9 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail { //! Remove low value components. bool prune(core_t::TTime time, core_t::TTime bucketLength); + //! Linearly scale the components' by \p scale. + void linearScale(core_t::TTime time, double scale); + //! Get a checksum for this object. uint64_t checksum(uint64_t seed = 0) const; diff --git a/include/maths/CTimeSeriesDecompositionInterface.h b/include/maths/CTimeSeriesDecompositionInterface.h index 24dad1d9c4..109756f594 100644 --- a/include/maths/CTimeSeriesDecompositionInterface.h +++ b/include/maths/CTimeSeriesDecompositionInterface.h @@ -27,6 +27,7 @@ namespace maths { class CMultivariatePrior; class CPrior; class CSeasonalComponent; +struct SChangeDescription; //! \brief The interface for decomposing times series into periodic, //! calendar periodic and trend components. @@ -35,6 +36,7 @@ class MATHS_EXPORT CTimeSeriesDecompositionInterface { using TDouble3Vec = core::CSmallVector; using TDouble3VecVec = std::vector; using TWeights = maths_t::CUnitWeights; + using TWriteForecastResult = std::function; //! The components of the decomposition. enum EComponents { @@ -53,7 +55,10 @@ class MATHS_EXPORT CTimeSeriesDecompositionInterface { virtual ~CTimeSeriesDecompositionInterface() = default; //! Clone this decomposition. - virtual CTimeSeriesDecompositionInterface* clone() const = 0; + virtual CTimeSeriesDecompositionInterface* clone(bool isForForecast = false) const = 0; + + //! Set the data type. + virtual void dataType(maths_t::EDataType dataType) = 0; //! Set the decay rate. virtual void decayRate(double decayRate) = 0; @@ -77,22 +82,32 @@ class MATHS_EXPORT CTimeSeriesDecompositionInterface { double value, const maths_t::TDoubleWeightsAry& weights = TWeights::UNIT) = 0; + //! Apply \p change at \p time. + //! + //! \param[in] time The time of the change point. + //! \param[in] value The value immediately before the change + //! point. + //! \param[in] change A description of the change to apply. + //! \return True if a new component was detected. + virtual bool + applyChange(core_t::TTime time, double value, const SChangeDescription& change) = 0; + //! Propagate the decomposition forwards to \p time. virtual void propagateForwardsTo(core_t::TTime time) = 0; - //! Get the mean value of the baseline in the vicinity of \p time. - virtual double mean(core_t::TTime time) const = 0; + //! Get the mean value of the time series in the vicinity of \p time. + virtual double meanValue(core_t::TTime time) const = 0; - //! Get the value of the time series baseline at \p time. + //! Get the value of the time series at \p time. //! //! \param[in] time The time of interest. //! \param[in] confidence The symmetric confidence interval for the prediction //! the baseline as a percentage. //! \param[in] components The components to include in the baseline. - virtual maths_t::TDoubleDoublePr baseline(core_t::TTime time, - double confidence = 0.0, - int components = E_All, - bool smooth = true) const = 0; + virtual maths_t::TDoubleDoublePr value(core_t::TTime time, + double confidence = 0.0, + int components = E_All, + bool smooth = true) const = 0; //! Forecast from \p start to \p end at \p dt intervals. //! @@ -101,20 +116,22 @@ class MATHS_EXPORT CTimeSeriesDecompositionInterface { //! \param[in] step The time increment. //! \param[in] confidence The forecast confidence interval. //! \param[in] minimumScale The minimum permitted seasonal scale. - //! \param[in] result Filled in with the forecast lower bound, prediction - //! and upper bound. + //! \param[in] writer Forecast results are passed to this callback. virtual void forecast(core_t::TTime startTime, core_t::TTime endTime, core_t::TTime step, double confidence, double minimumScale, - TDouble3VecVec& result) = 0; + const TWriteForecastResult& writer) = 0; //! Detrend \p value from the time series being modeled by removing //! any periodic component at \p time. //! //! \note That detrending preserves the time series mean. - virtual double detrend(core_t::TTime time, double value, double confidence) const = 0; + virtual double detrend(core_t::TTime time, + double value, + double confidence, + int components = E_All) const = 0; //! Get the mean variance of the baseline. virtual double meanVariance() const = 0; @@ -143,10 +160,14 @@ class MATHS_EXPORT CTimeSeriesDecompositionInterface { //! Get the static size of this object. virtual std::size_t staticSize() const = 0; + //! Get the time shift which is being applied. + virtual core_t::TTime timeShift(void) const = 0; + //! Get the seasonal components. virtual const maths_t::TSeasonalComponentVec& seasonalComponents() const = 0; - //! This is the latest time of any point added to this object or the time skipped to. + //! This is the latest time of any point added to this object or + //! the time skipped to. virtual core_t::TTime lastValueTime() const = 0; }; } diff --git a/include/maths/CTimeSeriesDecompositionStub.h b/include/maths/CTimeSeriesDecompositionStub.h index 7d81e5e3ac..0ed47c2e6e 100644 --- a/include/maths/CTimeSeriesDecompositionStub.h +++ b/include/maths/CTimeSeriesDecompositionStub.h @@ -23,7 +23,10 @@ namespace maths { class MATHS_EXPORT CTimeSeriesDecompositionStub : public CTimeSeriesDecompositionInterface { public: //! Clone this decomposition. - virtual CTimeSeriesDecompositionStub* clone() const; + virtual CTimeSeriesDecompositionStub* clone(bool isForForecast = false) const; + + //! No-op. + virtual void dataType(maths_t::EDataType dataType); //! No-op. virtual void decayRate(double decayRate); @@ -39,28 +42,32 @@ class MATHS_EXPORT CTimeSeriesDecompositionStub : public CTimeSeriesDecompositio double value, const maths_t::TDoubleWeightsAry& weights = TWeights::UNIT); + //! No-op returning false. + virtual bool applyChange(core_t::TTime time, double value, const SChangeDescription& change); + //! No-op. virtual void propagateForwardsTo(core_t::TTime time); //! Returns 0. - virtual double mean(core_t::TTime time) const; + virtual double meanValue(core_t::TTime time) const; //! Returns (0.0, 0.0). - virtual maths_t::TDoubleDoublePr baseline(core_t::TTime time, - double confidence = 0.0, - int components = E_All, - bool smooth = true) const; + virtual maths_t::TDoubleDoublePr value(core_t::TTime time, + double confidence = 0.0, + int components = E_All, + bool smooth = true) const; - //! Clears \p result. + //! No-op. virtual void forecast(core_t::TTime startTime, core_t::TTime endTime, core_t::TTime step, double confidence, double minimumScale, - TDouble3VecVec& result); + const TWriteForecastResult& writer); //! Returns \p value. - virtual double detrend(core_t::TTime time, double value, double confidence) const; + virtual double + detrend(core_t::TTime time, double value, double confidence, int components = E_All) const; //! Returns 0.0. virtual double meanVariance() const; @@ -84,7 +91,10 @@ class MATHS_EXPORT CTimeSeriesDecompositionStub : public CTimeSeriesDecompositio //! Get the static size of this object. virtual std::size_t staticSize() const; - //! Get the seasonal components. + //! Returns zero. + virtual core_t::TTime timeShift() const; + + //! Returns an empty vector. virtual const maths_t::TSeasonalComponentVec& seasonalComponents() const; //! Returns 0. diff --git a/include/maths/CTimeSeriesModel.h b/include/maths/CTimeSeriesModel.h index c569c6ec8e..8574c1df5e 100644 --- a/include/maths/CTimeSeriesModel.h +++ b/include/maths/CTimeSeriesModel.h @@ -25,29 +25,49 @@ class CMultivariatePrior; class CPrior; class CTimeSeriesDecompositionInterface; class CTimeSeriesAnomalyModel; +class CUnivariateTimeSeriesChangeDetector; +struct SChangeDescription; struct SDistributionRestoreParams; struct SModelRestoreParams; +namespace winsorisation { +//! Computes a Winsorisation weight for \p value based on its +//! one tail p-value. +MATHS_EXPORT +double tailWeight(const CPrior& prior, double derate, double scale, double value); + +//! Computes a Winsorisation weight for \p value based on its +//! marginal for \p dimension one tail p-value. +MATHS_EXPORT +double tailWeight(const CMultivariatePrior& prior, + std::size_t dimension, + double derate, + double scale, + const core::CSmallVector& value); +} + //! \brief A CModel implementation for modeling a univariate time series. class MATHS_EXPORT CUnivariateTimeSeriesModel : public CModel { public: using TTimeDoublePr = std::pair; using TTimeDoublePrCBuf = boost::circular_buffer; + using TDoubleWeightsAry = maths_t::TDoubleWeightsAry; + using TDecompositionPtr = std::shared_ptr; using TDecayRateController2Ary = boost::array; public: //! \param[in] params The model parameters. //! \param[in] id The *unique* identifier for this time series. - //! \param[in] trend The time series trend decomposition. - //! \param[in] prior The time series residuals' prior. + //! \param[in] trendModel The time series trend decomposition. + //! \param[in] residualModel The prior for the time series residual model. //! \param[in] controllers Optional decay rate controllers for the trend - //! and prior. + //! and residual model. //! \param[in] modelAnomalies If true we use a separate model to capture //! the characteristics of anomalous time periods. CUnivariateTimeSeriesModel(const CModelParams& params, std::size_t id, - const CTimeSeriesDecompositionInterface& trend, - const CPrior& prior, + const CTimeSeriesDecompositionInterface& trendModel, + const CPrior& residualModel, const TDecayRateController2Ary* controllers = nullptr, bool modelAnomalies = true); CUnivariateTimeSeriesModel(const SModelRestoreParams& params, @@ -60,7 +80,8 @@ class MATHS_EXPORT CUnivariateTimeSeriesModel : public CModel { //! Create a copy of this model passing ownership to the caller. virtual CUnivariateTimeSeriesModel* clone(std::size_t id) const; - //! Create a copy of the state we need to persist passing ownership to the caller. + //! Create a copy of the state we need to persist passing ownership + //! to the caller. virtual CUnivariateTimeSeriesModel* cloneForPersistence() const; //! Create a copy of the state we need to run forecasting. @@ -88,8 +109,8 @@ class MATHS_EXPORT CUnivariateTimeSeriesModel : public CModel { //! Get the most likely value for the time series at \p time. virtual TDouble2Vec mode(core_t::TTime time, const TDouble2VecWeightsAry& weights) const; - //! Get the most likely value for each correlate time series at - //! \p time, if there are any. + //! Get the most likely value for each correlate time series + //! at \p time, if there are any. virtual TDouble2Vec1Vec correlateModes(core_t::TTime time, const TDouble2VecWeightsAry1Vec& weights) const; @@ -158,35 +179,59 @@ class MATHS_EXPORT CUnivariateTimeSeriesModel : public CModel { //! Get the type of data being modeled. virtual maths_t::EDataType dataType() const; + //! \name Helpers + //@{ + //! Unpack the weights in \p weights. + static TDoubleWeightsAry unpack(const TDouble2VecWeightsAry& weights); + + //! Reinitialize \p residualModel using the detrended values + //! from \p slidingWindow. + static void reinitializeResidualModel(double learnRate, + const TDecompositionPtr& trend, + const TTimeDoublePrCBuf& slidingWindow, + CPrior& residualModel); + //@} + //! \name Test Functions //@{ //! Get the sliding window of recent values. const TTimeDoublePrCBuf& slidingWindow() const; //! Get the trend. - const CTimeSeriesDecompositionInterface& trend() const; + const CTimeSeriesDecompositionInterface& trendModel() const; - //! Get the prior. - const CPrior& prior() const; + //! Get the residual model. + const CPrior& residualModel(void) const; //@} private: + using TSizeVec = std::vector; using TDouble1Vec = core::CSmallVector; using TDouble1VecVec = std::vector; using TDouble2VecWeightsAryVec = std::vector; using TVector = CVectorNx1; using TVectorMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; using TDecayRateController2AryPtr = std::shared_ptr; - using TDecompositionPtr = std::shared_ptr; using TPriorPtr = std::shared_ptr; using TAnomalyModelPtr = std::shared_ptr; using TMultivariatePriorCPtrSizePr = std::pair; using TMultivariatePriorCPtrSizePr1Vec = core::CSmallVector; using TModelCPtr1Vec = core::CSmallVector; + using TChangeDetectorPtr = std::shared_ptr; private: - CUnivariateTimeSeriesModel(const CUnivariateTimeSeriesModel& other, std::size_t id); + CUnivariateTimeSeriesModel(const CUnivariateTimeSeriesModel& other, + std::size_t id, + bool isForForecast = false); + + //! Test for and apply any change we find. + EUpdateResult testAndApplyChange(const CModelAddSamplesParams& params, + const TSizeVec& order, + const TTimeDouble2VecSizeTrVec& samples); + + //! Apply \p change to this model. + EUpdateResult applyChange(const SChangeDescription& change); //! Update the trend with \p samples. EUpdateResult updateTrend(const TTimeDouble2VecSizeTrVec& samples, @@ -195,6 +240,10 @@ class MATHS_EXPORT CUnivariateTimeSeriesModel : public CModel { //! Compute the prediction errors for \p sample. void appendPredictionErrors(double interval, double sample, TDouble1VecVec (&result)[2]); + //! Reinitialize state after detecting a new component of the trend + //! decomposition. + void reinitializeStateGivenNewComponent(void); + //! Get the models for the correlations and the models of the correlated //! time series. bool correlationModels(TSize1Vec& correlated, @@ -215,20 +264,30 @@ class MATHS_EXPORT CUnivariateTimeSeriesModel : public CModel { //! A random number generator for sampling the sliding window. CPRNG::CXorOShiro128Plus m_Rng; - //! These control the trend and prior decay rates (see CDecayRateController - //! for more details). + //! These control the trend and residual model decay rates (see + //! CDecayRateController for more details). TDecayRateController2AryPtr m_Controllers; //! The time series trend decomposition. - TDecompositionPtr m_Trend; + TDecompositionPtr m_TrendModel; - //! The prior for the time series' residual model. - TPriorPtr m_Prior; + //! The time series' residual model. + TPriorPtr m_ResidualModel; - //! A model for time periods when the basic model can't predict the value - //! of the time series. + //! A model for time periods when the basic model can't predict the + //! value of the time series. TAnomalyModelPtr m_AnomalyModel; + //! The last "normal" time and median value. + TTimeDoublePr m_CandidateChangePoint; + + //! If the time series appears to be undergoing change, the contiguous + //! interval of unpredictable values. + core_t::TTime m_CurrentChangeInterval; + + //! Used to test for changes in the time series. + TChangeDetectorPtr m_ChangeDetector; + //! A sliding window of the recent samples (used to reinitialize the //! residual model when a new trend component is detected). TTimeDoublePrCBuf m_SlidingWindow; @@ -307,9 +366,9 @@ class MATHS_EXPORT CTimeSeriesCorrelations { TSize1Vec s_Tags; //! The sample weights. TDoubleWeightsAry1Vec s_Weights; - //! The interval by which to age the prior. + //! The interval by which to age the correlation model. double s_Interval; - //! The prior decay rate multiplier. + //! The decay rate multiplier. double s_Multiplier; }; @@ -322,7 +381,8 @@ class MATHS_EXPORT CTimeSeriesCorrelations { //! Create a copy of this model passing ownership to the caller. CTimeSeriesCorrelations* clone() const; - //! Create a copy of the state we need to persist passing ownership to the caller. + //! Create a copy of the state we need to persist passing ownership + //! to the caller. CTimeSeriesCorrelations* cloneForPersistence() const; //! Process all samples added from individual time series models. @@ -339,7 +399,7 @@ class MATHS_EXPORT CTimeSeriesCorrelations { void refresh(const CTimeSeriesCorrelateModelAllocator& allocator); //! Get the correlation joint distribution models. - const TSizeSizePrMultivariatePriorPtrDoublePrUMap& correlatePriors() const; + const TSizeSizePrMultivariatePriorPtrDoublePrUMap& correlationModels() const; //! Debug the memory used by this object. void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; @@ -366,20 +426,22 @@ class MATHS_EXPORT CTimeSeriesCorrelations { CTimeSeriesCorrelations(const CTimeSeriesCorrelations& other, bool isForPersistence = false); - //! Restore the correlate priors reading state from \p traverser. - bool restoreCorrelatePriors(const SDistributionRestoreParams& params, - core::CStateRestoreTraverser& traverser); + //! Restore the correlation distribution models reading state from + //! \p traverser. + bool restoreCorrelationModels(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser); - //! Persist the correlate priors passing information to \p inserter. - void persistCorrelatePriors(core::CStatePersistInserter& inserter) const; + //! Persist the correlation distribution models passing information + //! to \p inserter. + void persistCorrelationModels(core::CStatePersistInserter& inserter) const; - //! Restore the correlate priors reading state from \p traverser. + //! Restore the \p model reading state from \p traverser. static bool restore(const SDistributionRestoreParams& params, - TSizeSizePrMultivariatePriorPtrDoublePrPr& prior, + TSizeSizePrMultivariatePriorPtrDoublePrPr& model, core::CStateRestoreTraverser& traverser); - //! Persist the correlate priors passing information to \p inserter. - static void persist(const TSizeSizePrMultivariatePriorPtrDoublePrPr& prior, + //! Persist the \p model passing information to \p inserter. + static void persist(const TSizeSizePrMultivariatePriorPtrDoublePrPr& model, core::CStatePersistInserter& inserter); //! Add the time series identified by \p id. @@ -390,10 +452,8 @@ class MATHS_EXPORT CTimeSeriesCorrelations { //! Add a sample for the time series identified by \p id. void addSamples(std::size_t id, - maths_t::EDataType type, + const CModelAddSamplesParams& params, const TTimeDouble2VecSizeTrVec& samples, - const TDoubleWeightsAry1Vec& weights, - double interval, double multiplier); //! Get the ids of the time series correlated with \p id. @@ -439,23 +499,25 @@ class MATHS_EXPORT CTimeSeriesCorrelations { //! \brief A CModel implementation for modeling a multivariate time series. class MATHS_EXPORT CMultivariateTimeSeriesModel : public CModel { public: + using TDouble10Vec = core::CSmallVector; using TTimeDouble2VecPr = std::pair; using TTimeDouble2VecPrCBuf = boost::circular_buffer; + using TDouble10VecWeightsAry = maths_t::TDouble10VecWeightsAry; using TDecompositionPtr = std::shared_ptr; using TDecompositionPtr10Vec = core::CSmallVector; using TDecayRateController2Ary = boost::array; public: //! \param[in] params The model parameters. - //! \param[in] trend The time series trend decomposition. - //! \param[in] prior The time series residuals' prior. + //! \param[in] trendModel The time series trend decomposition. + //! \param[in] residualModel The prior for the time series residual model. //! \param[in] controllers Optional decay rate controllers for the trend - //! and prior. + //! and residual model. //! \param[in] modelAnomalies If true we use a separate model to capture //! the characteristics of anomalous time periods. CMultivariateTimeSeriesModel(const CModelParams& params, - const CTimeSeriesDecompositionInterface& trend, - const CMultivariatePrior& prior, + const CTimeSeriesDecompositionInterface& trendModel, + const CMultivariatePrior& residualModel, const TDecayRateController2Ary* controllers = nullptr, bool modelAnomalies = true); CMultivariateTimeSeriesModel(const CMultivariateTimeSeriesModel& other); @@ -468,7 +530,8 @@ class MATHS_EXPORT CMultivariateTimeSeriesModel : public CModel { //! Create a copy of this model passing ownership to the caller. virtual CMultivariateTimeSeriesModel* clone(std::size_t id) const; - //! Create a copy of the state we need to persist passing ownership to the caller. + //! Create a copy of the state we need to persist passing ownership + //! to the caller. virtual CMultivariateTimeSeriesModel* cloneForPersistence() const; //! Create a copy of the state we need to run forecasting. @@ -563,16 +626,29 @@ class MATHS_EXPORT CMultivariateTimeSeriesModel : public CModel { //! Get the type of data being modeled. virtual maths_t::EDataType dataType() const; + //! \name Helpers + //@{ + //! Unpack the weights in \p weights. + static TDouble10VecWeightsAry unpack(const TDouble2VecWeightsAry& weights); + + //! Reinitialize \p residualModel using the detrended values + //! from \p slidingWindow. + static void reinitializeResidualModel(double learnRate, + const TDecompositionPtr10Vec& trend, + const TTimeDouble2VecPrCBuf& slidingWindow, + CMultivariatePrior& residualModel); + //@} + //! \name Test Functions //@{ //! Get the sliding window of recent values. const TTimeDouble2VecPrCBuf& slidingWindow() const; //! Get the trend. - const TDecompositionPtr10Vec& trend() const; + const TDecompositionPtr10Vec& trendModel() const; - //! Get the prior. - const CMultivariatePrior& prior() const; + //! Get the residual model. + const CMultivariatePrior& residualModel() const; //@} private: @@ -595,6 +671,10 @@ class MATHS_EXPORT CMultivariateTimeSeriesModel : public CModel { const TDouble2Vec& sample, TDouble1VecVec (&result)[2]); + //! Reinitialize state after detecting a new component of the trend + //! decomposition. + void reinitializeStateGivenNewComponent(void); + //! Get the model dimension. std::size_t dimension() const; @@ -605,18 +685,18 @@ class MATHS_EXPORT CMultivariateTimeSeriesModel : public CModel { //! A random number generator for sampling the sliding window. CPRNG::CXorOShiro128Plus m_Rng; - //! These control the trend and prior decay rates (see CDecayRateController - //! for more details). + //! These control the trend and residual model decay rates (see + //! CDecayRateController for more details). TDecayRateController2AryPtr m_Controllers; //! The time series trend decomposition. - TDecompositionPtr10Vec m_Trend; + TDecompositionPtr10Vec m_TrendModel; - //! The prior for the time series' residual model. - TMultivariatePriorPtr m_Prior; + //! The time series residual model. + TMultivariatePriorPtr m_ResidualModel; - //! A model for time periods when the basic model can't predict the value - //! of the time series. + //! A model for time periods when the basic model can't predict the + //! value of the time series. TAnomalyModelPtr m_AnomalyModel; //! A sliding window of the recent samples (used to reinitialize the diff --git a/include/maths/CTools.h b/include/maths/CTools.h index 5b298d8632..a13c6466b7 100644 --- a/include/maths/CTools.h +++ b/include/maths/CTools.h @@ -350,9 +350,9 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable { static double safeCdfComplement(const chi_squared& chi2, double x); //@} - //! Compute the deviation from the probability of seeing a more - //! extreme event for a distribution, i.e. for a sample \f$x\f$ - //! from a R.V. the probability \f$P(R)\f$ of the set: + //! Compute the anomalousness from the probability of seeing a + //! more extreme event for a distribution, i.e. for a sample + //! \f$x\f$ from a R.V. the probability \f$P(R)\f$ of the set: //!
     //!   \f$ R = \{y\ |\ f(y) \leq f(x)\} \f$
     //! 
@@ -361,10 +361,10 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable { //! This is a monotonically decreasing function of \f$P(R)\f$ and //! is chosen so that for \f$P(R)\f$ near one it is zero and as //! \f$P(R) \rightarrow 0\f$ it saturates at 100. - static double deviation(double p); + static double anomalyScore(double p); - //! The inverse of the deviation function. - static double inverseDeviation(double deviation); + //! The inverse of the anomalyScore function. + static double inverseAnomalyScore(double deviation); //! \name Differential Entropy //! Compute the differential entropy of the specified distribution.\n\n @@ -662,6 +662,9 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable { //! Shift \p x to the right by \p eps times \p x. static double shiftRight(double x, double eps = std::numeric_limits::epsilon()); + //! Compute \f$x^2\f$. + static double pow2(double x) { return x * x; } + //! Sigmoid function of \p p. static double sigmoid(double p) { return 1.0 / (1.0 + 1.0 / p); } @@ -676,6 +679,20 @@ class MATHS_EXPORT CTools : private core::CNonInstantiatable { static double logisticFunction(double x, double width, double x0 = 0.0, double sign = 1.0) { return sigmoid(std::exp(std::copysign(1.0, sign) * (x - x0) / width)); } + + //! A custom, numerically robust, implementation of \f$(1 - x) ^ p\f$. + //! + //! \note It is assumed that p is integer. + static double powOneMinusX(double x, double p); + + //! A custom, numerically robust, implementation of \f$1 - (1 - x) ^ p\f$. + //! + //! \note It is assumed that p is integer. + static double oneMinusPowOneMinusX(double x, double p); + + //! A custom implementation of \f$\log(1 - x)\f$ which handles the + //! cancellation error for small x. + static double logOneMinusX(double x); }; } } diff --git a/include/maths/CTrendComponent.h b/include/maths/CTrendComponent.h index e59b6c5303..8f4d67aab9 100644 --- a/include/maths/CTrendComponent.h +++ b/include/maths/CTrendComponent.h @@ -11,6 +11,9 @@ #include #include +#include +#include +#include #include #include #include @@ -19,6 +22,7 @@ namespace ml { namespace maths { +struct SDistributionRestoreParams; //! \brief Models the trend component of a time series. //! @@ -50,6 +54,8 @@ class MATHS_EXPORT CTrendComponent { using TVectorVecVec = std::vector; using TMatrix = CSymmetricMatrixNxN; using TMatrixVec = std::vector; + using TSeasonalForecast = std::function; + using TWriteForecastResult = std::function; public: CTrendComponent(double decayRate); @@ -61,7 +67,8 @@ class MATHS_EXPORT CTrendComponent { void acceptPersistInserter(core::CStatePersistInserter& inserter) const; //! Initialize by reading state from \p traverser. - bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); + bool acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser); //! Check if the trend has been estimated. bool initialized() const; @@ -76,6 +83,17 @@ class MATHS_EXPORT CTrendComponent { //! greater than \p decayRate. void shiftSlope(double decayRate, double shift); + //! Apply a level shift of \p value at \p time and \p value. + void shiftLevel(core_t::TTime time, double value, double shift); + + //! Apply no level shift at \p time and \p value. + //! + //! This updates the model for the probability of a level shift. + void dontShiftLevel(core_t::TTime time, double value); + + //! Apply a linear scale by \p scale. + void linearScale(double scale); + //! Adds a value \f$(t, f(t))\f$ to this component. //! //! \param[in] time The time of the point. @@ -84,6 +102,9 @@ class MATHS_EXPORT CTrendComponent { //! less influence it has on the component. void add(core_t::TTime time, double value, double weight = 1.0); + //! Set the data type. + void dataType(maths_t::EDataType dataType); + //! Get the base rate at which models lose information. double defaultDecayRate() const; @@ -106,12 +127,20 @@ class MATHS_EXPORT CTrendComponent { //! variance as a percentage. TDoubleDoublePr variance(double confidence) const; - //! Create \p n sample forecast paths. + //! Forecast the trend model from \p startTime to \p endTime. + //! + //! \param[in] startTime The start time of the forecast interval. + //! \param[in] endTime The end time of the forecast interval. + //! \param[in] step The time step. + //! \param[in] confidence The confidence interval to calculate. + //! \param[in] seasonal Forecasts seasonal components. + //! \param[in] writer Writes out forecast results. void forecast(core_t::TTime startTime, core_t::TTime endTime, core_t::TTime step, double confidence, - TDouble3VecVec& result) const; + const TSeasonalForecast& seasonal, + const TWriteForecastResult& writer) const; //! Get the interval which has been observed so far. core_t::TTime observedInterval() const; @@ -145,6 +174,41 @@ class MATHS_EXPORT CTrendComponent { }; using TModelVec = std::vector; + //! \brief Forecasts the level model by path roll out. + class CForecastLevel : private core::CNonCopyable { + public: + //! The default number of roll out paths to use. + static const std::size_t DEFAULT_NUMBER_PATHS{100u}; + + public: + CForecastLevel(const CNaiveBayes& probability, + const CNormalMeanPrecConjugate& magnitude, + core_t::TTime timeOfLastChange, + std::size_t numberPaths = DEFAULT_NUMBER_PATHS); + + //! Forecast the time series level at \p time. + TDouble3Vec forecast(core_t::TTime time, double prediction, double confidence); + + private: + using TTimeVec = std::vector; + + private: + //! The model of the change probability. + const CNaiveBayes& m_Probability; + //! The model of the change magnitude. + const CNormalMeanPrecConjugate& m_Magnitude; + //! A random number generator for generating roll outs. + CPRNG::CXorOShiro128Plus m_Rng; + //! The current roll outs forecasted levels. + TDoubleVec m_Levels; + //! The current roll outs times of last change. + TTimeVec m_TimesOfLastChange; + //! Maintains the current bucket probability of change. + TDoubleVec m_ProbabilitiesOfChange; + //! Place holder for sampling. + TDoubleVec m_Uniform01; + }; + private: //! Get the factors by which to age the different regression models. TDoubleVec factors(core_t::TTime interval) const; @@ -179,11 +243,18 @@ class MATHS_EXPORT CTrendComponent { //! The start time of the regression models. core_t::TTime m_RegressionOrigin; //! The regression models (we have them for multiple time scales). - TModelVec m_Models; + TModelVec m_TrendModels; //! The variance of the prediction errors. double m_PredictionErrorVariance; //! The mean and variance of the values added to the trend component. TMeanVarAccumulator m_ValueMoments; + + //! The time of the last level change. + core_t::TTime m_TimeOfLastLevelChange; + //! A model of probability of level changes for the trend. + CNaiveBayes m_ProbabilityOfLevelChangeModel; + //! A model of magnitude of level changes for the trend. + CNormalMeanPrecConjugate m_MagnitudeOfLevelChangeModel; }; } } diff --git a/include/maths/CXMeansOnline.h b/include/maths/CXMeansOnline.h index 22f95fabba..c102a38e89 100644 --- a/include/maths/CXMeansOnline.h +++ b/include/maths/CXMeansOnline.h @@ -276,8 +276,8 @@ class CXMeansOnline : public CClusterer> { TCovariances covariances[2]; TSphericalClusterVec clusters; this->sphericalClusters(clusters); - for (std::size_t i = 0u; i < 2; ++i) { - for (std::size_t j = 0u; j < split[i].size(); ++j) { + for (std::size_t i = 0; i < 2; ++i) { + for (std::size_t j = 0; j < split[i].size(); ++j) { covariances[i].add(clusters[split[i][j]]); } } @@ -441,10 +441,10 @@ class CXMeansOnline : public CClusterer> { LOG_TRACE(<< "Checking full split"); TSizeVec assignment(remainder.size()); - for (std::size_t i = 0u; i < remainder.size(); ++i) { + for (std::size_t i = 0; i < remainder.size(); ++i) { assignment[i] = nearest(remainder[i], covariances); } - for (std::size_t i = 0u; i < assignment.size(); ++i) { + for (std::size_t i = 0; i < assignment.size(); ++i) { std::size_t j = assignment[i]; TCovariances ci; ci.add(remainder[i]); @@ -468,8 +468,8 @@ class CXMeansOnline : public CClusterer> { boost::counting_iterator(clusters.size())); COrderings::simultaneousSort( clusters, indexes, typename CSphericalCluster::SLess()); - for (std::size_t i = 0u; i < candidate.size(); ++i) { - for (std::size_t j = 0u; j < candidate[i].size(); ++j) { + for (std::size_t i = 0; i < candidate.size(); ++i) { + for (std::size_t j = 0; j < candidate[i].size(); ++j) { std::size_t k = std::lower_bound( clusters.begin(), clusters.end(), @@ -498,8 +498,8 @@ class CXMeansOnline : public CClusterer> { m_Structure.clusters(result); switch (m_DataType) { case maths_t::E_IntegerData: { - for (std::size_t i = 0u; i < result.size(); ++i) { - result[i].annotation().s_Variance += 1.0 / 12.0; + for (auto& cluster : result) { + cluster.annotation().s_Variance += 1.0 / 12.0; } break; } @@ -671,9 +671,9 @@ class CXMeansOnline : public CClusterer> { //! Persist state by passing information to the supplied inserter. virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const { - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { + for (const auto& cluster : m_Clusters) { inserter.insertLevel(CLUSTER_TAG, boost::bind(&CCluster::acceptPersistInserter, - &m_Clusters[i], _1)); + &cluster, _1)); } inserter.insertValue(DECAY_RATE_TAG, m_DecayRate, core::CIEEE754::E_SinglePrecision); inserter.insertValue(HISTORY_LENGTH_TAG, m_HistoryLength, @@ -706,16 +706,16 @@ class CXMeansOnline : public CClusterer> { //! Set the type of data being clustered. virtual void dataType(maths_t::EDataType dataType) { m_DataType = dataType; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - m_Clusters[i].dataType(dataType); + for (auto& cluster : m_Clusters) { + cluster.dataType(dataType); } } //! Set the rate at which information is aged out. virtual void decayRate(double decayRate) { m_DecayRate = decayRate; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - m_Clusters[i].decayRate(decayRate); + for (auto& cluster : m_Clusters) { + cluster.decayRate(decayRate); } } @@ -775,30 +775,30 @@ class CXMeansOnline : public CClusterer> { result.reserve(m_Clusters.size()); double renormalizer = boost::numeric::bounds::lowest(); - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - double likelihood = m_Clusters[i].logLikelihoodFromCluster(m_WeightCalc, point); - result.push_back(std::make_pair(m_Clusters[i].index(), likelihood)); + for (const auto& cluster : m_Clusters) { + double likelihood = cluster.logLikelihoodFromCluster(m_WeightCalc, point); + result.emplace_back(cluster.index(), likelihood); renormalizer = std::max(renormalizer, likelihood); } - double normalizer = 0.0; - for (std::size_t i = 0u; i < result.size(); ++i) { - result[i].second = std::exp(result[i].second - renormalizer); - normalizer += result[i].second; + double Z = 0.0; + for (auto& p : result) { + p.second = std::exp(p.second - renormalizer); + Z += p.second; } double pmax = 0.0; - for (std::size_t i = 0u; i < result.size(); ++i) { - result[i].second /= normalizer; - pmax = std::max(pmax, result[i].second); + for (auto& p : result) { + p.second /= Z; + pmax = std::max(pmax, p.second); } result.erase(std::remove_if(result.begin(), result.end(), CProbabilityLessThan(HARD_ASSIGNMENT_THRESHOLD * pmax)), result.end()); - normalizer = 0.0; - for (std::size_t i = 0u; i < result.size(); ++i) { - normalizer += result[i].second; + Z = 0.0; + for (const auto& p : result) { + Z += p.second; } - for (std::size_t i = 0u; i < result.size(); ++i) { - result[i].second *= count / normalizer; + for (auto& p : result) { + p.second *= count / Z; } } @@ -808,7 +808,7 @@ class CXMeansOnline : public CClusterer> { if (m_Clusters.size() == 1) { LOG_TRACE(<< "Adding " << x << " to " << m_Clusters[0].centre()); m_Clusters[0].add(x, count); - clusters.push_back(std::make_pair(m_Clusters[0].index(), count)); + clusters.emplace_back(m_Clusters[0].index(), count); if (this->maybeSplit(m_Clusters.begin())) { this->cluster(x, clusters, count); } @@ -818,9 +818,8 @@ class CXMeansOnline : public CClusterer> { CBasicStatistics::COrderStatisticsStack>; TMaxAccumulator closest; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - closest.add(std::make_pair( - m_Clusters[i].logLikelihoodFromCluster(m_WeightCalc, x), i)); + for (std::size_t i = 0; i < m_Clusters.size(); ++i) { + closest.add({m_Clusters[i].logLikelihoodFromCluster(m_WeightCalc, x), i}); } closest.sort(); LOG_TRACE(<< "closest = " << closest.print()); @@ -842,7 +841,7 @@ class CXMeansOnline : public CClusterer> { if (p1 < HARD_ASSIGNMENT_THRESHOLD * p0) { LOG_TRACE(<< "Adding " << x << " to " << cluster0->centre()); cluster0->add(x, count); - clusters.push_back(std::make_pair(cluster0->index(), count)); + clusters.emplace_back(cluster0->index(), count); if (this->maybeSplit(cluster0) || this->maybeMerge(cluster0)) { this->cluster(x, clusters, count); } @@ -856,8 +855,8 @@ class CXMeansOnline : public CClusterer> { cluster0->add(x, count0); cluster1->add(x, count1); - clusters.push_back(std::make_pair(cluster0->index(), count0)); - clusters.push_back(std::make_pair(cluster1->index(), count1)); + clusters.emplace_back(cluster0->index(), count0); + clusters.emplace_back(cluster1->index(), count1); if (this->maybeSplit(cluster0) || this->maybeSplit(cluster1) || this->maybeMerge(cluster0) || this->maybeMerge(cluster1)) { this->cluster(x, clusters, count); @@ -876,8 +875,8 @@ class CXMeansOnline : public CClusterer> { m_Clusters.push_back(CCluster(*this)); } TSizeDoublePr2Vec dummy; - for (std::size_t i = 0u; i < x.size(); ++i) { - this->add(x[i].first, dummy, x[i].second); + for (const auto& x_ : x) { + this->add(x_.first, dummy, x_.second); } } @@ -895,8 +894,8 @@ class CXMeansOnline : public CClusterer> { return; } m_HistoryLength = (m_HistoryLength + time) * std::exp(-m_DecayRate * time); - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - m_Clusters[i].propagateForwardsByTime(time); + for (auto& cluster : m_Clusters) { + cluster.propagateForwardsByTime(time); } } @@ -923,8 +922,7 @@ class CXMeansOnline : public CClusterer> { virtual double probability(std::size_t index) const { double weight = 0.0; double Z = 0.0; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - const CCluster& cluster = m_Clusters[i]; + for (const auto& cluster : m_Clusters) { if (cluster.index() == index) { weight = cluster.weight(maths_t::E_ClustersFractionWeight); } @@ -963,11 +961,10 @@ class CXMeansOnline : public CClusterer> { //! The total count of points. double count() const { - double result = 0.0; - for (std::size_t i = 0; i < m_Clusters.size(); ++i) { - result += m_Clusters[i].count(); - } - return result; + return std::accumulate(m_Clusters.begin(), m_Clusters.end(), 0.0, + [](double count, const CCluster& cluster) { + return count + cluster.count(); + }); } //! Print a representation of the clusters that can be plotted in octave. @@ -1021,9 +1018,9 @@ class CXMeansOnline : public CClusterer> { //! Get the cluster with the index \p index. const CCluster* cluster(std::size_t index) const { - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - if (m_Clusters[i].index() == index) { - return &m_Clusters[i]; + for (const auto& cluster : m_Clusters) { + if (cluster.index() == index) { + return &cluster; } } return nullptr; @@ -1033,13 +1030,9 @@ class CXMeansOnline : public CClusterer> { double minimumSplitCount() const { double result = m_MinimumClusterCount; if (m_MinimumClusterFraction > 0.0) { - double count = 0.0; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - count += m_Clusters[i].count(); - } - double scale = std::max( - m_HistoryLength * (1.0 - std::exp(-m_InitialDecayRate)), 1.0); - count *= m_MinimumClusterFraction / scale; + double count = this->count(); + double scale = m_HistoryLength * (1.0 - std::exp(-m_InitialDecayRate)); + count *= m_MinimumClusterFraction / std::max(scale, 1.0); result = std::max(result, count); } LOG_TRACE(<< "minimumSplitCount = " << result); @@ -1107,9 +1100,9 @@ class CXMeansOnline : public CClusterer> { // Get the clusters to prune. for (;;) { TMinAccumulator prune; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { + for (std::size_t i = 0; i < m_Clusters.size(); ++i) { if (m_Clusters[i].count() < minimumCount) { - prune.add(std::make_pair(m_Clusters[i].count(), i)); + prune.add({m_Clusters[i].count(), i}); } } if (prune.count() == 0) { @@ -1148,13 +1141,13 @@ class CXMeansOnline : public CClusterer> { CCluster* result = nullptr; TMinAccumulator min; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - if (cluster.index() == m_Clusters[i].index()) { + for (auto& candidate : m_Clusters) { + if (cluster.index() == candidate.index()) { continue; } - if (min.add(CCluster::BICGain(cluster, m_Clusters[i]))) { - result = &m_Clusters[i]; + if (min.add(CCluster::BICGain(cluster, candidate))) { + result = &candidate; } } if (!result) { diff --git a/include/maths/Constants.h b/include/maths/Constants.h index 99de4b29fa..18707acc67 100644 --- a/include/maths/Constants.h +++ b/include/maths/Constants.h @@ -86,6 +86,10 @@ const double COMPONENT_STATISTICALLY_SIGNIFICANT{0.001}; const double LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE{ std::log(COMPONENT_STATISTICALLY_SIGNIFICANT)}; +//! The default number of regression models used in periodic and +//! calendar cyclic components of the trend decomposition. +const std::size_t COMPONENT_SIZE{36u}; + //! The minimum variance scale for which the likelihood function //! can be accurately adjusted. For smaller scales errors are //! introduced for some priors. diff --git a/include/model/CAnomalyDetectorModelConfig.h b/include/model/CAnomalyDetectorModelConfig.h index f5af5ea896..0dea012c2a 100644 --- a/include/model/CAnomalyDetectorModelConfig.h +++ b/include/model/CAnomalyDetectorModelConfig.h @@ -33,20 +33,18 @@ class CSearchKey; class CModelAutoConfigurer; class CModelFactory; -//! \brief Holds configuration for the anomaly detection models. +//! \brief Responsible for configuring anomaly detection models. //! //! DESCRIPTION:\n -//! Holds configuration state for anomaly detection models. +//! Responsible for configuring classes for performing anomaly detection. +//! It also defines all parameter defaults. //! //! IMPLEMENTATION:\n -//! This wraps up the configuration of the models to encapsulate -//! the details from the calling code. It is intended that at least -//! some of this information will be exposed to the user via a -//! configuration file. -//! -//! Default settings for various modes of operation are provided -//! by the default* factory methods. - +//! This wraps up the configuration of anomaly detection to encapsulate +//! the details from calling code. It is anticipated that: +//! -# Some of this information will be exposed to the user via a +//! configuration file, +//! -# Some may be calculated from data characteristics and so on. class MODEL_EXPORT CAnomalyDetectorModelConfig { public: //! The possible factory types. @@ -89,6 +87,8 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig { using TStrDetectionRulePrVecCRef = boost::reference_wrapper; public: + //! \name Data Gathering + //@{ //! The default value used to separate components of a multivariate feature //! in its string value. static const std::string DEFAULT_MULTIVARIATE_COMPONENT_DELIMITER; @@ -113,6 +113,13 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig { //! Bucket length corresponding to the default decay and learn rates. static const core_t::TTime STANDARD_BUCKET_LENGTH; + //! The default number of half buckets to store before choosing which + //! overlapping bucket has the biggest anomaly + static const std::size_t DEFAULT_BUCKET_RESULTS_DELAY; + //@} + + //! \name Modelling + //@{ //! The default rate at which the model priors decay to non-informative //! per standard bucket length. static const double DEFAULT_DECAY_RATE; @@ -136,20 +143,22 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig { //! The default minimum count in a cluster we'll permit in a cluster. static const double DEFAULT_MINIMUM_CLUSTER_SPLIT_COUNT; - //! The default minimum frequency of non-empty buckets at which we model - //! all buckets. - static const double DEFAULT_CUTOFF_TO_MODEL_EMPTY_BUCKETS; - //! The default proportion of initial count at which we'll delete a //! category from the sketch to cluster. static const double DEFAULT_CATEGORY_DELETE_FRACTION; + //! The default minimum frequency of non-empty buckets at which we model + //! all buckets. + static const double DEFAULT_CUTOFF_TO_MODEL_EMPTY_BUCKETS; + //! The default size of the seasonal components we will model. static const std::size_t DEFAULT_COMPONENT_SIZE; - //! The default number of times to sample a person model when computing - //! total probabilities for population models. - static const std::size_t DEFAULT_TOTAL_PROBABILITY_CALC_SAMPLING_SIZE; + //! The default minimum time to detect a change point in a time series. + static const core_t::TTime DEFAULT_MINIMUM_TIME_TO_DETECT_CHANGE; + + //! The default maximum time to test for a change point in a time series. + static const core_t::TTime DEFAULT_MAXIMUM_TIME_TO_TEST_FOR_CHANGE; //! The maximum number of times we'll update a model in a bucketing //! interval. This only applies to our metric statistics, which are @@ -177,10 +186,7 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig { //! The default threshold for the Pearson correlation coefficient at //! which a correlate will be modeled. static const double DEFAULT_MINIMUM_SIGNIFICANT_CORRELATION; - - //! The default number of half buckets to store before choosing which - //! overlapping bucket has the biggest anomaly - static const std::size_t DEFAULT_BUCKET_RESULTS_DELAY; + //@} //! \name Anomaly Score Calculation //@{ @@ -214,9 +220,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig { static const TDoubleDoublePr DEFAULT_NORMALIZED_SCORE_KNOT_POINTS[9]; //@} - //! The maximum number of samples we use when re-sampling a prior. - static const std::size_t DEFAULT_RESAMPLING_MAX_SAMPLES; - public: //! Create the default configuration. //! diff --git a/include/model/CModelFactory.h b/include/model/CModelFactory.h index b9a3780bab..6fd130a99c 100644 --- a/include/model/CModelFactory.h +++ b/include/model/CModelFactory.h @@ -315,11 +315,6 @@ class MODEL_EXPORT CModelFactory { //! Set the prune window scale factor maximum void pruneWindowScaleMaximum(double factor); - //! Set the number of times we sample the people's attribute - //! distributions to compute raw total probabilities for population - //! models. - void totalProbabilityCalcSamplingSize(std::size_t samplingSize); - //! Set whether multivariate analysis of correlated 'by' fields should //! be performed. void multivariateByFields(bool enabled); diff --git a/include/model/CModelParams.h b/include/model/CModelParams.h index eaad3c56bd..5e2ec1a127 100644 --- a/include/model/CModelParams.h +++ b/include/model/CModelParams.h @@ -25,14 +25,15 @@ namespace ml { namespace maths { struct SDistributionRestoreParams; +struct STimeSeriesDecompositionRestoreParams; } namespace model { //! \brief Wraps up model global parameters. //! //! DESCIRIPTION:\n //! The idea of this class is to encapsulate global model configuration -//! to avoid the need of updating the constructor signatures of all the -//! classes in the CModel hierarchy when new parameters added. +//! parameters to avoid the need of updating the constructor signatures +//! of all the classes in the CModel hierarchy when new parameters added. //! //! IMPLEMENTATION:\n //! This is purposely not implemented as a nested class so that it can @@ -43,7 +44,6 @@ struct MODEL_EXPORT SModelParams { using TStrDetectionRulePr = std::pair; using TStrDetectionRulePrVec = std::vector; using TStrDetectionRulePrVecCRef = boost::reference_wrapper; - using TTimeVec = std::vector; explicit SModelParams(core_t::TTime bucketLength); @@ -54,6 +54,10 @@ struct MODEL_EXPORT SModelParams { //! Get the minimum permitted number of points in a sketched point. double minimumCategoryCount() const; + //! Get the parameters supplied when restoring time series decompositions. + maths::STimeSeriesDecompositionRestoreParams + decompositionRestoreParams(maths_t::EDataType dataType) const; + //! Get the parameters supplied when restoring distribution models. maths::SDistributionRestoreParams distributionRestoreParams(maths_t::EDataType dataType) const; @@ -92,6 +96,12 @@ struct MODEL_EXPORT SModelParams { //! The number of points to use for approximating each seasonal component. std::size_t s_ComponentSize; + //! The minimum time to detect a change point in a time series. + core_t::TTime s_MinimumTimeToDetectChange; + + //! The maximum time to test for a change point in a time series. + core_t::TTime s_MaximumTimeToTestForChange; + //! Controls whether to exclude heavy hitters. model_t::EExcludeFrequent s_ExcludeFrequent; @@ -104,10 +114,6 @@ struct MODEL_EXPORT SModelParams { //! The maximum number of times we'll update a metric model in a bucket. double s_MaximumUpdatesPerBucket; - //! The number of times we sample the people's attribute distributions - //! to compute raw total probabilities for population models. - std::size_t s_TotalProbabilityCalcSamplingSize; - //! The minimum value for the influence for which an influencing field //! value is judged to have any influence on a feature value. double s_InfluenceCutoff; diff --git a/lib/api/CResultNormalizer.cc b/lib/api/CResultNormalizer.cc index 7d36823e4f..a20f183c6d 100644 --- a/lib/api/CResultNormalizer.cc +++ b/lib/api/CResultNormalizer.cc @@ -92,7 +92,7 @@ bool CResultNormalizer::handleRecord(const TStrStrUMap& dataRowFields) { const model::CAnomalyScore::CNormalizer* levelNormalizer = nullptr; double score = probability > m_ModelConfig.maximumAnomalousProbability() ? 0.0 - : maths::CTools::deviation(probability); + : maths::CTools::anomalyScore(probability); if (level == ROOT_LEVEL) { levelNormalizer = &m_Normalizer.bucketNormalizer(); } else if (level == LEAF_LEVEL) { diff --git a/lib/maths/CCalendarComponent.cc b/lib/maths/CCalendarComponent.cc index 49e8d5fa6e..93dcd5c382 100644 --- a/lib/maths/CCalendarComponent.cc +++ b/lib/maths/CCalendarComponent.cc @@ -109,6 +109,11 @@ void CCalendarComponent::clear() { } } +void CCalendarComponent::linearScale(core_t::TTime time, double scale) { + m_Bucketing.linearScale(scale); + this->interpolate(time, false); +} + void CCalendarComponent::add(core_t::TTime time, double value, double weight) { m_Bucketing.add(time, value, weight); } diff --git a/lib/maths/CCalendarComponentAdaptiveBucketing.cc b/lib/maths/CCalendarComponentAdaptiveBucketing.cc index 8bdfd570db..3e518debfc 100644 --- a/lib/maths/CCalendarComponentAdaptiveBucketing.cc +++ b/lib/maths/CCalendarComponentAdaptiveBucketing.cc @@ -102,6 +102,12 @@ void CCalendarComponentAdaptiveBucketing::clear() { clearAndShrink(m_Values); } +void CCalendarComponentAdaptiveBucketing::linearScale(double scale) { + for (auto& value : m_Values) { + CBasicStatistics::moment<0>(value) *= scale; + } +} + void CCalendarComponentAdaptiveBucketing::add(core_t::TTime time, double value, double weight) { std::size_t bucket{0}; if (this->initialized() && this->bucket(time, bucket)) { diff --git a/lib/maths/CCooccurrences.cc b/lib/maths/CCooccurrences.cc index 044f89f755..9c7655be04 100644 --- a/lib/maths/CCooccurrences.cc +++ b/lib/maths/CCooccurrences.cc @@ -53,11 +53,6 @@ struct SCooccurrence { using TMostSignificant = CBasicStatistics::COrderStatisticsHeap; -//! Compute \p x * \p x. -double pow2(double x) { - return x * x; -} - //! Generate a random projection in the positive orthant. //! //! \param[in] dimension The dimension. @@ -168,7 +163,7 @@ void seed(const TPackedBitVectorVec& indicators, TDoubleVec theta(n, 0.0); for (std::size_t i = 0u; i < n; ++i) { for (std::size_t j = 0u; j < projected.size(); ++j) { - theta[i] += pow2(projected[j][i]); + theta[i] += CTools::pow2(projected[j][i]); } theta[i] = std::acos(std::sqrt(theta[i])); } diff --git a/lib/maths/CKMostCorrelated.cc b/lib/maths/CKMostCorrelated.cc index b858157251..b9b5f20fee 100644 --- a/lib/maths/CKMostCorrelated.cc +++ b/lib/maths/CKMostCorrelated.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -86,12 +87,9 @@ class CCloserThan : public std::unary_function { : m_Threshold(threshold), m_X(x) {} bool operator()(const TPointSizePr& y) const { - return pow2(bg::distance(m_X, y.first)) < m_Threshold; + return CTools::pow2(bg::distance(m_X, y.first)) < m_Threshold; } -private: - static double pow2(double x) { return x * x; } - private: double m_Threshold; TPoint m_X; diff --git a/lib/maths/CLogNormalMeanPrecConjugate.cc b/lib/maths/CLogNormalMeanPrecConjugate.cc index c5f7ac647f..a4d8271788 100644 --- a/lib/maths/CLogNormalMeanPrecConjugate.cc +++ b/lib/maths/CLogNormalMeanPrecConjugate.cc @@ -51,11 +51,6 @@ using TDoubleWeightsAry1Vec = maths_t::TDoubleWeightsAry1Vec; using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar::TAccumulator; -//! Compute x * x. -inline double pow2(double x) { - return x * x; -} - const double MINIMUM_LOGNORMAL_SHAPE = 100.0; namespace detail { @@ -309,7 +304,7 @@ class CVarianceKernel { boost::math::normal normal(m_M, std::sqrt(1.0 / x(0) / m_P)); double fx = boost::math::pdf(normal, x(1)) * boost::math::pdf(gamma, x(0)); double m = std::exp(x(1) + 0.5 / x(0)); - result(0) = (m * m * (std::exp(1.0 / x(0)) - 1.0) + pow2(m - m_Mean)) * fx; + result(0) = (m * m * (std::exp(1.0 / x(0)) - 1.0) + CTools::pow2(m - m_Mean)) * fx; result(1) = fx; } catch (const std::exception& e) { LOG_ERROR(<< "Failed to calculate mean kernel: " << e.what() @@ -473,7 +468,7 @@ class CLogMarginalLikelihood : core::CNonCopyable { double impliedShape = m_Shape + 0.5 * m_NumberSamples; double impliedRate = m_Rate + 0.5 * (logSamplesSquareDeviation + m_Precision * weightedNumberSamples * - pow2(logSamplesMean - m_Mean) / + CTools::pow2(logSamplesMean - m_Mean) / (m_Precision + weightedNumberSamples)); result = m_Constant - impliedShape * std::log(impliedRate) - logSamplesSum; @@ -585,7 +580,7 @@ class CLogSampleSquareDeviation : core::CNonCopyable { } double n = maths_t::countForUpdate(m_Weights[i]); residual = std::log(residual + x) - m_Mean; - result += n * pow2(residual); + result += n * CTools::pow2(residual); } return true; } @@ -841,7 +836,7 @@ void CLogNormalMeanPrecConjugate::addSamples(const TDouble1Vec& samples, m_GammaShape += 0.5 * numberSamples; m_GammaRate += 0.5 * (logSamplesSquareDeviation + m_GaussianPrecision * scaledNumberSamples * - pow2(logSamplesMean - m_GaussianMean) / + CTools::pow2(logSamplesMean - m_GaussianMean) / (m_GaussianPrecision + scaledNumberSamples)); m_GaussianMean = (m_GaussianPrecision * m_GaussianMean + scaledNumberSamples * logSamplesMean) / @@ -873,7 +868,8 @@ void CLogNormalMeanPrecConjugate::addSamples(const TDouble1Vec& samples, // // From which we derive the results below. - double minimumRate = (2.0 * m_GammaShape - 1.0) * pow2(MINIMUM_COEFFICIENT_OF_VARIATION); + double minimumRate = (2.0 * m_GammaShape - 1.0) * + CTools::pow2(MINIMUM_COEFFICIENT_OF_VARIATION); if (m_GammaRate < minimumRate) { double extraVariation = (minimumRate - m_GammaRate) / diff --git a/lib/maths/CLogTDistribution.cc b/lib/maths/CLogTDistribution.cc index cad91f9a60..05e977fdb3 100644 --- a/lib/maths/CLogTDistribution.cc +++ b/lib/maths/CLogTDistribution.cc @@ -17,13 +17,6 @@ namespace ml { namespace maths { -namespace { - -inline double square(double x) { - return x * x; -} -} - CLogTDistribution::CLogTDistribution(double degreesFreedom, double location, double scale) : m_DegreesFreedom(degreesFreedom), m_Location(location), m_Scale(scale) { } @@ -74,16 +67,17 @@ double mode(const CLogTDistribution& distribution) { // x = exp(m - (n+1) / 2 + ((n+1)^2 / 4 - n * s^2) ^ (1/2)) double degreesFreedom = distribution.degreesFreedom(); - double squareScale = square(distribution.scale()); + double squareScale = CTools::pow2(distribution.scale()); - if (square(degreesFreedom + 1.0) < 4.0 * degreesFreedom * squareScale) { + if (CTools::pow2(degreesFreedom + 1.0) < 4.0 * degreesFreedom * squareScale) { return 0.0; } double location = distribution.location(); return std::exp(location - (degreesFreedom + 1.0) / 2.0 + - std::sqrt(square(degreesFreedom + 1.0) / 4.0 - degreesFreedom * squareScale)); + std::sqrt(CTools::pow2(degreesFreedom + 1.0) / 4.0 - + degreesFreedom * squareScale)); } CLogTDistribution::TOptionalDouble localMinimum(const CLogTDistribution& distribution) { @@ -96,16 +90,17 @@ CLogTDistribution::TOptionalDouble localMinimum(const CLogTDistribution& distrib // See the documentation in the mode function for more details. double degreesFreedom = distribution.degreesFreedom(); - double squareScale = square(distribution.scale()); + double squareScale = CTools::pow2(distribution.scale()); - if (square(degreesFreedom + 1.0) < 4.0 * degreesFreedom * squareScale) { + if (CTools::pow2(degreesFreedom + 1.0) < 4.0 * degreesFreedom * squareScale) { return CLogTDistribution::TOptionalDouble(); } double location = distribution.location(); return std::exp(location - (degreesFreedom + 1.0) / 2.0 - - std::sqrt(square(degreesFreedom + 1.0) / 4.0 - degreesFreedom * squareScale)); + std::sqrt(CTools::pow2(degreesFreedom + 1.0) / 4.0 - + degreesFreedom * squareScale)); } double pdf(const CLogTDistribution& distribution, double x) { diff --git a/lib/maths/CModel.cc b/lib/maths/CModel.cc index dd71c3d2e0..5fd2ab1d2d 100644 --- a/lib/maths/CModel.cc +++ b/lib/maths/CModel.cc @@ -8,6 +8,9 @@ #include #include +#include + +#include #include @@ -53,23 +56,26 @@ double oneSidedEmptyBucketCorrection(maths_t::EProbabilityCalculation calculatio return 0.0; } -const double EFFECTIVE_COUNT[] = {1.0, 0.8, 0.7, 0.65, 0.6, - 0.57, 0.54, 0.52, 0.51}; -const double LEARN_RATE = 1.0; -const double DECAY_RATE = 0.0; +const double EFFECTIVE_COUNT[]{1.0, 0.8, 0.7, 0.65, 0.6, + 0.57, 0.54, 0.52, 0.51}; //! Get the parameters for the stub model. CModelParams stubParameters() { - return CModelParams(0, LEARN_RATE, DECAY_RATE, 0.0); + return CModelParams{ + 0, 1.0, 0.0, 0.0, 6 * core::constants::HOUR, core::constants::DAY}; } } CModelParams::CModelParams(core_t::TTime bucketLength, - const double& learnRate, - const double& decayRate, - double minimumSeasonalVarianceScale) + double learnRate, + double decayRate, + double minimumSeasonalVarianceScale, + core_t::TTime minimumTimeToDetectChange, + core_t::TTime maximumTimeToTestForChange) : m_BucketLength(bucketLength), m_LearnRate(learnRate), m_DecayRate(decayRate), m_MinimumSeasonalVarianceScale(minimumSeasonalVarianceScale), + m_MinimumTimeToDetectChange(std::max(minimumTimeToDetectChange, 12 * bucketLength)), + m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 48 * bucketLength)), m_ProbabilityBucketEmpty(0.0) { } @@ -93,6 +99,18 @@ double CModelParams::minimumSeasonalVarianceScale() const { return m_MinimumSeasonalVarianceScale; } +bool CModelParams::testForChange(core_t::TTime changeInterval) const { + return changeInterval >= std::max(3 * m_BucketLength, 10 * core::constants::MINUTE); +} + +core_t::TTime CModelParams::minimumTimeToDetectChange(void) const { + return m_MinimumTimeToDetectChange; +} + +core_t::TTime CModelParams::maximumTimeToTestForChange(void) const { + return m_MaximumTimeToTestForChange; +} + void CModelParams::probabilityBucketEmpty(double probability) { m_ProbabilityBucketEmpty = probability; } @@ -249,6 +267,18 @@ bool CModelProbabilityParams::updateAnomalyModel() const { return m_UpdateAnomalyModel; } +CModel::EUpdateResult CModel::combine(EUpdateResult lhs, EUpdateResult rhs) { + switch (lhs) { + case E_Success: + return rhs; + case E_Reset: + return rhs == E_Failure ? E_Failure : E_Reset; + case E_Failure: + return E_Failure; + } + return E_Failure; +} + CModel::CModel(const CModelParams& params) : m_Params(params) { } @@ -269,11 +299,10 @@ double CModel::correctForEmptyBucket(maths_t::EProbabilityCalculation calculatio bool bucketEmpty, double probabilityBucketEmpty, double probability) { - double pCorrected = (1.0 - probabilityBucketEmpty) * probability; + double pCorrected{(1.0 - probabilityBucketEmpty) * probability}; if (!bucketEmpty) { - double pOneSided = oneSidedEmptyBucketCorrection(calculation, value, - probabilityBucketEmpty); + double pOneSided{oneSidedEmptyBucketCorrection(calculation, value, probabilityBucketEmpty)}; return std::min(pOneSided + pCorrected, 1.0); } @@ -286,24 +315,23 @@ double CModel::correctForEmptyBucket(maths_t::EProbabilityCalculation calculatio const TDouble2Vec& probabilityEmptyBucket, double probability) { if (!bucketEmpty[0] && !bucketEmpty[1]) { - double pState = (1.0 - probabilityEmptyBucket[0]) * - (1.0 - probabilityEmptyBucket[1]); - double pOneSided = oneSidedEmptyBucketCorrection( - calculation, TDouble2Vec{value}, 1.0 - pState); + double pState{(1.0 - probabilityEmptyBucket[0]) * + (1.0 - probabilityEmptyBucket[1])}; + double pOneSided{oneSidedEmptyBucketCorrection(calculation, {value}, 1.0 - pState)}; return std::min(pOneSided + pState * probability, 1.0); } if (!bucketEmpty[0]) { - double pState = (1.0 - probabilityEmptyBucket[0]) * probabilityEmptyBucket[1]; - double pOneSided = oneSidedEmptyBucketCorrection( - calculation, TDouble2Vec{value}, probabilityEmptyBucket[0]); + double pState{(1.0 - probabilityEmptyBucket[0]) * probabilityEmptyBucket[1]}; + double pOneSided{oneSidedEmptyBucketCorrection(calculation, {value}, + probabilityEmptyBucket[0])}; return std::min(pOneSided + pState + (1.0 - pState) * probability, 1.0); } if (!bucketEmpty[1]) { - double pState = probabilityEmptyBucket[0] * (1.0 - probabilityEmptyBucket[1]); - double pOneSided = oneSidedEmptyBucketCorrection( - calculation, TDouble2Vec{value}, probabilityEmptyBucket[1]); + double pState{probabilityEmptyBucket[0] * (1.0 - probabilityEmptyBucket[1])}; + double pOneSided{oneSidedEmptyBucketCorrection(calculation, {value}, + probabilityEmptyBucket[1])}; return std::min(pOneSided + pState + (1.0 - pState) * probability, 1.0); } @@ -338,23 +366,23 @@ void CModelStub::modelCorrelations(CTimeSeriesCorrelations& /*model*/) { } CModelStub::TSize2Vec1Vec CModelStub::correlates() const { - return TSize2Vec1Vec(); + return {}; } CModelStub::TDouble2Vec CModelStub::mode(core_t::TTime /*time*/, const TDouble2VecWeightsAry& /*weights*/) const { - return TDouble2Vec(); + return {}; } CModelStub::TDouble2Vec1Vec CModelStub::correlateModes(core_t::TTime /*time*/, const TDouble2VecWeightsAry1Vec& /*weights*/) const { - return TDouble2Vec1Vec(); + return {}; } CModelStub::TDouble2Vec1Vec CModelStub::residualModes(const TDouble2VecWeightsAry& /*weights*/) const { - return TDouble2Vec1Vec(); + return {}; } void CModelStub::addBucketValue(const TTimeDouble2VecSizeTrVec& /*value*/) { @@ -376,14 +404,14 @@ void CModelStub::detrend(const TTime2Vec1Vec& /*time*/, CModelStub::TDouble2Vec CModelStub::predict(core_t::TTime /*time*/, const TSizeDoublePr1Vec& /*correlated*/, TDouble2Vec /*hint*/) const { - return TDouble2Vec(); + return {}; } CModelStub::TDouble2Vec3Vec CModelStub::confidenceInterval(core_t::TTime /*time*/, double /*confidenceInterval*/, const TDouble2VecWeightsAry& /*weights*/) const { - return TDouble2Vec3Vec(); + return {}; } bool CModelStub::forecast(core_t::TTime /*startTime*/, @@ -413,12 +441,12 @@ bool CModelStub::probability(const CModelProbabilityParams& /*params*/, CModelStub::TDouble2Vec CModelStub::winsorisationWeight(double /*derate*/, core_t::TTime /*time*/, const TDouble2Vec& /*value*/) const { - return TDouble2Vec(); + return {}; } CModelStub::TDouble2Vec CModelStub::seasonalWeight(double /*confidence*/, core_t::TTime /*time*/) const { - return TDouble2Vec(); + return {}; } std::uint64_t CModelStub::checksum(std::uint64_t seed) const { diff --git a/lib/maths/CNaiveBayes.cc b/lib/maths/CNaiveBayes.cc new file mode 100644 index 0000000000..64cb285d0b --- /dev/null +++ b/lib/maths/CNaiveBayes.cc @@ -0,0 +1,393 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace ml { +namespace maths { +namespace { +const std::string PRIOR_TAG{"a"}; +const std::string CLASS_LABEL_TAG{"b"}; +const std::string CLASS_MODEL_TAG{"c"}; +const std::string MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG{"d"}; +const std::string COUNT_TAG{"e"}; +const std::string CONDITIONAL_DENSITY_FROM_PRIOR_TAG{"f"}; +} + +CNaiveBayesFeatureDensityFromPrior::CNaiveBayesFeatureDensityFromPrior(const CPrior& prior) + : m_Prior(prior.clone()) { +} + +void CNaiveBayesFeatureDensityFromPrior::add(const TDouble1Vec& x) { + m_Prior->addSamples(x, maths_t::CUnitWeights::SINGLE_UNIT); +} + +CNaiveBayesFeatureDensityFromPrior* CNaiveBayesFeatureDensityFromPrior::clone() const { + return new CNaiveBayesFeatureDensityFromPrior(*m_Prior); +} + +bool CNaiveBayesFeatureDensityFromPrior::acceptRestoreTraverser( + const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser) { + do { + const std::string& name{traverser.name()}; + RESTORE(PRIOR_TAG, traverser.traverseSubLevel(boost::bind( + CPriorStateSerialiser(), boost::cref(params), + boost::ref(m_Prior), _1))); + } while (traverser.next()); + return true; +} + +void CNaiveBayesFeatureDensityFromPrior::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + inserter.insertLevel(PRIOR_TAG, boost::bind(CPriorStateSerialiser(), + boost::cref(*m_Prior), _1)); +} + +double CNaiveBayesFeatureDensityFromPrior::logValue(const TDouble1Vec& x) const { + double result; + if (m_Prior->jointLogMarginalLikelihood(x, maths_t::CUnitWeights::SINGLE_UNIT, + result) != maths_t::E_FpNoErrors) { + LOG_ERROR("Bad density value at " << x << " for " << m_Prior->print()); + return boost::numeric::bounds::lowest(); + } + return result; +} + +double CNaiveBayesFeatureDensityFromPrior::logMaximumValue() const { + double result; + if (m_Prior->jointLogMarginalLikelihood({m_Prior->marginalLikelihoodMode()}, + maths_t::CUnitWeights::SINGLE_UNIT, + result) != maths_t::E_FpNoErrors) { + LOG_ERROR("Bad density value for " << m_Prior->print()); + return boost::numeric::bounds::lowest(); + } + return result; +} + +void CNaiveBayesFeatureDensityFromPrior::dataType(maths_t::EDataType dataType) { + m_Prior->dataType(dataType); +} + +void CNaiveBayesFeatureDensityFromPrior::propagateForwardsByTime(double time) { + m_Prior->propagateForwardsByTime(time); +} + +void CNaiveBayesFeatureDensityFromPrior::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { + return core::CMemoryDebug::dynamicSize("m_Prior", m_Prior, mem); +} + +std::size_t CNaiveBayesFeatureDensityFromPrior::staticSize() const { + return sizeof(*this); +} + +std::size_t CNaiveBayesFeatureDensityFromPrior::memoryUsage() const { + return core::CMemory::dynamicSize(m_Prior); +} + +uint64_t CNaiveBayesFeatureDensityFromPrior::checksum(uint64_t seed) const { + return CChecksum::calculate(seed, m_Prior); +} + +std::string CNaiveBayesFeatureDensityFromPrior::print() const { + std::string result; + m_Prior->print(" ", result); + return result; +} + +CNaiveBayes::CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar, + double decayRate, + TOptionalDouble minMaxLogLikelihoodToUseFeature) + : m_MinMaxLogLikelihoodToUseFeature{minMaxLogLikelihoodToUseFeature}, + m_DecayRate{decayRate}, m_Exemplar{exemplar.clone()}, m_ClassConditionalDensities{2} { +} + +CNaiveBayes::CNaiveBayes(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser) + : m_DecayRate{params.s_DecayRate}, m_ClassConditionalDensities{2} { + traverser.traverseSubLevel(boost::bind(&CNaiveBayes::acceptRestoreTraverser, + this, boost::cref(params), _1)); +} + +bool CNaiveBayes::acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser) { + std::size_t label; + do { + const std::string& name{traverser.name()}; + RESTORE_BUILT_IN(CLASS_LABEL_TAG, label) + RESTORE_SETUP_TEARDOWN(CLASS_MODEL_TAG, SClass class_, + traverser.traverseSubLevel(boost::bind( + &SClass::acceptRestoreTraverser, + boost::ref(class_), boost::cref(params), _1)), + m_ClassConditionalDensities.emplace(label, class_)) + RESTORE_SETUP_TEARDOWN(MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG, double value, + core::CStringUtils::stringToType(traverser.value(), value), + m_MinMaxLogLikelihoodToUseFeature.reset(value)) + } while (traverser.next()); + return true; +} + +void CNaiveBayes::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + using TSizeClassUMapCItr = TSizeClassUMap::const_iterator; + using TSizeClassUMapCItrVec = std::vector; + TSizeClassUMapCItrVec classes; + classes.reserve(m_ClassConditionalDensities.size()); + for (auto i = m_ClassConditionalDensities.begin(); + i != m_ClassConditionalDensities.end(); ++i) { + classes.push_back(i); + } + std::sort(classes.begin(), classes.end(), + core::CFunctional::SDereference()); + for (const auto& class_ : classes) { + inserter.insertValue(CLASS_LABEL_TAG, class_->first); + inserter.insertLevel(CLASS_MODEL_TAG, + boost::bind(&SClass::acceptPersistInserter, + boost::ref(class_->second), _1)); + } + if (m_MinMaxLogLikelihoodToUseFeature) { + inserter.insertValue(MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG, + *m_MinMaxLogLikelihoodToUseFeature, + core::CIEEE754::E_SinglePrecision); + } +} + +void CNaiveBayes::swap(CNaiveBayes& other) { + std::swap(m_DecayRate, other.m_DecayRate); + m_Exemplar.swap(other.m_Exemplar); + m_ClassConditionalDensities.swap(other.m_ClassConditionalDensities); + std::swap(m_MinMaxLogLikelihoodToUseFeature, other.m_MinMaxLogLikelihoodToUseFeature); +} + +bool CNaiveBayes::initialized() const { + return m_ClassConditionalDensities.size() > 0; +} + +void CNaiveBayes::initialClassCounts(const TDoubleSizePrVec& counts) { + for (const auto& count : counts) { + m_ClassConditionalDensities[count.second] = SClass{count.first, {}}; + } +} + +void CNaiveBayes::addTrainingDataPoint(std::size_t label, const TDouble1VecVec& x) { + if (!this->validate(x)) { + return; + } + + auto& class_ = m_ClassConditionalDensities[label]; + + if (class_.s_ConditionalDensities.empty()) { + class_.s_ConditionalDensities.reserve(x.size()); + std::generate_n( + std::back_inserter(class_.s_ConditionalDensities), x.size(), + [this]() { return TFeatureDensityPtr{m_Exemplar->clone()}; }); + } + + bool updateCount{false}; + for (std::size_t i = 0u; i < x.size(); ++i) { + if (x[i].size() > 0) { + class_.s_ConditionalDensities[i]->add(x[i]); + updateCount = true; + } + } + + if (updateCount) { + class_.s_Count += 1.0; + } else { + LOG_TRACE("Ignoring empty feature vector"); + } +} + +void CNaiveBayes::dataType(maths_t::EDataType dataType) { + for (auto& class_ : m_ClassConditionalDensities) { + for (auto& density : class_.second.s_ConditionalDensities) { + density->dataType(dataType); + } + } +} + +void CNaiveBayes::propagateForwardsByTime(double time) { + double factor{std::exp(-m_DecayRate * time)}; + for (auto& class_ : m_ClassConditionalDensities) { + class_.second.s_Count *= factor; + for (auto& density : class_.second.s_ConditionalDensities) { + density->propagateForwardsByTime(time); + } + } +} + +CNaiveBayes::TDoubleSizePrVec +CNaiveBayes::highestClassProbabilities(std::size_t n, const TDouble1VecVec& x) const { + TDoubleSizePrVec p(this->classProbabilities(x)); + n = std::min(n, p.size()); + std::sort(p.begin(), p.begin() + n, std::greater()); + return TDoubleSizePrVec{p.begin(), p.begin() + n}; +} + +double CNaiveBayes::classProbability(std::size_t label, const TDouble1VecVec& x) const { + TDoubleSizePrVec p(this->classProbabilities(x)); + auto i = std::find_if(p.begin(), p.end(), [label](const TDoubleSizePr& p_) { + return p_.second == label; + }); + return i == p.end() ? 0.0 : i->first; +} + +CNaiveBayes::TDoubleSizePrVec CNaiveBayes::classProbabilities(const TDouble1VecVec& x) const { + if (!this->validate(x)) { + return {}; + } + if (m_ClassConditionalDensities.empty()) { + LOG_ERROR("Trying to compute class probabilities without supplying training data"); + return {}; + } + + using TDoubleVec = std::vector; + using TMaxAccumulator = CBasicStatistics::SMax::TAccumulator; + + TDoubleSizePrVec p; + p.reserve(m_ClassConditionalDensities.size()); + for (const auto& class_ : m_ClassConditionalDensities) { + p.emplace_back(CTools::fastLog(class_.second.s_Count), class_.first); + } + + TDoubleVec logLikelihoods; + for (std::size_t i = 0u; i < x.size(); ++i) { + if (x[i].size() > 0) { + TMaxAccumulator maxLogLikelihood; + logLikelihoods.clear(); + for (const auto& class_ : m_ClassConditionalDensities) { + const auto& density = class_.second.s_ConditionalDensities[i]; + double logLikelihood{density->logValue(x[i])}; + double logMaximumLikelihood{density->logMaximumValue()}; + maxLogLikelihood.add(logLikelihood - logMaximumLikelihood); + logLikelihoods.push_back(logLikelihood); + } + double weight{1.0}; + if (m_MinMaxLogLikelihoodToUseFeature) { + weight = CTools::logisticFunction( + (maxLogLikelihood[0] - *m_MinMaxLogLikelihoodToUseFeature) / + std::fabs(*m_MinMaxLogLikelihoodToUseFeature), + 0.1); + } + for (std::size_t j = 0u; j < logLikelihoods.size(); ++j) { + p[j].first += weight * logLikelihoods[j]; + } + } + } + + double scale{std::max_element(p.begin(), p.end())->first}; + double Z{0.0}; + for (auto& pc : p) { + pc.first = std::exp(pc.first - scale); + Z += pc.first; + } + for (auto& pc : p) { + pc.first /= Z; + } + + return p; +} + +void CNaiveBayes::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { + core::CMemoryDebug::dynamicSize("m_Exemplar", m_Exemplar, mem); + core::CMemoryDebug::dynamicSize("m_ClassConditionalDensities", + m_ClassConditionalDensities, mem); +} + +std::size_t CNaiveBayes::memoryUsage() const { + return core::CMemory::dynamicSize(m_Exemplar) + + core::CMemory::dynamicSize(m_ClassConditionalDensities); +} + +uint64_t CNaiveBayes::checksum(uint64_t seed) const { + return CChecksum::calculate(seed, m_ClassConditionalDensities); +} + +std::string CNaiveBayes::print() const { + std::ostringstream result; + result << "\n"; + for (const auto& class_ : m_ClassConditionalDensities) { + result << "CLASS(" << class_.first << ")\n"; + for (const auto& density : class_.second.s_ConditionalDensities) { + result << "---"; + result << density->print() << "\n"; + } + } + return result.str(); +} + +bool CNaiveBayes::validate(const TDouble1VecVec& x) const { + auto class_ = m_ClassConditionalDensities.begin(); + if (class_ != m_ClassConditionalDensities.end() && + class_->second.s_ConditionalDensities.size() > 0 && + class_->second.s_ConditionalDensities.size() != x.size()) { + LOG_ERROR("Unexpected feature vector: " << core::CContainerPrinter::print(x)); + return false; + } + return true; +} + +bool CNaiveBayes::SClass::acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser) { + do { + const std::string& name{traverser.name()}; + RESTORE_BUILT_IN(COUNT_TAG, s_Count) + RESTORE_SETUP_TEARDOWN(CONDITIONAL_DENSITY_FROM_PRIOR_TAG, + CNaiveBayesFeatureDensityFromPrior tmp, + traverser.traverseSubLevel(boost::bind( + &CNaiveBayesFeatureDensityFromPrior::acceptRestoreTraverser, + boost::ref(tmp), boost::cref(params), _1)), + s_ConditionalDensities.emplace_back(tmp.clone())) + // Add other implementations' restore code here. + } while (traverser.next()); + return true; +} + +void CNaiveBayes::SClass::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + inserter.insertValue(COUNT_TAG, s_Count, core::CIEEE754::E_SinglePrecision); + for (const auto& density : s_ConditionalDensities) { + if (dynamic_cast(density.get())) { + inserter.insertLevel(CONDITIONAL_DENSITY_FROM_PRIOR_TAG, + boost::bind(&CNaiveBayesFeatureDensity::acceptPersistInserter, + density.get(), _1)); + continue; + } + // Add other implementations' persist code here. + } +} + +void CNaiveBayes::SClass::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { + core::CMemoryDebug::dynamicSize("s_ConditionalDensities", s_ConditionalDensities, mem); +} + +std::size_t CNaiveBayes::SClass::memoryUsage() const { + return core::CMemory::dynamicSize(s_ConditionalDensities); +} + +uint64_t CNaiveBayes::SClass::checksum(uint64_t seed) const { + seed = CChecksum::calculate(seed, s_Count); + return CChecksum::calculate(seed, s_ConditionalDensities); +} +} +} diff --git a/lib/maths/CRestoreParams.cc b/lib/maths/CRestoreParams.cc index f736a5dcbc..340f4902d4 100644 --- a/lib/maths/CRestoreParams.cc +++ b/lib/maths/CRestoreParams.cc @@ -11,13 +11,6 @@ namespace ml { namespace maths { -STimeSeriesDecompositionRestoreParams::STimeSeriesDecompositionRestoreParams( - double decayRate, - core_t::TTime minimumBucketLength, - std::size_t componentSize) - : s_DecayRate{decayRate}, s_MinimumBucketLength{minimumBucketLength}, s_ComponentSize{componentSize} { -} - SDistributionRestoreParams::SDistributionRestoreParams(maths_t::EDataType dataType, double decayRate, double minimumClusterFraction, @@ -27,6 +20,23 @@ SDistributionRestoreParams::SDistributionRestoreParams(maths_t::EDataType dataTy s_MinimumClusterCount{minimumClusterCount}, s_MinimumCategoryCount{minimumCategoryCount} { } +STimeSeriesDecompositionRestoreParams::STimeSeriesDecompositionRestoreParams( + double decayRate, + core_t::TTime minimumBucketLength, + std::size_t componentSize, + const SDistributionRestoreParams& changeModelParams) + : s_DecayRate{decayRate}, s_MinimumBucketLength{minimumBucketLength}, + s_ComponentSize{componentSize}, s_ChangeModelParams{changeModelParams} { +} + +STimeSeriesDecompositionRestoreParams::STimeSeriesDecompositionRestoreParams( + double decayRate, + core_t::TTime minimumBucketLength, + const SDistributionRestoreParams& changeModelParams) + : s_DecayRate{decayRate}, s_MinimumBucketLength{minimumBucketLength}, + s_ComponentSize{COMPONENT_SIZE}, s_ChangeModelParams{changeModelParams} { +} + SModelRestoreParams::SModelRestoreParams(const CModelParams& params, const STimeSeriesDecompositionRestoreParams& decompositionParams, const SDistributionRestoreParams& distributionParams) diff --git a/lib/maths/CSeasonalComponent.cc b/lib/maths/CSeasonalComponent.cc index c53d902e70..15588eb69b 100644 --- a/lib/maths/CSeasonalComponent.cc +++ b/lib/maths/CSeasonalComponent.cc @@ -139,6 +139,11 @@ void CSeasonalComponent::shiftSlope(double shift) { m_Bucketing.shiftSlope(shift); } +void CSeasonalComponent::linearScale(core_t::TTime time, double scale) { + m_Bucketing.linearScale(scale); + this->interpolate(time, false); +} + void CSeasonalComponent::add(core_t::TTime time, double value, double weight) { double predicted{CBasicStatistics::mean(this->value(this->jitter(time), 0.0))}; m_Bucketing.add(time, value, predicted, weight); diff --git a/lib/maths/CSeasonalComponentAdaptiveBucketing.cc b/lib/maths/CSeasonalComponentAdaptiveBucketing.cc index 4d5902ee86..99ea15b1ea 100644 --- a/lib/maths/CSeasonalComponentAdaptiveBucketing.cc +++ b/lib/maths/CSeasonalComponentAdaptiveBucketing.cc @@ -186,6 +186,12 @@ void CSeasonalComponentAdaptiveBucketing::shiftSlope(double shift) { } } +void CSeasonalComponentAdaptiveBucketing::linearScale(double scale) { + for (auto& bucket : m_Buckets) { + bucket.s_Regression.linearScale(scale); + } +} + void CSeasonalComponentAdaptiveBucketing::add(core_t::TTime time, double value, double prediction, diff --git a/lib/maths/CTimeSeriesChangeDetector.cc b/lib/maths/CTimeSeriesChangeDetector.cc new file mode 100644 index 0000000000..8dad8364c8 --- /dev/null +++ b/lib/maths/CTimeSeriesChangeDetector.cc @@ -0,0 +1,692 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace ml { +namespace maths { +using namespace time_series_change_detector_detail; + +namespace { +using TDouble1Vec = core::CSmallVector; +using TOptionalChangeDescription = CUnivariateTimeSeriesChangeDetector::TOptionalChangeDescription; +const std::string MINIMUM_TIME_TO_DETECT{"a"}; +const std::string MAXIMUM_TIME_TO_DETECT{"b"}; +const std::string MINIMUM_DELTA_BIC_TO_DETECT{"c"}; +const std::string RESIDUAL_MODEL_MODE_TAG{"d"}; +const std::string SAMPLE_COUNT_TAG{"e"}; +const std::string CURRENT_EVIDENCE_OF_CHANGE{"f"}; +const std::string MIN_TIME_TAG{"g"}; +const std::string MAX_TIME_TAG{"h"}; +const std::string CHANGE_MODEL_TAG{"i"}; +const std::string LOG_LIKELIHOOD_TAG{"j"}; +const std::string EXPECTED_LOG_LIKELIHOOD_TAG{"k"}; +const std::string SHIFT_TAG{"l"}; +const std::string SCALE_TAG{"m"}; +const std::string RESIDUAL_MODEL_TAG{"n"}; +const std::size_t EXPECTED_LOG_LIKELIHOOD_NUMBER_INTERVALS{4u}; +const double EXPECTED_EVIDENCE_THRESHOLD_MULTIPLIER{0.9}; +const std::size_t COUNT_TO_INITIALIZE{5u}; +const double MINIMUM_SCALE{0.1}; +const double MAXIMUM_SCALE{10.0}; +const double WINSORISATION_DERATE{1.0}; +} + +SChangeDescription::SChangeDescription(EDescription description, double value, const TPriorPtr& residualModel) + : s_Description{description}, s_Value{value}, s_ResidualModel{residualModel} { +} + +std::string SChangeDescription::print() const { + std::string result; + switch (s_Description) { + case E_LevelShift: + result += "level shift by "; + break; + case E_LinearScale: + result += "linear scale by "; + break; + case E_TimeShift: + result += "time shift by "; + break; + } + return result + core::CStringUtils::typeToString(s_Value[0]); +} + +CUnivariateTimeSeriesChangeDetector::CUnivariateTimeSeriesChangeDetector( + const TDecompositionPtr& trendModel, + const TPriorPtr& residualModel, + core_t::TTime minimumTimeToDetect, + core_t::TTime maximumTimeToDetect, + double minimumDeltaBicToDetect) + : m_MinimumTimeToDetect{minimumTimeToDetect}, m_MaximumTimeToDetect{maximumTimeToDetect}, + m_MinimumDeltaBicToDetect{minimumDeltaBicToDetect}, m_SampleCount{0}, m_CurrentEvidenceOfChange{0.0}, + m_ChangeModels{ + std::make_shared(trendModel, residualModel), + std::make_shared(trendModel, residualModel), + std::make_shared(trendModel, residualModel, -core::constants::HOUR), + std::make_shared(trendModel, + residualModel, + +core::constants::HOUR)} { + if (trendModel->seasonalComponents().size() > 0) { + m_ChangeModels.push_back(std::make_shared( + trendModel, residualModel)); + } +} + +bool CUnivariateTimeSeriesChangeDetector::acceptRestoreTraverser( + const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser) { + auto model = m_ChangeModels.begin(); + do { + const std::string name{traverser.name()}; + RESTORE_BUILT_IN(MINIMUM_TIME_TO_DETECT, m_MinimumTimeToDetect) + RESTORE_BUILT_IN(MAXIMUM_TIME_TO_DETECT, m_MaximumTimeToDetect) + RESTORE_BUILT_IN(MINIMUM_DELTA_BIC_TO_DETECT, m_MinimumDeltaBicToDetect) + RESTORE_BUILT_IN(SAMPLE_COUNT_TAG, m_SampleCount) + RESTORE_BUILT_IN(CURRENT_EVIDENCE_OF_CHANGE, m_CurrentEvidenceOfChange) + RESTORE_SETUP_TEARDOWN(MIN_TIME_TAG, core_t::TTime time, + core::CStringUtils::stringToType(traverser.value(), time), + m_TimeRange.add(time)) + RESTORE_SETUP_TEARDOWN(MAX_TIME_TAG, core_t::TTime time, + core::CStringUtils::stringToType(traverser.value(), time), + m_TimeRange.add(time)) + RESTORE(CHANGE_MODEL_TAG, traverser.traverseSubLevel(boost::bind( + &CUnivariateChangeModel::acceptRestoreTraverser, + (model++)->get(), boost::cref(params), _1))) + } while (traverser.next()); + return true; +} + +void CUnivariateTimeSeriesChangeDetector::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + inserter.insertValue(MINIMUM_TIME_TO_DETECT, m_MinimumTimeToDetect); + inserter.insertValue(MAXIMUM_TIME_TO_DETECT, m_MaximumTimeToDetect); + inserter.insertValue(MINIMUM_DELTA_BIC_TO_DETECT, m_MinimumDeltaBicToDetect, + core::CIEEE754::E_SinglePrecision); + inserter.insertValue(SAMPLE_COUNT_TAG, m_SampleCount); + inserter.insertValue(CURRENT_EVIDENCE_OF_CHANGE, m_CurrentEvidenceOfChange, + core::CIEEE754::E_SinglePrecision); + if (m_TimeRange.initialized()) { + inserter.insertValue(MIN_TIME_TAG, m_TimeRange.min()); + inserter.insertValue(MAX_TIME_TAG, m_TimeRange.max()); + } + for (const auto& model : m_ChangeModels) { + inserter.insertLevel(CHANGE_MODEL_TAG, boost::bind(&CUnivariateChangeModel::acceptPersistInserter, + model.get(), _1)); + } +} + +TOptionalChangeDescription CUnivariateTimeSeriesChangeDetector::change() { + if (m_TimeRange.range() > m_MinimumTimeToDetect) { + std::size_t candidate{}; + double p{this->decisionFunction(candidate)}; + + if (p > 1.0) { + return m_ChangeModels[candidate]->change(); + } + + m_CurrentEvidenceOfChange = m_ChangeModels[0]->bic() - + m_ChangeModels[candidate]->bic(); + } + return TOptionalChangeDescription(); +} + +double CUnivariateTimeSeriesChangeDetector::decisionFunction(std::size_t& change) const { + using TChangeModelPtr5VecCItr = TChangeModelPtr5Vec::const_iterator; + using TDoubleChangeModelPtr5VecCItrPr = std::pair; + using TMinAccumulator = + CBasicStatistics::COrderStatisticsStack; + + if (m_SampleCount <= COUNT_TO_INITIALIZE) { + return 0.0; + } + + double noChangeBic{m_ChangeModels[0]->bic()}; + TMinAccumulator candidates; + for (auto i = m_ChangeModels.begin() + 1; i != m_ChangeModels.end(); ++i) { + candidates.add({(*i)->bic(), i}); + } + candidates.sort(); + + double evidences[]{noChangeBic - candidates[0].first, + noChangeBic - candidates[1].first}; + double expectedEvidence{noChangeBic - (*candidates[0].second)->expectedBic()}; + + double x[]{evidences[0] / m_MinimumDeltaBicToDetect, + 2.0 * (evidences[0] - evidences[1]) / m_MinimumDeltaBicToDetect, + evidences[0] / EXPECTED_EVIDENCE_THRESHOLD_MULTIPLIER / expectedEvidence, + static_cast(m_TimeRange.range() - m_MinimumTimeToDetect) / + static_cast(m_MaximumTimeToDetect - m_MinimumTimeToDetect)}; + double p{CTools::logisticFunction(x[0], 0.05, 1.0) * + CTools::logisticFunction(x[1], 0.1, 1.0) * + (x[2] < 0.0 ? 1.0 : CTools::logisticFunction(x[2], 0.2, 1.0)) * + CTools::logisticFunction(x[3], 0.2, 0.5)}; + LOG_TRACE("p(" << (*candidates[0].second)->change()->print() << ") = " << p + << " | x = " << core::CContainerPrinter::print(x)); + + change = candidates[0].second - m_ChangeModels.begin(); + + // Note 0.03125 = 0.5^5. This is chosen so that this function + // is equal to one when each of the decision criteria are at + // the centre of the sigmoid functions and the time range is + // equal to "minimum time to detect". This means we'll (just) + // accept the change if all of the individual hard decision + // criteria are satisfied. + + return p / 0.03125; +} + +bool CUnivariateTimeSeriesChangeDetector::stopTesting() const { + core_t::TTime range{m_TimeRange.range()}; + if (range > m_MinimumTimeToDetect) { + double scale{0.5 + CTools::logisticFunction(2.0 * m_CurrentEvidenceOfChange / m_MinimumDeltaBicToDetect, + 0.2, 1.0)}; + return static_cast(range) > + m_MinimumTimeToDetect + + scale * static_cast(m_MaximumTimeToDetect - m_MinimumTimeToDetect); + } + return false; +} + +void CUnivariateTimeSeriesChangeDetector::addSamples(const TTimeDoublePr1Vec& samples, + const TDoubleWeightsAry1Vec& weights) { + for (const auto& sample : samples) { + m_TimeRange.add(sample.first); + } + + ++m_SampleCount; + + for (auto& model : m_ChangeModels) { + model->addSamples(m_SampleCount, samples, weights); + } +} + +void CUnivariateTimeSeriesChangeDetector::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { + core::CMemoryDebug::dynamicSize("m_ChangeModels", m_ChangeModels, mem); +} + +std::size_t CUnivariateTimeSeriesChangeDetector::memoryUsage() const { + return core::CMemory::dynamicSize(m_ChangeModels); +} + +uint64_t CUnivariateTimeSeriesChangeDetector::checksum(uint64_t seed) const { + seed = CChecksum::calculate(seed, m_MinimumTimeToDetect); + seed = CChecksum::calculate(seed, m_MaximumTimeToDetect); + seed = CChecksum::calculate(seed, m_MinimumDeltaBicToDetect); + seed = CChecksum::calculate(seed, m_TimeRange); + seed = CChecksum::calculate(seed, m_SampleCount); + seed = CChecksum::calculate(seed, m_CurrentEvidenceOfChange); + return CChecksum::calculate(seed, m_ChangeModels); +} + +namespace time_series_change_detector_detail { + +CUnivariateChangeModel::CUnivariateChangeModel(const TDecompositionPtr& trendModel, + const TPriorPtr& residualModel) + : m_LogLikelihood{0.0}, m_ExpectedLogLikelihood{0.0}, + m_TrendModel{trendModel}, m_ResidualModel{residualModel} { +} + +bool CUnivariateChangeModel::acceptRestoreTraverser(const SModelRestoreParams& /*params*/, + core::CStateRestoreTraverser& traverser) { + do { + const std::string name{traverser.name()}; + RESTORE_BUILT_IN(LOG_LIKELIHOOD_TAG, m_LogLikelihood); + RESTORE_BUILT_IN(EXPECTED_LOG_LIKELIHOOD_TAG, m_ExpectedLogLikelihood); + return true; + } while (traverser.next()); + return true; +} + +void CUnivariateChangeModel::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + inserter.insertValue(LOG_LIKELIHOOD_TAG, m_LogLikelihood, core::CIEEE754::E_SinglePrecision); + inserter.insertValue(EXPECTED_LOG_LIKELIHOOD_TAG, m_ExpectedLogLikelihood, + core::CIEEE754::E_SinglePrecision); +} + +void CUnivariateChangeModel::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { + // Note if the trend and residual models are shallow copied their + // reference count will be updated so core::CMemory::dynamicSize + // will give the correct contribution for these reference. + core::CMemoryDebug::dynamicSize("m_TrendModel", m_TrendModel, mem); + core::CMemoryDebug::dynamicSize("m_ResidualModel", m_ResidualModel, mem); +} + +std::size_t CUnivariateChangeModel::memoryUsage() const { + // See above. + return core::CMemory::dynamicSize(m_TrendModel) + + core::CMemory::dynamicSize(m_ResidualModel); +} + +uint64_t CUnivariateChangeModel::checksum(uint64_t seed) const { + seed = CChecksum::calculate(seed, m_LogLikelihood); + seed = CChecksum::calculate(seed, m_ExpectedLogLikelihood); + seed = CChecksum::calculate(seed, m_TrendModel); + return CChecksum::calculate(seed, m_ResidualModel); +} + +bool CUnivariateChangeModel::restoreResidualModel(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser) { + return traverser.traverseSubLevel(boost::bind( + CPriorStateSerialiser(), boost::cref(params), boost::ref(m_ResidualModel), _1)); +} + +double CUnivariateChangeModel::logLikelihood() const { + return m_LogLikelihood; +} + +double CUnivariateChangeModel::expectedLogLikelihood() const { + return m_ExpectedLogLikelihood; +} + +void CUnivariateChangeModel::updateLogLikelihood(const TDouble1Vec& samples, + const TDoubleWeightsAry1Vec& weights) { + double logLikelihood{}; + if (m_ResidualModel->jointLogMarginalLikelihood(samples, weights, logLikelihood) == + maths_t::E_FpNoErrors) { + m_LogLikelihood += logLikelihood; + } +} + +void CUnivariateChangeModel::updateExpectedLogLikelihood(const TDoubleWeightsAry1Vec& weights) { + for (const auto& weight : weights) { + double expectedLogLikelihood{}; + if (m_ResidualModel->expectation( + maths::CPrior::CLogMarginalLikelihood{*m_ResidualModel, {weight}}, + EXPECTED_LOG_LIKELIHOOD_NUMBER_INTERVALS, expectedLogLikelihood, weight)) { + m_ExpectedLogLikelihood += expectedLogLikelihood; + } + } +} + +const CTimeSeriesDecompositionInterface& CUnivariateChangeModel::trendModel() const { + return *m_TrendModel; +} + +const CPrior& CUnivariateChangeModel::residualModel() const { + return *m_ResidualModel; +} + +CPrior& CUnivariateChangeModel::residualModel() { + return *m_ResidualModel; +} + +CUnivariateChangeModel::TPriorPtr CUnivariateChangeModel::residualModelPtr() const { + return m_ResidualModel; +} + +CUnivariateNoChangeModel::CUnivariateNoChangeModel(const TDecompositionPtr& trendModel, + const TPriorPtr& residualModel) + : CUnivariateChangeModel{trendModel, residualModel} { +} + +bool CUnivariateNoChangeModel::acceptRestoreTraverser(const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser) { + return this->CUnivariateChangeModel::acceptRestoreTraverser(params, traverser); +} + +void CUnivariateNoChangeModel::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + this->CUnivariateChangeModel::acceptPersistInserter(inserter); +} + +double CUnivariateNoChangeModel::bic() const { + return -2.0 * this->logLikelihood(); +} + +double CUnivariateNoChangeModel::expectedBic() const { + // This is irrelevant since this is only used for deciding + // whether to accept a change. + return this->bic(); +} + +TOptionalChangeDescription CUnivariateNoChangeModel::change() const { + return TOptionalChangeDescription(); +} + +void CUnivariateNoChangeModel::addSamples(const std::size_t count, + const TTimeDoublePr1Vec& samples_, + TDoubleWeightsAry1Vec weights) { + // See, for example, CUnivariateLevelShiftModel::addSamples + // for an explanation of the delay updating the log-likelihood. + + if (count >= COUNT_TO_INITIALIZE) { + TDouble1Vec samples; + samples.reserve(samples_.size()); + for (std::size_t i = 0u; i < samples_.size(); ++i) { + core_t::TTime time{samples_[i].first}; + double value{samples_[i].second}; + double sample{this->trendModel().detrend(time, value, 0.0)}; + samples.push_back(sample); + } + for (auto& weight : weights) { + maths_t::setWinsorisationWeight(1.0, weight); + } + this->updateLogLikelihood(samples, weights); + } +} + +std::size_t CUnivariateNoChangeModel::staticSize() const { + return sizeof(*this); +} + +uint64_t CUnivariateNoChangeModel::checksum(uint64_t seed) const { + return this->CUnivariateChangeModel::checksum(seed); +} + +CUnivariateLevelShiftModel::CUnivariateLevelShiftModel(const TDecompositionPtr& trendModel, + const TPriorPtr& residualModel) + : CUnivariateChangeModel{trendModel, TPriorPtr{residualModel->clone()}}, + m_ResidualModelMode{residualModel->marginalLikelihoodMode()}, m_SampleCount{0.0} { +} + +bool CUnivariateLevelShiftModel::acceptRestoreTraverser(const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser) { + if (this->CUnivariateChangeModel::acceptRestoreTraverser(params, traverser) == false) { + return false; + } + do { + const std::string name{traverser.name()}; + RESTORE(SHIFT_TAG, m_Shift.fromDelimited(traverser.value())) + RESTORE_BUILT_IN(RESIDUAL_MODEL_MODE_TAG, m_ResidualModelMode) + RESTORE_BUILT_IN(SAMPLE_COUNT_TAG, m_SampleCount) + RESTORE(RESIDUAL_MODEL_TAG, + this->restoreResidualModel(params.s_DistributionParams, traverser)) + } while (traverser.next()); + return true; +} + +void CUnivariateLevelShiftModel::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + this->CUnivariateChangeModel::acceptPersistInserter(inserter); + inserter.insertValue(SHIFT_TAG, m_Shift.toDelimited()); + inserter.insertValue(SAMPLE_COUNT_TAG, m_SampleCount); + inserter.insertLevel(RESIDUAL_MODEL_TAG, + boost::bind(CPriorStateSerialiser(), + boost::cref(this->residualModel()), _1)); +} + +double CUnivariateLevelShiftModel::bic() const { + return -2.0 * this->logLikelihood() + CTools::fastLog(m_SampleCount); +} + +double CUnivariateLevelShiftModel::expectedBic() const { + return -2.0 * this->expectedLogLikelihood() + CTools::fastLog(m_SampleCount); +} + +TOptionalChangeDescription CUnivariateLevelShiftModel::change() const { + return SChangeDescription{SChangeDescription::E_LevelShift, + CBasicStatistics::mean(m_Shift), this->residualModelPtr()}; +} + +void CUnivariateLevelShiftModel::addSamples(const std::size_t count, + const TTimeDoublePr1Vec& samples_, + TDoubleWeightsAry1Vec weights) { + const CTimeSeriesDecompositionInterface& trendModel{this->trendModel()}; + + // We delay updating the log-likelihood because early on the + // level can change giving us a better apparent fit to the + // data than a fixed step. Five updates was found to be the + // minimum to get empirically similar sum log-likelihood if + // there is no change in the data. + + if (count >= COUNT_TO_INITIALIZE) { + CPrior& residualModel{this->residualModel()}; + + TDouble1Vec samples; + samples.reserve(samples_.size()); + double shift{CBasicStatistics::mean(m_Shift)}; + for (std::size_t i = 0u; i < samples_.size(); ++i) { + core_t::TTime time{samples_[i].first}; + double value{samples_[i].second}; + double seasonalScale{maths_t::seasonalVarianceScale(weights[i])}; + double sample{trendModel.detrend(time, value, 0.0) - shift}; + double weight{winsorisation::tailWeight( + residualModel, WINSORISATION_DERATE, seasonalScale, sample)}; + samples.push_back(sample); + maths_t::setWinsorisationWeight(weight, weights[i]); + m_SampleCount += maths_t::count(weights[i]); + } + + residualModel.addSamples(samples, weights); + residualModel.propagateForwardsByTime(1.0); + + for (auto& weight : weights) { + maths_t::setWinsorisationWeight(1.0, weight); + } + this->updateLogLikelihood(samples, weights); + this->updateExpectedLogLikelihood(weights); + } + + for (std::size_t i = 0u; i < samples_.size(); ++i) { + core_t::TTime time{samples_[i].first}; + double value{samples_[i].second}; + double shift{trendModel.detrend(time, value, 0.0) - m_ResidualModelMode}; + m_Shift.add(shift); + } +} + +std::size_t CUnivariateLevelShiftModel::staticSize() const { + return sizeof(*this); +} + +uint64_t CUnivariateLevelShiftModel::checksum(uint64_t seed) const { + seed = this->CUnivariateChangeModel::checksum(seed); + seed = CChecksum::calculate(seed, m_Shift); + return CChecksum::calculate(seed, m_SampleCount); +} + +CUnivariateLinearScaleModel::CUnivariateLinearScaleModel(const TDecompositionPtr& trendModel, + const TPriorPtr& residualModel) + : CUnivariateChangeModel{trendModel, TPriorPtr{residualModel->clone()}}, + m_ResidualModelMode{residualModel->marginalLikelihoodMode()}, m_SampleCount{0.0} { +} + +bool CUnivariateLinearScaleModel::acceptRestoreTraverser(const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser) { + if (this->CUnivariateChangeModel::acceptRestoreTraverser(params, traverser) == false) { + return false; + } + do { + const std::string name{traverser.name()}; + RESTORE(SCALE_TAG, m_Scale.fromDelimited(traverser.value())) + RESTORE_BUILT_IN(RESIDUAL_MODEL_MODE_TAG, m_ResidualModelMode) + RESTORE_BUILT_IN(SAMPLE_COUNT_TAG, m_SampleCount) + RESTORE(RESIDUAL_MODEL_TAG, + this->restoreResidualModel(params.s_DistributionParams, traverser)) + } while (traverser.next()); + return true; +} + +void CUnivariateLinearScaleModel::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + this->CUnivariateChangeModel::acceptPersistInserter(inserter); + inserter.insertValue(SCALE_TAG, m_Scale.toDelimited()); + inserter.insertValue(SAMPLE_COUNT_TAG, m_SampleCount); + inserter.insertLevel(RESIDUAL_MODEL_TAG, + boost::bind(CPriorStateSerialiser(), + boost::cref(this->residualModel()), _1)); +} + +double CUnivariateLinearScaleModel::bic() const { + return -2.0 * this->logLikelihood() + CTools::fastLog(m_SampleCount); +} + +double CUnivariateLinearScaleModel::expectedBic() const { + return -2.0 * this->expectedLogLikelihood() + CTools::fastLog(m_SampleCount); +} + +CUnivariateLinearScaleModel::TOptionalChangeDescription +CUnivariateLinearScaleModel::change() const { + return SChangeDescription{SChangeDescription::E_LinearScale, + CBasicStatistics::mean(m_Scale), this->residualModelPtr()}; +} + +void CUnivariateLinearScaleModel::addSamples(const std::size_t count, + const TTimeDoublePr1Vec& samples_, + TDoubleWeightsAry1Vec weights) { + const CTimeSeriesDecompositionInterface& trendModel{this->trendModel()}; + + // We delay updating the log-likelihood because early on the + // scale can change giving us a better apparent fit to the + // data than a fixed scale. Five updates was found to be the + // minimum to get empirically similar sum log-likelihood if + // there is no change in the data. + + for (std::size_t i = 0u; i < samples_.size(); ++i) { + core_t::TTime time{samples_[i].first}; + double value{samples_[i].second - m_ResidualModelMode}; + double prediction{CBasicStatistics::mean(trendModel.value(time, 0.0))}; + double scale{std::fabs(value) / std::fabs(prediction)}; + m_Scale.add(value * prediction < 0.0 + ? MINIMUM_SCALE + : CTools::truncate(scale, MINIMUM_SCALE, MAXIMUM_SCALE), + std::fabs(prediction)); + } + + if (count >= COUNT_TO_INITIALIZE) { + CPrior& residualModel{this->residualModel()}; + + TDouble1Vec samples; + samples.reserve(samples_.size()); + double scale{CBasicStatistics::mean(m_Scale)}; + for (std::size_t i = 0u; i < samples_.size(); ++i) { + core_t::TTime time{samples_[i].first}; + double value{samples_[i].second}; + double seasonalScale{maths_t::seasonalVarianceScale(weights[i])}; + double prediction{CBasicStatistics::mean(trendModel.value(time, 0.0))}; + double sample{value - scale * prediction}; + double weight{winsorisation::tailWeight( + residualModel, WINSORISATION_DERATE, seasonalScale, sample)}; + samples.push_back(sample); + maths_t::setWinsorisationWeight(weight, weights[i]); + m_SampleCount += maths_t::count(weights[i]); + } + + residualModel.addSamples(samples, weights); + residualModel.propagateForwardsByTime(1.0); + + for (auto& weight : weights) { + maths_t::setWinsorisationWeight(1.0, weight); + } + this->updateLogLikelihood(samples, weights); + this->updateExpectedLogLikelihood(weights); + } +} + +std::size_t CUnivariateLinearScaleModel::staticSize() const { + return sizeof(*this); +} + +uint64_t CUnivariateLinearScaleModel::checksum(uint64_t seed) const { + seed = this->CUnivariateChangeModel::checksum(seed); + seed = CChecksum::calculate(seed, m_Scale); + return CChecksum::calculate(seed, m_SampleCount); +} + +CUnivariateTimeShiftModel::CUnivariateTimeShiftModel(const TDecompositionPtr& trendModel, + const TPriorPtr& residualModel, + core_t::TTime shift) + : CUnivariateChangeModel{trendModel, TPriorPtr{residualModel->clone()}}, m_Shift{shift} { +} + +bool CUnivariateTimeShiftModel::acceptRestoreTraverser(const SModelRestoreParams& params, + core::CStateRestoreTraverser& traverser) { + if (this->CUnivariateChangeModel::acceptRestoreTraverser(params, traverser) == false) { + return false; + } + do { + const std::string name{traverser.name()}; + RESTORE(RESIDUAL_MODEL_TAG, + this->restoreResidualModel(params.s_DistributionParams, traverser)) + } while (traverser.next()); + return true; +} + +void CUnivariateTimeShiftModel::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + this->CUnivariateChangeModel::acceptPersistInserter(inserter); + inserter.insertLevel(RESIDUAL_MODEL_TAG, + boost::bind(CPriorStateSerialiser(), + boost::cref(this->residualModel()), _1)); +} + +double CUnivariateTimeShiftModel::bic() const { + return -2.0 * this->logLikelihood(); +} + +double CUnivariateTimeShiftModel::expectedBic() const { + return -2.0 * this->expectedLogLikelihood(); +} + +TOptionalChangeDescription CUnivariateTimeShiftModel::change() const { + return SChangeDescription{SChangeDescription::E_TimeShift, + static_cast(m_Shift), this->residualModelPtr()}; +} + +void CUnivariateTimeShiftModel::addSamples(const std::size_t count, + const TTimeDoublePr1Vec& samples_, + TDoubleWeightsAry1Vec weights) { + // See, for example, CUnivariateLevelShiftModel::addSamples + // for an explanation of the delay updating the log-likelihood. + + if (count >= COUNT_TO_INITIALIZE) { + CPrior& residualModel{this->residualModel()}; + + TDouble1Vec samples; + samples.reserve(samples_.size()); + for (std::size_t i = 0u; i < samples_.size(); ++i) { + core_t::TTime time{samples_[i].first}; + double value{samples_[i].second}; + double seasonalScale{maths_t::seasonalVarianceScale(weights[i])}; + double sample{this->trendModel().detrend(time + m_Shift, value, 0.0)}; + double weight{winsorisation::tailWeight( + residualModel, WINSORISATION_DERATE, seasonalScale, sample)}; + samples.push_back(sample); + maths_t::setWinsorisationWeight(weight, weights[i]); + } + + residualModel.addSamples(samples, weights); + residualModel.propagateForwardsByTime(1.0); + + for (auto& weight : weights) { + maths_t::setWinsorisationWeight(1.0, weight); + } + this->updateLogLikelihood(samples, weights); + this->updateExpectedLogLikelihood(weights); + } +} + +std::size_t CUnivariateTimeShiftModel::staticSize() const { + return sizeof(*this); +} + +uint64_t CUnivariateTimeShiftModel::checksum(uint64_t seed) const { + seed = this->CUnivariateChangeModel::checksum(seed); + return CChecksum::calculate(seed, m_Shift); +} +} +} +} diff --git a/lib/maths/CTimeSeriesDecomposition.cc b/lib/maths/CTimeSeriesDecomposition.cc index 62fde73014..dbd352d3a4 100644 --- a/lib/maths/CTimeSeriesDecomposition.cc +++ b/lib/maths/CTimeSeriesDecomposition.cc @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include #include #include @@ -75,6 +77,7 @@ const std::string LAST_PROPAGATION_TIME_6_3_TAG{"b"}; const std::string PERIODICITY_TEST_6_3_TAG{"c"}; const std::string CALENDAR_CYCLIC_TEST_6_3_TAG{"d"}; const std::string COMPONENTS_6_3_TAG{"e"}; +const std::string TIME_SHIFT_6_3_TAG{"f"}; // Version < 6.3 const std::string DECAY_RATE_OLD_TAG{"a"}; const std::string LAST_VALUE_TIME_OLD_TAG{"b"}; @@ -88,35 +91,40 @@ const std::string EMPTY_STRING; CTimeSeriesDecomposition::CTimeSeriesDecomposition(double decayRate, core_t::TTime bucketLength, std::size_t seasonalComponentSize) - : m_LastValueTime{0}, m_LastPropagationTime{0}, m_PeriodicityTest{decayRate, bucketLength}, - m_CalendarCyclicTest{decayRate, bucketLength}, m_Components{decayRate, bucketLength, - seasonalComponentSize} { + : m_TimeShift{0}, m_LastValueTime{0}, m_LastPropagationTime{0}, + m_PeriodicityTest{decayRate, bucketLength}, m_CalendarCyclicTest{decayRate, bucketLength}, + m_Components{decayRate, bucketLength, seasonalComponentSize} { this->initializeMediator(); } -CTimeSeriesDecomposition::CTimeSeriesDecomposition(double decayRate, - core_t::TTime bucketLength, - std::size_t seasonalComponentSize, +CTimeSeriesDecomposition::CTimeSeriesDecomposition(const STimeSeriesDecompositionRestoreParams& params, core::CStateRestoreTraverser& traverser) - : m_LastValueTime{0}, m_LastPropagationTime{0}, m_PeriodicityTest{decayRate, bucketLength}, - m_CalendarCyclicTest{decayRate, bucketLength}, m_Components{decayRate, bucketLength, - seasonalComponentSize} { + : m_TimeShift{0}, m_LastValueTime{0}, m_LastPropagationTime{0}, + m_PeriodicityTest{params.s_DecayRate, params.s_MinimumBucketLength}, + m_CalendarCyclicTest{params.s_DecayRate, params.s_MinimumBucketLength}, + m_Components{params.s_DecayRate, params.s_MinimumBucketLength, params.s_ComponentSize} { traverser.traverseSubLevel( - boost::bind(&CTimeSeriesDecomposition::acceptRestoreTraverser, this, _1)); + boost::bind(&CTimeSeriesDecomposition::acceptRestoreTraverser, this, + boost::cref(params.s_ChangeModelParams), _1)); this->initializeMediator(); } -CTimeSeriesDecomposition::CTimeSeriesDecomposition(const CTimeSeriesDecomposition& other) - : m_LastValueTime{other.m_LastValueTime}, m_LastPropagationTime{other.m_LastPropagationTime}, - m_PeriodicityTest{other.m_PeriodicityTest}, - m_CalendarCyclicTest{other.m_CalendarCyclicTest}, m_Components{other.m_Components} { +CTimeSeriesDecomposition::CTimeSeriesDecomposition(const CTimeSeriesDecomposition& other, + bool isForForecast) + : m_TimeShift{other.m_TimeShift}, m_LastValueTime{other.m_LastValueTime}, + m_LastPropagationTime{other.m_LastPropagationTime}, + m_PeriodicityTest{other.m_PeriodicityTest, isForForecast}, + m_CalendarCyclicTest{other.m_CalendarCyclicTest, isForForecast}, m_Components{ + other.m_Components} { this->initializeMediator(); } -bool CTimeSeriesDecomposition::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { +bool CTimeSeriesDecomposition::acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser) { if (traverser.name() == VERSION_6_3_TAG) { while (traverser.next()) { const std::string& name{traverser.name()}; + RESTORE_BUILT_IN(TIME_SHIFT_6_3_TAG, m_TimeShift) RESTORE_BUILT_IN(LAST_VALUE_TIME_6_3_TAG, m_LastValueTime) RESTORE_BUILT_IN(LAST_PROPAGATION_TIME_6_3_TAG, m_LastPropagationTime) RESTORE(PERIODICITY_TEST_6_3_TAG, @@ -125,9 +133,9 @@ bool CTimeSeriesDecomposition::acceptRestoreTraverser(core::CStateRestoreTravers RESTORE(CALENDAR_CYCLIC_TEST_6_3_TAG, traverser.traverseSubLevel(boost::bind(&CCalendarTest::acceptRestoreTraverser, &m_CalendarCyclicTest, _1))) - RESTORE(COMPONENTS_6_3_TAG, - traverser.traverseSubLevel(boost::bind( - &CComponents::acceptRestoreTraverser, &m_Components, _1))) + RESTORE(COMPONENTS_6_3_TAG, traverser.traverseSubLevel(boost::bind( + &CComponents::acceptRestoreTraverser, + &m_Components, boost::cref(params), _1))) } } else { // There is no version string this is historic state. @@ -140,9 +148,9 @@ bool CTimeSeriesDecomposition::acceptRestoreTraverser(core::CStateRestoreTravers RESTORE(CALENDAR_CYCLIC_TEST_OLD_TAG, traverser.traverseSubLevel(boost::bind(&CCalendarTest::acceptRestoreTraverser, &m_CalendarCyclicTest, _1))) - RESTORE(COMPONENTS_OLD_TAG, - traverser.traverseSubLevel(boost::bind( - &CComponents::acceptRestoreTraverser, &m_Components, _1))) + RESTORE(COMPONENTS_OLD_TAG, traverser.traverseSubLevel(boost::bind( + &CComponents::acceptRestoreTraverser, + &m_Components, boost::cref(params), _1))) } while (traverser.next()); this->decayRate(decayRate); } @@ -150,6 +158,7 @@ bool CTimeSeriesDecomposition::acceptRestoreTraverser(core::CStateRestoreTravers } void CTimeSeriesDecomposition::swap(CTimeSeriesDecomposition& other) { + std::swap(m_TimeShift, other.m_TimeShift); std::swap(m_LastValueTime, other.m_LastValueTime); std::swap(m_LastPropagationTime, other.m_LastPropagationTime); m_PeriodicityTest.swap(other.m_PeriodicityTest); @@ -168,6 +177,7 @@ operator=(const CTimeSeriesDecomposition& other) { void CTimeSeriesDecomposition::acceptPersistInserter(core::CStatePersistInserter& inserter) const { inserter.insertValue(VERSION_6_3_TAG, ""); + inserter.insertValue(TIME_SHIFT_6_3_TAG, m_TimeShift); inserter.insertValue(LAST_VALUE_TIME_6_3_TAG, m_LastValueTime); inserter.insertValue(LAST_PROPAGATION_TIME_6_3_TAG, m_LastPropagationTime); inserter.insertLevel(PERIODICITY_TEST_6_3_TAG, @@ -180,8 +190,12 @@ void CTimeSeriesDecomposition::acceptPersistInserter(core::CStatePersistInserter &m_Components, _1)); } -CTimeSeriesDecomposition* CTimeSeriesDecomposition::clone() const { - return new CTimeSeriesDecomposition{*this}; +CTimeSeriesDecomposition* CTimeSeriesDecomposition::clone(bool isForForecast) const { + return new CTimeSeriesDecomposition{*this, isForForecast}; +} + +void CTimeSeriesDecomposition::dataType(maths_t::EDataType dataType) { + m_Components.dataType(dataType); } void CTimeSeriesDecomposition::decayRate(double decayRate) { @@ -202,6 +216,8 @@ bool CTimeSeriesDecomposition::addPoint(core_t::TTime time, const maths_t::TDoubleWeightsAry& weights) { CComponents::CScopeNotifyOnStateChange result{m_Components}; + time += m_TimeShift; + core_t::TTime lastTime{std::max(m_LastValueTime, m_LastPropagationTime)}; m_LastValueTime = std::max(m_LastValueTime, time); @@ -211,12 +227,12 @@ bool CTimeSeriesDecomposition::addPoint(core_t::TTime time, lastTime, value, weights, - CBasicStatistics::mean(this->baseline(time, 0.0, E_TrendForced)), - CBasicStatistics::mean(this->baseline(time, 0.0, E_Seasonal)), - CBasicStatistics::mean(this->baseline(time, 0.0, E_Calendar)), + CBasicStatistics::mean(this->value(time, 0.0, E_TrendForced)), + CBasicStatistics::mean(this->value(time, 0.0, E_Seasonal)), + CBasicStatistics::mean(this->value(time, 0.0, E_Calendar)), [this](core_t::TTime time_) { return CBasicStatistics::mean( - this->baseline(time_, 0.0, E_Seasonal | E_Calendar)); + this->value(time_, 0.0, E_Seasonal | E_Calendar)); }, m_Components.periodicityTestConfig()}; @@ -227,6 +243,29 @@ bool CTimeSeriesDecomposition::addPoint(core_t::TTime time, return result.changed(); } +bool CTimeSeriesDecomposition::applyChange(core_t::TTime time, + double value, + const SChangeDescription& change) { + bool result{m_Components.usingTrendForPrediction() == false}; + m_Components.useTrendForPrediction(); + + switch (change.s_Description) { + case SChangeDescription::E_LevelShift: + m_Components.shiftLevel(time, value, change.s_Value[0]); + m_PeriodicityTest.clear(CPeriodicityTest::E_Short, time); + break; + case SChangeDescription::E_LinearScale: + m_Components.linearScale(time, change.s_Value[0]); + m_PeriodicityTest.clear(CPeriodicityTest::E_Short, time); + break; + case SChangeDescription::E_TimeShift: + m_TimeShift += static_cast(change.s_Value[0]); + break; + } + + return result; +} + void CTimeSeriesDecomposition::propagateForwardsTo(core_t::TTime time) { if (time > m_LastPropagationTime) { m_PeriodicityTest.propagateForwards(m_LastPropagationTime, time); @@ -236,16 +275,18 @@ void CTimeSeriesDecomposition::propagateForwardsTo(core_t::TTime time) { m_LastPropagationTime = std::max(m_LastPropagationTime, time); } -double CTimeSeriesDecomposition::mean(core_t::TTime time) const { +double CTimeSeriesDecomposition::meanValue(core_t::TTime time) const { return m_Components.meanValue(time); } -TDoubleDoublePr CTimeSeriesDecomposition::baseline(core_t::TTime time, - double confidence, - int components, - bool smooth) const { +TDoubleDoublePr CTimeSeriesDecomposition::value(core_t::TTime time, + double confidence, + int components, + bool smooth) const { TVector2x1 baseline{0.0}; + time += m_TimeShift; + if (components & E_TrendForced) { baseline += vector2x1(m_Components.trend().value(time, confidence)); } else if (components & E_Trend) { @@ -272,9 +313,9 @@ TDoubleDoublePr CTimeSeriesDecomposition::baseline(core_t::TTime time, if (smooth) { baseline += vector2x1( - this->smooth(boost::bind(&CTimeSeriesDecomposition::baseline, this, _1, + this->smooth(boost::bind(&CTimeSeriesDecomposition::value, this, _1, confidence, components & E_Seasonal, false), - time, components)); + time - m_TimeShift, components)); } return pair(baseline); @@ -285,7 +326,7 @@ void CTimeSeriesDecomposition::forecast(core_t::TTime startTime, core_t::TTime step, double confidence, double minimumScale, - TDouble3VecVec& result) { + const TWriteForecastResult& writer) { if (endTime < startTime) { LOG_ERROR(<< "Bad forecast range: [" << startTime << "," << endTime << "]"); return; @@ -295,7 +336,7 @@ void CTimeSeriesDecomposition::forecast(core_t::TTime startTime, return; } - auto predictor = [this, confidence](core_t::TTime time) { + auto seasonal = [this, confidence](core_t::TTime time) { TVector2x1 prediction(0.0); for (const auto& component : m_Components.seasonal()) { if (component.initialized() && component.time().inWindow(time)) { @@ -310,38 +351,41 @@ void CTimeSeriesDecomposition::forecast(core_t::TTime startTime, return pair(prediction); }; + startTime += m_TimeShift; + endTime += m_TimeShift; endTime = startTime + CIntegerTools::ceil(endTime - startTime, step); double trendVariance{CBasicStatistics::mean(m_Components.trend().variance(0.0))}; double seasonalVariance{m_Components.meanVariance() - trendVariance}; double variance{this->meanVariance()}; - double scale0{std::sqrt(std::max( CBasicStatistics::mean(this->scale(startTime, variance, 0.0)), minimumScale))}; TVector2x1 i0{vector2x1(confidenceInterval(confidence, seasonalVariance))}; - m_Components.trend().forecast(startTime, endTime, step, confidence, result); - for (core_t::TTime time = startTime; time < endTime; time += step) { + auto forecastSeasonal = [&](core_t::TTime time) { + m_Components.interpolateForForecast(time); double scale{std::sqrt(std::max( CBasicStatistics::mean(this->scale(time, variance, 0.0)), minimumScale))}; - TVector2x1 prediction{vector2x1(predictor(time)) + - vector2x1(this->smooth(predictor, time, E_Seasonal)) + + TVector2x1 prediction{vector2x1(seasonal(time)) + + vector2x1(this->smooth(seasonal, time, E_Seasonal)) + (scale - scale0) * i0}; + return TDouble3Vec{prediction(0), (prediction(0) + prediction(1)) / 2.0, + prediction(1)}; + }; - core_t::TTime index{(time - startTime) / step}; - result[index][0] += prediction(0); - result[index][1] += (prediction(0) + prediction(1)) / 2.0; - result[index][2] += prediction(1); - m_Components.interpolate(SMessage{time, time - step}, false); - } + m_Components.trend().forecast(startTime, endTime, step, confidence, + forecastSeasonal, writer); } -double CTimeSeriesDecomposition::detrend(core_t::TTime time, double value, double confidence) const { +double CTimeSeriesDecomposition::detrend(core_t::TTime time, + double value, + double confidence, + int components) const { if (!this->initialized()) { return value; } - TDoubleDoublePr baseline{this->baseline(time, confidence)}; - return std::min(value - baseline.first, 0.0) + std::max(value - baseline.second, 0.0); + TDoubleDoublePr interval{this->value(time, confidence, components)}; + return std::min(value - interval.first, 0.0) + std::max(value - interval.second, 0.0); } double CTimeSeriesDecomposition::meanVariance() const { @@ -361,6 +405,8 @@ TDoubleDoublePr CTimeSeriesDecomposition::scale(core_t::TTime time, return {1.0, 1.0}; } + time += m_TimeShift; + double components{0.0}; TVector2x1 scale(0.0); if (m_Components.usingTrendForPrediction()) { @@ -430,6 +476,10 @@ std::size_t CTimeSeriesDecomposition::staticSize() const { return sizeof(*this); } +core_t::TTime CTimeSeriesDecomposition::timeShift() const { + return m_TimeShift; +} + const maths_t::TSeasonalComponentVec& CTimeSeriesDecomposition::seasonalComponents() const { return m_Components.seasonal(); } @@ -505,6 +555,5 @@ core_t::TTime CTimeSeriesDecomposition::lastValueTime() const { } const core_t::TTime CTimeSeriesDecomposition::SMOOTHING_INTERVAL{7200}; -const std::size_t CTimeSeriesDecomposition::DEFAULT_COMPONENT_SIZE{36u}; } } diff --git a/lib/maths/CTimeSeriesDecompositionDetail.cc b/lib/maths/CTimeSeriesDecompositionDetail.cc index 9d83523d58..e06d69bba5 100644 --- a/lib/maths/CTimeSeriesDecompositionDetail.cc +++ b/lib/maths/CTimeSeriesDecompositionDetail.cc @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -73,11 +74,6 @@ const core_t::TTime DAY = core::constants::DAY; const core_t::TTime WEEK = core::constants::WEEK; const core_t::TTime MONTH = 4 * WEEK; -//! Get the square of \p x. -double pow2(double x) { - return x * x; -} - //! Compute the mean of \p mean of \p components. template double meanOf(MEAN_FUNCTION mean, const TSeasonalComponentVec& components) { @@ -454,11 +450,12 @@ CTimeSeriesDecompositionDetail::CPeriodicityTest::CPeriodicityTest(double decayR m_DecayRate{decayRate}, m_BucketLength{bucketLength} { } -CTimeSeriesDecompositionDetail::CPeriodicityTest::CPeriodicityTest(const CPeriodicityTest& other) +CTimeSeriesDecompositionDetail::CPeriodicityTest::CPeriodicityTest(const CPeriodicityTest& other, + bool isForForecast) : m_Machine{other.m_Machine}, m_DecayRate{other.m_DecayRate}, m_BucketLength{ other.m_BucketLength} { // Note that m_Windows is an array. - for (std::size_t i = 0u; i < other.m_Windows.size(); ++i) { + for (std::size_t i = 0u; !isForForecast && i < other.m_Windows.size(); ++i) { if (other.m_Windows[i]) { m_Windows[i] = std::make_shared(*other.m_Windows[i]); } @@ -581,6 +578,13 @@ void CTimeSeriesDecompositionDetail::CPeriodicityTest::test(const SAddValue& mes } } +void CTimeSeriesDecompositionDetail::CPeriodicityTest::clear(ETest test, core_t::TTime time) { + if (m_Windows[test] != nullptr) { + m_Windows[test].reset(this->newWindow(test)); + m_Windows[test]->initialize(time); + } +} + void CTimeSeriesDecompositionDetail::CPeriodicityTest::propagateForwards(core_t::TTime start, core_t::TTime end) { stepwisePropagateForwards(DAY, start, end, m_Windows[E_Short]); @@ -736,11 +740,13 @@ CTimeSeriesDecompositionDetail::CCalendarTest::CCalendarTest(double decayRate, m_DecayRate{decayRate}, m_LastMonth{} { } -CTimeSeriesDecompositionDetail::CCalendarTest::CCalendarTest(const CCalendarTest& other) +CTimeSeriesDecompositionDetail::CCalendarTest::CCalendarTest(const CCalendarTest& other, + bool isForForecast) : m_Machine{other.m_Machine}, m_DecayRate{other.m_DecayRate}, - m_LastMonth{other.m_LastMonth}, m_Test{other.m_Test ? new CCalendarCyclicTest( - *other.m_Test) - : nullptr} { + m_LastMonth{other.m_LastMonth}, m_Test{!isForForecast && other.m_Test + ? std::make_shared( + *other.m_Test) + : 0} { } bool CTimeSeriesDecompositionDetail::CCalendarTest::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { @@ -751,7 +757,8 @@ bool CTimeSeriesDecompositionDetail::CCalendarTest::acceptRestoreTraverser(core: &core::CStateMachine::acceptRestoreTraverser, &m_Machine, _1))) RESTORE_BUILT_IN(LAST_MONTH_6_3_TAG, m_LastMonth); RESTORE_SETUP_TEARDOWN( - CALENDAR_TEST_6_3_TAG, m_Test.reset(new CCalendarCyclicTest(m_DecayRate)), + CALENDAR_TEST_6_3_TAG, + m_Test = std::make_shared(m_DecayRate), traverser.traverseSubLevel(boost::bind( &CCalendarCyclicTest::acceptRestoreTraverser, m_Test.get(), _1)), /**/) @@ -890,7 +897,7 @@ void CTimeSeriesDecompositionDetail::CCalendarTest::apply(std::size_t symbol, switch (state) { case CC_TEST: if (!m_Test) { - m_Test.reset(new CCalendarCyclicTest(m_DecayRate)); + m_Test = std::make_shared(m_DecayRate); m_LastMonth = this->month(time) + 2; } break; @@ -945,7 +952,9 @@ CTimeSeriesDecompositionDetail::CComponents::CComponents(const CComponents& othe m_UsingTrendForPrediction{other.m_UsingTrendForPrediction}, m_Watcher{nullptr} { } -bool CTimeSeriesDecompositionDetail::CComponents::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { +bool CTimeSeriesDecompositionDetail::CComponents::acceptRestoreTraverser( + const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser) { if (traverser.name() == VERSION_6_3_TAG) { while (traverser.next()) { const std::string& name{traverser.name()}; @@ -953,9 +962,9 @@ bool CTimeSeriesDecompositionDetail::CComponents::acceptRestoreTraverser(core::C traverser.traverseSubLevel(boost::bind( &core::CStateMachine::acceptRestoreTraverser, &m_Machine, _1))); RESTORE_BUILT_IN(DECAY_RATE_6_3_TAG, m_DecayRate); - RESTORE(TREND_6_3_TAG, - traverser.traverseSubLevel(boost::bind( - &CTrendComponent::acceptRestoreTraverser, &m_Trend, _1))) + RESTORE(TREND_6_3_TAG, traverser.traverseSubLevel(boost::bind( + &CTrendComponent::acceptRestoreTraverser, + &m_Trend, boost::cref(params), _1))) RESTORE_SETUP_TEARDOWN( SEASONAL_6_3_TAG, m_Seasonal.reset(new SSeasonal), traverser.traverseSubLevel( @@ -1089,6 +1098,7 @@ void CTimeSeriesDecompositionDetail::CComponents::handle(const SAddValue& messag core_t::TTime observedInterval{m_Trend.observedInterval()}; m_Trend.add(time, values[0], weight); + m_Trend.dontShiftLevel(time, value); for (std::size_t i = 1u; i <= m; ++i) { CSeasonalComponent* component{seasonalComponents[i - 1]}; CComponentErrors* error_{seasonalErrors[i - 1]}; @@ -1123,6 +1133,9 @@ void CTimeSeriesDecompositionDetail::CComponents::handle(const SAddValue& messag (vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0) : CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0))}; m_UsingTrendForPrediction = (p >= 0.25); + if (m_UsingTrendForPrediction) { + LOG_DEBUG(<< "Detected trend at " << time); + } *m_Watcher = m_UsingTrendForPrediction; } } @@ -1220,8 +1233,27 @@ void CTimeSeriesDecompositionDetail::CComponents::handle(const SDetectedCalendar } } -void CTimeSeriesDecompositionDetail::CComponents::interpolate(const SMessage& message, - bool refine) { +void CTimeSeriesDecompositionDetail::CComponents::useTrendForPrediction(void) { + m_UsingTrendForPrediction = true; +} + +void CTimeSeriesDecompositionDetail::CComponents::shiftLevel(core_t::TTime time, + double value, + double shift) { + m_Trend.shiftLevel(time, value, shift); +} + +void CTimeSeriesDecompositionDetail::CComponents::linearScale(core_t::TTime time, double scale) { + m_Trend.linearScale(scale); + if (m_Seasonal) { + m_Seasonal->linearScale(time, scale); + } + if (m_Calendar) { + m_Calendar->linearScale(time, scale); + } +} + +void CTimeSeriesDecompositionDetail::CComponents::interpolate(const SMessage& message) { core_t::TTime time{message.s_Time}; core_t::TTime lastTime{message.s_LastTime}; @@ -1235,10 +1267,10 @@ void CTimeSeriesDecompositionDetail::CComponents::interpolate(const SMessage& me LOG_TRACE(<< "Interpolating values at " << time); if (m_Seasonal) { - m_Seasonal->interpolate(time, lastTime, refine); + m_Seasonal->interpolate(time, lastTime, true); } if (m_Calendar) { - m_Calendar->interpolate(time, lastTime, refine); + m_Calendar->interpolate(time, lastTime, true); } this->apply(SC_INTERPOLATED, message); @@ -1253,6 +1285,21 @@ void CTimeSeriesDecompositionDetail::CComponents::interpolate(const SMessage& me } } +void CTimeSeriesDecompositionDetail::CComponents::interpolateForForecast(core_t::TTime time) { + if (this->shouldInterpolate(time, time - m_BucketLength)) { + if (m_Seasonal) { + m_Seasonal->interpolate(time, time - m_BucketLength, false); + } + if (m_Calendar) { + m_Calendar->interpolate(time, time - m_BucketLength, true); + } + } +} + +void CTimeSeriesDecompositionDetail::CComponents::dataType(maths_t::EDataType dataType) { + m_Trend.dataType(dataType); +} + void CTimeSeriesDecompositionDetail::CComponents::decayRate(double decayRate) { m_DecayRate = decayRate; m_Trend.decayRate(decayRate); @@ -1704,8 +1751,9 @@ std::string CTimeSeriesDecompositionDetail::CComponents::CComponentErrors::toDel void CTimeSeriesDecompositionDetail::CComponents::CComponentErrors::add(double error, double prediction, double weight) { - double errorWithComponent{winsorise(pow2(error), m_MeanErrorWithComponent)}; - double errorWithoutComponent{winsorise(pow2(error - prediction), m_MeanErrorWithoutComponent)}; + double errorWithComponent{winsorise(CTools::pow2(error), m_MeanErrorWithComponent)}; + double errorWithoutComponent{winsorise(CTools::pow2(error - prediction), + m_MeanErrorWithoutComponent)}; m_MeanErrorWithComponent.add(errorWithComponent, weight); m_MeanErrorWithoutComponent.add(errorWithoutComponent, weight); } @@ -1968,6 +2016,13 @@ void CTimeSeriesDecompositionDetail::CComponents::SSeasonal::shiftOrigin(core_t: } } +void CTimeSeriesDecompositionDetail::CComponents::SSeasonal::linearScale(core_t::TTime time, + double scale) { + for (auto& component : s_Components) { + component.linearScale(time, scale); + } +} + uint64_t CTimeSeriesDecompositionDetail::CComponents::SSeasonal::checksum(uint64_t seed) const { seed = CChecksum::calculate(seed, s_Components); return CChecksum::calculate(seed, s_PredictionErrors); @@ -2121,6 +2176,13 @@ bool CTimeSeriesDecompositionDetail::CComponents::SCalendar::prune(core_t::TTime return s_Components.empty(); } +void CTimeSeriesDecompositionDetail::CComponents::SCalendar::linearScale(core_t::TTime time, + double scale) { + for (auto& component : s_Components) { + component.linearScale(time, scale); + } +} + uint64_t CTimeSeriesDecompositionDetail::CComponents::SCalendar::checksum(uint64_t seed) const { seed = CChecksum::calculate(seed, s_Components); return CChecksum::calculate(seed, s_PredictionErrors); diff --git a/lib/maths/CTimeSeriesDecompositionStateSerialiser.cc b/lib/maths/CTimeSeriesDecompositionStateSerialiser.cc index 38b4b2f06b..8bfe1a0c64 100644 --- a/lib/maths/CTimeSeriesDecompositionStateSerialiser.cc +++ b/lib/maths/CTimeSeriesDecompositionStateSerialiser.cc @@ -43,8 +43,7 @@ operator()(const STimeSeriesDecompositionRestoreParams& params, do { const std::string& name = traverser.name(); if (name == TIME_SERIES_DECOMPOSITION_TAG) { - result.reset(new CTimeSeriesDecomposition(params.s_DecayRate, params.s_MinimumBucketLength, - params.s_ComponentSize, traverser)); + result.reset(new CTimeSeriesDecomposition(params, traverser)); ++numResults; } else if (name == TIME_SERIES_DECOMPOSITION_STUB_TAG) { result.reset(new CTimeSeriesDecompositionStub()); diff --git a/lib/maths/CTimeSeriesDecompositionStub.cc b/lib/maths/CTimeSeriesDecompositionStub.cc index 9435e7fdb9..e7249eccaa 100644 --- a/lib/maths/CTimeSeriesDecompositionStub.cc +++ b/lib/maths/CTimeSeriesDecompositionStub.cc @@ -14,10 +14,13 @@ namespace { const maths_t::TSeasonalComponentVec NO_COMPONENTS; } -CTimeSeriesDecompositionStub* CTimeSeriesDecompositionStub::clone() const { +CTimeSeriesDecompositionStub* CTimeSeriesDecompositionStub::clone(bool /*isForForecast*/) const { return new CTimeSeriesDecompositionStub(*this); } +void CTimeSeriesDecompositionStub::dataType(maths_t::EDataType /*dataType*/) { +} + void CTimeSeriesDecompositionStub::decayRate(double /*decayRate*/) { } @@ -35,17 +38,23 @@ bool CTimeSeriesDecompositionStub::addPoint(core_t::TTime /*time*/, return false; } +bool CTimeSeriesDecompositionStub::applyChange(core_t::TTime /*time*/, + double /*value*/, + const SChangeDescription& /*change*/) { + return false; +} + void CTimeSeriesDecompositionStub::propagateForwardsTo(core_t::TTime /*time*/) { } -double CTimeSeriesDecompositionStub::mean(core_t::TTime /*time*/) const { +double CTimeSeriesDecompositionStub::meanValue(core_t::TTime /*time*/) const { return 0.0; } -maths_t::TDoubleDoublePr CTimeSeriesDecompositionStub::baseline(core_t::TTime /*time*/, - double /*confidence*/, - int /*components*/, - bool /*smooth*/) const { +maths_t::TDoubleDoublePr CTimeSeriesDecompositionStub::value(core_t::TTime /*time*/, + double /*confidence*/, + int /*components*/, + bool /*smooth*/) const { return {0.0, 0.0}; } @@ -54,13 +63,13 @@ void CTimeSeriesDecompositionStub::forecast(core_t::TTime /*startTime*/, core_t::TTime /*step*/, double /*confidence*/, double /*minimumScale*/, - TDouble3VecVec& result) { - result.clear(); + const TWriteForecastResult& /*writer*/) { } double CTimeSeriesDecompositionStub::detrend(core_t::TTime /*time*/, double value, - double /*confidence*/) const { + double /*confidence*/, + int /*components*/) const { return value; } @@ -94,6 +103,10 @@ std::size_t CTimeSeriesDecompositionStub::staticSize() const { return sizeof(*this); } +core_t::TTime CTimeSeriesDecompositionStub::timeShift() const { + return 0; +} + const maths_t::TSeasonalComponentVec& CTimeSeriesDecompositionStub::seasonalComponents() const { return NO_COMPONENTS; } diff --git a/lib/maths/CTimeSeriesModel.cc b/lib/maths/CTimeSeriesModel.cc index 68e3637d0e..0e2e561452 100644 --- a/lib/maths/CTimeSeriesModel.cc +++ b/lib/maths/CTimeSeriesModel.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -33,125 +34,78 @@ namespace ml { namespace maths { namespace { + using TDoubleDoublePr = std::pair; +using TSizeDoublePr = std::pair; +using TTimeDoublePr = std::pair; +using TSizeVec = std::vector; +using TDouble1Vec = core::CSmallVector; using TDouble2Vec = core::CSmallVector; using TDouble10Vec = core::CSmallVector; using TDouble10Vec1Vec = core::CSmallVector; using TDouble10Vec2Vec = core::CSmallVector; -using TDouble10VecWeightsAry = maths_t::TDouble10VecWeightsAry; -using TDouble10VecWeightsAry1Vec = maths_t::TDouble10VecWeightsAry1Vec; -using TSizeVec = std::vector; +using TSize1Vec = core::CSmallVector; +using TSize2Vec = core::CSmallVector; +using TSize2Vec1Vec = core::CSmallVector; +using TTime1Vec = core::CSmallVector; using TSize10Vec = core::CSmallVector; -using TSizeDoublePr = std::pair; using TSizeDoublePr10Vec = core::CSmallVector; using TTail10Vec = core::CSmallVector; -using TTime1Vec = CTimeSeriesCorrelations::TTime1Vec; -using TDouble1Vec = CTimeSeriesCorrelations::TDouble1Vec; -using TDoubleWeightsAry = maths_t::TDoubleWeightsAry; -using TDoubleWeightsAry1Vec = maths_t::TDoubleWeightsAry1Vec; -using TSize1Vec = CTimeSeriesCorrelations::TSize1Vec; -using TSize2Vec1Vec = CTimeSeriesCorrelations::TSize2Vec1Vec; +using TOptionalSize = boost::optional; +using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; +using TChangeDetectorPtr = std::shared_ptr; using TMultivariatePriorCPtrSizePr1Vec = CTimeSeriesCorrelations::TMultivariatePriorCPtrSizePr1Vec; -const std::size_t SLIDING_WINDOW_SIZE{12}; - -//! Computes the Winsorisation weight for \p value. -double computeWinsorisationWeight(const CPrior& prior, double derate, double scale, double value) { - static const double WINSORISED_FRACTION = 1e-4; - static const double MINIMUM_WEIGHT_FRACTION = 1e-12; - static const double MINIMUM_WEIGHT = 0.05; - static const double MINUS_LOG_TOLERANCE = - -std::log(1.0 - 100.0 * std::numeric_limits::epsilon()); - - double deratedMinimumWeight = - MINIMUM_WEIGHT + (0.5 - MINIMUM_WEIGHT) * CTools::truncate(derate, 0.0, 1.0); - - auto weight = maths_t::seasonalVarianceScaleWeight(scale); - double lowerBound; - double upperBound; - if (!prior.minusLogJointCdf({value}, {weight}, lowerBound, upperBound)) { - return 1.0; - } - if (upperBound < MINUS_LOG_TOLERANCE && - !prior.minusLogJointCdfComplement({value}, {weight}, lowerBound, upperBound)) { - return 1.0; - } - - double f = std::exp(-(lowerBound + upperBound) / 2.0); - f = std::min(f, 1.0 - f); - if (f >= WINSORISED_FRACTION) { - return 1.0; - } - if (f <= MINIMUM_WEIGHT_FRACTION) { - return deratedMinimumWeight; - } - - // We interpolate between 1.0 and the minimum weight on the - // interval [WINSORISED_FRACTION, MINIMUM_WEIGHT_FRACTION] - // by fitting (f / WF)^(-c log(f)) where WF is the Winsorised - // fraction and c is determined by solving: - // MW = (MWF / WF)^(-c log(MWF)) - - static const double EXPONENT = - -std::log(MINIMUM_WEIGHT) / std::log(MINIMUM_WEIGHT_FRACTION) / - std::log(MINIMUM_WEIGHT_FRACTION / WINSORISED_FRACTION); - static const double LOG_WINSORISED_FRACTION = std::log(WINSORISED_FRACTION); - - double deratedExponent = EXPONENT; - if (deratedMinimumWeight != MINIMUM_WEIGHT) { - deratedExponent = -std::log(deratedMinimumWeight) / - std::log(MINIMUM_WEIGHT_FRACTION) / - std::log(MINIMUM_WEIGHT_FRACTION / WINSORISED_FRACTION); - } - - double logf = std::log(f); - double result = std::exp(-deratedExponent * logf * (logf - LOG_WINSORISED_FRACTION)); +//! The decay rate controllers we maintain. +enum EDecayRateController { + E_TrendControl = 0, + E_ResidualControl, + E_NumberControls +}; - if (CMathsFuncs::isNan(result)) { - return 1.0; +const std::size_t MAXIMUM_CORRELATIONS{5000}; +const double MINIMUM_CORRELATE_PRIOR_SAMPLE_COUNT{24.0}; +const std::size_t SLIDING_WINDOW_SIZE{12u}; +const TSize10Vec NOTHING_TO_MARGINALIZE; +const TSizeDoublePr10Vec NOTHING_TO_CONDITION; +const double CHANGE_P_VALUE{1e-5}; + +//! Optionally randomly sample from \p indices. +TOptionalSize randomlySample(CPRNG::CXorOShiro128Plus& rng, + const CModelAddSamplesParams& params, + core_t::TTime bucketLength, + const TSizeVec& indices) { + auto addWeight = [](TMeanAccumulator mean, const maths_t::TDouble2VecWeightsAry& weight) { + mean.add(maths_t::winsorisationWeight(weight)[0]); + return mean; + }; + TMeanAccumulator weight{std::accumulate(params.trendWeights().begin(), + params.trendWeights().end(), + TMeanAccumulator{}, addWeight)}; + double p{SLIDING_WINDOW_SIZE * static_cast(bucketLength) / + static_cast(core::constants::DAY) * CBasicStatistics::mean(weight)}; + if (p >= 1.0 || CSampling::uniformSample(rng, 0.0, 1.0) < p) { + std::size_t i{CSampling::uniformSample(rng, 0, indices.size())}; + return indices[i]; } - - LOG_TRACE(<< "sample = " << value << " min(F, 1-F) = " << f << ", weight = " << result); - - return result; + return TOptionalSize{}; } -//! Computes the Winsorisation weight for \p value. -double computeWinsorisationWeight(const CMultivariatePrior& prior, - std::size_t dimension, - double derate, - double scale, - const TDouble10Vec& value) { - static const TSize10Vec MARGINALIZE; - - std::size_t d = prior.dimension(); - - TSizeDoublePr10Vec condition(d - 1); - for (std::size_t i = 0u, j = 0u; i < d; ++i) { - if (i != dimension) { - condition[j++] = std::make_pair(i, value[i]); - } - } - - std::shared_ptr conditional(prior.univariate(MARGINALIZE, condition).first); - return computeWinsorisationWeight(*conditional, derate, scale, value[dimension]); +//! Convert \p value to comma separated string. +std::string toDelimited(const TTimeDoublePr& value) { + return core::CStringUtils::typeToString(value.first) + ',' + + core::CStringUtils::typeToStringPrecise(value.second, core::CIEEE754::E_SinglePrecision); } -//! Get the count weight to use to initialise the residual model -//! from the sliding window. -double slidingWindowCountWeight(double learnRate) { - return std::max(learnRate, 5.0 / static_cast(SLIDING_WINDOW_SIZE)); +//! Extract \p value from comma separated string. +bool fromDelimited(const std::string& str, TTimeDoublePr& value) { + std::size_t pos{str.find(',')}; + return pos != std::string::npos && + core::CStringUtils::stringToType(str.substr(0, pos), value.first) && + core::CStringUtils::stringToType(str.substr(pos + 1), value.second); } -//! The decay rate controllers we maintain. -enum EDecayRateController { - E_TrendControl = 0, - E_PriorControl, - E_NumberControls -}; - // Models - // Version 6.3 const std::string VERSION_6_3_TAG("6.3"); const std::string ID_6_3_TAG{"a"}; @@ -159,10 +113,13 @@ const std::string IS_NON_NEGATIVE_6_3_TAG{"b"}; const std::string IS_FORECASTABLE_6_3_TAG{"c"}; const std::string RNG_6_3_TAG{"d"}; const std::string CONTROLLER_6_3_TAG{"e"}; -const std::string TREND_6_3_TAG{"f"}; -const std::string PRIOR_6_3_TAG{"g"}; +const std::string TREND_MODEL_6_3_TAG{"f"}; +const std::string RESIDUAL_MODEL_6_3_TAG{"g"}; const std::string ANOMALY_MODEL_6_3_TAG{"h"}; const std::string SLIDING_WINDOW_6_3_TAG{"i"}; +const std::string CANDIDATE_CHANGE_POINT_6_3_TAG{"j"}; +const std::string CURRENT_CHANGE_INTERVAL_6_3_TAG{"k"}; +const std::string CHANGE_DETECTOR_6_3_TAG{"l"}; // Version < 6.3 const std::string ID_OLD_TAG{"a"}; const std::string CONTROLLER_OLD_TAG{"b"}; @@ -175,7 +132,7 @@ const std::string IS_FORECASTABLE_OLD_TAG{"h"}; // Anomaly model const std::string MEAN_ERROR_TAG{"a"}; const std::string ANOMALIES_TAG{"b"}; -const std::string PRIOR_TAG{"d"}; +const std::string ANOMALY_FEATURE_MODEL_TAG{"d"}; // Anomaly model nested const std::string TAG_TAG{"a"}; const std::string OPEN_TIME_TAG{"b"}; @@ -185,24 +142,124 @@ const std::string MEAN_ERROR_NORM_TAG{"d"}; // Correlations const std::string K_MOST_CORRELATED_TAG{"a"}; const std::string CORRELATED_LOOKUP_TAG{"b"}; -const std::string CORRELATED_PRIORS_TAG{"c"}; +const std::string CORRELATION_MODELS_TAG{"c"}; // Correlations nested const std::string FIRST_CORRELATE_ID_TAG{"a"}; const std::string SECOND_CORRELATE_ID_TAG{"b"}; -const std::string CORRELATE_PRIOR_TAG{"c"}; +const std::string CORRELATION_MODEL_TAG{"c"}; const std::string CORRELATION_TAG{"d"}; -const std::size_t MAXIMUM_CORRELATIONS{5000}; -const double MINIMUM_CORRELATE_PRIOR_SAMPLE_COUNT{24.0}; -const TSize10Vec NOTHING_TO_MARGINALIZE; -const TSizeDoublePr10Vec NOTHING_TO_CONDITION; - namespace forecast { const std::string INFO_INSUFFICIENT_HISTORY("Insufficient history to forecast"); const std::string ERROR_MULTIVARIATE("Forecast not supported for multivariate features"); } } +namespace winsorisation { +namespace { +const double MAXIMUM_P_VALUE{1e-3}; +const double MINIMUM_P_VALUE{1e-10}; +const double MINIMUM_WEIGHT{1e-2}; +const double LOG_MAXIMUM_P_VALUE{std::log(MAXIMUM_P_VALUE)}; +const double LOG_MINIMUM_P_VALUE{std::log(MINIMUM_P_VALUE)}; +const double LOG_MINIMUM_WEIGHT{std::log(MINIMUM_WEIGHT)}; +const double MINUS_LOG_TOLERANCE{ + -std::log(1.0 - 100.0 * std::numeric_limits::epsilon())}; + +//! Derate the minimum Winsorisation weight. +double deratedMinimumWeight(double derate) { + derate = CTools::truncate(derate, 0.0, 1.0); + return MINIMUM_WEIGHT + (0.5 - MINIMUM_WEIGHT) * derate; +} + +//! Get the one tail p-value from a specified Winsorisation weight. +double pValueFromWeight(double weight) { + if (weight >= 1.0) { + return 1.0; + } + + double logw{std::log(std::max(weight, MINIMUM_WEIGHT))}; + return std::exp(0.5 * (LOG_MAXIMUM_P_VALUE - + std::sqrt(CTools::pow2(LOG_MAXIMUM_P_VALUE) + + 4.0 * logw / LOG_MINIMUM_WEIGHT * LOG_MINIMUM_P_VALUE * + (LOG_MINIMUM_P_VALUE - LOG_MAXIMUM_P_VALUE)))); +} + +//! Computes a Winsorisation weight based on the chance that the +//! time series is currently undergoing a change. +double changeWeight(const TChangeDetectorPtr& detector) { + if (detector != nullptr) { + std::size_t dummy; + return std::max(CTools::logisticFunction(detector->decisionFunction(dummy), + 0.1, 1.0, -1.0), + MINIMUM_WEIGHT); + } + return 1.0; +} +} + +double tailWeight(const CPrior& prior, double derate, double scale, double value) { + double minimumWeight{deratedMinimumWeight(derate)}; + + double f{}; + double lowerBound; + double upperBound; + if (!prior.minusLogJointCdf({value}, {maths_t::seasonalVarianceScaleWeight(scale)}, + lowerBound, upperBound)) { + return 1.0; + } else if (upperBound >= MINUS_LOG_TOLERANCE) { + f = std::exp(-(lowerBound + upperBound) / 2.0); + f = std::min(f, 1.0 - f); + } else if (!prior.minusLogJointCdfComplement( + {value}, {maths_t::seasonalVarianceScaleWeight(scale)}, + lowerBound, upperBound)) { + return 1.0; + } else { + f = std::exp(-(lowerBound + upperBound) / 2.0); + } + + if (f >= MAXIMUM_P_VALUE) { + return 1.0; + } + if (f <= MINIMUM_P_VALUE) { + return minimumWeight; + } + + // We logarithmically interpolate between 1.0 and the minimum weight + // on the interval [MAXIMUM_P_VALUE, MINIMUM_P_VALUE]. + + double maximumExponent{-std::log(minimumWeight) / LOG_MINIMUM_P_VALUE / + (LOG_MINIMUM_P_VALUE - LOG_MAXIMUM_P_VALUE)}; + double logf{std::log(f)}; + double result{std::exp(-maximumExponent * logf * (logf - LOG_MAXIMUM_P_VALUE))}; + + if (CMathsFuncs::isNan(result)) { + return 1.0; + } + + LOG_TRACE(<< "sample = " << value << " min(F, 1-F) = " << f << ", weight = " << result); + + return result; +} + +double tailWeight(const CMultivariatePrior& prior, + std::size_t dimension, + double derate, + double scale, + const core::CSmallVector& value) { + std::size_t dimensions = prior.dimension(); + TSizeDoublePr10Vec condition(dimensions - 1); + for (std::size_t i = 0u, j = 0u; i < dimensions; ++i) { + if (i != dimension) { + condition[j++] = std::make_pair(i, value[i]); + } + } + std::shared_ptr conditional( + prior.univariate(NOTHING_TO_MARGINALIZE, condition).first); + return tailWeight(*conditional, derate, scale, value[dimension]); +} +} + //! \brief A model of anomalous sections of a time series. class CTimeSeriesAnomalyModel { public: @@ -272,13 +329,9 @@ class CTimeSeriesAnomalyModel { std::size_t tag() const { return m_Tag; } //! Add a result to the anomaly. - void update(const TDouble2Vec& errors) { - double norm{0.0}; - for (const auto& error : errors) { - norm += std::pow(error, 2.0); - m_Sign += error; - } - m_MeanErrorNorm.add(std::sqrt(norm)); + void update(double norm, double sign) { + m_MeanErrorNorm.add(norm); + m_Sign += sign; } //! Get the weight to apply to this anomaly on update. @@ -348,14 +401,15 @@ class CTimeSeriesAnomalyModel { //! significantly anomalous. static const double LOG_SMALL_PROBABILITY; //! A unit weight. - static const TDouble10VecWeightsAry1Vec UNIT; + static const maths_t::TDouble10VecWeightsAry1Vec UNIT; private: //! Update the appropriate anomaly model with \p anomaly. void sample(core_t::TTime time, const CAnomaly& anomaly, double weight) { std::size_t index(anomaly.positive() ? 0 : 1); TDouble10Vec1Vec features{anomaly.features(this->scale(time))}; - m_Priors[index].addSamples(features, {maths_t::countWeight(weight, 2)}); + m_AnomalyFeatureModels[index].addSamples(features, + {maths_t::countWeight(weight, 2)}); } //! Get the scaled time. @@ -374,21 +428,23 @@ class CTimeSeriesAnomalyModel { TAnomaly1Vec m_Anomalies; //! The model describing features of anomalous time periods. - TMultivariateNormalConjugateVec m_Priors; + TMultivariateNormalConjugateVec m_AnomalyFeatureModels; }; CTimeSeriesAnomalyModel::CTimeSeriesAnomalyModel() : m_BucketLength(0) { - m_Priors.reserve(2); - m_Priors.push_back(TMultivariateNormalConjugate::nonInformativePrior(maths_t::E_ContinuousData)); - m_Priors.push_back(TMultivariateNormalConjugate::nonInformativePrior(maths_t::E_ContinuousData)); + m_AnomalyFeatureModels.reserve(2); + m_AnomalyFeatureModels.push_back( + TMultivariateNormalConjugate::nonInformativePrior(maths_t::E_ContinuousData)); + m_AnomalyFeatureModels.push_back( + TMultivariateNormalConjugate::nonInformativePrior(maths_t::E_ContinuousData)); } CTimeSeriesAnomalyModel::CTimeSeriesAnomalyModel(core_t::TTime bucketLength, double decayRate) : m_BucketLength(bucketLength) { - m_Priors.reserve(2); - m_Priors.push_back(TMultivariateNormalConjugate::nonInformativePrior( + m_AnomalyFeatureModels.reserve(2); + m_AnomalyFeatureModels.push_back(TMultivariateNormalConjugate::nonInformativePrior( maths_t::E_ContinuousData, 0.5 * LARGEST_ANOMALOUS_PROBABILITY * decayRate)); - m_Priors.push_back(TMultivariateNormalConjugate::nonInformativePrior( + m_AnomalyFeatureModels.push_back(TMultivariateNormalConjugate::nonInformativePrior( maths_t::E_ContinuousData, 0.5 * LARGEST_ANOMALOUS_PROBABILITY * decayRate)); } @@ -403,20 +459,18 @@ void CTimeSeriesAnomalyModel::updateAnomaly(const CModelProbabilityParams& param [tag](const CAnomaly& anomaly_) { return anomaly_.tag() == tag; }); if (probability < LARGEST_ANOMALOUS_PROBABILITY) { - m_MeanError.add(std::sqrt( + double norm{std::sqrt( std::accumulate(errors.begin(), errors.end(), 0.0, - [](double n, double x) { return n + x * x; }))); - + [](double n, double x) { return n + x * x; }))}; + m_MeanError.add(norm); double scale{CBasicStatistics::mean(m_MeanError)}; - for (auto& error : errors) { - error = scale == 0.0 ? 1.0 : error / scale; - } - + norm = (scale == 0.0 ? 1.0 : norm / scale); + double sign{std::accumulate(errors.begin(), errors.end(), 0.0)}; if (anomaly == m_Anomalies.end()) { m_Anomalies.emplace_back(tag, this->scale(time)); anomaly = m_Anomalies.end() - 1; } - anomaly->update(errors); + anomaly->update(norm, sign); } else if (anomaly != m_Anomalies.end()) { this->sample(time, *anomaly, 1.0 - anomaly->weight(this->scale(time))); m_Anomalies.erase(anomaly); @@ -439,9 +493,9 @@ void CTimeSeriesAnomalyModel::sampleAnomaly(const CModelProbabilityParams& param void CTimeSeriesAnomalyModel::reset() { m_MeanError = TMeanAccumulator(); - for (auto& prior : m_Priors) { - prior = TMultivariateNormalConjugate::nonInformativePrior( - maths_t::E_ContinuousData, prior.decayRate()); + for (auto& model : m_AnomalyFeatureModels) { + model = TMultivariateNormalConjugate::nonInformativePrior( + maths_t::E_ContinuousData, model.decayRate()); } } @@ -458,47 +512,48 @@ void CTimeSeriesAnomalyModel::probability(const CModelProbabilityParams& params, double pl, pu; TTail10Vec tail; if (probability < LARGEST_ANOMALOUS_PROBABILITY && - !m_Priors[index].isNonInformative() && - m_Priors[index].probabilityOfLessLikelySamples( + !m_AnomalyFeatureModels[index].isNonInformative() && + m_AnomalyFeatureModels[index].probabilityOfLessLikelySamples( maths_t::E_OneSidedAbove, features, UNIT, pl, pu, tail)) { double logp{CTools::fastLog(probability)}; double alpha{0.5 * std::min((logp - LOG_LARGEST_ANOMALOUS_PROBABILITY) / (LOG_SMALL_PROBABILITY - LOG_LARGEST_ANOMALOUS_PROBABILITY), 1.0)}; double pGivenAnomalous{(pl + pu) / 2.0}; - double pScore{CTools::deviation(probability)}; - double pScoreGivenAnomalous{CTools::deviation(pGivenAnomalous)}; + double pScore{CTools::anomalyScore(probability)}; + double pScoreGivenAnomalous{CTools::anomalyScore(pGivenAnomalous)}; LOG_TRACE(<< "features = " << features << " score(.) = " << pScore << " score(.|anomalous) = " << pScoreGivenAnomalous << " p = " << probability); probability = std::min( - CTools::inverseDeviation((1.0 - alpha) * pScore + alpha * pScoreGivenAnomalous), + CTools::inverseAnomalyScore((1.0 - alpha) * pScore + alpha * pScoreGivenAnomalous), LARGEST_ANOMALOUS_PROBABILITY); } } } void CTimeSeriesAnomalyModel::propagateForwardsByTime(double time) { - m_Priors[0].propagateForwardsByTime(time); - m_Priors[1].propagateForwardsByTime(time); + m_AnomalyFeatureModels[0].propagateForwardsByTime(time); + m_AnomalyFeatureModels[1].propagateForwardsByTime(time); } uint64_t CTimeSeriesAnomalyModel::checksum(uint64_t seed) const { seed = CChecksum::calculate(seed, m_BucketLength); seed = CChecksum::calculate(seed, m_MeanError); seed = CChecksum::calculate(seed, m_Anomalies); - seed = CChecksum::calculate(seed, m_Priors[0]); - return CChecksum::calculate(seed, m_Priors[1]); + seed = CChecksum::calculate(seed, m_AnomalyFeatureModels[0]); + return CChecksum::calculate(seed, m_AnomalyFeatureModels[1]); } void CTimeSeriesAnomalyModel::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { mem->setName("CTimeSeriesAnomalyModel"); core::CMemoryDebug::dynamicSize("m_Anomalies", m_Anomalies, mem); - core::CMemoryDebug::dynamicSize("m_Priors", m_Priors, mem); + core::CMemoryDebug::dynamicSize("m_AnomalyFeatureModels", m_AnomalyFeatureModels, mem); } std::size_t CTimeSeriesAnomalyModel::memoryUsage() const { - return core::CMemory::dynamicSize(m_Anomalies) + core::CMemory::dynamicSize(m_Priors); + return core::CMemory::dynamicSize(m_Anomalies) + + core::CMemory::dynamicSize(m_AnomalyFeatureModels); } bool CTimeSeriesAnomalyModel::acceptRestoreTraverser(const SModelRestoreParams& params, @@ -510,9 +565,10 @@ bool CTimeSeriesAnomalyModel::acceptRestoreTraverser(const SModelRestoreParams& RESTORE(MEAN_ERROR_TAG, m_MeanError.fromDelimited(traverser.value())); RESTORE(ANOMALIES_TAG, core::CPersistUtils::restore(ANOMALIES_TAG, m_Anomalies, traverser)); - RESTORE(PRIOR_TAG, traverser.traverseSubLevel(boost::bind( - &TMultivariateNormalConjugate::acceptRestoreTraverser, - &m_Priors[index++], _1))) + RESTORE(ANOMALY_FEATURE_MODEL_TAG, + traverser.traverseSubLevel( + boost::bind(&TMultivariateNormalConjugate::acceptRestoreTraverser, + &m_AnomalyFeatureModels[index++], _1))) } while (traverser.next()); return true; } @@ -520,32 +576,35 @@ bool CTimeSeriesAnomalyModel::acceptRestoreTraverser(const SModelRestoreParams& void CTimeSeriesAnomalyModel::acceptPersistInserter(core::CStatePersistInserter& inserter) const { inserter.insertValue(MEAN_ERROR_TAG, m_MeanError.toDelimited()); core::CPersistUtils::persist(ANOMALIES_TAG, m_Anomalies, inserter); - inserter.insertLevel(PRIOR_TAG, boost::bind(&TMultivariateNormalConjugate::acceptPersistInserter, - &m_Priors[0], _1)); - inserter.insertLevel(PRIOR_TAG, boost::bind(&TMultivariateNormalConjugate::acceptPersistInserter, - &m_Priors[1], _1)); + inserter.insertLevel(ANOMALY_FEATURE_MODEL_TAG, + boost::bind(&TMultivariateNormalConjugate::acceptPersistInserter, + &m_AnomalyFeatureModels[0], _1)); + inserter.insertLevel(ANOMALY_FEATURE_MODEL_TAG, + boost::bind(&TMultivariateNormalConjugate::acceptPersistInserter, + &m_AnomalyFeatureModels[1], _1)); } const double CTimeSeriesAnomalyModel::LARGEST_ANOMALOUS_PROBABILITY{0.1}; const double CTimeSeriesAnomalyModel::LOG_LARGEST_ANOMALOUS_PROBABILITY{ CTools::fastLog(LARGEST_ANOMALOUS_PROBABILITY)}; const double CTimeSeriesAnomalyModel::LOG_SMALL_PROBABILITY{CTools::fastLog(SMALL_PROBABILITY)}; -const TDouble10VecWeightsAry1Vec CTimeSeriesAnomalyModel::UNIT{ +const maths_t::TDouble10VecWeightsAry1Vec CTimeSeriesAnomalyModel::UNIT{ maths_t::CUnitWeights::unit(2)}; CUnivariateTimeSeriesModel::CUnivariateTimeSeriesModel(const CModelParams& params, std::size_t id, - const CTimeSeriesDecompositionInterface& trend, - const CPrior& prior, + const CTimeSeriesDecompositionInterface& trendModel, + const CPrior& residualModel, const TDecayRateController2Ary* controllers, bool modelAnomalies) : CModel(params), m_Id(id), m_IsNonNegative(false), m_IsForecastable(true), - m_Trend(trend.clone()), m_Prior(prior.clone()), + m_TrendModel(trendModel.clone()), m_ResidualModel(residualModel.clone()), m_AnomalyModel(modelAnomalies ? std::make_shared( params.bucketLength(), params.decayRate()) : TAnomalyModelPtr()), - m_SlidingWindow(SLIDING_WINDOW_SIZE), m_Correlations(nullptr) { + m_CurrentChangeInterval(0), m_SlidingWindow(SLIDING_WINDOW_SIZE), + m_Correlations(nullptr) { if (controllers) { m_Controllers = std::make_shared(*controllers); } @@ -560,7 +619,7 @@ CUnivariateTimeSeriesModel::CUnivariateTimeSeriesModel(const SModelRestoreParams } CUnivariateTimeSeriesModel::~CUnivariateTimeSeriesModel() { - if (m_Correlations) { + if (m_Correlations != nullptr) { m_Correlations->removeTimeSeries(m_Id); } } @@ -571,7 +630,7 @@ std::size_t CUnivariateTimeSeriesModel::identifier() const { CUnivariateTimeSeriesModel* CUnivariateTimeSeriesModel::clone(std::size_t id) const { CUnivariateTimeSeriesModel* result{new CUnivariateTimeSeriesModel{*this, id}}; - if (m_Correlations) { + if (m_Correlations != nullptr) { result->modelCorrelations(*m_Correlations); } return result; @@ -582,11 +641,11 @@ CUnivariateTimeSeriesModel* CUnivariateTimeSeriesModel::cloneForPersistence() co } CUnivariateTimeSeriesModel* CUnivariateTimeSeriesModel::cloneForForecast() const { - return new CUnivariateTimeSeriesModel{*this, m_Id}; + return new CUnivariateTimeSeriesModel{*this, m_Id, true}; } bool CUnivariateTimeSeriesModel::isForecastPossible() const { - return m_IsForecastable && !m_Prior->isNonInformative(); + return m_IsForecastable && !m_ResidualModel->isNonInformative(); } void CUnivariateTimeSeriesModel::modelCorrelations(CTimeSeriesCorrelations& model) { @@ -598,10 +657,9 @@ TSize2Vec1Vec CUnivariateTimeSeriesModel::correlates() const { TSize2Vec1Vec result; TSize1Vec correlated; TSize2Vec1Vec variables; - TMultivariatePriorCPtrSizePr1Vec correlationDistributionModels; + TMultivariatePriorCPtrSizePr1Vec correlationModels; TModelCPtr1Vec correlatedTimeSeriesModels; - this->correlationModels(correlated, variables, correlationDistributionModels, - correlatedTimeSeriesModels); + this->correlationModels(correlated, variables, correlationModels, correlatedTimeSeriesModels); result.resize(correlated.size(), TSize2Vec(2)); for (std::size_t i = 0u; i < correlated.size(); ++i) { result[i][variables[i][0]] = m_Id; @@ -612,8 +670,9 @@ TSize2Vec1Vec CUnivariateTimeSeriesModel::correlates() const { void CUnivariateTimeSeriesModel::addBucketValue(const TTimeDouble2VecSizeTrVec& values) { for (const auto& value : values) { - m_Prior->adjustOffset({m_Trend->detrend(value.first, value.second[0], 0.0)}, - maths_t::CUnitWeights::SINGLE_UNIT); + m_ResidualModel->adjustOffset( + {m_TrendModel->detrend(value.first, value.second[0], 0.0)}, + maths_t::CUnitWeights::SINGLE_UNIT); } } @@ -624,7 +683,6 @@ CUnivariateTimeSeriesModel::addSamples(const CModelAddSamplesParams& params, return E_Success; } - using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; using TOptionalTimeDoublePr = boost::optional; TSizeVec valueorder(samples.size()); @@ -635,20 +693,23 @@ CUnivariateTimeSeriesModel::addSamples(const CModelAddSamplesParams& params, }); TOptionalTimeDoublePr randomSample; - - double p{SLIDING_WINDOW_SIZE * static_cast(this->params().bucketLength()) / - static_cast(core::constants::DAY)}; - if (p >= 1.0 || CSampling::uniformSample(m_Rng, 0.0, 1.0) < p) { - std::size_t i{CSampling::uniformSample(m_Rng, 0, samples.size())}; - randomSample.reset({samples[valueorder[i]].first, samples[valueorder[i]].second[0]}); + if (TOptionalSize index = randomlySample( + m_Rng, params, this->params().bucketLength(), valueorder)) { + randomSample.reset({samples[*index].first, samples[*index].second[0]}); } + EUpdateResult result{this->testAndApplyChange(params, valueorder, samples)}; + m_IsNonNegative = params.isNonNegative(); - EUpdateResult result{this->updateTrend(samples, params.trendWeights())}; + maths_t::EDataType type{params.type()}; + m_ResidualModel->dataType(type); + m_TrendModel->dataType(type); + + result = CModel::combine(result, this->updateTrend(samples, params.trendWeights())); for (auto& sample : samples) { - sample.second[0] = m_Trend->detrend(sample.first, sample.second[0], 0.0); + sample.second[0] = m_TrendModel->detrend(sample.first, sample.second[0], 0.0); } std::stable_sort(valueorder.begin(), valueorder.end(), @@ -656,68 +717,59 @@ CUnivariateTimeSeriesModel::addSamples(const CModelAddSamplesParams& params, return samples[lhs].second < samples[rhs].second; }); - maths_t::EDataType type{params.type()}; - m_Prior->dataType(type); - TDouble1Vec samples_; - TDoubleWeightsAry1Vec weights; + maths_t::TDoubleWeightsAry1Vec weights_; samples_.reserve(samples.size()); - weights.reserve(samples.size()); + weights_.reserve(samples.size()); TMeanAccumulator averageTime; for (auto i : valueorder) { samples_.push_back(samples[i].second[0]); - TDoubleWeightsAry1Vec wi(1); - for (std::size_t j = 0u; j < maths_t::NUMBER_WEIGHT_STYLES; ++j) { - wi[0][j] = params.priorWeights()[i][j][0]; - } - weights.push_back(wi[0]); + weights_.push_back(unpack(params.priorWeights()[i])); averageTime.add(static_cast(samples[i].first)); } - m_Prior->addSamples(samples_, weights); - m_Prior->propagateForwardsByTime(params.propagationInterval()); - if (m_AnomalyModel) { + m_ResidualModel->addSamples(samples_, weights_); + m_ResidualModel->propagateForwardsByTime(params.propagationInterval()); + if (m_AnomalyModel != nullptr) { m_AnomalyModel->propagateForwardsByTime(params.propagationInterval()); } double multiplier{1.0}; - if (m_Controllers) { + if (m_Controllers != nullptr) { TDouble1VecVec errors[2]; errors[0].reserve(samples.size()); errors[1].reserve(samples.size()); - for (auto i : valueorder) { - this->appendPredictionErrors(params.propagationInterval(), - samples[i].second[0], errors); + for (auto sample : samples_) { + this->appendPredictionErrors(params.propagationInterval(), sample, errors); } { CDecayRateController& controller{(*m_Controllers)[E_TrendControl]}; core_t::TTime time{static_cast(CBasicStatistics::mean(averageTime))}; - TDouble1Vec prediction{m_Trend->mean(time)}; + TDouble1Vec trendMean{m_TrendModel->meanValue(time)}; multiplier = controller.multiplier( - prediction, errors[E_TrendControl], this->params().bucketLength(), + trendMean, errors[E_TrendControl], this->params().bucketLength(), this->params().learnRate(), this->params().decayRate()); if (multiplier != 1.0) { - m_Trend->decayRate(multiplier * m_Trend->decayRate()); - LOG_TRACE(<< "trend decay rate = " << m_Trend->decayRate()); + m_TrendModel->decayRate(multiplier * m_TrendModel->decayRate()); + LOG_TRACE(<< "trend decay rate = " << m_TrendModel->decayRate()); } } { - CDecayRateController& controller{(*m_Controllers)[E_PriorControl]}; - TDouble1Vec prediction{m_Prior->marginalLikelihoodMean()}; + CDecayRateController& controller{(*m_Controllers)[E_ResidualControl]}; + TDouble1Vec residualMean{m_ResidualModel->marginalLikelihoodMean()}; multiplier = controller.multiplier( - prediction, errors[E_PriorControl], this->params().bucketLength(), + residualMean, errors[E_ResidualControl], this->params().bucketLength(), this->params().learnRate(), this->params().decayRate()); if (multiplier != 1.0) { - m_Prior->decayRate(multiplier * m_Prior->decayRate()); - LOG_TRACE(<< "prior decay rate = " << m_Prior->decayRate()); + m_ResidualModel->decayRate(multiplier * m_ResidualModel->decayRate()); + LOG_TRACE(<< "prior decay rate = " << m_ResidualModel->decayRate()); } } } - if (m_Correlations) { - m_Correlations->addSamples(m_Id, type, samples, weights, - params.propagationInterval(), multiplier); + if (m_Correlations != nullptr) { + m_Correlations->addSamples(m_Id, params, samples, multiplier); } if (randomSample) { @@ -728,45 +780,35 @@ CUnivariateTimeSeriesModel::addSamples(const CModelAddSamplesParams& params, } void CUnivariateTimeSeriesModel::skipTime(core_t::TTime gap) { - m_Trend->skipTime(gap); + m_TrendModel->skipTime(gap); } CUnivariateTimeSeriesModel::TDouble2Vec -CUnivariateTimeSeriesModel::mode(core_t::TTime time, - const TDouble2VecWeightsAry& weights_) const { - TDoubleWeightsAry weights; - for (std::size_t i = 0u; i < weights_.size(); ++i) { - weights[i] = weights_[i][0]; - } - return {m_Prior->marginalLikelihoodMode(weights) + - CBasicStatistics::mean(m_Trend->baseline(time))}; +CUnivariateTimeSeriesModel::mode(core_t::TTime time, const TDouble2VecWeightsAry& weights) const { + return {m_ResidualModel->marginalLikelihoodMode(unpack(weights)) + + CBasicStatistics::mean(m_TrendModel->value(time))}; } CUnivariateTimeSeriesModel::TDouble2Vec1Vec CUnivariateTimeSeriesModel::correlateModes(core_t::TTime time, - const TDouble2VecWeightsAry1Vec& weights_) const { - + const TDouble2VecWeightsAry1Vec& weights) const { TDouble2Vec1Vec result; TSize1Vec correlated; TSize2Vec1Vec variables; - TMultivariatePriorCPtrSizePr1Vec correlationDistributionModels; + TMultivariatePriorCPtrSizePr1Vec correlationModels; TModelCPtr1Vec correlatedTimeSeriesModels; - if (this->correlationModels(correlated, variables, correlationDistributionModels, + if (this->correlationModels(correlated, variables, correlationModels, correlatedTimeSeriesModels)) { result.resize(correlated.size(), TDouble10Vec(2)); double baseline[2]; - baseline[0] = CBasicStatistics::mean(m_Trend->baseline(time)); + baseline[0] = CBasicStatistics::mean(m_TrendModel->value(time)); for (std::size_t i = 0u; i < correlated.size(); ++i) { baseline[1] = CBasicStatistics::mean( - correlatedTimeSeriesModels[i]->m_Trend->baseline(time)); - TDouble10VecWeightsAry weights; - for (std::size_t j = 0u; j < weights_[i].size(); ++j) { - weights[j] = weights_[i][j]; - } - TDouble10Vec mode( - correlationDistributionModels[i].first->marginalLikelihoodMode(weights)); + correlatedTimeSeriesModels[i]->m_TrendModel->value(time)); + TDouble10Vec mode(correlationModels[i].first->marginalLikelihoodMode( + CMultivariateTimeSeriesModel::unpack(weights[i]))); result[i][variables[i][0]] = baseline[0] + mode[variables[i][0]]; result[i][variables[i][1]] = baseline[1] + mode[variables[i][1]]; } @@ -776,47 +818,38 @@ CUnivariateTimeSeriesModel::correlateModes(core_t::TTime time, } CUnivariateTimeSeriesModel::TDouble2Vec1Vec -CUnivariateTimeSeriesModel::residualModes(const TDouble2VecWeightsAry& weights_) const { - +CUnivariateTimeSeriesModel::residualModes(const TDouble2VecWeightsAry& weights) const { TDouble2Vec1Vec result; - - TDoubleWeightsAry weights; - for (std::size_t i = 0u; i < weights_.size(); ++i) { - weights[i] = weights_[i][0]; - } - - TDouble1Vec modes(m_Prior->marginalLikelihoodModes(weights)); + TDouble1Vec modes(m_ResidualModel->marginalLikelihoodModes(unpack(weights))); result.reserve(modes.size()); for (auto mode : modes) { result.push_back({mode}); } - return result; } void CUnivariateTimeSeriesModel::detrend(const TTime2Vec1Vec& time, double confidenceInterval, TDouble2Vec1Vec& value) const { - if (value.empty()) { return; } if (value[0].size() == 1) { - value[0][0] = m_Trend->detrend(time[0][0], value[0][0], confidenceInterval); + value[0][0] = m_TrendModel->detrend(time[0][0], value[0][0], confidenceInterval); } else { TSize1Vec correlated; TSize2Vec1Vec variables; - TMultivariatePriorCPtrSizePr1Vec correlationDistributionModels; + TMultivariatePriorCPtrSizePr1Vec correlationModels; TModelCPtr1Vec correlatedTimeSeriesModels; - if (this->correlationModels(correlated, variables, correlationDistributionModels, + if (this->correlationModels(correlated, variables, correlationModels, correlatedTimeSeriesModels)) { for (std::size_t i = 0u; i < variables.size(); ++i) { if (!value[i].empty()) { - value[i][variables[i][0]] = m_Trend->detrend( + value[i][variables[i][0]] = m_TrendModel->detrend( time[i][variables[i][0]], value[i][variables[i][0]], confidenceInterval); value[i][variables[i][1]] = - correlatedTimeSeriesModels[i]->m_Trend->detrend( + correlatedTimeSeriesModels[i]->m_TrendModel->detrend( time[i][variables[i][1]], value[i][variables[i][1]], confidenceInterval); } @@ -838,7 +871,7 @@ CUnivariateTimeSeriesModel::predict(core_t::TTime time, TModelCPtr1Vec correlatedModel; if (m_Correlations->correlationModels(m_Id, correlated, variables, correlationModel, correlatedModel)) { - double sample{correlatedModel[0]->m_Trend->detrend( + double sample{correlatedModel[0]->m_TrendModel->detrend( time, correlatedValue[0].second, 0.0)}; TSize10Vec marginalize{variables[0][1]}; TSizeDoublePr10Vec condition{{variables[0][1], sample}}; @@ -854,21 +887,22 @@ CUnivariateTimeSeriesModel::predict(core_t::TTime time, double scale{1.0 - this->params().probabilityBucketEmpty()}; - double seasonalOffset{0.0}; - if (m_Trend->initialized()) { - seasonalOffset = CBasicStatistics::mean(m_Trend->baseline(time)); + double trend{0.0}; + if (m_TrendModel->initialized()) { + trend = CBasicStatistics::mean(m_TrendModel->value(time)); } if (hint.size() == 1) { - hint[0] = m_Trend->detrend(time, hint[0], 0.0); + hint[0] = m_TrendModel->detrend(time, hint[0], 0.0); } - double median{m_Prior->isNonInformative() - ? m_Prior->marginalLikelihoodMean() - : (hint.empty() - ? CBasicStatistics::mean(m_Prior->marginalLikelihoodConfidenceInterval(0.0)) - : m_Prior->nearestMarginalLikelihoodMean(hint[0]))}; - double result{scale * (seasonalOffset + median + correlateCorrection)}; + double median{ + m_ResidualModel->isNonInformative() + ? m_ResidualModel->marginalLikelihoodMean() + : (hint.empty() + ? CBasicStatistics::mean(m_ResidualModel->marginalLikelihoodConfidenceInterval(0.0)) + : m_ResidualModel->nearestMarginalLikelihoodMean(hint[0]))}; + double result{scale * (trend + median + correlateCorrection)}; return {m_IsNonNegative ? std::max(result, 0.0) : result}; } @@ -877,30 +911,24 @@ CUnivariateTimeSeriesModel::TDouble2Vec3Vec CUnivariateTimeSeriesModel::confidenceInterval(core_t::TTime time, double confidenceInterval, const TDouble2VecWeightsAry& weights_) const { - - if (m_Prior->isNonInformative()) { + if (m_ResidualModel->isNonInformative()) { return TDouble2Vec3Vec(); } double scale{1.0 - this->params().probabilityBucketEmpty()}; - double seasonalOffset{m_Trend->initialized() - ? CBasicStatistics::mean(m_Trend->baseline(time, confidenceInterval)) - : 0.0}; - - TDoubleWeightsAry weights; - for (std::size_t i = 0u; i < weights_.size(); ++i) { - weights[i] = weights_[i][0]; - } + double trend{m_TrendModel->initialized() + ? CBasicStatistics::mean(m_TrendModel->value(time, confidenceInterval)) + : 0.0}; + TDoubleWeightsAry weights(unpack(weights_)); double median{CBasicStatistics::mean( - m_Prior->marginalLikelihoodConfidenceInterval(0.0, weights))}; - TDoubleDoublePr interval{ - m_Prior->marginalLikelihoodConfidenceInterval(confidenceInterval, weights)}; + m_ResidualModel->marginalLikelihoodConfidenceInterval(0.0, weights))}; + TDoubleDoublePr interval{m_ResidualModel->marginalLikelihoodConfidenceInterval( + confidenceInterval, weights)}; - double result[]{scale * (seasonalOffset + interval.first), - scale * (seasonalOffset + median), - scale * (seasonalOffset + interval.second)}; + double result[]{scale * (trend + interval.first), scale * (trend + median), + scale * (trend + interval.second)}; return {{m_IsNonNegative ? std::max(result[0], 0.0) : result[0]}, {m_IsNonNegative ? std::max(result[1], 0.0) : result[1]}, @@ -914,35 +942,28 @@ bool CUnivariateTimeSeriesModel::forecast(core_t::TTime startTime, const TDouble2Vec& maximum_, const TForecastPushDatapointFunc& forecastPushDataPointFunc, std::string& messageOut) { - if (m_Prior->isNonInformative()) { + if (m_ResidualModel->isNonInformative()) { messageOut = forecast::INFO_INSUFFICIENT_HISTORY; return true; } using TDouble3Vec = core::CSmallVector; - using TDouble3VecVec = std::vector; core_t::TTime bucketLength{this->params().bucketLength()}; double minimum{m_IsNonNegative ? std::max(minimum_[0], 0.0) : minimum_[0]}; double maximum{m_IsNonNegative ? std::max(maximum_[0], 0.0) : maximum_[0]}; - TDouble3VecVec predictions; - m_Trend->forecast(startTime, endTime, bucketLength, confidenceInterval, - this->params().minimumSeasonalVarianceScale(), predictions); - - core_t::TTime time{startTime}; - for (const auto& prediction : predictions) { - SErrorBar errorBar; - errorBar.s_Time = time; - errorBar.s_BucketLength = bucketLength; - errorBar.s_LowerBound = CTools::truncate( - prediction[0], minimum, maximum + prediction[0] - prediction[1]); - errorBar.s_Predicted = CTools::truncate(prediction[1], minimum, maximum); - errorBar.s_UpperBound = CTools::truncate( - prediction[2], minimum + prediction[2] - prediction[1], maximum); + auto writer = [&](core_t::TTime time, const TDouble3Vec& prediction) { + SErrorBar errorBar{ + time, bucketLength, + CTools::truncate(prediction[0], minimum, maximum + prediction[0] - prediction[1]), + CTools::truncate(prediction[1], minimum, maximum), + CTools::truncate(prediction[2], minimum + prediction[2] - prediction[1], maximum)}; forecastPushDataPointFunc(errorBar); - time += bucketLength; - } + }; + + m_TrendModel->forecast(startTime, endTime, bucketLength, confidenceInterval, + this->params().minimumSeasonalVarianceScale(), writer); return true; } @@ -966,18 +987,14 @@ bool CUnivariateTimeSeriesModel::probability(const CModelProbabilityParams& para if (value[0].size() == 1) { core_t::TTime time{time_[0][0]}; - TDouble1Vec sample{m_Trend->detrend(time, value[0][0], - params.seasonalConfidenceInterval())}; - - TDoubleWeightsAry1Vec weights(1); - for (std::size_t i = 0u; i < params.weights()[0].size(); ++i) { - weights[0][i] = params.weights()[0][i][0]; - } + TDouble1Vec sample{m_TrendModel->detrend( + time, value[0][0], params.seasonalConfidenceInterval())}; + maths_t::TDoubleWeightsAry1Vec weights{unpack(params.weights()[0])}; double pl, pu; maths_t::ETail tail_; - if (m_Prior->probabilityOfLessLikelySamples(params.calculation(0), sample, - weights, pl, pu, tail_)) { + if (m_ResidualModel->probabilityOfLessLikelySamples( + params.calculation(0), sample, weights, pl, pu, tail_)) { LOG_TRACE(<< "P(" << sample << " | weight = " << weights << ", time = " << time << ") = " << (pl + pu) / 2.0); } else { @@ -989,9 +1006,9 @@ bool CUnivariateTimeSeriesModel::probability(const CModelProbabilityParams& para params.calculation(0), value[0], params.bucketEmpty()[0][0], this->params().probabilityBucketEmpty(), (pl + pu) / 2.0); - if (m_AnomalyModel) { + if (m_AnomalyModel != nullptr) { TDouble2Vec residual{ - (sample[0] - m_Prior->nearestMarginalLikelihoodMean(sample[0])) / + (sample[0] - m_ResidualModel->nearestMarginalLikelihoodMean(sample[0])) / std::max(std::sqrt(this->seasonalWeight(0.0, time)[0]), 1.0)}; m_AnomalyModel->updateAnomaly(params, time, residual, probability); m_AnomalyModel->probability(params, time, probability); @@ -1001,9 +1018,9 @@ bool CUnivariateTimeSeriesModel::probability(const CModelProbabilityParams& para } else { TSize1Vec correlated; TSize2Vec1Vec variables; - TMultivariatePriorCPtrSizePr1Vec correlationDistributionModels; + TMultivariatePriorCPtrSizePr1Vec correlationModels; TModelCPtr1Vec correlatedTimeSeriesModels; - if (!this->correlationModels(correlated, variables, correlationDistributionModels, + if (!this->correlationModels(correlated, variables, correlationModels, correlatedTimeSeriesModels)) { return false; } @@ -1015,32 +1032,29 @@ bool CUnivariateTimeSeriesModel::probability(const CModelProbabilityParams& para // Declared outside the loop to minimize the number of times they are created. TSize10Vec variable(1); TDouble10Vec1Vec sample{TDouble10Vec(2)}; - TDouble10VecWeightsAry1Vec weights{maths_t::CUnitWeights::unit(2)}; + maths_t::TDouble10VecWeightsAry1Vec weights{ + maths_t::CUnitWeights::unit(2)}; TDouble2Vec probabilityBucketEmpty(2); TDouble10Vec2Vec pli, pui; TTail10Vec ti; core_t::TTime mostAnomalousTime{0}; double mostAnomalousSample{0.0}; - TPriorPtr mostAnomalousPrior; + TPriorPtr mostAnomalousCorrelationModel; for (std::size_t i = 0u; i < variables.size(); ++i) { if (!value[i].empty() || (!params.mostAnomalousCorrelate() || i == *params.mostAnomalousCorrelate())) { variable[0] = variables[i][0]; - sample[0][variables[i][0]] = m_Trend->detrend( + sample[0][variables[i][0]] = m_TrendModel->detrend( time_[i][variables[i][0]], value[i][variables[i][0]], params.seasonalConfidenceInterval()); sample[0][variables[i][1]] = - correlatedTimeSeriesModels[i]->m_Trend->detrend( + correlatedTimeSeriesModels[i]->m_TrendModel->detrend( time_[i][variables[i][1]], value[i][variables[i][1]], params.seasonalConfidenceInterval()); - for (std::size_t j = 0u; j < params.weights()[i].size(); ++j) { - for (std::size_t d = 0u; d < 2; ++d) { - weights[0][j][d] = params.weights()[i][j][d]; - } - } + weights[0] = CMultivariateTimeSeriesModel::unpack(params.weights()[i]); - if (correlationDistributionModels[i].first->probabilityOfLessLikelySamples( + if (correlationModels[i].first->probabilityOfLessLikelySamples( params.calculation(0), sample, weights, variable, pli, pui, ti)) { LOG_TRACE(<< "Marginal P(" << sample << " | weight = " << weights << ", coordinate = " << variable @@ -1066,24 +1080,21 @@ bool CUnivariateTimeSeriesModel::probability(const CModelProbabilityParams& para aggregator.add(p, neff); if (minProbability.add(p)) { - static TSizeDoublePr10Vec CONDITION; - static TSize10Vec MARGINALIZE; - tail[0] = ti[0]; mostAnomalousCorrelate.assign(1, i); conditional = ((pli[1][0] + pui[1][0]) < (pli[0][0] + pui[0][0])); mostAnomalousTime = time_[0][variables[i][0]]; mostAnomalousSample = sample[0][variables[i][0]]; - mostAnomalousPrior = + mostAnomalousCorrelationModel = conditional - ? correlationDistributionModels[i] + ? correlationModels[i] .first - ->univariate({variables[i][1]}, CONDITION) + ->univariate({variables[i][1]}, NOTHING_TO_CONDITION) .first - : correlationDistributionModels[i] + : correlationModels[i] .first ->univariate( - MARGINALIZE, + NOTHING_TO_MARGINALIZE, {{variables[i][1], sample[0][variables[i][1]]}}) .first; } @@ -1093,10 +1104,10 @@ bool CUnivariateTimeSeriesModel::probability(const CModelProbabilityParams& para } aggregator.calculate(probability); - if (m_AnomalyModel) { + if (m_AnomalyModel != nullptr) { TDouble2Vec residual{ - (mostAnomalousSample - - mostAnomalousPrior->nearestMarginalLikelihoodMean(mostAnomalousSample)) / + (mostAnomalousSample - mostAnomalousCorrelationModel->nearestMarginalLikelihoodMean( + mostAnomalousSample)) / std::max(std::sqrt(this->seasonalWeight(0.0, mostAnomalousTime)[0]), 1.0)}; m_AnomalyModel->updateAnomaly(params, mostAnomalousTime, residual, probability); m_AnomalyModel->probability(params, mostAnomalousTime, probability); @@ -1112,22 +1123,27 @@ CUnivariateTimeSeriesModel::winsorisationWeight(double derate, core_t::TTime time, const TDouble2Vec& value) const { double scale{this->seasonalWeight(0.0, time)[0]}; - double sample{m_Trend->detrend(time, value[0], 0.0)}; - return {computeWinsorisationWeight(*m_Prior, derate, scale, sample)}; + double sample{m_TrendModel->detrend(time, value[0], 0.0)}; + return {winsorisation::tailWeight(*m_ResidualModel, derate, scale, sample) * + winsorisation::changeWeight(m_ChangeDetector)}; } CUnivariateTimeSeriesModel::TDouble2Vec CUnivariateTimeSeriesModel::seasonalWeight(double confidence, core_t::TTime time) const { - double scale{ - m_Trend->scale(time, m_Prior->marginalLikelihoodVariance(), confidence).second}; + double scale{m_TrendModel + ->scale(time, m_ResidualModel->marginalLikelihoodVariance(), confidence) + .second}; return {std::max(scale, this->params().minimumSeasonalVarianceScale())}; } uint64_t CUnivariateTimeSeriesModel::checksum(uint64_t seed) const { seed = CChecksum::calculate(seed, m_IsNonNegative); seed = CChecksum::calculate(seed, m_Controllers); - seed = CChecksum::calculate(seed, m_Trend); - seed = CChecksum::calculate(seed, m_Prior); + seed = CChecksum::calculate(seed, m_TrendModel); + seed = CChecksum::calculate(seed, m_ResidualModel); + seed = CChecksum::calculate(seed, m_CandidateChangePoint); + seed = CChecksum::calculate(seed, m_CurrentChangeInterval); + seed = CChecksum::calculate(seed, m_ChangeDetector); seed = CChecksum::calculate(seed, m_AnomalyModel); seed = CChecksum::calculate(seed, m_SlidingWindow); return CChecksum::calculate(seed, m_Correlations != nullptr); @@ -1136,16 +1152,19 @@ uint64_t CUnivariateTimeSeriesModel::checksum(uint64_t seed) const { void CUnivariateTimeSeriesModel::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { mem->setName("CUnivariateTimeSeriesModel"); core::CMemoryDebug::dynamicSize("m_Controllers", m_Controllers, mem); - core::CMemoryDebug::dynamicSize("m_Trend", m_Trend, mem); - core::CMemoryDebug::dynamicSize("m_Prior", m_Prior, mem); + core::CMemoryDebug::dynamicSize("m_TrendModel", m_TrendModel, mem); + core::CMemoryDebug::dynamicSize("m_ResidualModel", m_ResidualModel, mem); core::CMemoryDebug::dynamicSize("m_AnomalyModel", m_AnomalyModel, mem); + core::CMemoryDebug::dynamicSize("m_ChangeDetector", m_ChangeDetector, mem); core::CMemoryDebug::dynamicSize("m_SlidingWindow", m_SlidingWindow, mem); } std::size_t CUnivariateTimeSeriesModel::memoryUsage() const { return core::CMemory::dynamicSize(m_Controllers) + - core::CMemory::dynamicSize(m_Trend) + core::CMemory::dynamicSize(m_Prior) + + core::CMemory::dynamicSize(m_TrendModel) + + core::CMemory::dynamicSize(m_ResidualModel) + core::CMemory::dynamicSize(m_AnomalyModel) + + core::CMemory::dynamicSize(m_ChangeDetector) + core::CMemory::dynamicSize(m_SlidingWindow); } @@ -1163,14 +1182,14 @@ bool CUnivariateTimeSeriesModel::acceptRestoreTraverser(const SModelRestoreParam m_Controllers = std::make_shared(), core::CPersistUtils::restore(CONTROLLER_6_3_TAG, *m_Controllers, traverser), /**/) - RESTORE(TREND_6_3_TAG, traverser.traverseSubLevel(boost::bind( - CTimeSeriesDecompositionStateSerialiser(), - boost::cref(params.s_DecompositionParams), - boost::ref(m_Trend), _1))) - RESTORE(PRIOR_6_3_TAG, traverser.traverseSubLevel(boost::bind( - CPriorStateSerialiser(), - boost::cref(params.s_DistributionParams), - boost::ref(m_Prior), _1))) + RESTORE(TREND_MODEL_6_3_TAG, traverser.traverseSubLevel(boost::bind( + CTimeSeriesDecompositionStateSerialiser(), + boost::cref(params.s_DecompositionParams), + boost::ref(m_TrendModel), _1))) + RESTORE(RESIDUAL_MODEL_6_3_TAG, + traverser.traverseSubLevel(boost::bind( + CPriorStateSerialiser(), boost::cref(params.s_DistributionParams), + boost::ref(m_ResidualModel), _1))) RESTORE_SETUP_TEARDOWN( ANOMALY_MODEL_6_3_TAG, m_AnomalyModel = std::make_shared(), @@ -1178,6 +1197,17 @@ bool CUnivariateTimeSeriesModel::acceptRestoreTraverser(const SModelRestoreParam boost::bind(&CTimeSeriesAnomalyModel::acceptRestoreTraverser, m_AnomalyModel.get(), boost::cref(params), _1)), /**/) + RESTORE(CANDIDATE_CHANGE_POINT_6_3_TAG, + fromDelimited(traverser.value(), m_CandidateChangePoint)) + RESTORE_BUILT_IN(CURRENT_CHANGE_INTERVAL_6_3_TAG, m_CurrentChangeInterval) + RESTORE_SETUP_TEARDOWN( + CHANGE_DETECTOR_6_3_TAG, + m_ChangeDetector = std::make_shared( + m_TrendModel, m_ResidualModel), + traverser.traverseSubLevel(boost::bind( + &CUnivariateTimeSeriesChangeDetector::acceptRestoreTraverser, + m_ChangeDetector.get(), boost::cref(params), _1)), + /**/) RESTORE(SLIDING_WINDOW_6_3_TAG, core::CPersistUtils::restore(SLIDING_WINDOW_6_3_TAG, m_SlidingWindow, traverser)) @@ -1197,11 +1227,11 @@ bool CUnivariateTimeSeriesModel::acceptRestoreTraverser(const SModelRestoreParam RESTORE(TREND_OLD_TAG, traverser.traverseSubLevel(boost::bind( CTimeSeriesDecompositionStateSerialiser(), boost::cref(params.s_DecompositionParams), - boost::ref(m_Trend), _1))) + boost::ref(m_TrendModel), _1))) RESTORE(PRIOR_OLD_TAG, traverser.traverseSubLevel(boost::bind( CPriorStateSerialiser(), boost::cref(params.s_DistributionParams), - boost::ref(m_Prior), _1))) + boost::ref(m_ResidualModel), _1))) RESTORE_SETUP_TEARDOWN( ANOMALY_MODEL_OLD_TAG, m_AnomalyModel = std::make_shared(), @@ -1226,12 +1256,20 @@ void CUnivariateTimeSeriesModel::acceptPersistInserter(core::CStatePersistInsert if (m_Controllers) { core::CPersistUtils::persist(CONTROLLER_6_3_TAG, *m_Controllers, inserter); } - inserter.insertLevel(TREND_6_3_TAG, + inserter.insertLevel(TREND_MODEL_6_3_TAG, boost::bind(CTimeSeriesDecompositionStateSerialiser(), - boost::cref(*m_Trend), _1)); - inserter.insertLevel(PRIOR_6_3_TAG, boost::bind(CPriorStateSerialiser(), - boost::cref(*m_Prior), _1)); - if (m_AnomalyModel) { + boost::cref(*m_TrendModel), _1)); + inserter.insertLevel(RESIDUAL_MODEL_6_3_TAG, + boost::bind(CPriorStateSerialiser(), + boost::cref(*m_ResidualModel), _1)); + inserter.insertValue(CANDIDATE_CHANGE_POINT_6_3_TAG, toDelimited(m_CandidateChangePoint)); + inserter.insertValue(CURRENT_CHANGE_INTERVAL_6_3_TAG, m_CurrentChangeInterval); + if (m_ChangeDetector != nullptr) { + inserter.insertLevel(CHANGE_DETECTOR_6_3_TAG, + boost::bind(&CUnivariateTimeSeriesChangeDetector::acceptPersistInserter, + m_ChangeDetector.get(), _1)); + } + if (m_AnomalyModel != nullptr) { inserter.insertLevel(ANOMALY_MODEL_6_3_TAG, boost::bind(&CTimeSeriesAnomalyModel::acceptPersistInserter, m_AnomalyModel.get(), _1)); @@ -1240,7 +1278,32 @@ void CUnivariateTimeSeriesModel::acceptPersistInserter(core::CStatePersistInsert } maths_t::EDataType CUnivariateTimeSeriesModel::dataType() const { - return m_Prior->dataType(); + return m_ResidualModel->dataType(); +} + +CUnivariateTimeSeriesModel::TDoubleWeightsAry +CUnivariateTimeSeriesModel::unpack(const TDouble2VecWeightsAry& weights) { + TDoubleWeightsAry result{maths_t::CUnitWeights::UNIT}; + for (std::size_t i = 0u; i < weights.size(); ++i) { + result[i] = weights[i][0]; + } + return result; +} + +void CUnivariateTimeSeriesModel::reinitializeResidualModel(double learnRate, + const TDecompositionPtr& trend, + const TTimeDoublePrCBuf& slidingWindow, + CPrior& residualModel) { + residualModel.setToNonInformative(0.0, residualModel.decayRate()); + if (!slidingWindow.empty()) { + double slidingWindowLength{static_cast(slidingWindow.size())}; + maths_t::TDoubleWeightsAry1Vec weight{maths_t::countWeight( + std::max(learnRate, std::min(5.0 / slidingWindowLength, 1.0)))}; + for (const auto& value : slidingWindow) { + TDouble1Vec sample{trend->detrend(value.first, value.second, 0.0)}; + residualModel.addSamples(sample, weight); + } + } } const CUnivariateTimeSeriesModel::TTimeDoublePrCBuf& @@ -1248,28 +1311,105 @@ CUnivariateTimeSeriesModel::slidingWindow() const { return m_SlidingWindow; } -const CTimeSeriesDecompositionInterface& CUnivariateTimeSeriesModel::trend() const { - return *m_Trend; +const CTimeSeriesDecompositionInterface& CUnivariateTimeSeriesModel::trendModel() const { + return *m_TrendModel; } -const CPrior& CUnivariateTimeSeriesModel::prior() const { - return *m_Prior; +const CPrior& CUnivariateTimeSeriesModel::residualModel() const { + return *m_ResidualModel; } CUnivariateTimeSeriesModel::CUnivariateTimeSeriesModel(const CUnivariateTimeSeriesModel& other, - std::size_t id) + std::size_t id, + bool isForForecast) : CModel(other.params()), m_Id(id), m_IsNonNegative(other.m_IsNonNegative), m_IsForecastable(other.m_IsForecastable), m_Rng(other.m_Rng), - m_Trend(other.m_Trend->clone()), m_Prior(other.m_Prior->clone()), - m_AnomalyModel(other.m_AnomalyModel + m_TrendModel(other.m_TrendModel->clone()), + m_ResidualModel(other.m_ResidualModel->clone()), + m_AnomalyModel(!isForForecast && other.m_AnomalyModel ? std::make_shared(*other.m_AnomalyModel) : TAnomalyModelPtr()), - m_SlidingWindow(other.m_SlidingWindow), m_Correlations(nullptr) { - if (other.m_Controllers) { + m_CandidateChangePoint(other.m_CandidateChangePoint), + m_CurrentChangeInterval(other.m_CurrentChangeInterval), + m_ChangeDetector(!isForForecast && other.m_ChangeDetector + ? std::make_shared( + *other.m_ChangeDetector) + : TChangeDetectorPtr()), + m_SlidingWindow(!isForForecast ? other.m_SlidingWindow : TTimeDoublePrCBuf{}), + m_Correlations(nullptr) { + if (!isForForecast && other.m_Controllers != nullptr) { m_Controllers = std::make_shared(*other.m_Controllers); } } +CUnivariateTimeSeriesModel::EUpdateResult +CUnivariateTimeSeriesModel::testAndApplyChange(const CModelAddSamplesParams& params, + const TSizeVec& order, + const TTimeDouble2VecSizeTrVec& values) { + std::size_t median{order[order.size() / 2]}; + TDoubleWeightsAry weights{unpack(params.priorWeights()[median])}; + core_t::TTime time{values[median].first}; + + if (m_ChangeDetector == nullptr) { + core_t::TTime minimumTimeToDetect{this->params().minimumTimeToDetectChange()}; + core_t::TTime maximumTimeToTest{this->params().maximumTimeToTestForChange()}; + double weight{maths_t::winsorisationWeight(weights)}; + if (minimumTimeToDetect < maximumTimeToTest && + winsorisation::pValueFromWeight(weight) <= CHANGE_P_VALUE) { + m_CurrentChangeInterval += this->params().bucketLength(); + if (this->params().testForChange(m_CurrentChangeInterval)) { + m_ChangeDetector = std::make_shared( + m_TrendModel, m_ResidualModel, minimumTimeToDetect, maximumTimeToTest); + m_CurrentChangeInterval = 0; + } + } else { + m_CandidateChangePoint = {time, values[median].second[0]}; + m_CurrentChangeInterval = 0; + } + } + + if (m_ChangeDetector != nullptr) { + m_ChangeDetector->addSamples({{time, values[median].second[0]}}, {weights}); + + if (m_ChangeDetector->stopTesting()) { + m_ChangeDetector.reset(); + } else if (auto change = m_ChangeDetector->change()) { + LOG_DEBUG("Detected " << change->print() << " at " << values[median].first); + m_ChangeDetector.reset(); + return this->applyChange(*change); + } + } + + return E_Success; +} + +CUnivariateTimeSeriesModel::EUpdateResult +CUnivariateTimeSeriesModel::applyChange(const SChangeDescription& change) { + for (auto& value : m_SlidingWindow) { + switch (change.s_Description) { + case SChangeDescription::E_LevelShift: + value.second += change.s_Value[0]; + break; + case SChangeDescription::E_LinearScale: + value.second *= change.s_Value[0]; + break; + case SChangeDescription::E_TimeShift: + value.first += static_cast(change.s_Value[0]); + break; + } + } + + if (m_TrendModel->applyChange(m_CandidateChangePoint.first, + m_CandidateChangePoint.second, change)) { + this->reinitializeStateGivenNewComponent(); + } else { + change.s_ResidualModel->decayRate(m_ResidualModel->decayRate()); + m_ResidualModel = change.s_ResidualModel; + } + + return E_Success; +} + CUnivariateTimeSeriesModel::EUpdateResult CUnivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& samples, const TDouble2VecWeightsAryVec& weights) { @@ -1280,6 +1420,8 @@ CUnivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& samples, } } + EUpdateResult result = E_Success; + // Time order is not reliable, for example if the data are polled // or for count feature, the times of all samples will be the same. TSizeVec timeorder(samples.size()); @@ -1291,43 +1433,17 @@ CUnivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& samples, samples[rhs].first, samples[rhs].second); }); - EUpdateResult result = E_Success; - { - TDoubleWeightsAry weight; - for (auto i : timeorder) { - core_t::TTime time{samples[i].first}; - double value{samples[i].second[0]}; - for (std::size_t j = 0u; j < weights[i].size(); ++j) { - weight[j] = weights[i][j][0]; - } - if (m_Trend->addPoint(time, value, weight)) { - result = E_Reset; - } + for (auto i : timeorder) { + core_t::TTime time{samples[i].first}; + double value{samples[i].second[0]}; + TDoubleWeightsAry weight{unpack(weights[i])}; + if (m_TrendModel->addPoint(time, value, weight)) { + result = E_Reset; } } + if (result == E_Reset) { - m_Prior->setToNonInformative(0.0, m_Prior->decayRate()); - TDoubleWeightsAry1Vec weight{maths_t::countWeight( - slidingWindowCountWeight(this->params().learnRate()))}; - for (const auto& value : m_SlidingWindow) { - TDouble1Vec sample{m_Trend->detrend(value.first, value.second, 0.0)}; - m_Prior->addSamples(sample, weight); - } - if (m_Correlations) { - m_Correlations->removeTimeSeries(m_Id); - } - if (m_Controllers) { - m_Prior->decayRate(m_Prior->decayRate() / - (*m_Controllers)[E_PriorControl].multiplier()); - m_Trend->decayRate(m_Trend->decayRate() / - (*m_Controllers)[E_TrendControl].multiplier()); - for (auto& controller : *m_Controllers) { - controller.reset(); - } - } - if (m_AnomalyModel) { - m_AnomalyModel->reset(); - } + this->reinitializeStateGivenNewComponent(); } return result; @@ -1338,22 +1454,43 @@ void CUnivariateTimeSeriesModel::appendPredictionErrors(double interval, TDouble1VecVec (&result)[2]) { using TDecompositionPtr1Vec = core::CSmallVector; TDouble1Vec sample{sample_}; - TDecompositionPtr1Vec trend{m_Trend}; - if (auto error = predictionError(interval, m_Prior, sample)) { - result[E_PriorControl].push_back(*error); + TDecompositionPtr1Vec trend{m_TrendModel}; + if (auto error = predictionError(interval, m_ResidualModel, sample)) { + result[E_ResidualControl].push_back(*error); } if (auto error = predictionError(trend, sample)) { result[E_TrendControl].push_back(*error); } } +void CUnivariateTimeSeriesModel::reinitializeStateGivenNewComponent() { + reinitializeResidualModel(this->params().learnRate(), m_TrendModel, + m_SlidingWindow, *m_ResidualModel); + if (m_Correlations != nullptr) { + m_Correlations->removeTimeSeries(m_Id); + } + if (m_Controllers != nullptr) { + m_ResidualModel->decayRate(m_ResidualModel->decayRate() / + (*m_Controllers)[E_ResidualControl].multiplier()); + m_TrendModel->decayRate(m_TrendModel->decayRate() / + (*m_Controllers)[E_TrendControl].multiplier()); + for (auto& controller : *m_Controllers) { + controller.reset(); + } + } + if (m_AnomalyModel != nullptr) { + m_AnomalyModel->reset(); + } + m_ChangeDetector.reset(); +} + bool CUnivariateTimeSeriesModel::correlationModels(TSize1Vec& correlated, TSize2Vec1Vec& variables, - TMultivariatePriorCPtrSizePr1Vec& correlationDistributionModels, + TMultivariatePriorCPtrSizePr1Vec& correlationModels, TModelCPtr1Vec& correlatedTimeSeriesModels) const { if (m_Correlations) { correlated = m_Correlations->correlated(m_Id); - m_Correlations->correlationModels(m_Id, correlated, variables, correlationDistributionModels, + m_Correlations->correlationModels(m_Id, correlated, variables, correlationModels, correlatedTimeSeriesModels); } return correlated.size() > 0; @@ -1409,7 +1546,7 @@ void CTimeSeriesCorrelations::processSamples() { core::CFunctional::SDereference()); TDouble10Vec1Vec multivariateSamples; - TDouble10VecWeightsAry1Vec multivariateWeights; + maths_t::TDouble10VecWeightsAry1Vec multivariateWeights; for (auto i : iterators) { std::size_t pid1{i->first.first}; std::size_t pid2{i->first.second}; @@ -1592,7 +1729,7 @@ void CTimeSeriesCorrelations::refresh(const CTimeSeriesCorrelateModelAllocator& } const CTimeSeriesCorrelations::TSizeSizePrMultivariatePriorPtrDoublePrUMap& -CTimeSeriesCorrelations::correlatePriors() const { +CTimeSeriesCorrelations::correlationModels() const { return m_CorrelationDistributionModels; } @@ -1621,8 +1758,8 @@ bool CTimeSeriesCorrelations::acceptRestoreTraverser(const SDistributionRestoreP &CKMostCorrelated::acceptRestoreTraverser, &m_Correlations, _1))) RESTORE(CORRELATED_LOOKUP_TAG, core::CPersistUtils::restore(CORRELATED_LOOKUP_TAG, m_CorrelatedLookup, traverser)) - RESTORE(CORRELATED_PRIORS_TAG, - traverser.traverseSubLevel(boost::bind(&CTimeSeriesCorrelations::restoreCorrelatePriors, + RESTORE(CORRELATION_MODELS_TAG, + traverser.traverseSubLevel(boost::bind(&CTimeSeriesCorrelations::restoreCorrelationModels, this, boost::cref(params), _1))) } while (traverser.next()); return true; @@ -1638,16 +1775,16 @@ void CTimeSeriesCorrelations::acceptPersistInserter(core::CStatePersistInserter& &m_Correlations, _1)); core::CPersistUtils::persist(CORRELATED_LOOKUP_TAG, m_CorrelatedLookup, inserter); inserter.insertLevel( - CORRELATED_PRIORS_TAG, - boost::bind(&CTimeSeriesCorrelations::persistCorrelatePriors, this, _1)); + CORRELATION_MODELS_TAG, + boost::bind(&CTimeSeriesCorrelations::persistCorrelationModels, this, _1)); } -bool CTimeSeriesCorrelations::restoreCorrelatePriors(const SDistributionRestoreParams& params, - core::CStateRestoreTraverser& traverser) { +bool CTimeSeriesCorrelations::restoreCorrelationModels(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser) { do { const std::string& name{traverser.name()}; RESTORE_SETUP_TEARDOWN( - CORRELATE_PRIOR_TAG, TSizeSizePrMultivariatePriorPtrDoublePrPr prior, + CORRELATION_MODEL_TAG, TSizeSizePrMultivariatePriorPtrDoublePrPr prior, traverser.traverseSubLevel(boost::bind(&restore, boost::cref(params), boost::ref(prior), _1)), m_CorrelationDistributionModels.insert(prior)) @@ -1655,8 +1792,7 @@ bool CTimeSeriesCorrelations::restoreCorrelatePriors(const SDistributionRestoreP return true; } -void CTimeSeriesCorrelations::persistCorrelatePriors(core::CStatePersistInserter& inserter) const { - +void CTimeSeriesCorrelations::persistCorrelationModels(core::CStatePersistInserter& inserter) const { using TSizeSizePrMultivariatePriorPtrDoublePrUMapCItrVec = std::vector; TSizeSizePrMultivariatePriorPtrDoublePrUMapCItrVec ordered; @@ -1668,35 +1804,35 @@ void CTimeSeriesCorrelations::persistCorrelatePriors(core::CStatePersistInserter std::sort(ordered.begin(), ordered.end(), core::CFunctional::SDereference()); for (auto prior : ordered) { - inserter.insertLevel(CORRELATE_PRIOR_TAG, + inserter.insertLevel(CORRELATION_MODEL_TAG, boost::bind(&persist, boost::cref(*prior), _1)); } } bool CTimeSeriesCorrelations::restore(const SDistributionRestoreParams& params, - TSizeSizePrMultivariatePriorPtrDoublePrPr& prior, + TSizeSizePrMultivariatePriorPtrDoublePrPr& model, core::CStateRestoreTraverser& traverser) { do { const std::string& name{traverser.name()}; - RESTORE_BUILT_IN(FIRST_CORRELATE_ID_TAG, prior.first.first) - RESTORE_BUILT_IN(SECOND_CORRELATE_ID_TAG, prior.first.second) - RESTORE(CORRELATE_PRIOR_TAG, traverser.traverseSubLevel(boost::bind( - CPriorStateSerialiser(), boost::cref(params), - boost::ref(prior.second.first), _1))) - RESTORE_BUILT_IN(CORRELATION_TAG, prior.second.second) + RESTORE_BUILT_IN(FIRST_CORRELATE_ID_TAG, model.first.first) + RESTORE_BUILT_IN(SECOND_CORRELATE_ID_TAG, model.first.second) + RESTORE(CORRELATION_MODEL_TAG, traverser.traverseSubLevel(boost::bind( + CPriorStateSerialiser(), boost::cref(params), + boost::ref(model.second.first), _1))) + RESTORE_BUILT_IN(CORRELATION_TAG, model.second.second) } while (traverser.next()); return true; } -void CTimeSeriesCorrelations::persist(const TSizeSizePrMultivariatePriorPtrDoublePrPr& prior, +void CTimeSeriesCorrelations::persist(const TSizeSizePrMultivariatePriorPtrDoublePrPr& model, core::CStatePersistInserter& inserter) { - inserter.insertValue(FIRST_CORRELATE_ID_TAG, prior.first.first); - inserter.insertValue(SECOND_CORRELATE_ID_TAG, prior.first.second); - inserter.insertLevel(CORRELATE_PRIOR_TAG, + inserter.insertValue(FIRST_CORRELATE_ID_TAG, model.first.first); + inserter.insertValue(SECOND_CORRELATE_ID_TAG, model.first.second); + inserter.insertLevel(CORRELATION_MODEL_TAG, boost::bind(CPriorStateSerialiser(), - boost::cref(*prior.second.first), _1)); - inserter.insertValue(CORRELATION_TAG, prior.second.second, core::CIEEE754::E_SinglePrecision); + boost::cref(*model.second.first), _1)); + inserter.insertValue(CORRELATION_TAG, model.second.second, core::CIEEE754::E_SinglePrecision); } void CTimeSeriesCorrelations::addTimeSeries(std::size_t id, @@ -1722,23 +1858,22 @@ void CTimeSeriesCorrelations::removeTimeSeries(std::size_t id) { } void CTimeSeriesCorrelations::addSamples(std::size_t id, - maths_t::EDataType type, + const CModelAddSamplesParams& params, const TTimeDouble2VecSizeTrVec& samples, - const TDoubleWeightsAry1Vec& weights, - double interval, double multiplier) { SSampleData& data{m_SampleData[id]}; - data.s_Type = type; + data.s_Type = params.type(); data.s_Times.reserve(samples.size()); data.s_Samples.reserve(samples.size()); data.s_Tags.reserve(samples.size()); - for (const auto& sample : samples) { - data.s_Times.push_back(sample.first); - data.s_Samples.push_back(sample.second[0]); - data.s_Tags.push_back(sample.third); - } - data.s_Weights = weights; - data.s_Interval = interval; + for (std::size_t i = 0u; i < samples.size(); ++i) { + data.s_Times.push_back(samples[i].first); + data.s_Samples.push_back(samples[i].second[0]); + data.s_Tags.push_back(samples[i].third); + data.s_Weights.push_back( + CUnivariateTimeSeriesModel::unpack(params.priorWeights()[i])); + } + data.s_Interval = params.propagationInterval(); data.s_Multiplier = multiplier; m_Correlations.add(id, CBasicStatistics::median(data.s_Samples)); } @@ -1751,11 +1886,11 @@ TSize1Vec CTimeSeriesCorrelations::correlated(std::size_t id) const { bool CTimeSeriesCorrelations::correlationModels(std::size_t id, TSize1Vec& correlated, TSize2Vec1Vec& variables, - TMultivariatePriorCPtrSizePr1Vec& correlationDistributionModels, + TMultivariatePriorCPtrSizePr1Vec& correlationModels, TModelCPtr1Vec& correlatedTimeSeriesModels) const { variables.clear(); - correlationDistributionModels.clear(); + correlationModels.clear(); correlatedTimeSeriesModels.clear(); if (correlated.empty()) { @@ -1763,7 +1898,7 @@ bool CTimeSeriesCorrelations::correlationModels(std::size_t id, } variables.reserve(correlated.size()); - correlationDistributionModels.reserve(correlated.size()); + correlationModels.reserve(correlated.size()); correlatedTimeSeriesModels.reserve(correlated.size()); std::size_t end{0u}; for (auto correlate : correlated) { @@ -1788,7 +1923,7 @@ bool CTimeSeriesCorrelations::correlationModels(std::size_t id, } correlated[end] = correlate; variables.push_back(std::move(variable)); - correlationDistributionModels.push_back({i->second.first.get(), variable[0]}); + correlationModels.push_back({i->second.first.get(), variable[0]}); ++end; } @@ -1797,7 +1932,7 @@ bool CTimeSeriesCorrelations::correlationModels(std::size_t id, correlatedTimeSeriesModels.push_back(m_TimeSeriesModels[correlate]); } - return correlationDistributionModels.size() > 0; + return correlationModels.size() > 0; } void CTimeSeriesCorrelations::refreshLookup() { @@ -1816,10 +1951,10 @@ void CTimeSeriesCorrelations::refreshLookup() { CMultivariateTimeSeriesModel::CMultivariateTimeSeriesModel( const CModelParams& params, const CTimeSeriesDecompositionInterface& trend, - const CMultivariatePrior& prior, + const CMultivariatePrior& residualModel, const TDecayRateController2Ary* controllers, bool modelAnomalies) - : CModel(params), m_IsNonNegative(false), m_Prior(prior.clone()), + : CModel(params), m_IsNonNegative(false), m_ResidualModel(residualModel.clone()), m_AnomalyModel(modelAnomalies ? std::make_shared( params.bucketLength(), params.decayRate()) @@ -1829,13 +1964,13 @@ CMultivariateTimeSeriesModel::CMultivariateTimeSeriesModel( m_Controllers = std::make_shared(*controllers); } for (std::size_t d = 0u; d < this->dimension(); ++d) { - m_Trend.emplace_back(trend.clone()); + m_TrendModel.emplace_back(trend.clone()); } } CMultivariateTimeSeriesModel::CMultivariateTimeSeriesModel(const CMultivariateTimeSeriesModel& other) : CModel(other.params()), m_IsNonNegative(other.m_IsNonNegative), - m_Prior(other.m_Prior->clone()), + m_ResidualModel(other.m_ResidualModel->clone()), m_AnomalyModel(other.m_AnomalyModel ? std::make_shared(*other.m_AnomalyModel) : TAnomalyModelPtr()), @@ -1843,9 +1978,9 @@ CMultivariateTimeSeriesModel::CMultivariateTimeSeriesModel(const CMultivariateTi if (other.m_Controllers) { m_Controllers = std::make_shared(*other.m_Controllers); } - m_Trend.reserve(other.m_Trend.size()); - for (const auto& trend : other.m_Trend) { - m_Trend.emplace_back(trend->clone()); + m_TrendModel.reserve(other.m_TrendModel.size()); + for (const auto& trend : other.m_TrendModel) { + m_TrendModel.emplace_back(trend->clone()); } } @@ -1896,7 +2031,6 @@ CMultivariateTimeSeriesModel::addSamples(const CModelAddSamplesParams& params, return E_Success; } - using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; using TOptionalTimeDouble2VecPr = boost::optional; TSizeVec valueorder(samples.size()); @@ -1907,16 +2041,19 @@ CMultivariateTimeSeriesModel::addSamples(const CModelAddSamplesParams& params, }); TOptionalTimeDouble2VecPr randomSample; - - double p{SLIDING_WINDOW_SIZE * static_cast(this->params().bucketLength()) / - static_cast(core::constants::DAY)}; - if (p >= 1.0 || CSampling::uniformSample(m_Rng, 0.0, 1.0) < p) { - std::size_t i{CSampling::uniformSample(m_Rng, 0, samples.size())}; - randomSample.reset({samples[valueorder[i]].first, samples[valueorder[i]].second}); + if (TOptionalSize index = randomlySample( + m_Rng, params, this->params().bucketLength(), valueorder)) { + randomSample.reset({samples[*index].first, samples[*index].second}); } m_IsNonNegative = params.isNonNegative(); + maths_t::EDataType type{params.type()}; + m_ResidualModel->dataType(type); + for (auto& trendModel : m_TrendModel) { + trendModel->dataType(type); + } + std::size_t dimension{this->dimension()}; EUpdateResult result{this->updateTrend(samples, params.trendWeights())}; @@ -1929,7 +2066,7 @@ CMultivariateTimeSeriesModel::addSamples(const CModelAddSamplesParams& params, } core_t::TTime time{sample.first}; for (std::size_t d = 0u; d < sample.second.size(); ++d) { - sample.second[d] = m_Trend[d]->detrend(time, sample.second[d], 0.0); + sample.second[d] = m_TrendModel[d]->detrend(time, sample.second[d], 0.0); } } @@ -1938,35 +2075,25 @@ CMultivariateTimeSeriesModel::addSamples(const CModelAddSamplesParams& params, return samples[lhs].second < samples[rhs].second; }); - maths_t::EDataType type{params.type()}; - m_Prior->dataType(type); - TDouble10Vec1Vec samples_; - TDouble10VecWeightsAry1Vec weights; + maths_t::TDouble10VecWeightsAry1Vec weights_; samples_.reserve(samples.size()); - weights.reserve(samples.size()); + weights_.reserve(samples.size()); TMeanAccumulator averageTime; for (auto i : valueorder) { samples_.push_back(samples[i].second); - TDouble10VecWeightsAry wi(maths_t::CUnitWeights::unit(dimension)); - for (std::size_t j = 0u; j < maths_t::NUMBER_WEIGHT_STYLES; ++j) { - const TDouble2Vec& weight{params.priorWeights()[i][j]}; - for (std::size_t d = 0u; d < dimension; ++d) { - wi[j][d] = weight[d]; - } - } - weights.push_back(wi); + weights_.push_back(unpack(params.priorWeights()[i])); averageTime.add(static_cast(samples[i].first)); } - m_Prior->addSamples(samples_, weights); - m_Prior->propagateForwardsByTime(params.propagationInterval()); - if (m_AnomalyModel) { + m_ResidualModel->addSamples(samples_, weights_); + m_ResidualModel->propagateForwardsByTime(params.propagationInterval()); + if (m_AnomalyModel != nullptr) { m_AnomalyModel->propagateForwardsByTime(params.propagationInterval()); } - if (m_Controllers) { + if (m_Controllers != nullptr) { TDouble1VecVec errors[2]; errors[0].reserve(samples.size()); errors[1].reserve(samples.size()); @@ -1976,30 +2103,30 @@ CMultivariateTimeSeriesModel::addSamples(const CModelAddSamplesParams& params, } { CDecayRateController& controller{(*m_Controllers)[E_TrendControl]}; - TDouble1Vec prediction(dimension); + TDouble1Vec trendMean(dimension); core_t::TTime time{static_cast(CBasicStatistics::mean(averageTime))}; for (std::size_t d = 0u; d < dimension; ++d) { - prediction[d] = m_Trend[d]->mean(time); + trendMean[d] = m_TrendModel[d]->meanValue(time); } double multiplier{controller.multiplier( - prediction, errors[E_TrendControl], this->params().bucketLength(), + trendMean, errors[E_TrendControl], this->params().bucketLength(), this->params().learnRate(), this->params().decayRate())}; if (multiplier != 1.0) { - for (const auto& trend : m_Trend) { + for (const auto& trend : m_TrendModel) { trend->decayRate(multiplier * trend->decayRate()); } - LOG_TRACE(<< "trend decay rate = " << m_Trend[0]->decayRate()); + LOG_TRACE(<< "trend decay rate = " << m_TrendModel[0]->decayRate()); } } { - CDecayRateController& controller{(*m_Controllers)[E_PriorControl]}; - TDouble1Vec prediction(m_Prior->marginalLikelihoodMean()); + CDecayRateController& controller{(*m_Controllers)[E_ResidualControl]}; + TDouble1Vec residualMean(m_ResidualModel->marginalLikelihoodMean()); double multiplier{controller.multiplier( - prediction, errors[E_PriorControl], this->params().bucketLength(), + residualMean, errors[E_ResidualControl], this->params().bucketLength(), this->params().learnRate(), this->params().decayRate())}; if (multiplier != 1.0) { - m_Prior->decayRate(multiplier * m_Prior->decayRate()); - LOG_TRACE(<< "prior decay rate = " << m_Prior->decayRate()); + m_ResidualModel->decayRate(multiplier * m_ResidualModel->decayRate()); + LOG_TRACE(<< "prior decay rate = " << m_ResidualModel->decayRate()); } } } @@ -2012,30 +2139,20 @@ CMultivariateTimeSeriesModel::addSamples(const CModelAddSamplesParams& params, } void CMultivariateTimeSeriesModel::skipTime(core_t::TTime gap) { - for (const auto& trend : m_Trend) { + for (const auto& trend : m_TrendModel) { trend->skipTime(gap); } } CMultivariateTimeSeriesModel::TDouble2Vec CMultivariateTimeSeriesModel::mode(core_t::TTime time, - const TDouble2VecWeightsAry& weights_) const { - - std::size_t dimension = this->dimension(); - + const TDouble2VecWeightsAry& weights) const { + std::size_t dimension{this->dimension()}; TDouble2Vec result(dimension); - - TDouble10VecWeightsAry weights; - for (std::size_t i = 0u; i < weights_.size(); ++i) { - weights[i] = weights_[i]; - } - - TDouble10Vec mode(m_Prior->marginalLikelihoodMode(weights)); - + TDouble10Vec mode(m_ResidualModel->marginalLikelihoodMode(unpack(weights))); for (std::size_t d = 0u; d < dimension; ++d) { - result[d] = mode[d] + CBasicStatistics::mean(m_Trend[d]->baseline(time)); + result[d] = mode[d] + CBasicStatistics::mean(m_TrendModel[d]->value(time)); } - return result; } @@ -2046,13 +2163,8 @@ CMultivariateTimeSeriesModel::correlateModes(core_t::TTime /*time*/, } CMultivariateTimeSeriesModel::TDouble2Vec1Vec -CMultivariateTimeSeriesModel::residualModes(const TDouble2VecWeightsAry& weights_) const { - - TDouble10VecWeightsAry weights; - for (std::size_t i = 0u; i < weights_.size(); ++i) { - weights[i] = weights_[i]; - } - TDouble10Vec1Vec modes(m_Prior->marginalLikelihoodModes(weights)); +CMultivariateTimeSeriesModel::residualModes(const TDouble2VecWeightsAry& weights) const { + TDouble10Vec1Vec modes(m_ResidualModel->marginalLikelihoodModes(unpack(weights))); TDouble2Vec1Vec result; result.reserve(modes.size()); for (const auto& mode : modes) { @@ -2067,7 +2179,7 @@ void CMultivariateTimeSeriesModel::detrend(const TTime2Vec1Vec& time_, std::size_t dimension{this->dimension()}; core_t::TTime time{time_[0][0]}; for (std::size_t d = 0u; d < dimension; ++d) { - value[0][d] = m_Trend[d]->detrend(time, value[0][d], confidenceInterval); + value[0][d] = m_TrendModel[d]->detrend(time, value[0][d], confidenceInterval); } } @@ -2077,14 +2189,12 @@ CMultivariateTimeSeriesModel::predict(core_t::TTime time, TDouble2Vec hint) const { using TUnivariatePriorPtr = std::shared_ptr; - static const TSizeDoublePr10Vec CONDITION; - std::size_t dimension{this->dimension()}; double scale{1.0 - this->params().probabilityBucketEmpty()}; if (hint.size() == dimension) { for (std::size_t d = 0u; d < dimension; ++d) { - hint[d] = m_Trend[d]->detrend(time, hint[d], 0.0); + hint[d] = m_TrendModel[d]->detrend(time, hint[d], 0.0); } } @@ -2092,23 +2202,23 @@ CMultivariateTimeSeriesModel::predict(core_t::TTime time, std::iota(marginalize.begin(), marginalize.end(), 1); TDouble2Vec result(dimension); - TDouble10Vec mean(m_Prior->marginalLikelihoodMean()); + TDouble10Vec mean(m_ResidualModel->marginalLikelihoodMean()); for (std::size_t d = 0u; d < dimension; --marginalize[std::min(d, dimension - 2)], ++d) { - double seasonalOffset{0.0}; - if (m_Trend[d]->initialized()) { - seasonalOffset = CBasicStatistics::mean(m_Trend[d]->baseline(time)); + double trend{0.0}; + if (m_TrendModel[d]->initialized()) { + trend = CBasicStatistics::mean(m_TrendModel[d]->value(time)); } double median{mean[d]}; - if (!m_Prior->isNonInformative()) { + if (!m_ResidualModel->isNonInformative()) { TUnivariatePriorPtr marginal{ - m_Prior->univariate(marginalize, CONDITION).first}; + m_ResidualModel->univariate(marginalize, NOTHING_TO_CONDITION).first}; median = hint.empty() ? CBasicStatistics::mean( marginal->marginalLikelihoodConfidenceInterval(0.0)) : marginal->nearestMarginalLikelihoodMean(hint[d]); } - result[d] = scale * (seasonalOffset + median); + result[d] = scale * (trend + median); if (m_IsNonNegative) { result[d] = std::max(result[d], 0.0); } @@ -2122,14 +2232,12 @@ CMultivariateTimeSeriesModel::confidenceInterval(core_t::TTime time, double confidenceInterval, const TDouble2VecWeightsAry& weights_) const { - if (m_Prior->isNonInformative()) { + if (m_ResidualModel->isNonInformative()) { return TDouble2Vec3Vec(); } using TUnivariatePriorPtr = std::shared_ptr; - static const TSizeDoublePr10Vec CONDITION; - std::size_t dimension{this->dimension()}; double scale{1.0 - this->params().probabilityBucketEmpty()}; @@ -2138,26 +2246,27 @@ CMultivariateTimeSeriesModel::confidenceInterval(core_t::TTime time, TDouble2Vec3Vec result(3, TDouble2Vec(dimension)); - TDoubleWeightsAry weights; + maths_t::TDoubleWeightsAry weights{maths_t::CUnitWeights::UNIT}; for (std::size_t d = 0u; d < dimension; --marginalize[std::min(d, dimension - 2)], ++d) { - double seasonalOffset{m_Trend[d]->initialized() - ? CBasicStatistics::mean(m_Trend[d]->baseline(time, confidenceInterval)) - : 0.0}; + double trend{m_TrendModel[d]->initialized() + ? CBasicStatistics::mean(m_TrendModel[d]->value(time, confidenceInterval)) + : 0.0}; for (std::size_t i = 0u; i < maths_t::NUMBER_WEIGHT_STYLES; ++i) { weights[i] = weights_[i][d]; } - TUnivariatePriorPtr marginal{m_Prior->univariate(marginalize, CONDITION).first}; + TUnivariatePriorPtr marginal{ + m_ResidualModel->univariate(marginalize, NOTHING_TO_CONDITION).first}; double median{CBasicStatistics::mean( - marginal->marginalLikelihoodConfidenceInterval(0.0))}; + marginal->marginalLikelihoodConfidenceInterval(0.0, weights))}; TDoubleDoublePr interval{marginal->marginalLikelihoodConfidenceInterval( confidenceInterval, weights)}; - result[0][d] = scale * (seasonalOffset + interval.first); - result[1][d] = scale * (seasonalOffset + median); - result[2][d] = scale * (seasonalOffset + interval.second); + result[0][d] = scale * (trend + interval.first); + result[1][d] = scale * (trend + median); + result[2][d] = scale * (trend + interval.second); if (m_IsNonNegative) { result[0][d] = std::max(result[0][d], 0.0); result[1][d] = std::max(result[1][d], 0.0); @@ -2202,16 +2311,11 @@ bool CMultivariateTimeSeriesModel::probability(const CModelProbabilityParams& pa std::size_t dimension{this->dimension()}; core_t::TTime time{time_[0][0]}; TDouble10Vec1Vec sample{TDouble10Vec(dimension)}; - TDouble10VecWeightsAry1Vec weights{maths_t::CUnitWeights::unit(dimension)}; for (std::size_t d = 0u; d < dimension; ++d) { - sample[0][d] = m_Trend[d]->detrend(time, value[0][d], - params.seasonalConfidenceInterval()); - } - for (std::size_t i = 0u; i < maths_t::NUMBER_WEIGHT_STYLES; ++i) { - for (std::size_t d = 0u; d < dimension; ++d) { - weights[0][i][d] = params.weights()[0][i][d]; - } + sample[0][d] = m_TrendModel[d]->detrend( + time, value[0][d], params.seasonalConfidenceInterval()); } + maths_t::TDouble10VecWeightsAry1Vec weights{unpack(params.weights()[0])}; bool bucketEmpty{params.bucketEmpty()[0][0]}; double probabilityBucketEmpty{this->params().probabilityBucketEmpty()}; @@ -2225,8 +2329,8 @@ bool CMultivariateTimeSeriesModel::probability(const CModelProbabilityParams& pa for (std::size_t i = 0u; i < coordinates.size(); ++i) { maths_t::EProbabilityCalculation calculation = params.calculation(i); coordinate[0] = coordinates[i]; - if (!m_Prior->probabilityOfLessLikelySamples(calculation, sample, weights, - coordinate, pls, pus, tail_)) { + if (!m_ResidualModel->probabilityOfLessLikelySamples( + calculation, sample, weights, coordinate, pls, pus, tail_)) { LOG_ERROR(<< "Failed to compute P(" << sample << " | weight = " << weights << ")"); return false; } @@ -2248,9 +2352,9 @@ bool CMultivariateTimeSeriesModel::probability(const CModelProbabilityParams& pa probability = (std::sqrt(pl[0] * pl[1]) + std::sqrt(pu[0] * pu[1])) / 2.0; - if (m_AnomalyModel) { + if (m_AnomalyModel != nullptr) { TDouble2Vec residual(dimension); - TDouble10Vec nearest(m_Prior->nearestMarginalLikelihoodMean(sample[0])); + TDouble10Vec nearest(m_ResidualModel->nearestMarginalLikelihoodMean(sample[0])); TDouble2Vec scale(this->seasonalWeight(0.0, time)); for (std::size_t i = 0u; i < dimension; ++i) { residual[i] = (sample[0][i] - nearest[i]) / std::max(std::sqrt(scale[i]), 1.0); @@ -2274,11 +2378,11 @@ CMultivariateTimeSeriesModel::winsorisationWeight(double derate, TDouble2Vec scale(this->seasonalWeight(0.0, time)); TDouble10Vec sample(dimension); for (std::size_t d = 0u; d < dimension; ++d) { - sample[d] = m_Trend[d]->detrend(time, value[d], 0.0); + sample[d] = m_TrendModel[d]->detrend(time, value[d], 0.0); } for (std::size_t d = 0u; d < dimension; ++d) { - result[d] = computeWinsorisationWeight(*m_Prior, d, derate, scale[d], sample); + result[d] = winsorisation::tailWeight(*m_ResidualModel, d, derate, scale[d], sample); } return result; @@ -2287,9 +2391,9 @@ CMultivariateTimeSeriesModel::winsorisationWeight(double derate, CMultivariateTimeSeriesModel::TDouble2Vec CMultivariateTimeSeriesModel::seasonalWeight(double confidence, core_t::TTime time) const { TDouble2Vec result(this->dimension()); - TDouble10Vec variances(m_Prior->marginalLikelihoodVariances()); + TDouble10Vec variances(m_ResidualModel->marginalLikelihoodVariances()); for (std::size_t d = 0u, dimension = this->dimension(); d < dimension; ++d) { - double scale{m_Trend[d]->scale(time, variances[d], confidence).second}; + double scale{m_TrendModel[d]->scale(time, variances[d], confidence).second}; result[d] = std::max(scale, this->params().minimumSeasonalVarianceScale()); } return result; @@ -2298,8 +2402,8 @@ CMultivariateTimeSeriesModel::seasonalWeight(double confidence, core_t::TTime ti uint64_t CMultivariateTimeSeriesModel::checksum(uint64_t seed) const { seed = CChecksum::calculate(seed, m_IsNonNegative); seed = CChecksum::calculate(seed, m_Controllers); - seed = CChecksum::calculate(seed, m_Trend); - seed = CChecksum::calculate(seed, m_Prior); + seed = CChecksum::calculate(seed, m_TrendModel); + seed = CChecksum::calculate(seed, m_ResidualModel); seed = CChecksum::calculate(seed, m_AnomalyModel); return CChecksum::calculate(seed, m_SlidingWindow); } @@ -2307,15 +2411,16 @@ uint64_t CMultivariateTimeSeriesModel::checksum(uint64_t seed) const { void CMultivariateTimeSeriesModel::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { mem->setName("CUnivariateTimeSeriesModel"); core::CMemoryDebug::dynamicSize("m_Controllers", m_Controllers, mem); - core::CMemoryDebug::dynamicSize("m_Trend", m_Trend, mem); - core::CMemoryDebug::dynamicSize("m_Prior", m_Prior, mem); + core::CMemoryDebug::dynamicSize("m_TrendModel", m_TrendModel, mem); + core::CMemoryDebug::dynamicSize("m_ResidualModel", m_ResidualModel, mem); core::CMemoryDebug::dynamicSize("m_AnomalyModel", m_AnomalyModel, mem); core::CMemoryDebug::dynamicSize("m_SlidingWindow", m_SlidingWindow, mem); } std::size_t CMultivariateTimeSeriesModel::memoryUsage() const { return core::CMemory::dynamicSize(m_Controllers) + - core::CMemory::dynamicSize(m_Trend) + core::CMemory::dynamicSize(m_Prior) + + core::CMemory::dynamicSize(m_TrendModel) + + core::CMemory::dynamicSize(m_ResidualModel) + core::CMemory::dynamicSize(m_AnomalyModel) + core::CMemory::dynamicSize(m_SlidingWindow); } @@ -2332,16 +2437,17 @@ bool CMultivariateTimeSeriesModel::acceptRestoreTraverser(const SModelRestorePar m_Controllers = std::make_shared(), core::CPersistUtils::restore(CONTROLLER_6_3_TAG, *m_Controllers, traverser), /**/) - RESTORE_SETUP_TEARDOWN(TREND_6_3_TAG, m_Trend.push_back(TDecompositionPtr()), - traverser.traverseSubLevel(boost::bind( - CTimeSeriesDecompositionStateSerialiser(), - boost::cref(params.s_DecompositionParams), - boost::ref(m_Trend.back()), _1)), - /**/) - RESTORE(PRIOR_6_3_TAG, traverser.traverseSubLevel(boost::bind( - CPriorStateSerialiser(), - boost::cref(params.s_DistributionParams), - boost::ref(m_Prior), _1))) + RESTORE_SETUP_TEARDOWN( + TREND_MODEL_6_3_TAG, m_TrendModel.push_back(TDecompositionPtr()), + traverser.traverseSubLevel( + boost::bind(CTimeSeriesDecompositionStateSerialiser(), + boost::cref(params.s_DecompositionParams), + boost::ref(m_TrendModel.back()), _1)), + /**/) + RESTORE(RESIDUAL_MODEL_6_3_TAG, + traverser.traverseSubLevel(boost::bind( + CPriorStateSerialiser(), boost::cref(params.s_DistributionParams), + boost::ref(m_ResidualModel), _1))) RESTORE_SETUP_TEARDOWN( ANOMALY_MODEL_6_3_TAG, m_AnomalyModel = std::make_shared(), @@ -2362,16 +2468,17 @@ bool CMultivariateTimeSeriesModel::acceptRestoreTraverser(const SModelRestorePar m_Controllers = std::make_shared(), core::CPersistUtils::restore(CONTROLLER_6_3_TAG, *m_Controllers, traverser), /**/) - RESTORE_SETUP_TEARDOWN(TREND_OLD_TAG, m_Trend.push_back(TDecompositionPtr()), - traverser.traverseSubLevel(boost::bind( - CTimeSeriesDecompositionStateSerialiser(), - boost::cref(params.s_DecompositionParams), - boost::ref(m_Trend.back()), _1)), - /**/) + RESTORE_SETUP_TEARDOWN( + TREND_OLD_TAG, m_TrendModel.push_back(TDecompositionPtr()), + traverser.traverseSubLevel( + boost::bind(CTimeSeriesDecompositionStateSerialiser(), + boost::cref(params.s_DecompositionParams), + boost::ref(m_TrendModel.back()), _1)), + /**/) RESTORE(PRIOR_OLD_TAG, traverser.traverseSubLevel(boost::bind( CPriorStateSerialiser(), boost::cref(params.s_DistributionParams), - boost::ref(m_Prior), _1))) + boost::ref(m_ResidualModel), _1))) RESTORE_SETUP_TEARDOWN( ANOMALY_MODEL_OLD_TAG, m_AnomalyModel = std::make_shared(), @@ -2393,14 +2500,15 @@ void CMultivariateTimeSeriesModel::acceptPersistInserter(core::CStatePersistInse if (m_Controllers) { core::CPersistUtils::persist(CONTROLLER_6_3_TAG, *m_Controllers, inserter); } - for (const auto& trend : m_Trend) { - inserter.insertLevel(TREND_6_3_TAG, + for (const auto& trend : m_TrendModel) { + inserter.insertLevel(TREND_MODEL_6_3_TAG, boost::bind(CTimeSeriesDecompositionStateSerialiser(), boost::cref(*trend), _1)); } - inserter.insertLevel(PRIOR_6_3_TAG, boost::bind(CPriorStateSerialiser(), - boost::cref(*m_Prior), _1)); - if (m_AnomalyModel) { + inserter.insertLevel(RESIDUAL_MODEL_6_3_TAG, + boost::bind(CPriorStateSerialiser(), + boost::cref(*m_ResidualModel), _1)); + if (m_AnomalyModel != nullptr) { inserter.insertLevel(ANOMALY_MODEL_6_3_TAG, boost::bind(&CTimeSeriesAnomalyModel::acceptPersistInserter, m_AnomalyModel.get(), _1)); @@ -2409,7 +2517,37 @@ void CMultivariateTimeSeriesModel::acceptPersistInserter(core::CStatePersistInse } maths_t::EDataType CMultivariateTimeSeriesModel::dataType() const { - return m_Prior->dataType(); + return m_ResidualModel->dataType(); +} + +CMultivariateTimeSeriesModel::TDouble10VecWeightsAry +CMultivariateTimeSeriesModel::unpack(const TDouble2VecWeightsAry& weights) { + TDouble10VecWeightsAry result{maths_t::CUnitWeights::unit(weights[0])}; + for (std::size_t i = 0u; i < weights.size(); ++i) { + result[i] = weights[i]; + } + return result; +} + +void CMultivariateTimeSeriesModel::reinitializeResidualModel( + double learnRate, + const TDecompositionPtr10Vec& trend, + const TTimeDouble2VecPrCBuf& slidingWindow, + CMultivariatePrior& residualModel) { + residualModel.setToNonInformative(0.0, residualModel.decayRate()); + if (!slidingWindow.empty()) { + std::size_t dimension{residualModel.dimension()}; + double slidingWindowLength{static_cast(slidingWindow.size())}; + maths_t::TDouble10VecWeightsAry1Vec weight{maths_t::countWeight(TDouble10Vec( + dimension, std::max(learnRate, std::min(5.0 / slidingWindowLength, 1.0))))}; + for (const auto& value : slidingWindow) { + TDouble10Vec1Vec sample{TDouble10Vec(dimension)}; + for (std::size_t i = 0u; i < dimension; ++i) { + sample[0][i] = trend[i]->detrend(value.first, value.second[i], 0.0); + } + residualModel.addSamples(sample, weight); + } + } } const CMultivariateTimeSeriesModel::TTimeDouble2VecPrCBuf& @@ -2418,12 +2556,12 @@ CMultivariateTimeSeriesModel::slidingWindow() const { } const CMultivariateTimeSeriesModel::TDecompositionPtr10Vec& -CMultivariateTimeSeriesModel::trend() const { - return m_Trend; +CMultivariateTimeSeriesModel::trendModel() const { + return m_TrendModel; } -const CMultivariatePrior& CMultivariateTimeSeriesModel::prior() const { - return *m_Prior; +const CMultivariatePrior& CMultivariateTimeSeriesModel::residualModel() const { + return *m_ResidualModel; } CMultivariateTimeSeriesModel::EUpdateResult @@ -2434,7 +2572,7 @@ CMultivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& sample for (const auto& sample : samples) { if (sample.second.size() != dimension) { LOG_ERROR(<< "Dimension mismatch: '" << sample.second.size() - << " != " << m_Trend.size() << "'"); + << " != " << m_TrendModel.size() << "'"); return E_Failure; } } @@ -2452,7 +2590,7 @@ CMultivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& sample EUpdateResult result{E_Success}; { - TDoubleWeightsAry weight; + maths_t::TDoubleWeightsAry weight; for (auto i : timeorder) { core_t::TTime time{samples[i].first}; TDouble10Vec value(samples[i].second); @@ -2460,37 +2598,14 @@ CMultivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& sample for (std::size_t j = 0u; j < maths_t::NUMBER_WEIGHT_STYLES; ++j) { weight[j] = weights[i][j][d]; } - if (m_Trend[d]->addPoint(time, value[d], weight)) { + if (m_TrendModel[d]->addPoint(time, value[d], weight)) { result = E_Reset; } } } } if (result == E_Reset) { - m_Prior->setToNonInformative(0.0, m_Prior->decayRate()); - TDouble10VecWeightsAry1Vec weight{maths_t::countWeight( - slidingWindowCountWeight(this->params().learnRate()), dimension)}; - for (const auto& value : m_SlidingWindow) { - TDouble10Vec1Vec sample{TDouble10Vec(dimension)}; - for (std::size_t i = 0u; i < dimension; ++i) { - sample[0][i] = m_Trend[i]->detrend(value.first, value.second[i], 0.0); - } - m_Prior->addSamples(sample, weight); - } - if (m_Controllers) { - m_Prior->decayRate(m_Prior->decayRate() / - (*m_Controllers)[E_PriorControl].multiplier()); - for (auto& trend : m_Trend) { - trend->decayRate(trend->decayRate() / - (*m_Controllers)[E_TrendControl].multiplier()); - } - for (auto& controller : *m_Controllers) { - controller.reset(); - } - } - if (m_AnomalyModel) { - m_AnomalyModel->reset(); - } + this->reinitializeStateGivenNewComponent(); } return result; @@ -2499,16 +2614,35 @@ CMultivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& sample void CMultivariateTimeSeriesModel::appendPredictionErrors(double interval, const TDouble2Vec& sample, TDouble1VecVec (&result)[2]) { - if (auto error = predictionError(interval, m_Prior, sample)) { - result[E_PriorControl].push_back(*error); + if (auto error = predictionError(interval, m_ResidualModel, sample)) { + result[E_ResidualControl].push_back(*error); } - if (auto error = predictionError(m_Trend, sample)) { + if (auto error = predictionError(m_TrendModel, sample)) { result[E_TrendControl].push_back(*error); } } +void CMultivariateTimeSeriesModel::reinitializeStateGivenNewComponent() { + reinitializeResidualModel(this->params().learnRate(), m_TrendModel, + m_SlidingWindow, *m_ResidualModel); + if (m_Controllers != nullptr) { + m_ResidualModel->decayRate(m_ResidualModel->decayRate() / + (*m_Controllers)[E_ResidualControl].multiplier()); + for (auto& trend : m_TrendModel) { + trend->decayRate(trend->decayRate() / + (*m_Controllers)[E_TrendControl].multiplier()); + } + for (auto& controller : *m_Controllers) { + controller.reset(); + } + } + if (m_AnomalyModel != nullptr) { + m_AnomalyModel->reset(); + } +} + std::size_t CMultivariateTimeSeriesModel::dimension() const { - return m_Prior->dimension(); + return m_ResidualModel->dimension(); } } } diff --git a/lib/maths/CTools.cc b/lib/maths/CTools.cc index b1fe78cb85..17aa2cffdb 100644 --- a/lib/maths/CTools.cc +++ b/lib/maths/CTools.cc @@ -82,11 +82,6 @@ inline TDoubleBoolPr stationaryPoint(const boost::math::beta_distribution<>& bet return {boost::math::mode(beta), true}; } -//! Compute \f$x^2\f$. -inline double square(double x) { - return x * x; -} - //! \brief p.d.f function adapter. //! //! DESCRIPTION:\n @@ -599,10 +594,10 @@ operator()(const lognormal& logNormal, double x, maths_t::ETail& tail) const { // + 2 * s^2 * (log(x) - m))^(1/2)) if x > mode double logx = std::log(x); - double squareScale = square(logNormal.scale()); - double discriminant = std::sqrt( - square(squareScale) + (logx - logNormal.location() + 2.0 * squareScale) * - (logx - logNormal.location())); + double squareScale = CTools::pow2(logNormal.scale()); + double discriminant = std::sqrt(CTools::pow2(squareScale) + + (logx - logNormal.location() + 2.0 * squareScale) * + (logx - logNormal.location())); double m = boost::math::mode(logNormal); this->tail(x, m, tail); double y = m * std::exp(x > m ? -discriminant : discriminant); @@ -1709,16 +1704,16 @@ double CTools::safeCdfComplement(const chi_squared& chi2, double x) { //////// deviation Implementation //////// namespace { -const double SMALL_PROBABILITY_DEVIATION = 1.0; -const double MINUSCULE_PROBABILITY_DEVIATION = 50.0; -const double MAX_DEVIATION = 100.0; +const double SMALL_PROBABILITY_ANOMALY_SCORE = 1.0; +const double MINUSCULE_PROBABILITY_ANOMALY_SCORE = 50.0; +const double MAX_ANOMALY_SCORE = 100.0; const double INV_LARGEST_SIGNIFICANT_PROBABILITY = 1.0 / LARGEST_SIGNIFICANT_PROBABILITY; const double INV_SMALL_PROBABILITY = 1.0 / SMALL_PROBABILITY; const double MINUS_LOG_SMALL_PROBABILITY = -std::log(SMALL_PROBABILITY); const double MINUS_LOG_MINUSCULE_PROBABILITY = -std::log(MINUSCULE_PROBABILITY); } -double CTools::deviation(double p) { +double CTools::anomalyScore(double p) { const double MINUS_LOG_SMALLEST_PROBABILITY = -std::log(smallestProbability()); double result = 0.0; @@ -1728,62 +1723,63 @@ double CTools::deviation(double p) { if (adjP >= SMALL_PROBABILITY) { // We use a linear scaling based on the inverse probability // into the range (0.0, 1.0]. - result = SMALL_PROBABILITY_DEVIATION * + result = SMALL_PROBABILITY_ANOMALY_SCORE * (1.0 / adjP - INV_LARGEST_SIGNIFICANT_PROBABILITY) / (INV_SMALL_PROBABILITY - INV_LARGEST_SIGNIFICANT_PROBABILITY); } else if (adjP >= MINUSCULE_PROBABILITY) { // We use a linear scaling based on the log probability into // the range (1.0, 50.0]. - result = SMALL_PROBABILITY_DEVIATION + - (MINUSCULE_PROBABILITY_DEVIATION - SMALL_PROBABILITY_DEVIATION) * + result = SMALL_PROBABILITY_ANOMALY_SCORE + + (MINUSCULE_PROBABILITY_ANOMALY_SCORE - SMALL_PROBABILITY_ANOMALY_SCORE) * (-std::log(adjP) - MINUS_LOG_SMALL_PROBABILITY) / (MINUS_LOG_MINUSCULE_PROBABILITY - MINUS_LOG_SMALL_PROBABILITY); } else { // We use a linear scaling based on the log probability into // the range (50.0, 100.0]. - result = MINUSCULE_PROBABILITY_DEVIATION + - (MAX_DEVIATION - MINUSCULE_PROBABILITY_DEVIATION) * + result = MINUSCULE_PROBABILITY_ANOMALY_SCORE + + (MAX_ANOMALY_SCORE - MINUSCULE_PROBABILITY_ANOMALY_SCORE) * (-std::log(adjP) - MINUS_LOG_MINUSCULE_PROBABILITY) / (MINUS_LOG_SMALLEST_PROBABILITY - MINUS_LOG_MINUSCULE_PROBABILITY); } } - if (!(result >= 0.0 && result <= MAX_DEVIATION)) { + if (!(result >= 0.0 && result <= MAX_ANOMALY_SCORE)) { LOG_ERROR(<< "Deviation " << result << " out of range, p =" << p); } return result; } -double CTools::inverseDeviation(double deviation) { +double CTools::inverseAnomalyScore(double deviation) { const double MINUS_LOG_SMALLEST_PROBABILITY = -std::log(smallestProbability()); double result = 0.0; - double adjDeviation = truncate(deviation, 0.0, MAX_DEVIATION); + double adjDeviation = truncate(deviation, 0.0, MAX_ANOMALY_SCORE); if (adjDeviation == 0.0) { result = (1.0 + LARGEST_SIGNIFICANT_PROBABILITY) / 2.0; - } else if (adjDeviation <= SMALL_PROBABILITY_DEVIATION) { + } else if (adjDeviation <= SMALL_PROBABILITY_ANOMALY_SCORE) { // We invert the linear scaling of the inverse probability // into the range (0.0, 1.0]. result = 1.0 / (INV_LARGEST_SIGNIFICANT_PROBABILITY + (INV_SMALL_PROBABILITY - INV_LARGEST_SIGNIFICANT_PROBABILITY) * - deviation / SMALL_PROBABILITY_DEVIATION); - } else if (adjDeviation <= MINUSCULE_PROBABILITY_DEVIATION) { + deviation / SMALL_PROBABILITY_ANOMALY_SCORE); + } else if (adjDeviation <= MINUSCULE_PROBABILITY_ANOMALY_SCORE) { // We invert the linear scaling of the log probability // into the range (1.0, 50.0]. result = std::exp( -(MINUS_LOG_SMALL_PROBABILITY + (MINUS_LOG_MINUSCULE_PROBABILITY - MINUS_LOG_SMALL_PROBABILITY) * - (deviation - SMALL_PROBABILITY_DEVIATION) / - (MINUSCULE_PROBABILITY_DEVIATION - SMALL_PROBABILITY_DEVIATION))); + (deviation - SMALL_PROBABILITY_ANOMALY_SCORE) / + (MINUSCULE_PROBABILITY_ANOMALY_SCORE - SMALL_PROBABILITY_ANOMALY_SCORE))); } else { // We invert the linear scaling of the log probability // into the range (50.0, 100.0]. - result = std::exp(-(MINUS_LOG_MINUSCULE_PROBABILITY + - (MINUS_LOG_SMALLEST_PROBABILITY - MINUS_LOG_MINUSCULE_PROBABILITY) * - (deviation - MINUSCULE_PROBABILITY_DEVIATION) / - (MAX_DEVIATION - MINUSCULE_PROBABILITY_DEVIATION))); + result = std::exp( + -(MINUS_LOG_MINUSCULE_PROBABILITY + + (MINUS_LOG_SMALLEST_PROBABILITY - MINUS_LOG_MINUSCULE_PROBABILITY) * + (deviation - MINUSCULE_PROBABILITY_ANOMALY_SCORE) / + (MAX_ANOMALY_SCORE - MINUSCULE_PROBABILITY_ANOMALY_SCORE))); } if (!(result >= 0.0 && result <= 1.0)) { @@ -1835,7 +1831,7 @@ double CTools::differentialEntropy(const lognormal& logNormal) { double location = logNormal.location(); double scale = logNormal.scale(); return 0.5 * std::log(boost::math::double_constants::two_pi * - boost::math::double_constants::e * square(scale)) + + boost::math::double_constants::e * CTools::pow2(scale)) + location; } @@ -1890,6 +1886,15 @@ double CTools::CGroup::rightEndpoint(double separation) const { const CTools::CLookupTableForFastLog CTools::FAST_LOG_TABLE; +//////// Miscellaneous Implementations //////// + +namespace { +const double EPS{0.1}; +const double COEFFS[]{-1.0, +1.0 / 2.0, -1.0 / 6.0, + +1.0 / 24.0, -1.0 / 120.0, +1.0 / 720.0}; +const std::size_t N{boost::size(COEFFS)}; +} + double CTools::shiftLeft(double x, double eps) { if (x == NEG_INF) { return x; @@ -1903,5 +1908,102 @@ double CTools::shiftRight(double x, double eps) { } return (x < 0.0 ? 1.0 - eps : 1.0 + eps) * x; } + +double CTools::powOneMinusX(double x, double p) { + // For large p, + // (1 - x) ^ p ~= exp(-p * x). + // + // and this doesn't suffer from cancellation errors in the limit + // p -> inf and x -> 0. For p * x << 1 we get much better precision + // using the Taylor expansion: + // (1 - x) ^ p = 1 - p * x + p * (p - 1) * x^2 / 2! + ... + // + // and canceling the leading terms. + + if (x == 1.0) { + return 0.0; + } + if (p == 1.0) { + return 1.0 - x; + } + + double y = p * x; + if (std::fabs(y) < EPS) { + double remainder = 0.0; + double ti = 1.0; + for (std::size_t i = 0u; i < N && p != 0.0; ++i, p -= 1.0) { + ti *= p * x; + remainder += COEFFS[i] * ti; + } + return 1.0 + remainder; + } else if (p > 1000.0) { + return std::exp(-y); + } + + if (x > 1.0) { + double sign = static_cast(p) % 2 ? -1.0 : 1.0; + return sign * std::exp(p * std::log(x - 1.0)); + } + + return std::exp(p * std::log(1.0 - x)); +} + +double CTools::oneMinusPowOneMinusX(double x, double p) { + // For large p, + // (1 - x) ^ p ~= exp(-p * x). + // + // and this doesn't suffer from cancellation errors in the limit + // p -> inf and x -> 0. For p * x << 1 we get much better precision + // using the Taylor expansion: + // (1 - x) ^ p = 1 - p * x + p * (p - 1) * x^2 / 2! + ... + // + // Note that this doesn't make use of powOneMinusX because we can + // avoid the cancellation errors by using: + // 1 - (1 - x) ^ p = p * x - p * (p - 1) * x^2 / 2 + ... + // + // when p * x is small. + + if (x == 1.0) { + return 1.0; + } + if (p == 1.0) { + return x; + } + + double y = p * x; + if (std::fabs(y) < EPS) { + double result = 0.0; + double ti = 1.0; + for (std::size_t i = 0u; i < N && p != 0.0; ++i, p -= 1.0) { + ti *= p * x; + result -= COEFFS[i] * ti; + } + return result; + } else if (p > 1000.0) { + return 1.0 - std::exp(-y); + } + + if (x > 1.0) { + double sign = static_cast(p) % 2 ? -1.0 : 1.0; + return 1.0 - sign * std::exp(p * std::log(x - 1.0)); + } + + return 1.0 - std::exp(p * std::log(1.0 - x)); +} + +double CTools::logOneMinusX(double x) { + double result = 0.0; + + if (std::fabs(x) < EPS) { + double xi = -x; + for (std::size_t i = 0u; i < 6; ++i, xi *= -x) { + result += xi / static_cast(i + 1); + } + } else { + result = std::log(1.0 - x); + } + + return result; +} } } diff --git a/lib/maths/CTrendComponent.cc b/lib/maths/CTrendComponent.cc index d26510db29..87c97a549d 100644 --- a/lib/maths/CTrendComponent.cc +++ b/lib/maths/CTrendComponent.cc @@ -30,6 +30,15 @@ namespace ml { namespace maths { namespace { +using TOptionalDoubleDoublePr = boost::optional>; + +const double TIME_SCALES[]{144.0, 72.0, 36.0, 12.0, 4.0, 1.0, 0.25, 0.05}; +const std::size_t NUMBER_MODELS{boost::size(TIME_SCALES)}; +const double MINIMUM_WEIGHT_TO_USE_MODEL_FOR_PREDICTION{0.01}; +const double MAX_CONDITION{1e12}; +const core_t::TTime UNSET_TIME{0}; +const std::size_t NO_CHANGE_LABEL{0}; +const std::size_t LEVEL_CHANGE_LABEL{1}; //! Get the desired weight for the regression model. double modelWeight(double targetDecayRate, double modelDecayRate) { @@ -45,6 +54,32 @@ double scaleTime(core_t::TTime time, core_t::TTime origin) { return static_cast(time - origin) / static_cast(core::constants::WEEK); } +//! Get the \p confidence interval for \p prediction and \p variance. +TOptionalDoubleDoublePr confidenceInterval(double prediction, double variance, double confidence) { + try { + boost::math::normal normal{prediction, std::sqrt(variance)}; + double ql{boost::math::quantile(normal, (100.0 - confidence) / 200.0)}; + double qu{boost::math::quantile(normal, (100.0 + confidence) / 200.0)}; + return std::make_pair(ql, qu); + } catch (const std::exception& e) { + LOG_ERROR("Failed calculating confidence interval: " + << e.what() << ", prediction = " << prediction + << ", variance = " << variance << ", confidence = " << confidence); + } + return TOptionalDoubleDoublePr{}; +} + +CNaiveBayes initialProbabilityOfChangeModel(double decayRate) { + decayRate *= TIME_SCALES[NUMBER_MODELS - 1]; + return CNaiveBayes{CNaiveBayesFeatureDensityFromPrior{CNormalMeanPrecConjugate::nonInformativePrior( + maths_t::E_ContinuousData, decayRate)}, + decayRate, -20.0}; +} + +CNormalMeanPrecConjugate initialMagnitudeOfChangeModel(double decayRate) { + return CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData, decayRate); +} + const std::string TARGET_DECAY_RATE_TAG{"a"}; const std::string FIRST_UPDATE_TAG{"b"}; const std::string LAST_UPDATE_TAG{"c"}; @@ -52,23 +87,23 @@ const std::string REGRESSION_ORIGIN_TAG{"d"}; const std::string MODEL_TAG{"e"}; const std::string PREDICTION_ERROR_VARIANCE_TAG{"f"}; const std::string VALUE_MOMENTS_TAG{"g"}; +const std::string TIME_OF_LAST_LEVEL_CHANGE_TAG{"h"}; +const std::string PROBABILITY_OF_LEVEL_CHANGE_MODEL_TAG{"i"}; +const std::string MAGNITUDE_OF_LEVEL_CHANGE_MODEL_TAG{"j"}; const std::string WEIGHT_TAG{"a"}; const std::string REGRESSION_TAG{"b"}; const std::string RESIDUAL_MOMENTS_TAG{"c"}; - -const double TIME_SCALES[]{144.0, 72.0, 36.0, 12.0, 4.0, 1.0, 0.25, 0.05}; -const std::size_t NUMBER_MODELS{boost::size(TIME_SCALES)}; -const double MAX_CONDITION{1e12}; -const double MINIMUM_WEIGHT_TO_USE_MODEL_FOR_PREDICTION{0.01}; -const core_t::TTime UNSET_TIME{0}; } CTrendComponent::CTrendComponent(double decayRate) : m_DefaultDecayRate(decayRate), m_TargetDecayRate(decayRate), m_FirstUpdate(UNSET_TIME), m_LastUpdate(UNSET_TIME), - m_RegressionOrigin(UNSET_TIME), m_PredictionErrorVariance(0.0) { + m_RegressionOrigin(UNSET_TIME), m_PredictionErrorVariance(0.0), + m_TimeOfLastLevelChange(UNSET_TIME), + m_ProbabilityOfLevelChangeModel(initialProbabilityOfChangeModel(decayRate)), + m_MagnitudeOfLevelChangeModel(initialMagnitudeOfChangeModel(decayRate)) { for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { - m_Models.emplace_back(modelWeight(1.0, TIME_SCALES[i])); + m_TrendModels.emplace_back(modelWeight(1.0, TIME_SCALES[i])); } } @@ -78,9 +113,12 @@ void CTrendComponent::swap(CTrendComponent& other) { std::swap(m_FirstUpdate, other.m_FirstUpdate); std::swap(m_LastUpdate, other.m_LastUpdate); std::swap(m_RegressionOrigin, other.m_RegressionOrigin); - m_Models.swap(other.m_Models); + m_TrendModels.swap(other.m_TrendModels); std::swap(m_PredictionErrorVariance, other.m_PredictionErrorVariance); std::swap(m_ValueMoments, other.m_ValueMoments); + std::swap(m_TimeOfLastLevelChange, other.m_TimeOfLastLevelChange); + m_ProbabilityOfLevelChangeModel.swap(other.m_ProbabilityOfLevelChangeModel); + m_MagnitudeOfLevelChangeModel.swap(other.m_MagnitudeOfLevelChangeModel); } void CTrendComponent::acceptPersistInserter(core::CStatePersistInserter& inserter) const { @@ -88,16 +126,24 @@ void CTrendComponent::acceptPersistInserter(core::CStatePersistInserter& inserte inserter.insertValue(FIRST_UPDATE_TAG, m_FirstUpdate); inserter.insertValue(LAST_UPDATE_TAG, m_LastUpdate); inserter.insertValue(REGRESSION_ORIGIN_TAG, m_RegressionOrigin); - for (const auto& model : m_Models) { + for (const auto& model : m_TrendModels) { inserter.insertLevel( MODEL_TAG, boost::bind(&SModel::acceptPersistInserter, &model, _1)); } inserter.insertValue(PREDICTION_ERROR_VARIANCE_TAG, m_PredictionErrorVariance, core::CIEEE754::E_DoublePrecision); inserter.insertValue(VALUE_MOMENTS_TAG, m_ValueMoments.toDelimited()); + inserter.insertValue(TIME_OF_LAST_LEVEL_CHANGE_TAG, m_TimeOfLastLevelChange); + inserter.insertLevel(PROBABILITY_OF_LEVEL_CHANGE_MODEL_TAG, + boost::bind(&CNaiveBayes::acceptPersistInserter, + &m_ProbabilityOfLevelChangeModel, _1)); + inserter.insertLevel(MAGNITUDE_OF_LEVEL_CHANGE_MODEL_TAG, + boost::bind(&CNormalMeanPrecConjugate::acceptPersistInserter, + &m_MagnitudeOfLevelChangeModel, _1)); } -bool CTrendComponent::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { +bool CTrendComponent::acceptRestoreTraverser(const SDistributionRestoreParams& params, + core::CStateRestoreTraverser& traverser) { std::size_t i{0}; do { const std::string& name{traverser.name()}; @@ -106,9 +152,15 @@ bool CTrendComponent::acceptRestoreTraverser(core::CStateRestoreTraverser& trave RESTORE_BUILT_IN(LAST_UPDATE_TAG, m_LastUpdate) RESTORE_BUILT_IN(REGRESSION_ORIGIN_TAG, m_RegressionOrigin) RESTORE(MODEL_TAG, traverser.traverseSubLevel(boost::bind( - &SModel::acceptRestoreTraverser, &m_Models[i++], _1))) + &SModel::acceptRestoreTraverser, &m_TrendModels[i++], _1))) RESTORE_BUILT_IN(PREDICTION_ERROR_VARIANCE_TAG, m_PredictionErrorVariance) RESTORE(VALUE_MOMENTS_TAG, m_ValueMoments.fromDelimited(traverser.value())) + RESTORE_BUILT_IN(TIME_OF_LAST_LEVEL_CHANGE_TAG, m_TimeOfLastLevelChange) + RESTORE_NO_ERROR(PROBABILITY_OF_LEVEL_CHANGE_MODEL_TAG, + m_ProbabilityOfLevelChangeModel = CNaiveBayes(params, traverser)) + RESTORE_NO_ERROR(MAGNITUDE_OF_LEVEL_CHANGE_MODEL_TAG, + m_MagnitudeOfLevelChangeModel = + CNormalMeanPrecConjugate(params, traverser)) } while (traverser.next()); return true; } @@ -122,17 +174,20 @@ void CTrendComponent::clear() { m_LastUpdate = UNSET_TIME; m_RegressionOrigin = UNSET_TIME; for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { - m_Models[i] = SModel(modelWeight(1.0, TIME_SCALES[i])); + m_TrendModels[i] = SModel(modelWeight(1.0, TIME_SCALES[i])); } m_PredictionErrorVariance = 0.0; m_ValueMoments = TMeanVarAccumulator(); + m_TimeOfLastLevelChange = UNSET_TIME; + m_ProbabilityOfLevelChangeModel = initialProbabilityOfChangeModel(m_DefaultDecayRate); + m_MagnitudeOfLevelChangeModel = initialMagnitudeOfChangeModel(m_DefaultDecayRate); } void CTrendComponent::shiftOrigin(core_t::TTime time) { time = CIntegerTools::floor(time, core::constants::WEEK); double scaledShift{scaleTime(time, m_RegressionOrigin)}; if (scaledShift > 0.0) { - for (auto& model : m_Models) { + for (auto& model : m_TrendModels) { model.s_Regression.shiftAbscissa(-scaledShift); } m_RegressionOrigin = time; @@ -142,7 +197,34 @@ void CTrendComponent::shiftOrigin(core_t::TTime time) { void CTrendComponent::shiftSlope(double decayRate, double shift) { for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { double shift_{std::min(m_DefaultDecayRate * TIME_SCALES[i] / decayRate, 1.0) * shift}; - m_Models[i].s_Regression.shiftGradient(shift_); + m_TrendModels[i].s_Regression.shiftGradient(shift_); + } +} + +void CTrendComponent::shiftLevel(core_t::TTime time, double value, double shift) { + for (auto& model : m_TrendModels) { + model.s_Regression.shiftOrdinate(shift); + } + if (m_TimeOfLastLevelChange != UNSET_TIME) { + double dt{static_cast(time - m_TimeOfLastLevelChange)}; + m_ProbabilityOfLevelChangeModel.addTrainingDataPoint(LEVEL_CHANGE_LABEL, + {{dt}, {value}}); + } + m_MagnitudeOfLevelChangeModel.addSamples({shift}, maths_t::CUnitWeights::SINGLE_UNIT); + m_TimeOfLastLevelChange = time; +} + +void CTrendComponent::dontShiftLevel(core_t::TTime time, double value) { + if (m_TimeOfLastLevelChange != UNSET_TIME) { + double dt{static_cast(time - m_TimeOfLastLevelChange)}; + m_ProbabilityOfLevelChangeModel.addTrainingDataPoint(NO_CHANGE_LABEL, + {{dt}, {value}}); + } +} + +void CTrendComponent::linearScale(double scale) { + for (auto& model : m_TrendModels) { + model.s_Regression.linearScale(scale); } } @@ -151,7 +233,7 @@ void CTrendComponent::add(core_t::TTime time, double value, double weight) { // relative difference in the component scale and the target scale. for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { - m_Models[i].s_Weight.add( + m_TrendModels[i].s_Weight.add( modelWeight(m_TargetDecayRate, m_DefaultDecayRate * TIME_SCALES[i])); } @@ -172,7 +254,7 @@ void CTrendComponent::add(core_t::TTime time, double value, double weight) { } double scaledTime{scaleTime(time, m_RegressionOrigin)}; - for (auto& model : m_Models) { + for (auto& model : m_TrendModels) { model.s_Regression.add(scaledTime, value, weight); model.s_ResidualMoments.add(value - model.s_Regression.predict(scaledTime, MAX_CONDITION)); } @@ -182,6 +264,11 @@ void CTrendComponent::add(core_t::TTime time, double value, double weight) { m_LastUpdate = std::max(m_LastUpdate, time); } +void CTrendComponent::dataType(maths_t::EDataType dataType) { + m_ProbabilityOfLevelChangeModel.dataType(dataType); + m_MagnitudeOfLevelChangeModel.dataType(dataType); +} + double CTrendComponent::defaultDecayRate() const { return m_DefaultDecayRate; } @@ -194,10 +281,14 @@ void CTrendComponent::propagateForwardsByTime(core_t::TTime interval) { TDoubleVec factors(this->factors(interval)); double median{CBasicStatistics::median(factors)}; for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { - m_Models[i].s_Weight.age(median); - m_Models[i].s_Regression.age(factors[i]); - m_Models[i].s_ResidualMoments.age(std::sqrt(factors[i])); + m_TrendModels[i].s_Weight.age(median); + m_TrendModels[i].s_Regression.age(factors[i]); + m_TrendModels[i].s_ResidualMoments.age(std::sqrt(factors[i])); } + double interval_{static_cast(interval) / + static_cast(core::constants::DAY)}; + m_ProbabilityOfLevelChangeModel.propagateForwardsByTime(interval_); + m_MagnitudeOfLevelChangeModel.propagateForwardsByTime(interval_); } CTrendComponent::TDoubleDoublePr CTrendComponent::value(core_t::TTime time, @@ -215,12 +306,12 @@ CTrendComponent::TDoubleDoublePr CTrendComponent::value(core_t::TTime time, TDoubleVec weights(this->factors(std::abs(time - m_LastUpdate))); double Z{0.0}; for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { - weights[i] *= CBasicStatistics::mean(m_Models[i].s_Weight); + weights[i] *= CBasicStatistics::mean(m_TrendModels[i].s_Weight); Z += weights[i]; } for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { if (weights[i] > MINIMUM_WEIGHT_TO_USE_MODEL_FOR_PREDICTION * Z) { - prediction_.add(m_Models[i].s_Regression.predict(scaledTime, MAX_CONDITION), + prediction_.add(m_TrendModels[i].s_Regression.predict(scaledTime, MAX_CONDITION), weights[i]); } } @@ -232,15 +323,8 @@ CTrendComponent::TDoubleDoublePr CTrendComponent::value(core_t::TTime time, double variance{a * m_PredictionErrorVariance / std::max(this->count(), 1.0) + b * CBasicStatistics::variance(m_ValueMoments) / std::max(CBasicStatistics::count(m_ValueMoments), 1.0)}; - try { - boost::math::normal normal{prediction, std::sqrt(variance)}; - double ql{boost::math::quantile(normal, (100.0 - confidence) / 200.0)}; - double qu{boost::math::quantile(normal, (100.0 + confidence) / 200.0)}; - return {ql, qu}; - } catch (const std::exception& e) { - LOG_ERROR(<< "Failed calculating confidence interval: " << e.what() - << ", prediction = " << prediction << ", variance = " << variance - << ", confidence = " << confidence); + if (auto interval = confidenceInterval(prediction, variance, confidence)) { + return *interval; } } @@ -274,9 +358,8 @@ void CTrendComponent::forecast(core_t::TTime startTime, core_t::TTime endTime, core_t::TTime step, double confidence, - TDouble3VecVec& result) const { - result.clear(); - + const TSeasonalForecast& seasonal, + const TWriteForecastResult& writer) const { if (endTime < startTime) { LOG_ERROR(<< "Bad forecast range: [" << startTime << "," << endTime << "]"); return; @@ -288,34 +371,33 @@ void CTrendComponent::forecast(core_t::TTime startTime, endTime = startTime + CIntegerTools::ceil(endTime - startTime, step); - core_t::TTime steps{(endTime - startTime) / step}; - result.resize(steps, TDouble3Vec(3)); - LOG_TRACE(<< "forecasting = " << this->print()); TDoubleVec factors(this->factors(step)); - TDoubleVec modelWeights(this->initialForecastModelWeights()); TDoubleVec errorWeights(this->initialForecastErrorWeights()); TRegressionArrayVec models(NUMBER_MODELS); TMatrixVec modelCovariances(NUMBER_MODELS); TDoubleVec residualVariances(NUMBER_MODELS); for (std::size_t i = 0u; i < NUMBER_MODELS; ++i) { - m_Models[i].s_Regression.parameters(models[i], MAX_CONDITION); - m_Models[i].s_Regression.covariances(m_PredictionErrorVariance, - modelCovariances[i], MAX_CONDITION); - modelCovariances[i] /= std::max(m_Models[i].s_Regression.count(), 1.0); - residualVariances[i] = - std::pow(CBasicStatistics::mean(m_Models[i].s_ResidualMoments), 2.0) + - CBasicStatistics::variance(m_Models[i].s_ResidualMoments); - LOG_TRACE(<< "params = " << core::CContainerPrinter::print(models[i])); - LOG_TRACE(<< "covariances = " << modelCovariances[i].toDelimited()) + const SModel& model{m_TrendModels[i]}; + model.s_Regression.parameters(models[i], MAX_CONDITION); + model.s_Regression.covariances(m_PredictionErrorVariance, + modelCovariances[i], MAX_CONDITION); + modelCovariances[i] /= std::max(model.s_Regression.count(), 1.0); + residualVariances[i] = CTools::pow2(CBasicStatistics::mean(model.s_ResidualMoments)) + + CBasicStatistics::variance(model.s_ResidualMoments); + LOG_TRACE("params = " << core::CContainerPrinter::print(models[i])); + LOG_TRACE("covariances = " << modelCovariances[i].toDelimited()) + LOG_TRACE("variances = " << residualVariances[i]); } LOG_TRACE(<< "long time variance = " << CBasicStatistics::variance(m_ValueMoments)); + CForecastLevel level{m_ProbabilityOfLevelChangeModel, + m_MagnitudeOfLevelChangeModel, m_TimeOfLastLevelChange}; + TDoubleVec variances(NUMBER_MODELS + 1); for (core_t::TTime time = startTime; time < endTime; time += step) { - core_t::TTime pillar{(time - startTime) / step}; double scaledDt{scaleTime(time, startTime)}; TVector times({0.0, scaledDt, scaledDt * scaledDt}); @@ -324,7 +406,7 @@ void CTrendComponent::forecast(core_t::TTime startTime, for (std::size_t j = 0u; j < NUMBER_MODELS; ++j) { modelWeights[j] *= factors[j]; - errorWeights[j] *= std::pow(factors[j], 2.0); + errorWeights[j] *= CTools::pow2(factors[j]); } for (std::size_t j = 0u; j < NUMBER_MODELS; ++j) { @@ -341,22 +423,20 @@ void CTrendComponent::forecast(core_t::TTime startTime, double prediction{this->value(modelWeights, models, scaleTime(time, m_RegressionOrigin))}; + TDouble3Vec seasonal_(seasonal(time)); + TDouble3Vec level_(level.forecast(time, seasonal_[1] + prediction, confidence)); + double ql{0.0}; double qu{0.0}; double variance{a * CBasicStatistics::mean(variance_) + b * CBasicStatistics::variance(m_ValueMoments)}; - try { - boost::math::normal normal{0.0, std::sqrt(variance)}; - ql = boost::math::quantile(normal, (100.0 - confidence) / 200.0); - qu = boost::math::quantile(normal, (100.0 + confidence) / 200.0); - } catch (const std::exception& e) { - LOG_ERROR(<< "Failed calculating confidence interval: " << e.what() - << ", variance = " << variance << ", confidence = " << confidence); + if (auto interval = confidenceInterval(0.0, variance, confidence)) { + boost::tie(ql, qu) = *interval; } - result[pillar][0] = prediction + ql; - result[pillar][1] = prediction; - result[pillar][2] = prediction + qu; + writer(time, {level_[0] + seasonal_[0] + prediction + ql, + level_[1] + seasonal_[1] + prediction, + level_[2] + seasonal_[2] + prediction + qu}); } } @@ -372,16 +452,27 @@ uint64_t CTrendComponent::checksum(uint64_t seed) const { seed = CChecksum::calculate(seed, m_TargetDecayRate); seed = CChecksum::calculate(seed, m_FirstUpdate); seed = CChecksum::calculate(seed, m_LastUpdate); - seed = CChecksum::calculate(seed, m_Models); + seed = CChecksum::calculate(seed, m_TrendModels); seed = CChecksum::calculate(seed, m_PredictionErrorVariance); - return CChecksum::calculate(seed, m_ValueMoments); + seed = CChecksum::calculate(seed, m_ValueMoments); + seed = CChecksum::calculate(seed, m_TimeOfLastLevelChange); + seed = CChecksum::calculate(seed, m_ProbabilityOfLevelChangeModel); + return CChecksum::calculate(seed, m_MagnitudeOfLevelChangeModel); } std::string CTrendComponent::print() const { std::ostringstream result; - for (const auto& model : m_Models) { - result << model.s_Regression.print() << "\n"; + result << "\n===\n"; + result << "Trend Models:"; + for (const auto& model : m_TrendModels) { + result << "\n" << model.s_Regression.print(); } + result << "\n===\n"; + result << "Probability of Change Model:"; + result << m_ProbabilityOfLevelChangeModel.print(); + result << "===\n"; + result << "Magnitude of Change Model:"; + result << m_MagnitudeOfLevelChangeModel.print(); return result.str(); } @@ -416,7 +507,7 @@ CTrendComponent::TDoubleVec CTrendComponent::initialForecastErrorWeights() const double CTrendComponent::count() const { TMeanAccumulator result; - for (const auto& model : m_Models) { + for (const auto& model : m_TrendModels) { result.add(CTools::fastLog(model.s_Regression.count()), CBasicStatistics::mean(model.s_Weight)); } @@ -476,5 +567,56 @@ uint64_t CTrendComponent::SModel::checksum(uint64_t seed) const { seed = CChecksum::calculate(seed, s_Regression); return CChecksum::calculate(seed, s_ResidualMoments); } + +CTrendComponent::CForecastLevel::CForecastLevel(const CNaiveBayes& probability, + const CNormalMeanPrecConjugate& magnitude, + core_t::TTime timeOfLastChange, + std::size_t numberPaths) + : m_Probability(probability), m_Magnitude(magnitude), m_Levels(numberPaths), + m_TimesOfLastChange(numberPaths, timeOfLastChange), + m_ProbabilitiesOfChange(numberPaths, 0.0) { + m_Uniform01.reserve(numberPaths); +} + +CTrendComponent::TDouble3Vec +CTrendComponent::CForecastLevel::forecast(core_t::TTime time, double prediction, double confidence) { + TDouble3Vec result{0.0, 0.0, 0.0}; + + if (m_Probability.initialized()) { + CSampling::uniformSample(0.0, 1.0, m_Levels.size(), m_Uniform01); + bool reorder{false}; + for (std::size_t i = 0u; i < m_Levels.size(); ++i) { + double dt{static_cast(time - m_TimesOfLastChange[i])}; + double x = m_Levels[i] + prediction; + double p{m_Probability.classProbability(LEVEL_CHANGE_LABEL, {{dt}, {x}})}; + m_ProbabilitiesOfChange[i] = std::max(m_ProbabilitiesOfChange[i], p); + if (m_Uniform01[i] < m_ProbabilitiesOfChange[i]) { + double stepMean{m_Magnitude.marginalLikelihoodMean()}; + double stepVariance{m_Magnitude.marginalLikelihoodVariance()}; + m_Levels[i] += CSampling::normalSample(m_Rng, stepMean, stepVariance); + m_TimesOfLastChange[i] = time; + m_ProbabilitiesOfChange[i] = 0.0; + reorder = true; + } + } + if (reorder) { + COrderings::simultaneousSort(m_Levels, m_TimesOfLastChange, m_ProbabilitiesOfChange); + } + + double rollouts{static_cast(m_Levels.size())}; + std::size_t lower{std::min( + static_cast((100.0 - confidence) / 200.0 * rollouts + 0.5), + m_Levels.size())}; + std::size_t upper{std::min( + static_cast((100.0 + confidence) / 200.0 * rollouts + 0.5), + m_Levels.size() - 1)}; + + result[0] = m_Levels[lower]; + result[1] = CBasicStatistics::median(m_Levels); + result[2] = m_Levels[upper]; + } + + return result; +} } } diff --git a/lib/maths/CXMeansOnline1d.cc b/lib/maths/CXMeansOnline1d.cc index d52cfe81e8..0938a9c6de 100644 --- a/lib/maths/CXMeansOnline1d.cc +++ b/lib/maths/CXMeansOnline1d.cc @@ -69,11 +69,6 @@ struct SClusterCentreLess { } }; -//! Get \p x time \p x. -double pow2(double x) { - return x * x; -} - //! Get the minimum of \p x, \p y and \p z. double min(double x, double y, double z) { return std::min(std::min(x, y), z); @@ -272,14 +267,15 @@ void BICGain(maths_t::EDataType dataType, } // Log-normal (method of moments) - double s = std::log(1.0 + v / pow2(m + logNormalOffset)); + double s = std::log(1.0 + v / CTools::pow2(m + logNormalOffset)); double l = std::log(m + logNormalOffset) - s / 2.0; // Gamma (method of moments) - double a = pow2(m + gammaOffset) / v; + double a = CTools::pow2(m + gammaOffset) / v; double b = (m + gammaOffset) / v; double smin = std::max(logNormalOffset, gammaOffset); - double vmin = std::min(MIN_RELATIVE_VARIANCE * std::max(v, pow2(smin)), MIN_ABSOLUTE_VARIANCE); + double vmin = std::min(MIN_RELATIVE_VARIANCE * std::max(v, CTools::pow2(smin)), + MIN_ABSOLUTE_VARIANCE); // Mixture of normals double wl = CBasicStatistics::count(mvl) / n; @@ -291,23 +287,27 @@ void BICGain(maths_t::EDataType dataType, try { // Mixture of log-normals (method of moments) - double sl = std::log(1.0 + vl / pow2(ml + logNormalOffset)); + double sl = std::log(1.0 + vl / CTools::pow2(ml + logNormalOffset)); double ll = std::log(ml + logNormalOffset) - sl / 2.0; - double sr = std::log(1.0 + vr / pow2(mr + logNormalOffset)); + double sr = std::log(1.0 + vr / CTools::pow2(mr + logNormalOffset)); double lr = std::log(mr + logNormalOffset) - sr / 2.0; // Mixture of gammas (method of moments) - double al = pow2(ml + gammaOffset) / vl; + double al = CTools::pow2(ml + gammaOffset) / vl; double bl = (ml + gammaOffset) / vl; - double ar = pow2(mr + gammaOffset) / vr; + double ar = CTools::pow2(mr + gammaOffset) / vr; double br = (mr + gammaOffset) / vr; double log2piv = std::log(boost::math::double_constants::two_pi * v); double log2pis = std::log(boost::math::double_constants::two_pi * s); double loggn = boost::math::lgamma(a) - a * std::log(b); - double log2pivl = std::log(boost::math::double_constants::two_pi * vl / pow2(wl)); - double log2pivr = std::log(boost::math::double_constants::two_pi * vr / pow2(wr)); - double log2pisl = std::log(boost::math::double_constants::two_pi * sl / pow2(wl)); - double log2pisr = std::log(boost::math::double_constants::two_pi * sr / pow2(wr)); + double log2pivl = + std::log(boost::math::double_constants::two_pi * vl / CTools::pow2(wl)); + double log2pivr = + std::log(boost::math::double_constants::two_pi * vr / CTools::pow2(wr)); + double log2pisl = + std::log(boost::math::double_constants::two_pi * sl / CTools::pow2(wl)); + double log2pisr = + std::log(boost::math::double_constants::two_pi * sr / CTools::pow2(wr)); double loggnl = boost::math::lgamma(al) - al * std::log(bl) - std::log(wl); double loggnr = boost::math::lgamma(ar) - ar * std::log(br) - std::log(wr); @@ -318,20 +318,20 @@ void BICGain(maths_t::EDataType dataType, if (vi == 0.0) { double li = std::log(mi + logNormalOffset); - ll1n += ni * ((vi + pow2(mi - m)) / v + log2piv); - ll1l += ni * (pow2(li - l) / s + 2.0 * li + log2pis); + ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv); + ll1l += ni * (CTools::pow2(li - l) / s + 2.0 * li + log2pis); ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn); - ll2nl += ni * ((vi + pow2(mi - ml)) / vl + log2pivl); - ll2ll += ni * (pow2(li - ll) / sl + 2.0 * li + log2pisl); + ll2nl += ni * ((vi + CTools::pow2(mi - ml)) / vl + log2pivl); + ll2ll += ni * (CTools::pow2(li - ll) / sl + 2.0 * li + log2pisl); ll2gl += ni * 2.0 * (bl * (mi + gammaOffset) - (al - 1.0) * li + loggnl); } else { - double si = std::log(1.0 + vi / pow2(mi + logNormalOffset)); + double si = std::log(1.0 + vi / CTools::pow2(mi + logNormalOffset)); double li = std::log(mi + logNormalOffset) - si / 2.0; - ll1n += ni * ((vi + pow2(mi - m)) / v + log2piv); - ll1l += ni * ((si + pow2(li - l)) / s + 2.0 * li + log2pis); + ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv); + ll1l += ni * ((si + CTools::pow2(li - l)) / s + 2.0 * li + log2pis); ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn); - ll2nl += ni * ((vi + pow2(mi - ml)) / vl + log2pivl); - ll2ll += ni * ((si + pow2(li - ll)) / sl + 2.0 * li + log2pisl); + ll2nl += ni * ((vi + CTools::pow2(mi - ml)) / vl + log2pivl); + ll2ll += ni * ((si + CTools::pow2(li - ll)) / sl + 2.0 * li + log2pisl); ll2gl += ni * 2.0 * (bl * (mi + gammaOffset) - (al - 1.0) * li + loggnl); } } @@ -343,20 +343,20 @@ void BICGain(maths_t::EDataType dataType, if (vi == 0.0) { double li = std::log(mi + logNormalOffset); - ll1n += ni * ((vi + pow2(mi - m)) / v + log2piv); - ll1l += ni * (pow2(li - l) / s + 2.0 * li + log2pis); + ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv); + ll1l += ni * (CTools::pow2(li - l) / s + 2.0 * li + log2pis); ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn); - ll2nr += ni * ((vi + pow2(mi - mr)) / vr + log2pivr); - ll2lr += ni * (pow2(li - lr) / sr + 2.0 * li + log2pisr); + ll2nr += ni * ((vi + CTools::pow2(mi - mr)) / vr + log2pivr); + ll2lr += ni * (CTools::pow2(li - lr) / sr + 2.0 * li + log2pisr); ll2gr += ni * 2.0 * (br * (mi + gammaOffset) - (ar - 1.0) * li + loggnr); } else { - double si = std::log(1.0 + vi / pow2(mi + logNormalOffset)); + double si = std::log(1.0 + vi / CTools::pow2(mi + logNormalOffset)); double li = std::log(mi + logNormalOffset) - si / 2.0; - ll1n += ni * ((vi + pow2(mi - m)) / v + log2piv); - ll1l += ni * ((si + pow2(li - l)) / s + 2.0 * li + log2pis); + ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv); + ll1l += ni * ((si + CTools::pow2(li - l)) / s + 2.0 * li + log2pis); ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn); - ll2nr += ni * ((vi + pow2(mi - mr)) / vr + log2pivr); - ll2lr += ni * ((si + pow2(li - lr)) / sr + 2.0 * li + log2pisr); + ll2nr += ni * ((vi + CTools::pow2(mi - mr)) / vr + log2pivr); + ll2lr += ni * ((si + CTools::pow2(li - lr)) / sr + 2.0 * li + log2pisr); ll2gr += ni * 2.0 * (br * (mi + gammaOffset) - (ar - 1.0) * li + loggnr); } } @@ -723,9 +723,9 @@ std::string CXMeansOnline1d::persistenceTag() const { } void CXMeansOnline1d::acceptPersistInserter(core::CStatePersistInserter& inserter) const { - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - inserter.insertLevel(CLUSTER_TAG, boost::bind(&CCluster::acceptPersistInserter, - &m_Clusters[i], _1)); + for (const auto& cluster : m_Clusters) { + inserter.insertLevel( + CLUSTER_TAG, boost::bind(&CCluster::acceptPersistInserter, &cluster, _1)); } inserter.insertValue(AVAILABLE_DISTRIBUTIONS_TAG, m_AvailableDistributions.toString()); inserter.insertValue(DECAY_RATE_TAG, m_DecayRate, core::CIEEE754::E_SinglePrecision); @@ -758,15 +758,15 @@ std::size_t CXMeansOnline1d::numberClusters() const { void CXMeansOnline1d::dataType(maths_t::EDataType dataType) { m_DataType = dataType; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - m_Clusters[i].dataType(dataType); + for (auto& cluster : m_Clusters) { + cluster.dataType(dataType); } } void CXMeansOnline1d::decayRate(double decayRate) { m_DecayRate = decayRate; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - m_Clusters[i].decayRate(decayRate); + for (auto& cluster : m_Clusters) { + cluster.decayRate(decayRate); } } @@ -904,9 +904,9 @@ void CXMeansOnline1d::add(const double& point, TSizeDoublePr2Vec& clusters, doub double renormalizer = std::max(likelihoodLeft, likelihoodRight); double pLeft = std::exp(likelihoodLeft - renormalizer); double pRight = std::exp(likelihoodRight - renormalizer); - double normalizer = pLeft + pRight; - pLeft /= normalizer; - pRight /= normalizer; + double pLeftPlusRight = pLeft + pRight; + pLeft /= pLeftPlusRight; + pRight /= pLeftPlusRight; if (pLeft < HARD_ASSIGNMENT_THRESHOLD * pRight) { LOG_TRACE(<< "Adding " << point << " to " << rightCluster->centre()); @@ -948,11 +948,11 @@ void CXMeansOnline1d::add(const double& point, TSizeDoublePr2Vec& clusters, doub void CXMeansOnline1d::add(const TDoubleDoublePrVec& points) { if (m_Clusters.empty()) { - m_Clusters.push_back(CCluster(*this)); + m_Clusters.emplace_back(*this); } TSizeDoublePr2Vec dummy; - for (std::size_t i = 0u; i < points.size(); ++i) { - this->add(points[i].first, dummy, points[i].second); + for (const auto& point : points) { + this->add(point.first, dummy, point.second); } } @@ -962,8 +962,8 @@ void CXMeansOnline1d::propagateForwardsByTime(double time) { return; } m_HistoryLength = (m_HistoryLength + time) * std::exp(-m_DecayRate * time); - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - m_Clusters[i].propagateForwardsByTime(time); + for (auto& cluster : m_Clusters) { + cluster.propagateForwardsByTime(time); } } @@ -979,15 +979,14 @@ bool CXMeansOnline1d::sample(std::size_t index, std::size_t numberSamples, TDoub double CXMeansOnline1d::probability(std::size_t index) const { double weight = 0.0; - double weightSum = 0.0; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - const CCluster& cluster = m_Clusters[i]; + double Z = 0.0; + for (const auto& cluster : m_Clusters) { if (cluster.index() == index) { weight = cluster.weight(maths_t::E_ClustersFractionWeight); } - weightSum += cluster.weight(maths_t::E_ClustersFractionWeight); + Z += cluster.weight(maths_t::E_ClustersFractionWeight); } - return weightSum == 0.0 ? 0.0 : weight / weightSum; + return Z == 0.0 ? 0.0 : weight / Z; } void CXMeansOnline1d::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { @@ -1015,11 +1014,10 @@ uint64_t CXMeansOnline1d::checksum(uint64_t seed) const { } double CXMeansOnline1d::count() const { - double result = 0.0; - for (std::size_t i = 0; i < m_Clusters.size(); ++i) { - result += m_Clusters[i].count(); - } - return result; + return std::accumulate(m_Clusters.begin(), m_Clusters.end(), 0.0, + [](double count, const CCluster& cluster) { + return count + cluster.count(); + }); } const CXMeansOnline1d::TClusterVec& CXMeansOnline1d::clusters() const { @@ -1043,16 +1041,16 @@ std::string CXMeansOnline1d::printClusters() const { TDoubleDoublePr range(boost::numeric::bounds::highest(), boost::numeric::bounds::lowest()); - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - const CPrior& prior = m_Clusters[i].prior(); + for (const auto& cluster : m_Clusters) { + const CPrior& prior = cluster.prior(); TDoubleDoublePr clusterRange = prior.marginalLikelihoodConfidenceInterval(RANGE); range.first = std::min(range.first, clusterRange.first); range.second = std::max(range.second, clusterRange.second); } - double weightSum = 0.0; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - weightSum += m_Clusters[i].weight(m_WeightCalc); + double Z = 0.0; + for (const auto& cluster : m_Clusters) { + Z += cluster.weight(m_WeightCalc); } TDouble1Vec x{range.first}; @@ -1062,15 +1060,14 @@ std::string CXMeansOnline1d::printClusters() const { std::ostringstream likelihoodStr; coordinatesStr << "x = ["; likelihoodStr << "likelihood = ["; - for (unsigned int i = 0u; i < POINTS; ++i, x[0] += increment) { + for (unsigned int i = 0; i < POINTS; ++i, x[0] += increment) { double likelihood = 0.0; - for (std::size_t j = 0u; j < m_Clusters.size(); ++j) { + for (const auto& cluster : m_Clusters) { double logLikelihood; - const CPrior& prior = m_Clusters[j].prior(); + const CPrior& prior = cluster.prior(); if (!(prior.jointLogMarginalLikelihood(x, maths_t::CUnitWeights::SINGLE_UNIT, logLikelihood) & (maths_t::E_FpFailed | maths_t::E_FpOverflowed))) { - likelihood += m_Clusters[j].weight(m_WeightCalc) / weightSum * - std::exp(logLikelihood); + likelihood += cluster.weight(m_WeightCalc) / Z * std::exp(logLikelihood); } } coordinatesStr << x[0] << " "; @@ -1119,9 +1116,9 @@ bool CXMeansOnline1d::acceptRestoreTraverser(const SDistributionRestoreParams& p } const CXMeansOnline1d::CCluster* CXMeansOnline1d::cluster(std::size_t index) const { - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - if (m_Clusters[i].index() == index) { - return &m_Clusters[i]; + for (const auto& cluster : m_Clusters) { + if (cluster.index() == index) { + return &cluster; } } return nullptr; @@ -1130,13 +1127,9 @@ const CXMeansOnline1d::CCluster* CXMeansOnline1d::cluster(std::size_t index) con double CXMeansOnline1d::minimumSplitCount() const { double result = m_MinimumClusterCount; if (m_MinimumClusterFraction > 0.0) { - double count = 0.0; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - count += m_Clusters[i].count(); - } - double scale = - std::max(m_HistoryLength * (1.0 - std::exp(-m_InitialDecayRate)), 1.0); - count *= m_MinimumClusterFraction / scale; + double count = this->count(); + double scale = m_HistoryLength * (1.0 - std::exp(-m_InitialDecayRate)); + count *= m_MinimumClusterFraction / std::max(scale, 1.0); result = std::max(result, count); } LOG_TRACE(<< "minimumSplitCount = " << result); @@ -1144,11 +1137,9 @@ double CXMeansOnline1d::minimumSplitCount() const { } bool CXMeansOnline1d::maybeSplit(TClusterVecItr cluster) { - if (cluster == m_Clusters.end()) { return false; } - TDoubleDoublePr interval = this->winsorisationInterval(); if (TOptionalClusterClusterPr split = cluster->split(m_AvailableDistributions, this->minimumSplitCount(), @@ -1161,16 +1152,13 @@ bool CXMeansOnline1d::maybeSplit(TClusterVecItr cluster) { (this->splitFunc())(index, split->first.index(), split->second.index()); return true; } - return false; } bool CXMeansOnline1d::maybeMerge(TClusterVecItr cluster1, TClusterVecItr cluster2) { - if (cluster1 == m_Clusters.end() || cluster2 == m_Clusters.end()) { return false; } - TDoubleDoublePr interval = this->winsorisationInterval(); if (cluster1->shouldMerge(*cluster2, m_AvailableDistributions, m_Smallest[0], interval)) { LOG_TRACE(<< "Merging cluster " << cluster1->index() << " at " @@ -1184,7 +1172,6 @@ bool CXMeansOnline1d::maybeMerge(TClusterVecItr cluster1, TClusterVecItr cluster (this->mergeFunc())(index1, index2, merged.index()); return true; } - return false; } @@ -1234,11 +1221,7 @@ TDoubleDoublePr CXMeansOnline1d::winsorisationInterval() const { // Winsorisation confidence interval, i.e. we truncate the // data to the 1 - f central confidence interval. - double totalCount = 0.0; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - totalCount += m_Clusters[i].count(); - } - + double totalCount = this->count(); double leftCount = f * totalCount; double rightCount = (1.0 - f) * totalCount; LOG_TRACE(<< "totalCount = " << totalCount << " interval = [" << leftCount @@ -1248,15 +1231,15 @@ TDoubleDoublePr CXMeansOnline1d::winsorisationInterval() const { TDoubleDoublePr result; double partialCount = 0.0; - for (std::size_t i = 0u; i < m_Clusters.size(); ++i) { - double count = m_Clusters[i].count(); + for (const auto& cluster : m_Clusters) { + double count = cluster.count(); if (partialCount < leftCount && partialCount + count >= leftCount) { double p = 100.0 * (leftCount - partialCount) / count; - result.first = m_Clusters[i].percentile(p); + result.first = cluster.percentile(p); } if (partialCount < rightCount && partialCount + count >= rightCount) { double p = 100.0 * (rightCount - partialCount) / count; - result.second = m_Clusters[i].percentile(p); + result.second = cluster.percentile(p); break; } partialCount += count; @@ -1394,7 +1377,7 @@ CXMeansOnline1d::CCluster::split(CAvailableModeDistributions distributions, LOG_TRACE(<< "split"); if (m_Structure.buffering()) { - return TOptionalClusterClusterPr(); + return {}; } maths_t::EDataType dataType = m_Prior.dataType(); @@ -1402,19 +1385,19 @@ CXMeansOnline1d::CCluster::split(CAvailableModeDistributions distributions, std::size_t n = m_Structure.size(); if (n < 2) { - return TOptionalClusterClusterPr(); + return {}; } TSizeVec split; { TTupleVec categories; m_Structure.categories(n, 0, categories); - for (std::size_t i = 0u; i < categories.size(); ++i) { - detail::winsorise(interval, categories[i]); + for (auto& category : categories) { + detail::winsorise(interval, category); } if (!detail::splitSearch(minimumCount, MINIMUM_SPLIT_DISTANCE, dataType, distributions, smallest, categories, split)) { - return TOptionalClusterClusterPr(); + return {}; } } @@ -1433,8 +1416,8 @@ CXMeansOnline1d::CCluster::split(CAvailableModeDistributions distributions, CNormalMeanPrecConjugate leftNormal(dataType, categories[0], decayRate); CNormalMeanPrecConjugate rightNormal(dataType, categories[1], decayRate); - return TClusterClusterPr(CCluster(index1, leftNormal, classifiers[0]), - CCluster(index2, rightNormal, classifiers[1])); + return TClusterClusterPr{CCluster(index1, leftNormal, classifiers[0]), + CCluster(index2, rightNormal, classifiers[1])}; } bool CXMeansOnline1d::CCluster::shouldMerge(CCluster& other, @@ -1457,8 +1440,8 @@ bool CXMeansOnline1d::CCluster::shouldMerge(CCluster& other, return false; } - for (std::size_t i = 0u; i < categories.size(); ++i) { - detail::winsorise(interval, categories[i]); + for (auto& category : categories) { + detail::winsorise(interval, category); } double distance; diff --git a/lib/maths/Makefile b/lib/maths/Makefile index 20af85b3ce..22a5fd7a79 100644 --- a/lib/maths/Makefile +++ b/lib/maths/Makefile @@ -60,6 +60,7 @@ CMultivariateNormalConjugateFactory.cc \ CMultivariateOneOfNPrior.cc \ CMultivariateOneOfNPriorFactory.cc \ CMultivariatePrior.cc \ +CNaiveBayes.cc \ CNaturalBreaksClassifier.cc \ CNormalMeanPrecConjugate.cc \ COneOfNPrior.cc \ @@ -85,6 +86,7 @@ CSeasonalTime.cc \ CSignal.cc \ CSpline.cc \ CStatisticalTests.cc \ +CTimeSeriesChangeDetector.cc \ CTimeSeriesDecomposition.cc \ CTimeSeriesDecompositionDetail.cc \ CTimeSeriesDecompositionStateSerialiser.cc \ diff --git a/lib/maths/ProbabilityAggregators.cc b/lib/maths/ProbabilityAggregators.cc index e7b23a1367..b4449f5ecb 100644 --- a/lib/maths/ProbabilityAggregators.cc +++ b/lib/maths/ProbabilityAggregators.cc @@ -23,15 +23,10 @@ namespace ml { namespace maths { - namespace { -using TDoubleVec = std::vector; -using TDoubleDoublePr = std::pair; -//! Compute \f$x^2\f$. -inline double square(double x) { - return x * x; -} +using TDoubleDoublePr = std::pair; +using TDoubleVec = std::vector; //! Compute the deviation corresponding to a probability of less likely //! samples \p p. @@ -50,7 +45,7 @@ inline double square(double x) { bool deviation(double p, double& result) { try { boost::math::normal_distribution<> normal(0.0, 1.0); - result = square(boost::math::quantile(normal, p / 2.0)); + result = CTools::pow2(boost::math::quantile(normal, p / 2.0)); return true; } catch (const std::exception& e) { LOG_ERROR(<< "Unable to compute quantile: " << e.what() << ", probability = " << p); @@ -58,125 +53,6 @@ bool deviation(double p, double& result) { return false; } -const double EPS = 0.1; - -//! A custom, numerically robust, implementation of \f$(1 - x) ^ p\f$. -//! -//! \note It is assumed that p is integer. -double powOneMinusX(double x, double p) { - // For large p, - // (1 - x) ^ p ~= exp(-p * x). - // - // and this doesn't suffer from cancellation errors in the limit - // p -> inf and x -> 0. For p * x << 1 we get much better precision - // using the Taylor expansion: - // (1 - x) ^ p = 1 - p * x + p * (p - 1) * x^2 / 2! + ... - // - // and canceling the leading terms. - - if (x == 1.0) { - return 0.0; - } - if (p == 1.0) { - return 1.0 - x; - } - - double y = p * x; - if (std::fabs(y) < EPS) { - static const double COEFFS[] = {-1.0, +1.0 / 2.0, - -1.0 / 6.0, +1.0 / 24.0, - -1.0 / 120.0, +1.0 / 720.0}; - static const std::size_t N = boost::size(COEFFS); - - double remainder = 0.0; - double ti = 1.0; - for (std::size_t i = 0u; i < N && p != 0.0; ++i, p -= 1.0) { - ti *= p * x; - remainder += COEFFS[i] * ti; - } - return 1.0 + remainder; - } else if (p > 1000.0) { - return std::exp(-y); - } - - if (x > 1.0) { - double sign = static_cast(p) % 2 ? -1.0 : 1.0; - return sign * std::exp(p * std::log(x - 1.0)); - } - - return std::exp(p * std::log(1.0 - x)); -} - -//! A custom, numerically robust, implementation of \f$1 - (1 - x) ^ p\f$. -//! -//! \note It is assumed that p is integer. -double oneMinusPowOneMinusX(double x, double p) { - // For large p, - // (1 - x) ^ p ~= exp(-p * x). - // - // and this doesn't suffer from cancellation errors in the limit - // p -> inf and x -> 0. For p * x << 1 we get much better precision - // using the Taylor expansion: - // (1 - x) ^ p = 1 - p * x + p * (p - 1) * x^2 / 2! + ... - // - // Note that this doesn't make use of powOneMinusX because we can - // avoid the cancellation errors by using: - // 1 - (1 - x) ^ p = p * x - p * (p - 1) * x^2 / 2 + ... - // - // when p * x is small. - - if (x == 1.0) { - return 1.0; - } - if (p == 1.0) { - return x; - } - - double y = p * x; - if (std::fabs(y) < EPS) { - static const double COEFFS[] = {+1.0, -1.0 / 2.0, - +1.0 / 6.0, -1.0 / 24.0, - +1.0 / 120.0, -1.0 / 720.0}; - static const std::size_t N = boost::size(COEFFS); - - double result = 0.0; - - double ti = 1.0; - for (std::size_t i = 0u; i < N && p != 0.0; ++i, p -= 1.0) { - ti *= p * x; - result += COEFFS[i] * ti; - } - - return result; - } else if (p > 1000.0) { - return 1.0 - std::exp(-y); - } - - if (x > 1.0) { - double sign = static_cast(p) % 2 ? -1.0 : 1.0; - return 1.0 - sign * std::exp(p * std::log(x - 1.0)); - } - - return 1.0 - std::exp(p * std::log(1.0 - x)); -} - -//! A custom implementation of \f$\log(1 - x)\f$ which handles the -//! cancellation error for small x. -double logOneMinusX(double x) { - double result = 0.0; - - if (std::fabs(x) < EPS) { - double xi = -x; - for (std::size_t i = 0u; i < 6; ++i, xi *= -x) { - result += xi / static_cast(i + 1); - } - } else { - result = std::log(1.0 - x); - } - - return result; -} - //! \brief Calculates the probability of the m most extreme samples. //! //! DESCRIPTION:\n @@ -222,7 +98,7 @@ class CNumericalLogProbabilityOfMFromNExtremeSamples { //! Evaluate the i'th integral at \p x. double evaluate(double x) const { if (m_I == m_M) { - return static_cast(m_N - m_M) * logOneMinusX(x); + return static_cast(m_N - m_M) * CTools::logOneMinusX(x); } double result; CLogIntegrand f(*m_Limits, *m_Corrections, m_N, m_M, m_I + 1u); @@ -631,7 +507,8 @@ bool CLogJointProbabilityOfLessLikelySamples::calculateLowerBound(double& result b1 = -1.0 - 0.5 * logm + m * (1.0 + logx - logm); } else if (E * x / m != 1.0) { double r = 1.0 - E * x / m; - b1 = -1.0 - 0.5 * logm + std::log(oneMinusPowOneMinusX(r, m + 1.0) / r); + b1 = -1.0 - 0.5 * logm + + std::log(CTools::oneMinusPowOneMinusX(r, m + 1.0) / r); } else { // Use L'Hopital's rule to show that: // lim { (1 - r^(m+1)) / (1 - r) } = m + 1 @@ -650,7 +527,7 @@ bool CLogJointProbabilityOfLessLikelySamples::calculateLowerBound(double& result } else if (E * x / p != 1.0) { double r = 1.0 - E * x / p; t = m + (m + 1.0) * logx - (m + 1.5) * logp + - std::log(oneMinusPowOneMinusX(r, p - m) / r); + std::log(CTools::oneMinusPowOneMinusX(r, p - m) / r); } else { // Use L'Hopital's rule to show that: // lim { (1 - r^(p - m)) / (1 - r) } = p - m @@ -772,7 +649,7 @@ bool CLogJointProbabilityOfLessLikelySamples::calculateUpperBound(double& result b1 = (p + 1.0) * std::log(p / x) - std::log(p / x - 1.0); } else if (p != x) { double r = 1.0 - p / x; - b1 = std::log(oneMinusPowOneMinusX(r, p + 1.0) / r); + b1 = std::log(CTools::oneMinusPowOneMinusX(r, p + 1.0) / r); } else { // Use L'Hopital's rule to show that: // lim { (1 - r^(p+1)) / (1 - r) } = p + 1 @@ -836,7 +713,7 @@ bool CProbabilityOfExtremeSample::calculate(double& result) const { result = 1.0; if (m_NumberSamples > 0) { result = CTools::truncate( - oneMinusPowOneMinusX(m_MinValue[0], m_NumberSamples), 0.0, 1.0); + CTools::oneMinusPowOneMinusX(m_MinValue[0], m_NumberSamples), 0.0, 1.0); } return true; } @@ -945,7 +822,7 @@ bool CLogProbabilityOfMFromNExtremeSamples::calculate(double& result) { for (std::size_t i = 0u; i < coeffs.size(); ++i) { double index = static_cast(coeffs.size() - i); coeffs[i] /= index; - sum += coeffs[i] * powOneMinusX(p / 2.0, index); + sum += coeffs[i] * CTools::powOneMinusX(p / 2.0, index); } LOG_TRACE(<< "sum = " << sum); @@ -955,8 +832,8 @@ bool CLogProbabilityOfMFromNExtremeSamples::calculate(double& result) { // that the following calculation can't use the re-normalized // "c" directly because it might be infinite. Instead, we make // use the fact that c * (1 - p)^(N - M + m) won't overflow. - double q = CTools::truncate(powOneMinusX(p, static_cast(N - M + m)), - 0.0, 1.0); + double q = CTools::truncate( + CTools::powOneMinusX(p, static_cast(N - M + m)), 0.0, 1.0); coeffs.push_back(-sum - q * std::exp(logc - logLargestCoeff)); LOG_TRACE(<< "c(0) = " << coeffs.back()); @@ -994,7 +871,7 @@ bool CLogProbabilityOfMFromNExtremeSamples::calculate(double& result) { double pM = m_MinValues[0]; LOG_TRACE(<< "p(" << M << ") = " << pM); - double pMin = oneMinusPowOneMinusX(pM, static_cast(N)); + double pMin = CTools::oneMinusPowOneMinusX(pM, static_cast(N)); LOG_TRACE(<< "1 - (1 - p(" << M << "))^" << N << " = " << pMin); if (M > 1) { @@ -1011,7 +888,7 @@ bool CLogProbabilityOfMFromNExtremeSamples::calculate(double& result) { for (std::size_t i = 0u; i < coeffs.size(); ++i) { double index = static_cast(coeffs.size() - i); double c = coeffs[i] / index; - double p = oneMinusPowOneMinusX(pM / 2.0, index); + double p = CTools::oneMinusPowOneMinusX(pM / 2.0, index); LOG_TRACE(<< "term(" << index << ") = " << (c * p) << " (c(" << index << ") = " << c << ", 1 - (1 - p(M)/2)^" << index << " = " << p << ")"); diff --git a/lib/maths/unittest/CForecastTest.cc b/lib/maths/unittest/CForecastTest.cc index 58c6357d13..dd93281379 100644 --- a/lib/maths/unittest/CForecastTest.cc +++ b/lib/maths/unittest/CForecastTest.cc @@ -56,8 +56,12 @@ maths::CModelParams params(core_t::TTime bucketLength) { static TTimeDoubleMap learnRates; learnRates[bucketLength] = static_cast(bucketLength) / 1800.0; double minimumSeasonalVarianceScale{0.25}; - return maths::CModelParams{bucketLength, learnRates[bucketLength], - DECAY_RATE, minimumSeasonalVarianceScale}; + return maths::CModelParams{bucketLength, + learnRates[bucketLength], + DECAY_RATE, + minimumSeasonalVarianceScale, + 6 * core::constants::HOUR, + core::constants::DAY}; } maths::CUnivariateTimeSeriesModel::TDecayRateController2Ary decayRateControllers() { diff --git a/lib/maths/unittest/CLinearAlgebraTest.cc b/lib/maths/unittest/CLinearAlgebraTest.cc index e7205e16f2..e184fd2139 100644 --- a/lib/maths/unittest/CLinearAlgebraTest.cc +++ b/lib/maths/unittest/CLinearAlgebraTest.cc @@ -1041,7 +1041,8 @@ void CLinearAlgebraTest::testProjected() { } void CLinearAlgebraTest::testPersist() { - // Check conversion to and from delimited is idempotent. + // Check conversion to and from delimited is idempotent and parsing + // bad input produces an error. { double matrix_[][4] = {{1.0, 2.1, 1.5, 0.1}, diff --git a/lib/maths/unittest/CModelTest.cc b/lib/maths/unittest/CModelTest.cc index 1d4a8af80f..61406dd52d 100644 --- a/lib/maths/unittest/CModelTest.cc +++ b/lib/maths/unittest/CModelTest.cc @@ -7,6 +7,7 @@ #include "CModelTest.h" #include +#include #include #include @@ -24,8 +25,8 @@ void CModelTest::testAll() { double learnRate{0.5}; double decayRate{0.001}; double minimumSeasonalVarianceScale{0.3}; - maths::CModelParams params(bucketLength, learnRate, decayRate, - minimumSeasonalVarianceScale); + maths::CModelParams params(bucketLength, learnRate, decayRate, minimumSeasonalVarianceScale, + 6 * core::constants::HOUR, core::constants::DAY); CPPUNIT_ASSERT_EQUAL(bucketLength, params.bucketLength()); CPPUNIT_ASSERT_EQUAL(learnRate, params.learnRate()); CPPUNIT_ASSERT_EQUAL(decayRate, params.decayRate()); @@ -34,6 +35,8 @@ void CModelTest::testAll() { CPPUNIT_ASSERT_EQUAL(0.0, params.probabilityBucketEmpty()); params.probabilityBucketEmpty(0.2); CPPUNIT_ASSERT_EQUAL(0.2, params.probabilityBucketEmpty()); + CPPUNIT_ASSERT_EQUAL(6 * core::constants::HOUR, params.minimumTimeToDetectChange()); + CPPUNIT_ASSERT_EQUAL(core::constants::DAY, params.maximumTimeToTestForChange()); } { maths_t::TDouble2VecWeightsAry weight1(maths_t::CUnitWeights::unit(2)); diff --git a/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc b/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc index 39735ea51e..a81721c64a 100644 --- a/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc +++ b/lib/maths/unittest/CMultivariateMultimodalPriorTest.cc @@ -658,7 +658,7 @@ void CMultivariateMultimodalPriorTest::testMarginalLikelihoodMean() { for (std::size_t i = 0u; i < samples.size(); ++i) { filter.addSamples({samples[i]}, maths_t::CUnitWeights::singleUnit(2)); - expectedMean.add(samples[i]); + expectedMean.add(TVector2(samples[i])); if (i % 10 == 0) { LOG_DEBUG(<< "sample mean = " << maths::CBasicStatistics::mean(expectedMean)); diff --git a/lib/maths/unittest/CNaiveBayesTest.cc b/lib/maths/unittest/CNaiveBayesTest.cc new file mode 100644 index 0000000000..b0de6d00d0 --- /dev/null +++ b/lib/maths/unittest/CNaiveBayesTest.cc @@ -0,0 +1,335 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include "CNaiveBayesTest.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include +#include + +using namespace ml; + +using TDoubleVec = std::vector; +using TDouble1Vec = core::CSmallVector; +using TDouble1VecVec = std::vector; +using TDoubleSizePr = std::pair; +using TDoubleSizePrVec = std::vector; +using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; +using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; + +void CNaiveBayesTest::testClassification() { + // We'll test classification using Gaussian naive Bayes. We + // test: + // - We get the probabilities we expect using if the underlying + // classes are consistent with the assumptions, + // - Test with missing data + + // We test two features with true density + // - x(1) ~ N(0,12) | C(1), + // - x(2) ~ N(10,16) | C(1), + // - x(1) ~ N(3,14) | C(2), + // - x(2) ~ N(-5,24) | C(2) + + test::CRandomNumbers rng; + + TDoubleVec trainingData[4]; + rng.generateNormalSamples(0.0, 12.0, 100, trainingData[0]); + rng.generateNormalSamples(10.0, 16.0, 100, trainingData[1]); + rng.generateNormalSamples(3.0, 14.0, 200, trainingData[2]); + rng.generateNormalSamples(-5.0, 24.0, 200, trainingData[3]); + + TMeanAccumulator meanMeanError; + + for (auto initialCount : {0.0, 100.0}) { + maths::CNormalMeanPrecConjugate normal{ + maths::CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData)}; + maths::CNaiveBayes nb{maths::CNaiveBayesFeatureDensityFromPrior(normal)}; + + if (initialCount > 0) { + nb.initialClassCounts({{initialCount, 1}, {initialCount, 2}}); + } + + for (std::size_t i = 0u; i < 100; ++i) { + nb.addTrainingDataPoint(1, {{trainingData[0][i]}, {trainingData[1][i]}}); + } + for (std::size_t i = 0u; i < 200; ++i) { + nb.addTrainingDataPoint(2, {{trainingData[2][i]}, {trainingData[3][i]}}); + } + + TMeanVarAccumulator moments[4]; + moments[0].add(trainingData[0]); + moments[1].add(trainingData[1]); + moments[2].add(trainingData[2]); + moments[3].add(trainingData[3]); + + // The training data sizes are 100 and 200 so we expect the + // class probabilities to be: + // - P(1) = (initialCount + 100) / (2*initialCount + 300) + // - P(2) = (initialCount + 200) / (2*initialCount + 300) + + TDoubleSizePrVec probabilities(nb.highestClassProbabilities(2, {{}, {}})); + + double P1{(initialCount + 100.0) / (2.0 * initialCount + 300.0)}; + double P2{(initialCount + 200.0) / (2.0 * initialCount + 300.0)}; + + CPPUNIT_ASSERT_EQUAL(std::size_t(2), probabilities.size()); + CPPUNIT_ASSERT_DOUBLES_EQUAL(P1, probabilities[1].first, 1e-5); + CPPUNIT_ASSERT_EQUAL(std::size_t(1), probabilities[1].second); + CPPUNIT_ASSERT_DOUBLES_EQUAL(P2, probabilities[0].first, 1e-5); + CPPUNIT_ASSERT_EQUAL(std::size_t(2), probabilities[0].second); + + // If we supply feature values we should approximately + // get these modulated by the product of the true density + // ratios for those feature values. + + boost::math::normal class1[]{ + boost::math::normal{maths::CBasicStatistics::mean(moments[0]), + std::sqrt(maths::CBasicStatistics::variance(moments[0]))}, + boost::math::normal{maths::CBasicStatistics::mean(moments[1]), + std::sqrt(maths::CBasicStatistics::variance(moments[1]))}}; + boost::math::normal class2[]{ + boost::math::normal{maths::CBasicStatistics::mean(moments[2]), + std::sqrt(maths::CBasicStatistics::variance(moments[2]))}, + boost::math::normal{maths::CBasicStatistics::mean(moments[3]), + std::sqrt(maths::CBasicStatistics::variance(moments[3]))}}; + + TDoubleVec xtest; + rng.generateNormalSamples(0.0, 64.0, 40, xtest); + + TMeanAccumulator meanErrors[3]; + + for (std::size_t i = 0u; i < xtest.size(); i += 2) { + auto test = [i](double p1, double p2, const TDoubleSizePrVec& p, + TMeanAccumulator& meanError) { + double Z{p1 + p2}; + p1 /= Z; + p2 /= Z; + double p1_{p[0].second == 1 ? p[0].first : p[1].first}; + double p2_{p[0].second == 1 ? p[1].first : p[0].first}; + + if (i % 10 == 0) { + LOG_DEBUG(<< i << ") expected P(1) = " << p1 << ", P(2) = " << p2 + << " got P(1) = " << p1_ << ", P(2) = " << p2_); + } + + CPPUNIT_ASSERT_EQUAL(std::size_t(2), p.size()); + CPPUNIT_ASSERT_DOUBLES_EQUAL(p1, p1_, 0.03); + CPPUNIT_ASSERT_DOUBLES_EQUAL(p2, p2_, 0.03); + if (p1 > 0.001) { + meanError.add(std::fabs((p1 - p1_) / p1)); + } + if (p2 > 0.001) { + meanError.add(std::fabs((p2 - p2_) / p2)); + } + }; + + // Supply both feature values. + double p1{P1 * maths::CTools::safePdf(class1[0], xtest[i]) * + maths::CTools::safePdf(class1[1], xtest[i + 1])}; + double p2{P2 * maths::CTools::safePdf(class2[0], xtest[i]) * + maths::CTools::safePdf(class2[1], xtest[i + 1])}; + probabilities = nb.highestClassProbabilities(2, {{xtest[i]}, {xtest[i + 1]}}); + test(p1, p2, probabilities, meanErrors[0]); + + // Miss out the first feature value. + p1 = P1 * maths::CTools::safePdf(class1[1], xtest[i + 1]); + p2 = P2 * maths::CTools::safePdf(class2[1], xtest[i + 1]); + probabilities = nb.highestClassProbabilities(2, {{}, {xtest[i + 1]}}); + test(p1, p2, probabilities, meanErrors[1]); + + // Miss out the second feature value. + p1 = P1 * maths::CTools::safePdf(class1[0], xtest[i]); + p2 = P2 * maths::CTools::safePdf(class2[0], xtest[i]); + probabilities = nb.highestClassProbabilities(2, {{xtest[i]}, {}}); + test(p1, p2, probabilities, meanErrors[2]); + } + + for (std::size_t i = 0u; i < 3; ++i) { + LOG_DEBUG(<< "Mean relative error = " + << maths::CBasicStatistics::mean(meanErrors[i])); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanErrors[i]) < 0.05); + meanMeanError += meanErrors[i]; + } + } +} + +void CNaiveBayesTest::testPropagationByTime() { + // Make feature distributions drift over time and verify that + // the classifier adapts. + + test::CRandomNumbers rng; + + maths::CNormalMeanPrecConjugate normal{maths::CNormalMeanPrecConjugate::nonInformativePrior( + maths_t::E_ContinuousData, 0.05)}; + maths::CNaiveBayes nb[]{ + maths::CNaiveBayes{maths::CNaiveBayesFeatureDensityFromPrior(normal), 0.05}, + maths::CNaiveBayes{maths::CNaiveBayesFeatureDensityFromPrior(normal), 0.05}}; + + TDoubleVec trainingData[4]; + for (std::size_t i = 0u; i < 1000; ++i) { + double x{static_cast(i)}; + rng.generateNormalSamples(0.02 * x - 14.0, 16.0, 1, trainingData[0]); + rng.generateNormalSamples(0.02 * x - 14.0, 16.0, 1, trainingData[1]); + rng.generateNormalSamples(-0.02 * x + 14.0, 16.0, 1, trainingData[2]); + rng.generateNormalSamples(-0.02 * x + 14.0, 16.0, 1, trainingData[3]); + + nb[0].addTrainingDataPoint(1, {{trainingData[0][0]}, {trainingData[1][0]}}); + nb[0].addTrainingDataPoint(2, {{trainingData[2][0]}, {trainingData[3][0]}}); + nb[0].propagateForwardsByTime(1.0); + + nb[1].addTrainingDataPoint(1, {{trainingData[0][0]}, {trainingData[1][0]}}); + nb[1].addTrainingDataPoint(2, {{trainingData[2][0]}, {trainingData[3][0]}}); + } + + // Check that the value: + // - (-10,-10) gets assigned to class 2 + // - ( 10, 10) gets assigned to class 1 + // for the aged classifier and vice versa. + + { + TDoubleSizePrVec probabilities[]{ + nb[0].highestClassProbabilities(2, {{-10.0}, {-10.0}}), + nb[1].highestClassProbabilities(2, {{-10.0}, {-10.0}})}; + LOG_DEBUG(<< "Aged class probabilities = " + << core::CContainerPrinter::print(probabilities[0])); + LOG_DEBUG(<< "Class probabilities = " + << core::CContainerPrinter::print(probabilities[1])); + CPPUNIT_ASSERT_EQUAL(std::size_t(2), probabilities[0][0].second); + CPPUNIT_ASSERT(probabilities[0][0].first > 0.99); + CPPUNIT_ASSERT_EQUAL(std::size_t(1), probabilities[1][0].second); + CPPUNIT_ASSERT(probabilities[1][0].first > 0.95); + } + { + TDoubleSizePrVec probabilities[]{ + nb[0].highestClassProbabilities(2, {{10.0}, {10.0}}), + nb[1].highestClassProbabilities(2, {{10.0}, {10.0}})}; + LOG_DEBUG(<< "Aged class probabilities = " + << core::CContainerPrinter::print(probabilities[0])); + LOG_DEBUG(<< "Class probabilities = " + << core::CContainerPrinter::print(probabilities[1])); + CPPUNIT_ASSERT_EQUAL(std::size_t(1), probabilities[0][0].second); + CPPUNIT_ASSERT(probabilities[0][0].first > 0.99); + CPPUNIT_ASSERT_EQUAL(std::size_t(2), probabilities[1][0].second); + CPPUNIT_ASSERT(probabilities[1][0].first > 0.95); + } +} + +void CNaiveBayesTest::testMemoryUsage() { + // Check invariants. + + using TMemoryUsagePtr = std::unique_ptr; + using TNaiveBayesPtr = std::shared_ptr; + + test::CRandomNumbers rng; + + TDoubleVec trainingData[4]; + rng.generateNormalSamples(0.0, 12.0, 100, trainingData[0]); + rng.generateNormalSamples(10.0, 16.0, 100, trainingData[1]); + rng.generateNormalSamples(3.0, 14.0, 200, trainingData[2]); + rng.generateNormalSamples(-5.0, 24.0, 200, trainingData[3]); + + TMeanAccumulator meanMeanError; + + maths::CNormalMeanPrecConjugate normal{maths::CNormalMeanPrecConjugate::nonInformativePrior( + maths_t::E_ContinuousData, 0.1)}; + TNaiveBayesPtr nb{new maths::CNaiveBayes{ + maths::CNaiveBayesFeatureDensityFromPrior(normal), 0.1}}; + + for (std::size_t i = 0u; i < 100; ++i) { + nb->addTrainingDataPoint(1, {{trainingData[0][i]}, {trainingData[1][i]}}); + } + for (std::size_t i = 0u; i < 200; ++i) { + nb->addTrainingDataPoint(2, {{trainingData[2][i]}, {trainingData[3][i]}}); + } + + std::size_t memoryUsage{nb->memoryUsage()}; + TMemoryUsagePtr mem{new core::CMemoryUsage}; + nb->debugMemoryUsage(mem.get()); + + LOG_DEBUG(<< "Memory = " << memoryUsage); + CPPUNIT_ASSERT_EQUAL(memoryUsage, mem->usage()); + + LOG_DEBUG(<< "Memory = " << core::CMemory::dynamicSize(nb)); + CPPUNIT_ASSERT_EQUAL(memoryUsage + sizeof(maths::CNaiveBayes), + core::CMemory::dynamicSize(nb)); +} + +void CNaiveBayesTest::testPersist() { + test::CRandomNumbers rng; + + TDoubleVec trainingData[4]; + rng.generateNormalSamples(0.0, 12.0, 100, trainingData[0]); + rng.generateNormalSamples(10.0, 16.0, 100, trainingData[1]); + rng.generateNormalSamples(3.0, 14.0, 200, trainingData[2]); + rng.generateNormalSamples(-5.0, 24.0, 200, trainingData[3]); + + TMeanAccumulator meanMeanError; + + maths::CNormalMeanPrecConjugate normal{maths::CNormalMeanPrecConjugate::nonInformativePrior( + maths_t::E_ContinuousData, 0.1)}; + maths::CNaiveBayes origNb{maths::CNaiveBayesFeatureDensityFromPrior(normal), 0.1}; + + for (std::size_t i = 0u; i < 100; ++i) { + origNb.addTrainingDataPoint(1, {{trainingData[0][i]}, {trainingData[1][i]}}); + } + for (std::size_t i = 0u; i < 200; ++i) { + origNb.addTrainingDataPoint(2, {{trainingData[2][i]}, {trainingData[3][i]}}); + } + + std::string origXml; + { + core::CRapidXmlStatePersistInserter inserter("root"); + origNb.acceptPersistInserter(inserter); + inserter.toXml(origXml); + } + + LOG_DEBUG(<< "Naive Bayes XML representation:\n" << origXml); + + core::CRapidXmlParser parser; + CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); + core::CRapidXmlStateRestoreTraverser traverser(parser); + + maths::SDistributionRestoreParams params{maths_t::E_ContinuousData, 0.1, 0.0, 0.0, 0.0}; + maths::CNaiveBayes restoredNb{params, traverser}; + + CPPUNIT_ASSERT_EQUAL(origNb.checksum(), restoredNb.checksum()); + + std::string restoredXml; + { + core::CRapidXmlStatePersistInserter inserter("root"); + origNb.acceptPersistInserter(inserter); + inserter.toXml(restoredXml); + } + CPPUNIT_ASSERT_EQUAL(origXml, restoredXml); +} + +CppUnit::Test* CNaiveBayesTest::suite() { + CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CNaiveBayesTest"); + + suiteOfTests->addTest(new CppUnit::TestCaller( + "CNaiveBayesTest::testClassification", &CNaiveBayesTest::testClassification)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CNaiveBayesTest::testPropagationByTime", &CNaiveBayesTest::testPropagationByTime)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CNaiveBayesTest::testMemoryUsage", &CNaiveBayesTest::testMemoryUsage)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CNaiveBayesTest::testPersist", &CNaiveBayesTest::testPersist)); + + return suiteOfTests; +} diff --git a/lib/maths/unittest/CNaiveBayesTest.h b/lib/maths/unittest/CNaiveBayesTest.h new file mode 100644 index 0000000000..ac4ed35cd7 --- /dev/null +++ b/lib/maths/unittest/CNaiveBayesTest.h @@ -0,0 +1,22 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_CNaiveBayesTest_h +#define INCLUDED_CNaiveBayesTest_h + +#include + +class CNaiveBayesTest : public CppUnit::TestFixture { +public: + void testClassification(); + void testPropagationByTime(); + void testMemoryUsage(); + void testPersist(); + + static CppUnit::Test* suite(); +}; + +#endif // INCLUDED_CNaiveBayesTest_h diff --git a/lib/maths/unittest/CRegressionTest.cc b/lib/maths/unittest/CRegressionTest.cc index 2243fcd52a..2235614fcb 100644 --- a/lib/maths/unittest/CRegressionTest.cc +++ b/lib/maths/unittest/CRegressionTest.cc @@ -323,6 +323,42 @@ void CRegressionTest::testShiftGradient() { CPPUNIT_ASSERT_DOUBLES_EQUAL(params1[3], params2[3], 1e-6 * std::fabs(params1[3])); } +void CRegressionTest::testLinearScale() { + // Test that linearly scaling a regression linearly + // scales all the parameters. + + maths::CRegression::CLeastSquaresOnline<3, double> regression; + for (double x = 0.0; x < 100.0; x += 1.0) { + regression.add(x, 0.01 * x * x * x - 0.2 * x * x + 1.0 * x + 10.0); + } + + TDoubleArray4 params1; + regression.parameters(params1); + + regression.linearScale(0.1); + + TDoubleArray4 params2; + regression.parameters(params2); + + LOG_DEBUG("parameters 1 = " << core::CContainerPrinter::print(params1)); + LOG_DEBUG("parameters 2 = " << core::CContainerPrinter::print(params2)); + + for (std::size_t i = 0u; i < 4; ++i) { + CPPUNIT_ASSERT_DOUBLES_EQUAL(0.1 * params1[i], params2[i], 1e-6); + } + + regression.linearScale(100.0); + + regression.parameters(params2); + + LOG_DEBUG("parameters 1 = " << core::CContainerPrinter::print(params1)); + LOG_DEBUG("parameters 2 = " << core::CContainerPrinter::print(params2)); + + for (std::size_t i = 0u; i < 4; ++i) { + CPPUNIT_ASSERT_DOUBLES_EQUAL(10.0 * params1[i], params2[i], 1e-6); + } +} + void CRegressionTest::testAge() { // Test that the regression is mean reverting. @@ -1041,6 +1077,8 @@ CppUnit::Test* CRegressionTest::suite() { "CRegressionTest::testShiftOrdinate", &CRegressionTest::testShiftOrdinate)); suiteOfTests->addTest(new CppUnit::TestCaller( "CRegressionTest::testShiftGradient", &CRegressionTest::testShiftGradient)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CRegressionTest::testLinearScale", &CRegressionTest::testLinearScale)); suiteOfTests->addTest(new CppUnit::TestCaller( "CRegressionTest::testAge", &CRegressionTest::testAge)); suiteOfTests->addTest(new CppUnit::TestCaller( diff --git a/lib/maths/unittest/CRegressionTest.h b/lib/maths/unittest/CRegressionTest.h index 70fb06756a..2e64708dcd 100644 --- a/lib/maths/unittest/CRegressionTest.h +++ b/lib/maths/unittest/CRegressionTest.h @@ -16,6 +16,7 @@ class CRegressionTest : public CppUnit::TestFixture { void testShiftAbscissa(); void testShiftOrdinate(); void testShiftGradient(); + void testLinearScale(); void testAge(); void testPrediction(); void testCombination(); diff --git a/lib/maths/unittest/CTimeSeriesChangeDetectorTest.cc b/lib/maths/unittest/CTimeSeriesChangeDetectorTest.cc new file mode 100644 index 0000000000..c87863fd81 --- /dev/null +++ b/lib/maths/unittest/CTimeSeriesChangeDetectorTest.cc @@ -0,0 +1,325 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include "CTimeSeriesChangeDetectorTest.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "TestUtils.h" + +#include +#include +#include + +using namespace ml; + +namespace { + +using TDoubleVec = std::vector; +using TDouble2Vec = core::CSmallVector; +using TTimeDoublePr = std::pair; +using TTimeDoublePrCBuf = boost::circular_buffer; +using TDecompositionPtr = std::shared_ptr; +using TPriorPtr = std::shared_ptr; +using TPriorPtrVec = std::vector; + +core_t::TTime BUCKET_LENGTH{1800}; +const double DECAY_RATE{0.0002}; + +TPriorPtr makeResidualModel() { + maths::CGammaRateConjugate gamma{maths::CGammaRateConjugate::nonInformativePrior( + maths_t::E_ContinuousData, 0.1, DECAY_RATE)}; + maths::CLogNormalMeanPrecConjugate lognormal{maths::CLogNormalMeanPrecConjugate::nonInformativePrior( + maths_t::E_ContinuousData, 1.0, DECAY_RATE)}; + maths::CNormalMeanPrecConjugate normal{maths::CNormalMeanPrecConjugate::nonInformativePrior( + maths_t::E_ContinuousData, DECAY_RATE)}; + + TPriorPtrVec mode; + mode.reserve(3u); + mode.emplace_back(gamma.clone()); + mode.emplace_back(lognormal.clone()); + mode.emplace_back(normal.clone()); + maths::COneOfNPrior modePrior{mode, maths_t::E_ContinuousData, DECAY_RATE}; + maths::CXMeansOnline1d clusterer{maths_t::E_ContinuousData, + maths::CAvailableModeDistributions::ALL, + maths_t::E_ClustersFractionWeight, + DECAY_RATE, + 0.05, + 12.0, + 1.0}; + maths::CMultimodalPrior multimodal{maths_t::E_ContinuousData, clusterer, + modePrior, DECAY_RATE}; + + TPriorPtrVec models; + mode.emplace_back(gamma.clone()); + mode.emplace_back(lognormal.clone()); + mode.emplace_back(normal.clone()); + mode.emplace_back(multimodal.clone()); + + return TPriorPtr{ + maths::COneOfNPrior{mode, maths_t::E_ContinuousData, DECAY_RATE}.clone()}; +} +} + +void CTimeSeriesChangeDetectorTest::testNoChange() { + test::CRandomNumbers rng; + + TDoubleVec variances{1.0, 10.0, 20.0, 30.0, 100.0, 1000.0}; + TDoubleVec scales{0.1, 1.0, 2.0, 3.0, 5.0, 8.0}; + + TDoubleVec samples; + for (std::size_t t = 0u; t < 100; ++t) { + if (t % 10 == 0) { + LOG_DEBUG(<< t << "%"); + } + + switch (t % 3) { + case 0: + rng.generateNormalSamples(10.0, variances[(t / 3) % variances.size()], + 1000, samples); + break; + case 1: + rng.generateLogNormalSamples(1.0, scales[(t / 3) % scales.size()], 1000, samples); + break; + case 2: + rng.generateGammaSamples(10.0, 10.0 * scales[(t / 3) % scales.size()], + 1000, samples); + break; + } + + TDecompositionPtr trendModel( + new maths::CTimeSeriesDecomposition{DECAY_RATE, BUCKET_LENGTH}); + TPriorPtr residualModel(makeResidualModel()); + + auto addSampleToModel = [&trendModel, &residualModel](core_t::TTime time, double x) { + trendModel->addPoint(time, x); + double detrended{trendModel->detrend(time, x, 0.0)}; + residualModel->addSamples({detrended}, maths_t::CUnitWeights::SINGLE_UNIT); + residualModel->propagateForwardsByTime(1.0); + }; + + core_t::TTime time{0}; + for (std::size_t i = 0u; i < 950; ++i) { + addSampleToModel(time, samples[i]); + time += BUCKET_LENGTH; + } + + maths::CUnivariateTimeSeriesChangeDetector detector{ + trendModel, residualModel, 6 * core::constants::HOUR, + 24 * core::constants::HOUR, 14.0}; + for (std::size_t i = 950u; i < samples.size(); ++i) { + addSampleToModel(time, samples[i]); + detector.addSamples({{time, samples[i]}}, maths_t::CUnitWeights::SINGLE_UNIT); + if (detector.stopTesting()) { + break; + } + + CPPUNIT_ASSERT(!detector.change()); + + time += BUCKET_LENGTH; + } + } +} + +void CTimeSeriesChangeDetectorTest::testLevelShift() { + TGeneratorVec trends{constant, ramp, smoothDaily, weekends, spikeyDaily}; + this->testChange( + trends, maths::SChangeDescription::E_LevelShift, + [](TGenerator trend, core_t::TTime time) { return trend(time) + 0.5; }, 5.0, 16.0); +} + +void CTimeSeriesChangeDetectorTest::testLinearScale() { + TGeneratorVec trends{smoothDaily, spikeyDaily}; + this->testChange( + trends, maths::SChangeDescription::E_LinearScale, + [](TGenerator trend, core_t::TTime time) { return 3.0 * trend(time); }, 3.0, 16.0); +} + +void CTimeSeriesChangeDetectorTest::testTimeShift() { + TGeneratorVec trends{smoothDaily, spikeyDaily}; + this->testChange(trends, maths::SChangeDescription::E_TimeShift, + [](TGenerator trend, core_t::TTime time) { + return trend(time - core::constants::HOUR); + }, + -static_cast(core::constants::HOUR), 24.0); + this->testChange(trends, maths::SChangeDescription::E_TimeShift, + [](TGenerator trend, core_t::TTime time) { + return trend(time + core::constants::HOUR); + }, + +static_cast(core::constants::HOUR), 24.0); +} + +void CTimeSeriesChangeDetectorTest::testPersist() { + test::CRandomNumbers rng; + + TDoubleVec samples; + rng.generateNormalSamples(10.0, 10.0, 1000, samples); + + TDecompositionPtr trendModel(new maths::CTimeSeriesDecomposition{DECAY_RATE, BUCKET_LENGTH}); + TPriorPtr residualModel(makeResidualModel()); + + auto addSampleToModel = [&trendModel, &residualModel](core_t::TTime time, double x) { + trendModel->addPoint(time, x); + double detrended{trendModel->detrend(time, x, 0.0)}; + residualModel->addSamples({detrended}, maths_t::CUnitWeights::SINGLE_UNIT); + residualModel->propagateForwardsByTime(1.0); + }; + + core_t::TTime time{0}; + for (std::size_t i = 0u; i < 990; ++i) { + addSampleToModel(time, samples[i]); + time += BUCKET_LENGTH; + } + + maths::CUnivariateTimeSeriesChangeDetector origDetector{ + trendModel, residualModel, 6 * core::constants::HOUR, + 24 * core::constants::HOUR, 12.0}; + + maths::CModelParams modelParams{ + BUCKET_LENGTH, 1.0, 0.0, 1.0, 6 * core::constants::HOUR, 24 * core::constants::HOUR}; + maths::SDistributionRestoreParams distributionParams{maths_t::E_ContinuousData, DECAY_RATE}; + maths::STimeSeriesDecompositionRestoreParams decompositionParams{ + DECAY_RATE, BUCKET_LENGTH, distributionParams}; + maths::SModelRestoreParams params{modelParams, decompositionParams, distributionParams}; + + for (std::size_t i = 990u; i < samples.size(); ++i) { + addSampleToModel(time, samples[i]); + std::string origXml; + { + ml::core::CRapidXmlStatePersistInserter inserter{"root"}; + origDetector.acceptPersistInserter(inserter); + inserter.toXml(origXml); + } + + maths::CUnivariateTimeSeriesChangeDetector restoredDetector{ + trendModel, residualModel, 6 * core::constants::HOUR, + 24 * core::constants::HOUR, 12.0}; + core::CRapidXmlParser parser; + CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); + core::CRapidXmlStateRestoreTraverser traverser(parser); + traverser.traverseSubLevel(boost::bind( + &maths::CUnivariateTimeSeriesChangeDetector::acceptRestoreTraverser, + &restoredDetector, boost::cref(params), _1)); + + LOG_DEBUG(<< "expected " << origDetector.checksum() << " got " + << restoredDetector.checksum()); + CPPUNIT_ASSERT_EQUAL(origDetector.checksum(), restoredDetector.checksum()); + } +} + +CppUnit::Test* CTimeSeriesChangeDetectorTest::suite() { + CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CTimeSeriesChangeDetectorTest"); + + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTimeSeriesChangeDetectorTest::testNoChange", + &CTimeSeriesChangeDetectorTest::testNoChange)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTimeSeriesChangeDetectorTest::testLevelShift", + &CTimeSeriesChangeDetectorTest::testLevelShift)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTimeSeriesChangeDetectorTest::testLinearScale", + &CTimeSeriesChangeDetectorTest::testLinearScale)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTimeSeriesChangeDetectorTest::testTimeShift", + &CTimeSeriesChangeDetectorTest::testTimeShift)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTimeSeriesChangeDetectorTest::testPersist", + &CTimeSeriesChangeDetectorTest::testPersist)); + + return suiteOfTests; +} + +void CTimeSeriesChangeDetectorTest::testChange(const TGeneratorVec& trends, + maths::SChangeDescription::EDescription description, + TChange applyChange, + double expectedChange, + double expectedMeanBucketsToDetectChange) { + using TOptionalSize = boost::optional; + using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; + + test::CRandomNumbers rng; + + TMeanAccumulator meanBucketsToDetect; + + TDoubleVec samples; + for (std::size_t t = 0u; t < 100; ++t) { + if (t % 10 == 0) { + LOG_DEBUG(<< t << "%"); + } + + rng.generateNormalSamples(0.0, 1.0, 1000, samples); + + TDecompositionPtr trendModel( + new maths::CTimeSeriesDecomposition{DECAY_RATE, BUCKET_LENGTH}); + TPriorPtr residualModel(makeResidualModel()); + + auto addSampleToModel = [&trendModel, &residualModel]( + core_t::TTime time, double x, double weight) { + trendModel->addPoint(time, x, maths_t::countWeight(weight)); + double detrended{trendModel->detrend(time, x, 0.0)}; + residualModel->addSamples({detrended}, {maths_t::countWeight(weight)}); + residualModel->propagateForwardsByTime(1.0); + }; + + core_t::TTime time{0}; + for (std::size_t i = 0u; i < 950; ++i) { + double x{10.0 * trends[t % trends.size()](time) + samples[i]}; + addSampleToModel(time, x, 1.0); + time += BUCKET_LENGTH; + } + + maths::CUnivariateTimeSeriesChangeDetector detector{ + trendModel, residualModel, 6 * core::constants::HOUR, + 24 * core::constants::HOUR, 14.0}; + + TOptionalSize bucketsToDetect; + for (std::size_t i = 950u; i < samples.size(); ++i) { + double x{10.0 * applyChange(trends[t % trends.size()], time) + samples[i]}; + + addSampleToModel(time, x, 0.5); + detector.addSamples({{time, x}}, maths_t::CUnitWeights::SINGLE_UNIT); + + auto change = detector.change(); + if (change) { + if (!bucketsToDetect) { + bucketsToDetect.reset(i - 949); + } + CPPUNIT_ASSERT_EQUAL(change->s_Description, description); + CPPUNIT_ASSERT_DOUBLES_EQUAL(expectedChange, change->s_Value[0], + 0.5 * std::fabs(expectedChange)); + break; + } + if (detector.stopTesting()) { + break; + } + + time += BUCKET_LENGTH; + } + CPPUNIT_ASSERT(bucketsToDetect); + meanBucketsToDetect.add(static_cast(*bucketsToDetect)); + } + + LOG_DEBUG(<< "buckets to detect = " << maths::CBasicStatistics::mean(meanBucketsToDetect)); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanBucketsToDetect) < + expectedMeanBucketsToDetectChange); +} diff --git a/lib/maths/unittest/CTimeSeriesChangeDetectorTest.h b/lib/maths/unittest/CTimeSeriesChangeDetectorTest.h new file mode 100644 index 0000000000..350cf927cc --- /dev/null +++ b/lib/maths/unittest/CTimeSeriesChangeDetectorTest.h @@ -0,0 +1,39 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_CTimeSeriesChangeDetectorTest_h +#define INCLUDED_CTimeSeriesChangeDetectorTest_h + +#include + +#include + +#include + +class CTimeSeriesChangeDetectorTest : public CppUnit::TestFixture { +public: + void testNoChange(); + void testLevelShift(); + void testLinearScale(); + void testTimeShift(); + void testPersist(); + + static CppUnit::Test* suite(); + +private: + using TGenerator = std::function; + using TGeneratorVec = std::vector; + using TChange = std::function; + +private: + void testChange(const TGeneratorVec& trends, + ml::maths::SChangeDescription::EDescription description, + TChange applyChange, + double expectedChange, + double expectedMeanBucketsToDetectChange); +}; + +#endif // INCLUDED_CTimeSeriesChangeDetectorTest_h diff --git a/lib/maths/unittest/CTimeSeriesDecompositionTest.cc b/lib/maths/unittest/CTimeSeriesDecompositionTest.cc index 0e697589e8..ed88fa4413 100644 --- a/lib/maths/unittest/CTimeSeriesDecompositionTest.cc +++ b/lib/maths/unittest/CTimeSeriesDecompositionTest.cc @@ -17,8 +17,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -107,18 +109,18 @@ void CTimeSeriesDecompositionTest::testSuperpositionOfSines() { double percentileError = 0.0; for (core_t::TTime t = lastWeek; t < lastWeek + WEEK; t += HALF_HOUR) { - TDoubleDoublePr baseline = decomposition.baseline(t, 70.0); - double residual = std::fabs(trend[t / HALF_HOUR] - mean(baseline)); + TDoubleDoublePr prediction = decomposition.value(t, 70.0); + double residual = std::fabs(trend[t / HALF_HOUR] - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(trend[t / HALF_HOUR]); maxValue = std::max(maxValue, std::fabs(trend[t / HALF_HOUR])); percentileError += - std::max(std::max(baseline.first - trend[t / HALF_HOUR], - trend[t / HALF_HOUR] - baseline.second), + std::max(std::max(prediction.first - trend[t / HALF_HOUR], + trend[t / HALF_HOUR] - prediction.second), 0.0); - //f.push_back(mean(baseline)); - //r.push_back(mean(baseline) - trend[t / HALF_HOUR]); + //f.push_back(mean(value)); + //r.push_back(mean(value) - trend[t / HALF_HOUR]); } LOG_DEBUG(<< "'sum residual' / 'sum value' = " << sumResidual / sumValue); @@ -286,21 +288,19 @@ void CTimeSeriesDecompositionTest::testDistortedPeriodic() { tt < lastWeek + WEEK && static_cast(tt / HOUR) < boost::size(timeseries); tt += HOUR) { - TDoubleDoublePr baseline = decomposition.baseline(tt, 70.0); - - double residual = std::fabs(timeseries[tt / HOUR] - mean(baseline)); + TDoubleDoublePr prediction = decomposition.value(tt, 70.0); + double residual = std::fabs(timeseries[tt / HOUR] - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(timeseries[tt / HOUR]); maxValue = std::max(maxValue, std::fabs(timeseries[tt / HOUR])); percentileError += - std::max(std::max(baseline.first - timeseries[tt / HOUR], - timeseries[tt / HOUR] - baseline.second), + std::max(std::max(prediction.first - timeseries[tt / HOUR], + timeseries[tt / HOUR] - prediction.second), 0.0); - //t.push_back(tt); //f.push_back(timeseries[tt / HOUR]); - //fe.push_back(mean(baseline)); + //fe.push_back(mean(value)); } LOG_DEBUG(<< "'sum residual' / 'sum value' = " << sumResidual / sumValue); @@ -392,19 +392,17 @@ void CTimeSeriesDecompositionTest::testMinimizeLongComponents() { double percentileError = 0.0; for (core_t::TTime t = lastWeek; t < lastWeek + WEEK; t += HALF_HOUR) { - TDoubleDoublePr baseline = decomposition.baseline(t, 70.0); - - double residual = std::fabs(trend[t / HALF_HOUR] - mean(baseline)); + TDoubleDoublePr prediction = decomposition.value(t, 70.0); + double residual = std::fabs(trend[t / HALF_HOUR] - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(trend[t / HALF_HOUR]); maxValue = std::max(maxValue, std::fabs(trend[t / HALF_HOUR])); percentileError += - std::max(std::max(baseline.first - trend[t / HALF_HOUR], - trend[t / HALF_HOUR] - baseline.second), + std::max(std::max(prediction.first - trend[t / HALF_HOUR], + trend[t / HALF_HOUR] - prediction.second), 0.0); - - //f.push_back(mean(baseline)); + //f.push_back(mean(value)); //r.push_back(residual); } @@ -509,19 +507,17 @@ void CTimeSeriesDecompositionTest::testWeekend() { double percentileError = 0.0; for (core_t::TTime t = lastWeek; t < lastWeek + WEEK; t += HALF_HOUR) { - TDoubleDoublePr baseline = decomposition.baseline(t, 70.0); - - double residual = std::fabs(trend[t / HALF_HOUR] - mean(baseline)); + TDoubleDoublePr prediction = decomposition.value(t, 70.0); + double residual = std::fabs(trend[t / HALF_HOUR] - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(trend[t / HALF_HOUR]); maxValue = std::max(maxValue, std::fabs(trend[t / HALF_HOUR])); percentileError += - std::max(std::max(baseline.first - trend[t / HALF_HOUR], - trend[t / HALF_HOUR] - baseline.second), + std::max(std::max(prediction.first - trend[t / HALF_HOUR], + trend[t / HALF_HOUR] - prediction.second), 0.0); - - //f.push_back(mean(baseline)); + //f.push_back(mean(value)); //r.push_back(residual); } @@ -610,20 +606,18 @@ void CTimeSeriesDecompositionTest::testSinglePeriodicity() { double percentileError = 0.0; for (core_t::TTime t = lastWeek; t < lastWeek + WEEK; t += HALF_HOUR) { - TDoubleDoublePr baseline = decomposition.baseline(t, 70.0); - + TDoubleDoublePr prediction = decomposition.value(t, 70.0); double residual = - std::fabs(trend[t / HALF_HOUR] + noiseMean - mean(baseline)); + std::fabs(trend[t / HALF_HOUR] + noiseMean - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(trend[t / HALF_HOUR]); maxValue = std::max(maxValue, std::fabs(trend[t / HALF_HOUR])); percentileError += std::max( - std::max(baseline.first - (trend[t / HALF_HOUR] + noiseMean), - (trend[t / HALF_HOUR] + noiseMean) - baseline.second), + std::max(prediction.first - (trend[t / HALF_HOUR] + noiseMean), + (trend[t / HALF_HOUR] + noiseMean) - prediction.second), 0.0); - - //f.push_back(mean(baseline)); + //f.push_back(mean(value)); //r.push_back(residual); } @@ -681,13 +675,13 @@ void CTimeSeriesDecompositionTest::testSeasonalOnset() { TTimeVec times; TDoubleVec trend; for (core_t::TTime time = 0; time < 150 * WEEK + 1; time += HOUR) { - double baseline = 0.0; + double value = 0.0; if (time > 10 * WEEK) { - baseline += daily[(time % DAY) / HOUR]; - baseline *= weekly[(time % WEEK) / DAY]; + value += daily[(time % DAY) / HOUR]; + value *= weekly[(time % WEEK) / DAY]; } times.push_back(time); - trend.push_back(baseline); + trend.push_back(value); } test::CRandomNumbers rng; @@ -727,17 +721,17 @@ void CTimeSeriesDecompositionTest::testSeasonalOnset() { double maxValue = 0.0; double percentileError = 0.0; for (core_t::TTime t = lastWeek; t < lastWeek + WEEK; t += HOUR) { - TDoubleDoublePr baseline = decomposition.baseline(t, 70.0); - - double residual = std::fabs(trend[t / HOUR] - mean(baseline)); + TDoubleDoublePr prediction = decomposition.value(t, 70.0); + double residual = std::fabs(trend[t / HOUR] - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(trend[t / HOUR]); maxValue = std::max(maxValue, std::fabs(trend[t / HOUR])); - percentileError += std::max(std::max(baseline.first - trend[t / HOUR], - trend[t / HOUR] - baseline.second), - 0.0); - //f.push_back(mean(baseline)); + percentileError += + std::max(std::max(prediction.first - trend[t / HOUR], + trend[t / HOUR] - prediction.second), + 0.0); + //f.push_back(mean(value)); //r.push_back(residual); } @@ -798,15 +792,15 @@ void CTimeSeriesDecompositionTest::testVarianceScale() { for (std::size_t i = 0u; i < 50; ++i) { for (core_t::TTime t = 0; t < DAY; t += TEN_MINS) { - double baseline = 1.0; + double value = 1.0; double variance = 1.0; if (t >= 3600 && t < 7200) { - baseline = 5.0; + value = 5.0; variance = 10.0; } - TDoubleVec value; - rng.generateNormalSamples(baseline, variance, 1, value); - decomposition.addPoint(time + t, value[0]); + TDoubleVec noise; + rng.generateNormalSamples(value, variance, 1, noise); + decomposition.addPoint(time + t, noise[0]); } time += DAY; } @@ -847,16 +841,16 @@ void CTimeSeriesDecompositionTest::testVarianceScale() { for (std::size_t i = 0u; i < 50; ++i) { for (core_t::TTime t = 0; t < DAY; t += TEN_MINS) { - double baseline = 5.0 * std::sin(boost::math::double_constants::two_pi * - static_cast(t) / - static_cast(DAY)); + double value = 5.0 * std::sin(boost::math::double_constants::two_pi * + static_cast(t) / + static_cast(DAY)); double variance = 1.0; if (t >= 3600 && t < 7200) { variance = 10.0; } - TDoubleVec value; - rng.generateNormalSamples(0.0, variance, 1, value); - decomposition.addPoint(time + t, baseline + value[0]); + TDoubleVec noise; + rng.generateNormalSamples(0.0, variance, 1, noise); + decomposition.addPoint(time + t, value + noise[0]); } time += DAY; } @@ -970,17 +964,16 @@ void CTimeSeriesDecompositionTest::testSpikeyDataProblemCase() { double percentileError = 0.0; for (std::size_t j = 0u; j < lastWeekTimeseries.size(); ++j) { - TDoubleDoublePr baseline = - decomposition.baseline(lastWeekTimeseries[j].first, 70.0); - - double residual = std::fabs(lastWeekTimeseries[j].second - mean(baseline)); + TDoubleDoublePr prediction = + decomposition.value(lastWeekTimeseries[j].first, 70.0); + double residual = std::fabs(lastWeekTimeseries[j].second - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(lastWeekTimeseries[j].second); maxValue = std::max(maxValue, std::fabs(lastWeekTimeseries[j].second)); percentileError += std::max( - std::max(baseline.first - lastWeekTimeseries[j].second, - lastWeekTimeseries[j].second - baseline.second), + std::max(prediction.first - lastWeekTimeseries[j].second, + lastWeekTimeseries[j].second - prediction.second), 0.0); } @@ -1024,7 +1017,7 @@ void CTimeSeriesDecompositionTest::testSpikeyDataProblemCase() { //file.open("results.m"); //TTimeVec times; //TDoubleVec raw; - //TDoubleVec baseline; + //TDoubleVec values; //TDoubleVec scales; //TDoubleVec probs; @@ -1047,7 +1040,7 @@ void CTimeSeriesDecompositionTest::testSpikeyDataProblemCase() { //times.push_back(time); //raw.push_back(value); - //baseline.push_back(mean(decomposition.baseline(time, 70.0))); + //values.push_back(mean(decomposition.value(time, 70.0))); //scales.push_back(mean(decomposition.scale(time, variance, 70.0))); //probs.push_back(-std::log(pScaled)); @@ -1061,7 +1054,7 @@ void CTimeSeriesDecompositionTest::testSpikeyDataProblemCase() { //file << "hold on;\n"; //file << "t = " << core::CContainerPrinter::print(times) << ";\n"; //file << "r = " << core::CContainerPrinter::print(raw) << ";\n"; - //file << "b = " << core::CContainerPrinter::print(baseline) << ";\n"; + //file << "b = " << core::CContainerPrinter::print(values) << ";\n"; //file << "s = " << core::CContainerPrinter::print(scales) << ";\n"; //file << "p = " << core::CContainerPrinter::print(probs) << ";\n"; //file << "subplot(3,1,1); hold on; plot(t, r, 'b'); plot(t, b, 'r');\n"; @@ -1116,22 +1109,20 @@ void CTimeSeriesDecompositionTest::testVeryLargeValuesProblemCase() { double percentileError = 0.0; for (std::size_t j = 0u; j < lastWeekTimeseries.size(); ++j) { - TDoubleDoublePr baseline = - decomposition.baseline(lastWeekTimeseries[j].first, 70.0); - - double residual = std::fabs(lastWeekTimeseries[j].second - mean(baseline)); + TDoubleDoublePr prediction = + decomposition.value(lastWeekTimeseries[j].first, 70.0); + double residual = std::fabs(lastWeekTimeseries[j].second - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(lastWeekTimeseries[j].second); maxValue = std::max(maxValue, std::fabs(lastWeekTimeseries[j].second)); percentileError += std::max( - std::max(baseline.first - lastWeekTimeseries[j].second, - lastWeekTimeseries[j].second - baseline.second), + std::max(prediction.first - lastWeekTimeseries[j].second, + lastWeekTimeseries[j].second - prediction.second), 0.0); - //times.push_back(lastWeekTimeseries[j].first); //values.push_back(lastWeekTimeseries[j].second); - //f.push_back(mean(baseline)); + //f.push_back(mean(value)); //r.push_back(residual); } @@ -1230,22 +1221,20 @@ void CTimeSeriesDecompositionTest::testMixedSmoothAndSpikeyDataProblemCase() { double percentileError = 0.0; for (std::size_t j = 0u; j < lastWeekTimeseries.size(); ++j) { - TDoubleDoublePr baseline = - decomposition.baseline(lastWeekTimeseries[j].first, 70.0); - - double residual = std::fabs(lastWeekTimeseries[j].second - mean(baseline)); + TDoubleDoublePr prediction = + decomposition.value(lastWeekTimeseries[j].first, 70.0); + double residual = std::fabs(lastWeekTimeseries[j].second - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(lastWeekTimeseries[j].second); maxValue = std::max(maxValue, std::fabs(lastWeekTimeseries[j].second)); percentileError += std::max( - std::max(baseline.first - lastWeekTimeseries[j].second, - lastWeekTimeseries[j].second - baseline.second), + std::max(prediction.first - lastWeekTimeseries[j].second, + lastWeekTimeseries[j].second - prediction.second), 0.0); - //times.push_back(lastWeekTimeseries[j].first); //values.push_back(lastWeekTimeseries[j].second); - //f.push_back(mean(baseline)); + //f.push_back(mean(value)); //r.push_back(residual); } @@ -1319,12 +1308,12 @@ void CTimeSeriesDecompositionTest::testDiurnalPeriodicityWithMissingValues() { if (decomposition.initialized()) { error.add(std::fabs((value + noise[0] - maths::CBasicStatistics::mean( - decomposition.baseline(time, 0.0)))) / + decomposition.value(time, 0.0)))) / std::fabs(value + noise[0])); } //times.push_back(time); //values.push_back(value + noise[0]); - //f.push_back(maths::CBasicStatistics::mean(decomposition.baseline(time, 0.0))); + //f.push_back(maths::CBasicStatistics::mean(decomposition.value(time, 0.0))); } time += HALF_HOUR; } @@ -1378,12 +1367,12 @@ void CTimeSeriesDecompositionTest::testDiurnalPeriodicityWithMissingValues() { if (decomposition.initialized()) { error.add(std::fabs((value + noise[0] - maths::CBasicStatistics::mean( - decomposition.baseline(time, 0.0)))) / + decomposition.value(time, 0.0)))) / std::fabs(value + noise[0])); } //times.push_back(time); //values.push_back(value + noise[0]); - //f.push_back(maths::CBasicStatistics::mean(decomposition.baseline(time, 0.0))); + //f.push_back(maths::CBasicStatistics::mean(decomposition.value(time, 0.0))); } time += HOUR; } @@ -1443,12 +1432,9 @@ void CTimeSeriesDecompositionTest::testLongTermTrend() { double sumValue = 0.0; double maxValue = 0.0; - TDoubleVec baselines; - for (std::size_t j = i - 48; j < i; ++j) { - TDoubleDoublePr baseline = decomposition.baseline(times[j], 70.0); - baselines.push_back(mean(baseline)); - double residual = std::fabs(trend[j] - mean(baseline)); + TDoubleDoublePr prediction = decomposition.value(times[j], 70.0); + double residual = std::fabs(trend[j] - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(trend[j]); @@ -1471,7 +1457,7 @@ void CTimeSeriesDecompositionTest::testLongTermTrend() { lastDay += DAY; } //values.push_back(trend[i] + noise[i]); - //f.push_back(maths::CBasicStatistics::mean(decomposition.baseline(times[i]))); + //f.push_back(maths::CBasicStatistics::mean(decomposition.value(times[i]))); } LOG_DEBUG(<< "total 'sum residual' / 'sum value' = " << totalSumResidual / totalSumValue); @@ -1526,12 +1512,9 @@ void CTimeSeriesDecompositionTest::testLongTermTrend() { double sumValue = 0.0; double maxValue = 0.0; - TDoubleVec baselines; - for (std::size_t j = i - 48; j < i; ++j) { - TDoubleDoublePr baseline = decomposition.baseline(times[j], 70.0); - baselines.push_back(mean(baseline)); - double residual = std::fabs(trend[j] - mean(baseline)); + TDoubleDoublePr prediction = decomposition.value(times[j], 70.0); + double residual = std::fabs(trend[j] - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(trend[j]); @@ -1551,7 +1534,7 @@ void CTimeSeriesDecompositionTest::testLongTermTrend() { lastDay += DAY; } //values.push_back(trend[i] + 0.3*noise[i]); - //f.push_back(maths::CBasicStatistics::mean(decomposition.baseline(times[i]))); + //f.push_back(maths::CBasicStatistics::mean(decomposition.value(times[i]))); } LOG_DEBUG(<< "total 'sum residual' / 'sum value' = " << totalSumResidual / totalSumValue); @@ -1615,12 +1598,9 @@ void CTimeSeriesDecompositionTest::testLongTermTrendAndPeriodicity() { double sumValue = 0.0; double maxValue = 0.0; - TDoubleVec baselines; - for (std::size_t j = i - 48; j < i; ++j) { - TDoubleDoublePr baseline = decomposition.baseline(times[j], 70.0); - baselines.push_back(mean(baseline)); - double residual = std::fabs(trend[j] - mean(baseline)); + TDoubleDoublePr prediction = decomposition.value(times[j], 70.0); + double residual = std::fabs(trend[j] - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(trend[j]); @@ -1643,7 +1623,7 @@ void CTimeSeriesDecompositionTest::testLongTermTrendAndPeriodicity() { lastDay += DAY; } //values.push_back(trend[i] + 0.3 * noise[i]); - //f.push_back(maths::CBasicStatistics::mean(decomposition.baseline(times[i]))); + //f.push_back(maths::CBasicStatistics::mean(decomposition.value(times[i]))); } LOG_DEBUG(<< "total 'sum residual' / 'sum value' = " << totalSumResidual / totalSumValue); @@ -1709,13 +1689,10 @@ void CTimeSeriesDecompositionTest::testNonDiurnal() { double sumValue = 0.0; double maxValue = 0.0; - TDoubleVec baselines; - for (std::size_t j = i - 12; j < i; ++j) { - TDoubleDoublePr baseline = - decomposition.baseline(times[j], 70.0); - baselines.push_back(mean(baseline)); - double residual = std::fabs(trends[t][j] - mean(baseline)); + TDoubleDoublePr prediction = + decomposition.value(times[j], 70.0); + double residual = std::fabs(trends[t][j] - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(trends[t][j]); @@ -1738,7 +1715,7 @@ void CTimeSeriesDecompositionTest::testNonDiurnal() { lastHour += HOUR; } //values.push_back(trends[t][i] + noise[i]); - //f.push_back(maths::CBasicStatistics::mean(decomposition.baseline(times[i]))); + //f.push_back(maths::CBasicStatistics::mean(decomposition.value(times[i]))); } LOG_DEBUG(<< "total 'sum residual' / 'sum value' = " @@ -1800,12 +1777,9 @@ void CTimeSeriesDecompositionTest::testNonDiurnal() { double sumValue = 0.0; double maxValue = 0.0; - TDoubleVec baselines; - for (std::size_t j = i - 288; j < i; ++j) { - TDoubleDoublePr baseline = decomposition.baseline(times[j], 70.0); - baselines.push_back(mean(baseline)); - double residual = std::fabs(trend[j] - mean(baseline)); + TDoubleDoublePr prediction = decomposition.value(times[j], 70.0); + double residual = std::fabs(trend[j] - mean(prediction)); sumResidual += residual; maxResidual = std::max(maxResidual, residual); sumValue += std::fabs(trend[j]); @@ -1828,7 +1802,7 @@ void CTimeSeriesDecompositionTest::testNonDiurnal() { lastTwoDay += 2 * DAY; } //values.push_back(trend[i] + noise[i]); - //f.push_back(maths::CBasicStatistics::mean(decomposition.baseline(times[i]))); + //f.push_back(maths::CBasicStatistics::mean(decomposition.value(times[i]))); } LOG_DEBUG(<< "total 'sum residual' / 'sum value' = " << totalSumResidual / totalSumValue); @@ -1865,7 +1839,7 @@ void CTimeSeriesDecompositionTest::testYearly() { rng.generateNormalSamples(0.0, 1.0, 1, noise); decomposition.addPoint(time, trend + noise[0]); if (decomposition.initialized()) { - TDouble1Vec prediction{decomposition.mean(time)}; + TDouble1Vec prediction{decomposition.meanValue(time)}; TDouble1Vec predictionError{decomposition.detrend(time, trend, 0.0)}; double multiplier{controller.multiplier(prediction, {predictionError}, 4 * HOUR, 1.0, 0.0005)}; @@ -1887,8 +1861,7 @@ void CTimeSeriesDecompositionTest::testYearly() { static_cast(time) / static_cast(YEAR))) + 7.5 * std::sin(boost::math::double_constants::two_pi * static_cast(time) / static_cast(DAY)); - double prediction = - maths::CBasicStatistics::mean(decomposition.baseline(time, 0.0)); + double prediction = maths::CBasicStatistics::mean(decomposition.value(time, 0.0)); double error = std::fabs((prediction - trend) / trend); meanError.add(error); //times.push_back(time); @@ -1955,7 +1928,7 @@ void CTimeSeriesDecompositionTest::testWithOutliers() { TMeanAccumulator error; for (core_t::TTime endTime = time + DAY; time < endTime; time += TEN_MINS) { double prediction = - maths::CBasicStatistics::mean(decomposition.baseline(time, 0.0)); + maths::CBasicStatistics::mean(decomposition.value(time, 0.0)); error.add(std::fabs(prediction - trend(time)) / trend(time)); //times.push_back(time); //values.push_back(trend(time)); @@ -2025,7 +1998,7 @@ void CTimeSeriesDecompositionTest::testCalendar() { for (core_t::TTime time_ = time - DAY; time_ < time; time_ += TEN_MINS) { double prediction = - maths::CBasicStatistics::mean(decomposition.baseline(time_)); + maths::CBasicStatistics::mean(decomposition.value(time_)); double variance = 4.0 * maths::CBasicStatistics::mean( decomposition.scale(time_, 4.0, 0.0)); double actual = trend(time_); @@ -2044,7 +2017,7 @@ void CTimeSeriesDecompositionTest::testCalendar() { //times.push_back(time); //values.push_back(trend(time) + noise[0]); - //f.push_back(maths::CBasicStatistics::mean(decomposition.baseline(time, 0.0))); + //f.push_back(maths::CBasicStatistics::mean(decomposition.value(time, 0.0))); } //file << "t = " << core::CContainerPrinter::print(times) << ";\n"; @@ -2151,10 +2124,11 @@ void CTimeSeriesDecompositionTest::testPersist() { core::CRapidXmlParser parser; CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); core::CRapidXmlStateRestoreTraverser traverser(parser); - - maths::CTimeSeriesDecomposition restoredDecomposition( + maths::STimeSeriesDecompositionRestoreParams params{ decayRate + 0.1, bucketLength, - maths::CTimeSeriesDecomposition::DEFAULT_COMPONENT_SIZE, traverser); + maths::SDistributionRestoreParams{maths_t::E_ContinuousData, decayRate + 0.1}}; + + maths::CTimeSeriesDecomposition restoredDecomposition(params, traverser); std::string newXml; { @@ -2186,6 +2160,8 @@ void CTimeSeriesDecompositionTest::testUpgrade() { return TDoubleDoublePr{first, second}; }; + maths::STimeSeriesDecompositionRestoreParams params{ + 0.1, HALF_HOUR, maths::SDistributionRestoreParams{maths_t::E_ContinuousData, 0.1}}; std::string empty; LOG_DEBUG(<< "*** Seasonal and Calendar Components ***"); @@ -2212,15 +2188,14 @@ void CTimeSeriesDecompositionTest::testUpgrade() { CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(xml)); core::CRapidXmlStateRestoreTraverser traverser(parser); - maths::CTimeSeriesDecomposition decomposition( - 0.1, HALF_HOUR, maths::CTimeSeriesDecomposition::DEFAULT_COMPONENT_SIZE, traverser); + maths::CTimeSeriesDecomposition decomposition(params, traverser); // Check that the decay rates match and the values and variances // predictions match the values obtained from 6.2. CPPUNIT_ASSERT_EQUAL(0.01, decomposition.decayRate()); - double meanValue{decomposition.mean(60480000)}; + double meanValue{decomposition.meanValue(60480000)}; double meanVariance{decomposition.meanVariance()}; LOG_DEBUG(<< "restored mean value = " << meanValue); LOG_DEBUG(<< "restored mean variance = " << meanVariance); @@ -2232,7 +2207,7 @@ void CTimeSeriesDecompositionTest::testUpgrade() { time += HALF_HOUR, ++i) { TDoubleDoublePr expectedValue{stringToPair(expectedValues[i])}; TDoubleDoublePr expectedScale{stringToPair(expectedScales[i])}; - TDoubleDoublePr value{decomposition.baseline(time, 10.0)}; + TDoubleDoublePr value{decomposition.value(time, 10.0)}; TDoubleDoublePr scale{decomposition.scale(time, 286374.0, 10.0)}; CPPUNIT_ASSERT_DOUBLES_EQUAL(expectedValue.first, value.first, 0.005 * std::fabs(expectedValue.first)); @@ -2271,8 +2246,7 @@ void CTimeSeriesDecompositionTest::testUpgrade() { CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(xml)); core::CRapidXmlStateRestoreTraverser traverser(parser); - maths::CTimeSeriesDecomposition decomposition( - 0.1, HALF_HOUR, maths::CTimeSeriesDecomposition::DEFAULT_COMPONENT_SIZE, traverser); + maths::CTimeSeriesDecomposition decomposition(params, traverser); // Check that the decay rates match and the values and variances // predictions are close to the values obtained from 6.2. We can't @@ -2281,7 +2255,7 @@ void CTimeSeriesDecompositionTest::testUpgrade() { CPPUNIT_ASSERT_EQUAL(0.024, decomposition.decayRate()); - double meanValue{decomposition.mean(10366200)}; + double meanValue{decomposition.meanValue(10366200)}; double meanVariance{decomposition.meanVariance()}; LOG_DEBUG(<< "restored mean value = " << meanValue); LOG_DEBUG(<< "restored mean variance = " << meanVariance); @@ -2295,7 +2269,7 @@ void CTimeSeriesDecompositionTest::testUpgrade() { time += HALF_HOUR, ++i) { TDoubleDoublePr expectedValue{stringToPair(expectedValues[i])}; TDoubleDoublePr expectedScale{stringToPair(expectedScales[i])}; - TDoubleDoublePr value{decomposition.baseline(time, 10.0)}; + TDoubleDoublePr value{decomposition.value(time, 10.0)}; TDoubleDoublePr scale{decomposition.scale(time, 96.1654, 10.0)}; CPPUNIT_ASSERT_DOUBLES_EQUAL(expectedValue.first, value.first, 0.1 * std::fabs(expectedValue.first)); diff --git a/lib/maths/unittest/CTimeSeriesModelTest.cc b/lib/maths/unittest/CTimeSeriesModelTest.cc index 6a2ac96003..100218086b 100644 --- a/lib/maths/unittest/CTimeSeriesModelTest.cc +++ b/lib/maths/unittest/CTimeSeriesModelTest.cc @@ -23,8 +23,12 @@ #include #include #include +#include #include +#include + +#include "TestUtils.h" #include #include @@ -32,17 +36,15 @@ using namespace ml; namespace { +using namespace handy_typedefs; using TBool2Vec = core::CSmallVector; using TDoubleVec = std::vector; using TDoubleVecVec = std::vector; -using TDouble1Vec = core::CSmallVector; using TDoubleWeightsAry1Vec = maths_t::TDoubleWeightsAry1Vec; using TDouble2Vec = core::CSmallVector; -using TDouble10Vec = core::CSmallVector; using TDouble2Vec1Vec = core::CSmallVector; using TDouble2VecWeightsAry = maths_t::TDouble2VecWeightsAry; using TDouble2VecWeightsAryVec = std::vector; -using TDouble10Vec1Vec = core::CSmallVector; using TDouble10VecWeightsAry1Vec = maths_t::TDouble10VecWeightsAry1Vec; using TSize1Vec = core::CSmallVector; using TTime2Vec = core::CSmallVector; @@ -87,46 +89,70 @@ class CTimeSeriesCorrelateModelAllocator : public maths::CTimeSeriesCorrelateMod } }; -maths::CModelParams params(core_t::TTime bucketLength) { +maths::CModelParams modelParams(core_t::TTime bucketLength) { using TTimeDoubleMap = std::map; static TTimeDoubleMap learnRates; learnRates[bucketLength] = static_cast(bucketLength) / 1800.0; double minimumSeasonalVarianceScale{MINIMUM_SEASONAL_SCALE}; - return maths::CModelParams{bucketLength, learnRates[bucketLength], - DECAY_RATE, minimumSeasonalVarianceScale}; + return maths::CModelParams{bucketLength, + learnRates[bucketLength], + DECAY_RATE, + minimumSeasonalVarianceScale, + 12 * core::constants::HOUR, + core::constants::DAY}; +} + +maths::CModelAddSamplesParams +addSampleParams(double interval, const TDouble2VecWeightsAryVec& weights) { + maths::CModelAddSamplesParams params; + params.integer(false).propagationInterval(interval).trendWeights(weights).priorWeights(weights); + return params; +} + +maths::CModelAddSamplesParams addSampleParams(const TDouble2VecWeightsAryVec& weights) { + return addSampleParams(1.0, weights); } -maths::CNormalMeanPrecConjugate univariateNormal() { +maths::CModelProbabilityParams computeProbabilityParams(const TDouble2VecWeightsAry& weight) { + maths::CModelProbabilityParams params; + params.addCalculation(maths_t::E_TwoSided) + .seasonalConfidenceInterval(50.0) + .addBucketEmpty({false}) + .addWeights(weight); + return params; +} + +maths::CNormalMeanPrecConjugate univariateNormal(double decayRate = DECAY_RATE) { return maths::CNormalMeanPrecConjugate::nonInformativePrior( - maths_t::E_ContinuousData, DECAY_RATE); + maths_t::E_ContinuousData, decayRate); } -maths::CLogNormalMeanPrecConjugate univariateLogNormal() { +maths::CLogNormalMeanPrecConjugate univariateLogNormal(double decayRate = DECAY_RATE) { return maths::CLogNormalMeanPrecConjugate::nonInformativePrior( - maths_t::E_ContinuousData, 0.0, DECAY_RATE); + maths_t::E_ContinuousData, 0.0, decayRate); } -maths::CMultimodalPrior univariateMultimodal() { +maths::CMultimodalPrior univariateMultimodal(double decayRate = DECAY_RATE) { maths::CXMeansOnline1d clusterer{maths_t::E_ContinuousData, maths::CAvailableModeDistributions::ALL, - maths_t::E_ClustersFractionWeight, DECAY_RATE}; + maths_t::E_ClustersFractionWeight, decayRate}; return maths::CMultimodalPrior{maths_t::E_ContinuousData, clusterer, - univariateNormal(), DECAY_RATE}; + univariateNormal(), decayRate}; } -maths::CMultivariateNormalConjugate<3> multivariateNormal() { +maths::CMultivariateNormalConjugate<3> multivariateNormal(double decayRate = DECAY_RATE) { return maths::CMultivariateNormalConjugate<3>::nonInformativePrior( - maths_t::E_ContinuousData, DECAY_RATE); + maths_t::E_ContinuousData, decayRate); } -maths::CMultivariateMultimodalPrior<3> multivariateMultimodal() { +maths::CMultivariateMultimodalPrior<3> multivariateMultimodal(double decayRate = DECAY_RATE) { maths::CXMeansOnline clusterer( - maths_t::E_ContinuousData, maths_t::E_ClustersFractionWeight, DECAY_RATE); + maths_t::E_ContinuousData, maths_t::E_ClustersFractionWeight, decayRate); return maths::CMultivariateMultimodalPrior<3>( maths_t::E_ContinuousData, clusterer, maths::CMultivariateNormalConjugate<3>::nonInformativePrior( - maths_t::E_ContinuousData, DECAY_RATE), - DECAY_RATE); + maths_t::E_ContinuousData, decayRate), + decayRate); } maths::CUnivariateTimeSeriesModel::TDecayRateController2Ary @@ -176,7 +202,7 @@ void CTimeSeriesModelTest::testClone() { maths::CTimeSeriesDecomposition trend{DECAY_RATE, bucketLength}; auto controllers = decayRateControllers(1); maths::CTimeSeriesCorrelations correlations{MINIMUM_SIGNIFICANT_CORRELATION, DECAY_RATE}; - maths::CUnivariateTimeSeriesModel model(params(bucketLength), 1, trend, + maths::CUnivariateTimeSeriesModel model(modelParams(bucketLength), 1, trend, univariateNormal(), &controllers); model.modelCorrelations(correlations); @@ -185,9 +211,8 @@ void CTimeSeriesModelTest::testClone() { TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(1)}; core_t::TTime time{0}; for (auto sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); time += bucketLength; } @@ -202,7 +227,7 @@ void CTimeSeriesModelTest::testClone() { maths::CTimeSeriesDecomposition trend{DECAY_RATE, bucketLength}; auto controllers = decayRateControllers(3); maths::CMultivariateTimeSeriesModel model( - params(bucketLength), trend, multivariateNormal(), &controllers); + modelParams(bucketLength), trend, multivariateNormal(), &controllers); TDoubleVec mean{13.0, 9.0, 10.0}; TDoubleVecVec covariance{{3.5, 2.9, 0.5}, {2.9, 3.6, 0.1}, {0.5, 0.1, 2.1}}; @@ -212,9 +237,8 @@ void CTimeSeriesModelTest::testClone() { TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(3)}; core_t::TTime time{0}; for (const auto& sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec(sample), TAG)}); + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec(sample), TAG)}); time += bucketLength; } @@ -244,7 +268,7 @@ void CTimeSeriesModelTest::testMode() { maths::CTimeSeriesDecomposition trend{DECAY_RATE, bucketLength}; maths::CNormalMeanPrecConjugate prior{univariateNormal()}; - maths::CUnivariateTimeSeriesModel model{params(bucketLength), 0, trend, prior}; + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, prior}; core_t::TTime time{0}; for (auto sample : samples) { @@ -258,12 +282,11 @@ void CTimeSeriesModelTest::testMode() { TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(1)}; time = 0; for (auto sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); time += bucketLength; } - double expectedMode{maths::CBasicStatistics::mean(trend.baseline(time)) + + double expectedMode{maths::CBasicStatistics::mean(trend.value(time)) + prior.marginalLikelihoodMode()}; TDouble2Vec mode(model.mode(time, maths_t::CUnitWeights::unit(1))); @@ -280,22 +303,20 @@ void CTimeSeriesModelTest::testMode() { maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; maths::CNormalMeanPrecConjugate prior{univariateNormal()}; - maths::CUnivariateTimeSeriesModel model{params(bucketLength), 0, trend, prior}; + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, prior}; core_t::TTime time{0}; for (auto& sample : samples) { sample += 20.0 + 10.0 * std::sin(boost::math::double_constants::two_pi * - static_cast(time) / - static_cast(core::constants::DAY)); + static_cast(time) / 86400.0); time += bucketLength; } TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(1)}; time = 0; for (auto sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); if (trend.addPoint(time, sample)) { prior.setToNonInformative(0.0, DECAY_RATE); for (const auto& value : model.slidingWindow()) { @@ -309,7 +330,7 @@ void CTimeSeriesModelTest::testMode() { time += bucketLength; } - double expectedMode{maths::CBasicStatistics::mean(trend.baseline(time)) + + double expectedMode{maths::CBasicStatistics::mean(trend.value(time)) + prior.marginalLikelihoodMode()}; TDouble2Vec mode(model.mode(time, maths_t::CUnitWeights::unit(1))); @@ -327,11 +348,12 @@ void CTimeSeriesModelTest::testMode() { rng.generateMultivariateNormalSamples(mean, covariance, 1000, samples); TDecompositionPtr10Vec trends{ - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}, - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}, - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}}; + TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}, + TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}, + TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}}; maths::CMultivariateNormalConjugate<3> prior{multivariateNormal()}; - maths::CMultivariateTimeSeriesModel model{params(bucketLength), *trends[0], prior}; + maths::CMultivariateTimeSeriesModel model{modelParams(bucketLength), + *trends[0], prior}; core_t::TTime time{0}; for (const auto& sample : samples) { @@ -348,15 +370,14 @@ void CTimeSeriesModelTest::testMode() { TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(3)}; time = 0; for (const auto& sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec(sample), TAG)}); + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec(sample), TAG)}); time += bucketLength; } TDouble2Vec expectedMode(prior.marginalLikelihoodMode( maths_t::CUnitWeights::unit(3))); for (std::size_t i = 0u; i < trends.size(); ++i) { - expectedMode[i] += maths::CBasicStatistics::mean(trends[i]->baseline(time)); + expectedMode[i] += maths::CBasicStatistics::mean(trends[i]->value(time)); } TDouble2Vec mode(model.mode(time, maths_t::CUnitWeights::unit(3))); @@ -375,23 +396,22 @@ void CTimeSeriesModelTest::testMode() { TDoubleVecVec samples; rng.generateMultivariateNormalSamples(mean, covariance, 1000, samples); - double learnRate{params(bucketLength).learnRate()}; + double learnRate{modelParams(bucketLength).learnRate()}; TDecompositionPtr10Vec trends{ - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}, - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}, - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}}; + TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}, + TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}, + TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}}; maths::CMultivariateNormalConjugate<3> prior{multivariateNormal()}; - maths::CMultivariateTimeSeriesModel model{params(bucketLength), *trends[0], prior}; + maths::CMultivariateTimeSeriesModel model{modelParams(bucketLength), + *trends[0], prior}; core_t::TTime time{0}; for (auto& sample : samples) { double amplitude{10.0}; for (std::size_t i = 0u; i < sample.size(); ++i) { - sample[i] += - 30.0 + amplitude * - std::sin(boost::math::double_constants::two_pi * - static_cast(time) / - static_cast(core::constants::DAY)); + sample[i] += 30.0 + amplitude * + std::sin(boost::math::double_constants::two_pi * + static_cast(time) / 86400.0); amplitude += 4.0; } time += bucketLength; @@ -400,9 +420,8 @@ void CTimeSeriesModelTest::testMode() { TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(3)}; time = 0; for (const auto& sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec(sample), TAG)}); + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec(sample), TAG)}); bool reinitialize{false}; TDouble10Vec1Vec detrended{TDouble10Vec(3)}; @@ -422,7 +441,7 @@ void CTimeSeriesModelTest::testMode() { TDouble2Vec expectedMode(prior.marginalLikelihoodMode( maths_t::CUnitWeights::unit(3))); for (std::size_t i = 0u; i < trends.size(); ++i) { - expectedMode[i] += maths::CBasicStatistics::mean(trends[i]->baseline(time)); + expectedMode[i] += maths::CBasicStatistics::mean(trends[i]->value(time)); } TDouble2Vec mode(model.mode(time, maths_t::CUnitWeights::unit(3))); @@ -442,7 +461,7 @@ void CTimeSeriesModelTest::testAddBucketValue() { core_t::TTime bucketLength{600}; maths::CTimeSeriesDecompositionStub trend; maths::CLogNormalMeanPrecConjugate prior{univariateLogNormal()}; - maths::CUnivariateTimeSeriesModel model{params(bucketLength), 0, trend, prior}; + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, prior}; TTimeDouble2VecSizeTrVec samples{ core::make_triple(core_t::TTime{20}, TDouble2Vec{3.5}, TAG), @@ -462,12 +481,10 @@ void CTimeSeriesModelTest::testAddBucketValue() { prior.propagateForwardsByTime(1.0); prior.adjustOffset({-1.0}, maths_t::CUnitWeights::SINGLE_UNIT); - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(modelWeights).priorWeights(modelWeights); - model.addSamples(params, samples); + model.addSamples(addSampleParams(modelWeights), samples); model.addBucketValue({core::make_triple(core_t::TTime{20}, TDouble2Vec{-1.0}, TAG)}); - CPPUNIT_ASSERT_EQUAL(prior.checksum(), model.prior().checksum()); + CPPUNIT_ASSERT_EQUAL(prior.checksum(), model.residualModel().checksum()); } void CTimeSeriesModelTest::testAddSamples() { @@ -483,7 +500,7 @@ void CTimeSeriesModelTest::testAddSamples() { { maths::CTimeSeriesDecompositionStub trend; maths::CNormalMeanPrecConjugate prior{univariateNormal()}; - maths::CUnivariateTimeSeriesModel model{params(bucketLength), 0, trend, prior}; + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, prior}; TTimeDouble2VecSizeTrVec samples{ core::make_triple(core_t::TTime{20}, TDouble2Vec{3.5}, TAG), @@ -495,13 +512,7 @@ void CTimeSeriesModelTest::testAddSamples() { maths_t::countWeight(TDouble2Vec{weights[1]}), maths_t::countWeight(TDouble2Vec{weights[2]})}; - maths::CModelAddSamplesParams params; - params.integer(false) - .propagationInterval(1.0) - .trendWeights(modelWeights) - .priorWeights(modelWeights); - - model.addSamples(params, samples); + model.addSamples(addSampleParams(modelWeights), samples); trend.addPoint(samples[1].first, samples[1].second[0], maths_t::countWeight(weights[1])); @@ -516,11 +527,11 @@ void CTimeSeriesModelTest::testAddSamples() { prior.propagateForwardsByTime(1.0); uint64_t checksum1{trend.checksum()}; - uint64_t checksum2{model.trend().checksum()}; + uint64_t checksum2{model.trendModel().checksum()}; LOG_DEBUG(<< "checksum1 = " << checksum1 << " checksum2 = " << checksum2); CPPUNIT_ASSERT_EQUAL(checksum1, checksum2); checksum1 = prior.checksum(); - checksum2 = model.prior().checksum(); + checksum2 = model.residualModel().checksum(); LOG_DEBUG(<< "checksum1 = " << checksum1 << " checksum2 = " << checksum2); CPPUNIT_ASSERT_EQUAL(checksum1, checksum2); } @@ -532,7 +543,8 @@ void CTimeSeriesModelTest::testAddSamples() { TDecompositionPtr{new maths::CTimeSeriesDecompositionStub{}}, TDecompositionPtr{new maths::CTimeSeriesDecompositionStub{}}}; maths::CMultivariateNormalConjugate<3> prior{multivariateNormal()}; - maths::CMultivariateTimeSeriesModel model{params(bucketLength), *trends[0], prior}; + maths::CMultivariateTimeSeriesModel model{modelParams(bucketLength), + *trends[0], prior}; TTimeDouble2VecSizeTrVec samples{ core::make_triple(core_t::TTime{20}, TDouble2Vec{3.5, 3.4, 3.3}, TAG), @@ -544,13 +556,7 @@ void CTimeSeriesModelTest::testAddSamples() { maths_t::countWeight(TDouble2Vec(weights[1], weights[1] + 3)), maths_t::countWeight(TDouble2Vec(weights[2], weights[2] + 3))}; - maths::CModelAddSamplesParams params; - params.integer(false) - .propagationInterval(1.0) - .trendWeights(modelWeights) - .priorWeights(modelWeights); - - model.addSamples(params, samples); + model.addSamples(addSampleParams(modelWeights), samples); for (std::size_t i = 0u; i < trends.size(); ++i) { trends[i]->addPoint(samples[1].first, samples[1].second[i], @@ -571,12 +577,12 @@ void CTimeSeriesModelTest::testAddSamples() { for (std::size_t i = 0u; i < trends.size(); ++i) { uint64_t checksum1{trends[i]->checksum()}; - uint64_t checksum2{model.trend()[i]->checksum()}; + uint64_t checksum2{model.trendModel()[i]->checksum()}; LOG_DEBUG(<< "checksum1 = " << checksum1 << " checksum2 = " << checksum2); CPPUNIT_ASSERT_EQUAL(checksum1, checksum2); } uint64_t checksum1{prior.checksum()}; - uint64_t checksum2{model.prior().checksum()}; + uint64_t checksum2{model.residualModel().checksum()}; LOG_DEBUG(<< "checksum1 = " << checksum1 << " checksum2 = " << checksum2); CPPUNIT_ASSERT_EQUAL(checksum1, checksum2); } @@ -585,7 +591,7 @@ void CTimeSeriesModelTest::testAddSamples() { { maths::CTimeSeriesDecompositionStub trend; maths::CNormalMeanPrecConjugate prior{univariateNormal()}; - maths::CUnivariateTimeSeriesModel model{params(bucketLength), 0, trend, prior}; + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, prior}; double interval[]{1.0, 1.1, 0.4}; TDouble2Vec samples[]{{10.0}, {13.9}, {27.1}}; @@ -597,12 +603,7 @@ void CTimeSeriesModelTest::testAddSamples() { core_t::TTime time{0}; for (std::size_t i = 0u; i < 3; ++i) { TTimeDouble2VecSizeTrVec sample{core::make_triple(time, samples[i], TAG)}; - maths::CModelAddSamplesParams params; - params.integer(false) - .propagationInterval(interval[i]) - .trendWeights(weights) - .priorWeights(weights); - model.addSamples(params, sample); + model.addSamples(addSampleParams(interval[i], weights), sample); TDoubleWeightsAry1Vec weight{maths_t::CUnitWeights::UNIT}; for (std::size_t j = 0u; j < weights[0].size(); ++j) { @@ -612,7 +613,7 @@ void CTimeSeriesModelTest::testAddSamples() { prior.propagateForwardsByTime(interval[i]); uint64_t checksum1{prior.checksum()}; - uint64_t checksum2{model.prior().checksum()}; + uint64_t checksum2{model.residualModel().checksum()}; LOG_DEBUG(<< "checksum1 = " << checksum1 << " checksum2 = " << checksum2); CPPUNIT_ASSERT_EQUAL(checksum1, checksum2); @@ -627,7 +628,8 @@ void CTimeSeriesModelTest::testAddSamples() { TDecompositionPtr{new maths::CTimeSeriesDecompositionStub{}}, TDecompositionPtr{new maths::CTimeSeriesDecompositionStub{}}}; maths::CMultivariateNormalConjugate<3> prior{multivariateNormal()}; - maths::CMultivariateTimeSeriesModel model{params(bucketLength), *trends[0], prior}; + maths::CMultivariateTimeSeriesModel model{modelParams(bucketLength), + *trends[0], prior}; double interval[]{1.0, 1.1, 0.4}; TDouble2Vec samples[]{{13.5, 13.4, 13.3}, {13.9, 13.8, 13.7}, {20.1, 20.0, 10.9}}; @@ -639,12 +641,7 @@ void CTimeSeriesModelTest::testAddSamples() { core_t::TTime time{0}; for (std::size_t i = 0u; i < 3; ++i) { TTimeDouble2VecSizeTrVec sample{core::make_triple(time, samples[i], TAG)}; - maths::CModelAddSamplesParams params; - params.integer(false) - .propagationInterval(interval[i]) - .trendWeights(weights) - .priorWeights(weights); - model.addSamples(params, sample); + model.addSamples(addSampleParams(interval[i], weights), sample); TDouble10VecWeightsAry1Vec weight{maths_t::CUnitWeights::unit(3)}; for (std::size_t j = 0u; j < weights[0].size(); ++j) { @@ -654,7 +651,7 @@ void CTimeSeriesModelTest::testAddSamples() { prior.propagateForwardsByTime(interval[i]); uint64_t checksum1{prior.checksum()}; - uint64_t checksum2{model.prior().checksum()}; + uint64_t checksum2{model.residualModel().checksum()}; LOG_DEBUG(<< "checksum1 = " << checksum1 << " checksum2 = " << checksum2); CPPUNIT_ASSERT_EQUAL(checksum1, checksum2); @@ -667,8 +664,8 @@ void CTimeSeriesModelTest::testAddSamples() { maths::CTimeSeriesDecomposition trend{DECAY_RATE, bucketLength}; maths::CNormalMeanPrecConjugate prior{univariateNormal()}; auto controllers = decayRateControllers(1); - maths::CUnivariateTimeSeriesModel model(params(bucketLength), 1, trend, - prior, &controllers); + maths::CUnivariateTimeSeriesModel model(modelParams(bucketLength), 1, + trend, prior, &controllers); TDoubleVec samples; rng.generateNormalSamples(1.0, 4.0, 2000, samples); @@ -681,12 +678,10 @@ void CTimeSeriesModelTest::testAddSamples() { 4.0 * std::sin(boost::math::double_constants::two_pi * static_cast(time) / 86400.0) + (time / bucketLength > 1800 ? 10.0 : 0.0) + noise}; - TTimeDouble2VecSizeTrVec sample_{ core::make_triple(time, TDouble2Vec{sample}, TAG)}; - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, sample_); + + model.addSamples(addSampleParams(weights), sample_); if (trend.addPoint(time, sample)) { trend.decayRate(trend.decayRate() / controllers[0].multiplier()); @@ -705,7 +700,7 @@ void CTimeSeriesModelTest::testAddSamples() { if (trend.initialized()) { double multiplier{controllers[0].multiplier( - {trend.mean(time)}, {{detrended}}, bucketLength, + {trend.meanValue(time)}, {{detrended}}, bucketLength, model.params().learnRate(), DECAY_RATE)}; trend.decayRate(multiplier * trend.decayRate()); } @@ -718,10 +713,10 @@ void CTimeSeriesModelTest::testAddSamples() { } uint64_t checksum1{trend.checksum()}; - uint64_t checksum2{model.trend().checksum()}; + uint64_t checksum2{model.trendModel().checksum()}; CPPUNIT_ASSERT_EQUAL(checksum1, checksum2); checksum1 = prior.checksum(); - checksum2 = model.prior().checksum(); + checksum2 = model.residualModel().checksum(); CPPUNIT_ASSERT_EQUAL(checksum1, checksum2); time += bucketLength; @@ -730,14 +725,14 @@ void CTimeSeriesModelTest::testAddSamples() { LOG_DEBUG(<< "Decay rate control multivariate"); { - double learnRate{params(bucketLength).learnRate()}; + double learnRate{modelParams(bucketLength).learnRate()}; TDecompositionPtr10Vec trends{ - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}, - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}, - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}}; + TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}, + TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}, + TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}}; maths::CMultivariateNormalConjugate<3> prior{multivariateNormal()}; auto controllers = decayRateControllers(3); - maths::CMultivariateTimeSeriesModel model{params(bucketLength), + maths::CMultivariateTimeSeriesModel model{modelParams(bucketLength), *trends[0], prior, &controllers}; TDoubleVecVec samples; @@ -764,19 +759,15 @@ void CTimeSeriesModelTest::testAddSamples() { (time / bucketLength > 1800 ? 10.0 : 0.0) + sample[i]; reinitialize |= trends[i]->addPoint(time, sample[i]); detrended[0][i] = trends[i]->detrend(time, sample[i], 0.0); - mean[i] = trends[i]->mean(time); + mean[i] = trends[i]->meanValue(time); hasTrend |= true; amplitude += 4.0; } TTimeDouble2VecSizeTrVec sample_{ core::make_triple(time, TDouble2Vec(sample), TAG)}; - maths::CModelAddSamplesParams params_; - params_.integer(false) - .propagationInterval(1.0) - .trendWeights(weights) - .priorWeights(weights); - model.addSamples(params_, sample_); + + model.addSamples(addSampleParams(weights), sample_); if (reinitialize) { reinitializePrior(learnRate, model, trends, prior, &controllers); @@ -806,11 +797,11 @@ void CTimeSeriesModelTest::testAddSamples() { for (std::size_t i = 0u; i < trends.size(); ++i) { uint64_t checksum1{trends[i]->checksum()}; - uint64_t checksum2{model.trend()[i]->checksum()}; + uint64_t checksum2{model.trendModel()[i]->checksum()}; CPPUNIT_ASSERT_EQUAL(checksum1, checksum2); } uint64_t checksum1{prior.checksum()}; - uint64_t checksum2{model.prior().checksum()}; + uint64_t checksum2{model.residualModel().checksum()}; CPPUNIT_ASSERT_EQUAL(checksum1, checksum2); time += bucketLength; @@ -830,8 +821,8 @@ void CTimeSeriesModelTest::testPredict() { maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; maths::CNormalMeanPrecConjugate prior{univariateNormal()}; auto controllers = decayRateControllers(1); - maths::CUnivariateTimeSeriesModel model{params(bucketLength), 0, trend, - prior, &controllers}; + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, + trend, prior, &controllers}; TDoubleVec samples; rng.generateNormalSamples(0.0, 4.0, 1008, samples); @@ -841,9 +832,8 @@ void CTimeSeriesModelTest::testPredict() { sample += 10.0 + 5.0 * std::sin(boost::math::double_constants::two_pi * static_cast(time) / 86400.0); - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); if (trend.addPoint(time, sample)) { prior.setToNonInformative(0.0, DECAY_RATE); @@ -863,7 +853,7 @@ void CTimeSeriesModelTest::testPredict() { for (core_t::TTime time_ = time; time_ < time + 86400; time_ += 3600) { double trend_{10.0 + 5.0 * std::sin(boost::math::double_constants::two_pi * static_cast(time_) / 86400.0)}; - double expected{maths::CBasicStatistics::mean(trend.baseline(time_)) + + double expected{maths::CBasicStatistics::mean(trend.value(time_)) + maths::CBasicStatistics::mean( prior.marginalLikelihoodConfidenceInterval(0.0))}; double predicted{model.predict(time_)[0]}; @@ -882,7 +872,7 @@ void CTimeSeriesModelTest::testPredict() { { maths::CTimeSeriesDecompositionStub trend; maths::CMultimodalPrior prior{univariateMultimodal()}; - maths::CUnivariateTimeSeriesModel model{params(bucketLength), 0, trend, prior}; + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, prior}; TMeanAccumulator modes[2]; TDoubleVec samples, samples_; @@ -896,9 +886,8 @@ void CTimeSeriesModelTest::testPredict() { TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(1)}; core_t::TTime time{0}; for (auto sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); time += bucketLength; } @@ -920,21 +909,19 @@ void CTimeSeriesModelTest::testPredict() { LOG_DEBUG(<< "Multivariate Seasonal"); { - double learnRate{params(bucketLength).learnRate()}; + double learnRate{modelParams(bucketLength).learnRate()}; TDecompositionPtr10Vec trends{ TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}, TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}, TDecompositionPtr{new maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}}}; maths::CMultivariateNormalConjugate<3> prior{multivariateNormal()}; maths::CMultivariateTimeSeriesModel model{maths::CMultivariateTimeSeriesModel{ - params(bucketLength), *trends[0], prior}}; + modelParams(bucketLength), *trends[0], prior}}; TDoubleVecVec samples; TDoubleVec mean{0.0, 2.0, 1.0}; - { - TDoubleVecVec covariance{{3.0, 2.9, 0.5}, {2.9, 2.6, 0.1}, {0.5, 0.1, 2.0}}; - rng.generateMultivariateNormalSamples(mean, covariance, 1000, samples); - } + rng.generateMultivariateNormalSamples( + mean, {{3.0, 2.9, 0.5}, {2.9, 2.6, 0.1}, {0.5, 0.1, 2.0}}, 1000, samples); TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(3)}; core_t::TTime time{0}; @@ -956,9 +943,8 @@ void CTimeSeriesModelTest::testPredict() { maths_t::CUnitWeights::singleUnit(3)); prior.propagateForwardsByTime(1.0); - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec(sample), TAG)}); + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec(sample), TAG)}); time += bucketLength; } @@ -972,7 +958,7 @@ void CTimeSeriesModelTest::testPredict() { static_cast(time_) / 86400.0)}; maths::CMultivariatePrior::TUnivariatePriorPtr margin{ prior.univariate(marginalize, condition).first}; - double expected{maths::CBasicStatistics::mean(trends[i]->baseline(time_)) + + double expected{maths::CBasicStatistics::mean(trends[i]->value(time_)) + maths::CBasicStatistics::mean( margin->marginalLikelihoodConfidenceInterval(0.0))}; double predicted{model.predict(time_)[i]}; @@ -992,8 +978,8 @@ void CTimeSeriesModelTest::testPredict() { TDecompositionPtr{new maths::CTimeSeriesDecompositionStub{}}, TDecompositionPtr{new maths::CTimeSeriesDecompositionStub{}}}; maths::CMultivariateMultimodalPrior<3> prior{multivariateMultimodal()}; - maths::CMultivariateTimeSeriesModel model{maths::CMultivariateTimeSeriesModel{ - params(bucketLength), *trends[0], prior}}; + maths::CMultivariateTimeSeriesModel model{modelParams(bucketLength), + *trends[0], prior}; TMeanAccumulator2Vec modes[2]{TMeanAccumulator2Vec(3), TMeanAccumulator2Vec(3)}; TDoubleVecVec samples; @@ -1020,9 +1006,8 @@ void CTimeSeriesModelTest::testPredict() { TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(3)}; core_t::TTime time{0}; for (const auto& sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec(sample), TAG)}); + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec(sample), TAG)}); time += bucketLength; } @@ -1045,9 +1030,11 @@ void CTimeSeriesModelTest::testPredict() { } void CTimeSeriesModelTest::testProbability() { - // Test: 1) Calculation, seasonal confidence interval, weights, etc. - // 2) Test with and without trend. - // 3) Test with anomalies. + // Test: 1) The different the calculation matches the expected values + // given the trend and decomposition for different calculations, + // seasonal confidence intervals, weights and so on. + // 2) Test the calculation with and without trend. + // 3) Test manually injected anomalies have low probabilities. using TDoubleSizePr = std::pair; using TSizeVec = std::vector; @@ -1059,11 +1046,11 @@ void CTimeSeriesModelTest::testProbability() { LOG_DEBUG(<< "Univariate"); { maths::CUnivariateTimeSeriesModel models[]{ - maths::CUnivariateTimeSeriesModel{params(bucketLength), 1, + maths::CUnivariateTimeSeriesModel{modelParams(bucketLength), 1, maths::CTimeSeriesDecompositionStub{}, - univariateNormal(), nullptr, false}, + univariateNormal(), 0, false}, maths::CUnivariateTimeSeriesModel{ - params(bucketLength), 1, + modelParams(bucketLength), 1, maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}, univariateNormal(), nullptr, false}}; @@ -1075,20 +1062,13 @@ void CTimeSeriesModelTest::testProbability() { const TDouble2VecWeightsAryVec weight{ maths_t::CUnitWeights::unit(1)}; for (auto sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false) - .propagationInterval(1.0) - .trendWeights(weight) - .priorWeights(weight); - double trend{5.0 + 5.0 * std::sin(boost::math::double_constants::two_pi * static_cast(time) / 86400.0)}; - - models[0].addSamples( - params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); + models[0].addSamples(addSampleParams(weight), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); models[1].addSamples( - params, {core::make_triple(time, TDouble2Vec{trend + sample}, TAG)}); - + addSampleParams(weight), + {core::make_triple(time, TDouble2Vec{trend + sample}, TAG)}); time += bucketLength; } } @@ -1122,12 +1102,12 @@ void CTimeSeriesModelTest::testProbability() { weight_[i] = weight[i][0]; } double lb[2], ub[2]; - models[0].prior().probabilityOfLessLikelySamples( + models[0].residualModel().probabilityOfLessLikelySamples( calculation, sample, {weight_}, lb[0], ub[0], expectedTail[0]); - models[1].prior().probabilityOfLessLikelySamples( + models[1].residualModel().probabilityOfLessLikelySamples( calculation, - {models[1].trend().detrend(time, sample[0], confidence)}, + {models[1].trendModel().detrend(time, sample[0], confidence)}, {weight_}, lb[1], ub[1], expectedTail[1]); expectedProbability[0] = (lb[0] + ub[0]) / 2.0; expectedProbability[1] = (lb[1] + ub[1]) / 2.0; @@ -1164,42 +1144,33 @@ void CTimeSeriesModelTest::testProbability() { LOG_DEBUG(<< "Multivariate"); { maths::CMultivariateTimeSeriesModel models[]{ + maths::CMultivariateTimeSeriesModel{modelParams(bucketLength), + maths::CTimeSeriesDecompositionStub{}, + multivariateNormal(), 0, false}, maths::CMultivariateTimeSeriesModel{ - params(bucketLength), maths::CTimeSeriesDecompositionStub{}, - multivariateNormal(), nullptr, false}, - maths::CMultivariateTimeSeriesModel{ - params(bucketLength), + modelParams(bucketLength), maths::CTimeSeriesDecomposition{24.0 * DECAY_RATE, bucketLength}, multivariateNormal(), nullptr, false}}; TDoubleVecVec samples; - { - TDoubleVec mean{10.0, 15.0, 11.0}; - TDoubleVecVec covariance{{3.0, 2.9, 0.5}, {2.9, 2.6, 0.1}, {0.5, 0.1, 2.0}}; - rng.generateMultivariateNormalSamples(mean, covariance, 1000, samples); - } + rng.generateMultivariateNormalSamples( + {10.0, 15.0, 11.0}, + {{3.0, 2.9, 0.5}, {2.9, 2.6, 0.1}, {0.5, 0.1, 2.0}}, 1000, samples); core_t::TTime time{0}; { TDouble2VecWeightsAryVec weight{maths_t::CUnitWeights::unit(3)}; for (auto& sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false) - .propagationInterval(1.0) - .trendWeights(weight) - .priorWeights(weight); - TDouble2Vec sample_(sample); - models[0].addSamples(params, {core::make_triple(time, sample_, TAG)}); - + models[0].addSamples(addSampleParams(weight), + {core::make_triple(time, sample_, TAG)}); double trend{5.0 + 5.0 * std::sin(boost::math::double_constants::two_pi * static_cast(time) / 86400.0)}; for (auto& component : sample_) { component += trend; } - - models[1].addSamples(params, {core::make_triple(time, sample_, TAG)}); - + models[1].addSamples(addSampleParams(weight), + {core::make_triple(time, sample_, TAG)}); time += bucketLength; } } @@ -1234,15 +1205,15 @@ void CTimeSeriesModelTest::testProbability() { weight_[i] = weight[i]; } double lb[2], ub[2]; - models[0].prior().probabilityOfLessLikelySamples( + models[0].residualModel().probabilityOfLessLikelySamples( calculation, {TDouble10Vec(sample)}, {weight_}, lb[0], ub[0], expectedTail[0]); TDouble10Vec detrended; for (std::size_t j = 0u; j < sample.size(); ++j) { - detrended.push_back(models[1].trend()[j]->detrend( + detrended.push_back(models[1].trendModel()[j]->detrend( time, sample[j], confidence)); } - models[1].prior().probabilityOfLessLikelySamples( + models[1].residualModel().probabilityOfLessLikelySamples( calculation, {detrended}, {weight_}, lb[1], ub[1], expectedTail[1]); expectedProbability[0] = (lb[0] + ub[0]) / 2.0; @@ -1282,8 +1253,8 @@ void CTimeSeriesModelTest::testProbability() { LOG_DEBUG(<< "Anomalies"); { maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; - maths::CUnivariateTimeSeriesModel model{params(bucketLength), 1, trend, - univariateNormal()}; + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 1, + trend, univariateNormal()}; TSizeVec anomalies; rng.generateUniformSamples(100, 1000, 10, anomalies); @@ -1296,33 +1267,20 @@ void CTimeSeriesModelTest::testProbability() { TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(1)}; std::size_t bucket{0}; core_t::TTime time{0}; - for (auto& sample : samples) { + for (auto sample : samples) { if (std::binary_search(anomalies.begin(), anomalies.end(), bucket++)) { sample += 10.0; } - { - maths::CModelAddSamplesParams params; - params.integer(false) - .propagationInterval(1.0) - .trendWeights(weights) - .priorWeights(weights); - model.addSamples( - params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); - } - { - maths::CModelProbabilityParams params; - params.addCalculation(maths_t::E_TwoSided) - .seasonalConfidenceInterval(50.0) - .addBucketEmpty({false}) - .addWeights(weights[0]); - TTail2Vec tail; - double probability; - bool conditional; - TSize1Vec mostAnomalousCorrelate; - model.probability(params, {{time}}, {{sample}}, probability, - tail, conditional, mostAnomalousCorrelate); - smallest.add({probability, bucket - 1}); - } + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); + TTail2Vec tail; + double probability; + bool conditional; + TSize1Vec mostAnomalousCorrelate; + model.probability(computeProbabilityParams(weights[0]), {{time}}, + {{sample}}, probability, tail, conditional, + mostAnomalousCorrelate); + smallest.add({probability, bucket - 1}); time += bucketLength; } @@ -1339,6 +1297,13 @@ void CTimeSeriesModelTest::testProbability() { } void CTimeSeriesModelTest::testWeights() { + // Check that the seasonal weight matches the value we expect given + // 1) the trend and residual model + // 2) the variation in the input data + // + // And that the Winsorisation weight is monotonic decreasing with + // increasing distance from the expected value. + core_t::TTime bucketLength{1800}; test::CRandomNumbers rng; @@ -1347,7 +1312,7 @@ void CTimeSeriesModelTest::testWeights() { { maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; maths::CNormalMeanPrecConjugate prior{univariateNormal()}; - maths::CUnivariateTimeSeriesModel model{params(bucketLength), 0, trend, prior}; + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, prior}; TDoubleVec samples; rng.generateNormalSamples(0.0, 4.0, 1008, samples); @@ -1357,21 +1322,8 @@ void CTimeSeriesModelTest::testWeights() { double scale{10.0 + 5.0 * std::sin(boost::math::double_constants::two_pi * static_cast(time) / 86400.0)}; sample = scale * (1.0 + 0.1 * sample); - - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); - - if (trend.addPoint(time, sample)) { - prior.setToNonInformative(0.0, DECAY_RATE); - for (const auto& value : model.slidingWindow()) { - prior.addSamples({trend.detrend(value.first, value.second, 0.0)}, - maths_t::CUnitWeights::SINGLE_UNIT); - } - } - prior.addSamples({trend.detrend(time, sample, 0.0)}, - maths_t::CUnitWeights::SINGLE_UNIT); - + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); time += bucketLength; } @@ -1384,7 +1336,9 @@ void CTimeSeriesModelTest::testWeights() { 2.0)}; double expectedScale{ - trend.scale(time_, prior.marginalLikelihoodVariance(), 0.0).second}; + model.trendModel() + .scale(time_, model.residualModel().marginalLikelihoodVariance(), 0.0) + .second}; double scale{model.seasonalWeight(0.0, time_)[0]}; LOG_DEBUG(<< "expected weight = " << expectedScale << ", weight = " << scale @@ -1400,7 +1354,7 @@ void CTimeSeriesModelTest::testWeights() { TDouble2Vec prediction(model.predict(time)); double lastWeight = 1.0; for (std::size_t i = 0u; i < 10; ++i) { - double weight_{model.winsorisationWeight(1.0, time, prediction)[0]}; + double weight_{model.winsorisationWeight(0.0, time, prediction)[0]}; LOG_DEBUG(<< "weight = " << weight_); CPPUNIT_ASSERT(weight_ <= lastWeight); lastWeight = weight_; @@ -1410,44 +1364,25 @@ void CTimeSeriesModelTest::testWeights() { LOG_DEBUG(<< "Multivariate"); { - double learnRate{params(bucketLength).learnRate()}; - TDecompositionPtr10Vec trends{ - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}, - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}, - TDecompositionPtr{new maths::CTimeSeriesDecomposition{DECAY_RATE, bucketLength}}}; + maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; maths::CMultivariateNormalConjugate<3> prior{multivariateNormal()}; - maths::CMultivariateTimeSeriesModel model{params(bucketLength), *trends[0], prior}; + maths::CMultivariateTimeSeriesModel model{modelParams(bucketLength), trend, prior}; TDoubleVecVec samples; - { - TDoubleVec mean{10.0, 15.0, 11.0}; - TDoubleVecVec covariance{{3.0, 2.9, 0.5}, {2.9, 2.6, 0.1}, {0.5, 0.1, 2.0}}; - rng.generateMultivariateNormalSamples(mean, covariance, 1008, samples); - } + rng.generateMultivariateNormalSamples( + {10.0, 15.0, 11.0}, + {{3.0, 2.9, 0.5}, {2.9, 2.6, 0.1}, {0.5, 0.1, 2.0}}, 1008, samples); TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(3)}; core_t::TTime time{0}; for (auto& sample : samples) { double scale{10.0 + 5.0 * std::sin(boost::math::double_constants::two_pi * static_cast(time) / 86400.0)}; - - bool reinitialize{false}; - TDouble10Vec1Vec detrended{TDouble10Vec(3)}; - for (std::size_t i = 0u; i < sample.size(); ++i) { - sample[i] = scale * (1.0 + 0.1 * sample[i]); - reinitialize |= trends[i]->addPoint(time, sample[i]); - detrended[0][i] = trends[i]->detrend(time, sample[i], 0.0); - } - if (reinitialize) { - reinitializePrior(learnRate, model, trends, prior); + for (auto& component : sample) { + component = scale * (1.0 + 0.1 * component); } - prior.addSamples(detrended, - maths_t::CUnitWeights::singleUnit(3)); - - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - model.addSamples(params, {core::make_triple(time, TDouble2Vec(sample), TAG)}); - + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec(sample), TAG)}); time += bucketLength; } @@ -1461,8 +1396,8 @@ void CTimeSeriesModelTest::testWeights() { for (std::size_t i = 0u; i < 3; ++i) { double expectedScale{ - trends[i] - ->scale(time_, prior.marginalLikelihoodVariances()[i], 0.0) + model.trendModel()[i] + ->scale(time_, model.residualModel().marginalLikelihoodVariances()[i], 0.0) .second}; double scale{model.seasonalWeight(0.0, time_)[i]}; LOG_DEBUG(<< "expected weight = " << expectedScale << ", weight = " << scale @@ -1478,7 +1413,7 @@ void CTimeSeriesModelTest::testWeights() { TDouble2Vec prediction(model.predict(time)); double lastWeight = 1.0; for (std::size_t i = 0u; i < 10; ++i) { - double weight_{model.winsorisationWeight(1.0, time, prediction)[0]}; + double weight_{model.winsorisationWeight(0.0, time, prediction)[0]}; LOG_DEBUG(<< "weight = " << weight_); CPPUNIT_ASSERT(weight_ <= lastWeight); lastWeight = weight_; @@ -1499,19 +1434,18 @@ void CTimeSeriesModelTest::testMemoryUsage() { maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; auto controllers = decayRateControllers(1); std::unique_ptr model{new maths::CUnivariateTimeSeriesModel{ - params(bucketLength), 0, trend, univariateNormal(), &controllers}}; + modelParams(bucketLength), 0, trend, univariateNormal(), &controllers}}; TDoubleVec samples; rng.generateNormalSamples(1.0, 4.0, 1000, samples); TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(1)}; core_t::TTime time{0}; for (auto sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); sample += 10.0 + 5.0 * std::sin(boost::math::double_constants::two_pi * static_cast(time) / 86400.0); trend.addPoint(time, sample); - model->addSamples(params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); + model->addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); time += bucketLength; } @@ -1536,19 +1470,18 @@ void CTimeSeriesModelTest::testMemoryUsage() { maths::CMultivariateNormalConjugate<3> prior{multivariateNormal()}; auto controllers = decayRateControllers(3); std::unique_ptr model{new maths::CMultivariateTimeSeriesModel{ - params(bucketLength), trend, prior, &controllers}}; + modelParams(bucketLength), trend, prior, &controllers}}; TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(3)}; core_t::TTime time{0}; for (auto& sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); for (auto& coordinate : sample) { coordinate += 10.0 + 5.0 * std::sin(boost::math::double_constants::two_pi * static_cast(time) / 86400.0); } trend.addPoint(time, sample[0]); - model->addSamples(params, {core::make_triple(time, TDouble2Vec(sample), TAG)}); + model->addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec(sample), TAG)}); time += bucketLength; } @@ -1566,10 +1499,10 @@ void CTimeSeriesModelTest::testMemoryUsage() { } void CTimeSeriesModelTest::testPersist() { - // Test persist then restore is idempotent. + // Test the restored model checksum matches the persisted model. core_t::TTime bucketLength{600}; - maths::CModelParams params_{params(bucketLength)}; + maths::CModelParams params{modelParams(bucketLength)}; test::CRandomNumbers rng; @@ -1578,17 +1511,15 @@ void CTimeSeriesModelTest::testPersist() { maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; auto controllers = decayRateControllers(1); maths::CUnivariateTimeSeriesModel origModel{ - params_, 1, trend, univariateNormal(), &controllers}; + params, 1, trend, univariateNormal(), &controllers}; TDoubleVec samples; rng.generateNormalSamples(1.0, 4.0, 1000, samples); TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(1)}; core_t::TTime time{0}; for (auto sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - origModel.addSamples( - params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); + origModel.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); time += bucketLength; } @@ -1599,19 +1530,19 @@ void CTimeSeriesModelTest::testPersist() { inserter.toXml(origXml); } - //LOG_DEBUG(<< "model XML representation:\n" << origXml); + LOG_TRACE(<< "model XML representation:\n" << origXml); + LOG_DEBUG(<< "model XML size: " << origXml.size()); // Restore the XML into a new filter core::CRapidXmlParser parser; CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); core::CRapidXmlStateRestoreTraverser traverser(parser); + maths::SDistributionRestoreParams distributionParams{maths_t::E_ContinuousData, + DECAY_RATE}; maths::STimeSeriesDecompositionRestoreParams decompositionParams{ - 24.0 * DECAY_RATE, bucketLength, - maths::CTimeSeriesDecomposition::DEFAULT_COMPONENT_SIZE}; - maths::SDistributionRestoreParams distributionParams{ - maths_t::E_ContinuousData, DECAY_RATE, 0.5, 24.0, 12}; - maths::SModelRestoreParams restoreParams{params_, decompositionParams, distributionParams}; + 24.0 * DECAY_RATE, bucketLength, distributionParams}; + maths::SModelRestoreParams restoreParams{params, decompositionParams, distributionParams}; maths::CUnivariateTimeSeriesModel restoredModel{restoreParams, traverser}; CPPUNIT_ASSERT_EQUAL(origModel.checksum(), restoredModel.checksum()); @@ -1619,24 +1550,22 @@ void CTimeSeriesModelTest::testPersist() { LOG_DEBUG(<< "Multivariate"); { - TDoubleVec mean{11.0, 10.0, 12.0}; - TDoubleVecVec covariance{{4.0, 2.9, 0.5}, {2.9, 2.6, 0.1}, {0.5, 0.1, 2.0}}; TDoubleVecVec samples; - rng.generateMultivariateNormalSamples(mean, covariance, 1000, samples); + rng.generateMultivariateNormalSamples( + {11.0, 10.0, 12.0}, + {{4.0, 2.9, 0.5}, {2.9, 2.6, 0.1}, {0.5, 0.1, 2.0}}, 1000, samples); maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; maths::CMultivariateNormalConjugate<3> prior{multivariateNormal()}; auto controllers = decayRateControllers(3); - maths::CMultivariateTimeSeriesModel origModel{params(bucketLength), + maths::CMultivariateTimeSeriesModel origModel{modelParams(bucketLength), trend, prior, &controllers}; TDouble2VecWeightsAryVec weights{maths_t::CUnitWeights::unit(3)}; core_t::TTime time{0}; for (const auto& sample : samples) { - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - origModel.addSamples( - params, {core::make_triple(time, TDouble2Vec(sample), TAG)}); + origModel.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec(sample), TAG)}); time += bucketLength; } @@ -1647,19 +1576,19 @@ void CTimeSeriesModelTest::testPersist() { inserter.toXml(origXml); } - //LOG_DEBUG(<< "model XML representation:\n" << origXml); + LOG_TRACE(<< "model XML representation:\n" << origXml); + LOG_DEBUG(<< "model XML size: " << origXml.size()); // Restore the XML into a new filter core::CRapidXmlParser parser; CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); core::CRapidXmlStateRestoreTraverser traverser(parser); + maths::SDistributionRestoreParams distributionParams{maths_t::E_ContinuousData, + DECAY_RATE}; maths::STimeSeriesDecompositionRestoreParams decompositionParams{ - 24.0 * DECAY_RATE, bucketLength, - maths::CTimeSeriesDecomposition::DEFAULT_COMPONENT_SIZE}; - maths::SDistributionRestoreParams distributionParams{ - maths_t::E_ContinuousData, DECAY_RATE, 0.5, 24.0, 12}; - maths::SModelRestoreParams restoreParams{params_, decompositionParams, distributionParams}; + 24.0 * DECAY_RATE, bucketLength, distributionParams}; + maths::SModelRestoreParams restoreParams{params, decompositionParams, distributionParams}; maths::CMultivariateTimeSeriesModel restoredModel{restoreParams, traverser}; CPPUNIT_ASSERT_EQUAL(origModel.checksum(), restoredModel.checksum()); @@ -1669,6 +1598,12 @@ void CTimeSeriesModelTest::testPersist() { } void CTimeSeriesModelTest::testUpgrade() { + // Test upgrade is minimally disruptive. We test the upgraded model + // predicted confidence intervals verses the values we obtain from + // the previous model. Note the confidence interval depends on both + // trend and residual model so this test is sensitive to problems + // restoring either. + using TStrVec = std::vector; auto load = [](const std::string& name, std::string& result) { std::ifstream file; @@ -1680,7 +1615,7 @@ void CTimeSeriesModelTest::testUpgrade() { core_t::TTime bucketLength{600}; core_t::TTime halfHour{1800}; - maths::CModelParams params_{params(bucketLength)}; + maths::CModelParams params{modelParams(bucketLength)}; std::string empty; LOG_DEBUG(<< "Univariate"); @@ -1699,12 +1634,11 @@ void CTimeSeriesModelTest::testUpgrade() { CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(xml)); core::CRapidXmlStateRestoreTraverser traverser(parser); + maths::SDistributionRestoreParams distributionParams{maths_t::E_ContinuousData, + DECAY_RATE}; maths::STimeSeriesDecompositionRestoreParams decompositionParams{ - 24.0 * DECAY_RATE, bucketLength, - maths::CTimeSeriesDecomposition::DEFAULT_COMPONENT_SIZE}; - maths::SDistributionRestoreParams distributionParams{ - maths_t::E_ContinuousData, DECAY_RATE, 0.5, 24.0, 12}; - maths::SModelRestoreParams restoreParams{params_, decompositionParams, distributionParams}; + 24.0 * DECAY_RATE, bucketLength, distributionParams}; + maths::SModelRestoreParams restoreParams{params, decompositionParams, distributionParams}; maths::CUnivariateTimeSeriesModel restoredModel{restoreParams, traverser}; TStrVec expectedInterval; @@ -1749,12 +1683,11 @@ void CTimeSeriesModelTest::testUpgrade() { CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(xml)); core::CRapidXmlStateRestoreTraverser traverser(parser); + maths::SDistributionRestoreParams distributionParams{maths_t::E_ContinuousData, + DECAY_RATE}; maths::STimeSeriesDecompositionRestoreParams decompositionParams{ - 24.0 * DECAY_RATE, bucketLength, - maths::CTimeSeriesDecomposition::DEFAULT_COMPONENT_SIZE}; - maths::SDistributionRestoreParams distributionParams{ - maths_t::E_ContinuousData, DECAY_RATE, 0.5, 24.0, 12}; - maths::SModelRestoreParams restoreParams{params_, decompositionParams, distributionParams}; + 24.0 * DECAY_RATE, bucketLength, distributionParams}; + maths::SModelRestoreParams restoreParams{params, decompositionParams, distributionParams}; maths::CMultivariateTimeSeriesModel restoredModel{restoreParams, traverser}; TStrVec expectedInterval; @@ -1792,17 +1725,16 @@ void CTimeSeriesModelTest::testAddSamplesWithCorrelations() { test::CRandomNumbers rng; { - TDoubleVec mean{10.0, 15.0}; - TDoubleVecVec covariance{{3.0, 2.9}, {2.9, 2.6}}; TDoubleVecVec samples; - rng.generateMultivariateNormalSamples(mean, covariance, 1000, samples); + rng.generateMultivariateNormalSamples({10.0, 15.0}, {{3.0, 2.9}, {2.9, 2.6}}, + 1000, samples); maths::CTimeSeriesDecomposition trend{DECAY_RATE, bucketLength}; maths::CTimeSeriesCorrelations correlations{MINIMUM_SIGNIFICANT_CORRELATION, DECAY_RATE}; maths::CNormalMeanPrecConjugate prior{univariateNormal()}; maths::CUnivariateTimeSeriesModel models[]{ - {params(bucketLength), 0, trend, prior, nullptr}, - {params(bucketLength), 1, trend, prior, nullptr}}; + {modelParams(bucketLength), 0, trend, prior, nullptr}, + {modelParams(bucketLength), 1, trend, prior, nullptr}}; models[0].modelCorrelations(correlations); models[1].modelCorrelations(correlations); CTimeSeriesCorrelateModelAllocator allocator; @@ -1811,12 +1743,10 @@ void CTimeSeriesModelTest::testAddSamplesWithCorrelations() { core_t::TTime time{0}; for (auto sample : samples) { correlations.refresh(allocator); - maths::CModelAddSamplesParams params; - params.integer(false).propagationInterval(1.0).trendWeights(weights).priorWeights(weights); - models[0].addSamples( - params, {core::make_triple(time, TDouble2Vec{sample[0]}, TAG)}); - models[1].addSamples( - params, {core::make_triple(time, TDouble2Vec{sample[1]}, TAG)}); + models[0].addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample[0]}, TAG)}); + models[1].addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample[1]}, TAG)}); correlations.processSamples(); time += bucketLength; } @@ -1830,14 +1760,17 @@ void CTimeSeriesModelTest::testProbabilityWithCorrelations() { } void CTimeSeriesModelTest::testAnomalyModel() { + // We test we can find the "odd anomaly out". + using TSizeVec = std::vector; using TDoubleSizePr = std::pair; test::CRandomNumbers rng; - LOG_DEBUG(<< "Univariate") { - std::size_t length = 2000; + std::size_t length = 2000; + LOG_DEBUG(<< "Univariate"); + { TSizeVec anomalies; rng.generateUniformSamples(0, length, 30, anomalies); std::sort(anomalies.begin(), anomalies.end()); @@ -1847,8 +1780,8 @@ void CTimeSeriesModelTest::testAnomalyModel() { core_t::TTime bucketLength{600}; maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; - maths::CUnivariateTimeSeriesModel model{params(bucketLength), 1, trend, - univariateNormal()}; + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 1, + trend, univariateNormal()}; //std::ofstream file; //file.open("results.m"); @@ -1865,30 +1798,17 @@ void CTimeSeriesModelTest::testAnomalyModel() { if (bucket >= length - 100 && bucket < length - 92) { sample += 8.0; } - { - maths::CModelAddSamplesParams params; - params.integer(false) - .propagationInterval(1.0) - .trendWeights(weights) - .priorWeights(weights); - model.addSamples( - params, {core::make_triple(time, TDouble2Vec{sample}, TAG)}); - } - { - maths::CModelProbabilityParams params; - params.addCalculation(maths_t::E_TwoSided) - .seasonalConfidenceInterval(50.0) - .addBucketEmpty({false}) - .addWeights(weights[0]); - TTail2Vec tail; - double probability; - bool conditional; - TSize1Vec mostAnomalousCorrelate; - model.probability(params, {{time}}, {{sample}}, probability, - tail, conditional, mostAnomalousCorrelate); - mostAnomalous.add({::log(probability), bucket}); - //scores.push_back(maths::CTools::deviation(probability)); - } + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec{sample}, TAG)}); + TTail2Vec tail; + double probability; + bool conditional; + TSize1Vec mostAnomalousCorrelate; + model.probability(computeProbabilityParams(weights[0]), {{time}}, + {{sample}}, probability, tail, conditional, + mostAnomalousCorrelate); + mostAnomalous.add({std::log(probability), bucket}); + //scores.push_back(maths::CTools::deviation(probability)); time += bucketLength; } @@ -1919,21 +1839,20 @@ void CTimeSeriesModelTest::testAnomalyModel() { //file << "plot([1:length(s)], s, 'r');\n"; } - LOG_DEBUG(<< "Multivariate") { - std::size_t length = 2000; - + LOG_DEBUG("Multivariate"); + { TSizeVec anomalies; rng.generateUniformSamples(0, length, 30, anomalies); std::sort(anomalies.begin(), anomalies.end()); core_t::TTime bucketLength{600}; - TDoubleVec mean{10.0, 10.0, 10.0}; - TDoubleVecVec covariance{{4.0, 0.9, 0.5}, {0.9, 2.6, 0.1}, {0.5, 0.1, 3.0}}; TDoubleVecVec samples; - rng.generateMultivariateNormalSamples(mean, covariance, length, samples); + rng.generateMultivariateNormalSamples( + {10.0, 10.0, 10.0}, + {{4.0, 0.9, 0.5}, {0.9, 2.6, 0.1}, {0.5, 0.1, 3.0}}, length, samples); maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; maths::CMultivariateNormalConjugate<3> prior{multivariateNormal()}; - maths::CMultivariateTimeSeriesModel model{params(bucketLength), trend, prior}; + maths::CMultivariateTimeSeriesModel model{modelParams(bucketLength), trend, prior}; //std::ofstream file; //file.open("results.m"); @@ -1953,30 +1872,17 @@ void CTimeSeriesModelTest::testAnomalyModel() { } } ++bucket; - { - maths::CModelAddSamplesParams params; - params.integer(false) - .propagationInterval(1.0) - .trendWeights(weights) - .priorWeights(weights); - model.addSamples( - params, {core::make_triple(time, TDouble2Vec(sample), TAG)}); - } - { - maths::CModelProbabilityParams params; - params.addCalculation(maths_t::E_TwoSided) - .seasonalConfidenceInterval(50.0) - .addBucketEmpty({false}) - .addWeights(weights[0]); - TTail2Vec tail; - double probability; - bool conditional; - TSize1Vec mostAnomalousCorrelate; - model.probability(params, {{time}}, {(sample)}, probability, - tail, conditional, mostAnomalousCorrelate); - mostAnomalous.add({::log(probability), bucket}); - //scores.push_back(maths::CTools::deviation(probability)); - } + model.addSamples(addSampleParams(weights), + {core::make_triple(time, TDouble2Vec(sample), TAG)}); + TTail2Vec tail; + double probability; + bool conditional; + TSize1Vec mostAnomalousCorrelate; + model.probability(computeProbabilityParams(weights[0]), {{time}}, + {(sample)}, probability, tail, conditional, + mostAnomalousCorrelate); + mostAnomalous.add({std::log(probability), bucket}); + //scores.push_back(maths::CTools::deviation(probability)); time += bucketLength; } @@ -2013,6 +1919,389 @@ void CTimeSeriesModelTest::testAnomalyModel() { } } +void CTimeSeriesModelTest::testStepChangeDiscontinuities() { + // Check detection and modelling of step changes in data with + // 1) Piecewise constant, + // 2) Saw tooth. + + using TDouble3Vec = core::CSmallVector; + using TDouble3VecVec = std::vector; + + TDouble2VecWeightsAryVec weight{maths_t::CUnitWeights::unit(1)}; + auto updateModel = [&](core_t::TTime time, double value, + maths::CUnivariateTimeSeriesModel& model) { + maths_t::setWinsorisationWeight( + model.winsorisationWeight(0.0, time, {value}), weight[0]); + model.addSamples(addSampleParams(1.0, weight), + {core::make_triple(time, TDouble2Vec{value}, TAG)}); + }; + + //std::ostringstream actual, modelBounds; + //actual << "r = ["; + //modelBounds << "x = ["; + //auto updateTestDebug = [&](core_t::TTime time, double value, + // const maths::CUnivariateTimeSeriesModel &model) + // { + // actual << value << std::endl; + // auto x = model.confidenceInterval(time, 90.0, {maths_t::E_SampleCountWeight}, {{1.0}}); + // if (x.size() == 3) + // { + // modelBounds << x[0][0] << "," << x[1][0] << "," << x[2][0] << std::endl; + // } + // }; + + test::CRandomNumbers rng; + + LOG_DEBUG("Univariate: Piecwise Constant"); + { + core_t::TTime bucketLength{600}; + maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; + auto controllers = decayRateControllers(1); + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, + univariateNormal(DECAY_RATE / 3.0), + &controllers}; + + // Add some data to the model. + + core_t::TTime time{0}; + TDoubleVec samples; + double level{20.0}; + for (auto dl : {10.0, 20.0, 15.0, 50.0, 30.0, 40.0, 15.0, 40.0, 25.0}) { + level += dl; + rng.generateNormalSamples( + level, 2.0, 300 + static_cast(2.0 * dl), samples); + for (auto sample : samples) { + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + time += bucketLength; + } + } + level += 30.0; + rng.generateNormalSamples(level, 2.0, 100, samples); + for (auto sample : samples) { + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + time += bucketLength; + } + + // Generate expected values from the same process. + + TDoubleVec expected; + rng.generateNormalSamples(level, 2.0, 260, expected); + for (auto dl : {25.0, 40.0}) { + level += dl; + rng.generateNormalSamples( + level, 2.0, 300 + static_cast(2.0 * dl), samples); + expected.insert(expected.end(), samples.begin(), samples.end()); + } + //std::for_each(expected.begin(), expected.end(), + // [&actual](double sample) { actual << sample << std::endl; }); + + //std::ofstream file; + //file.open("forecast.m"); + //file << actual.str() << "];"; + //file << modelBounds.str() << "];"; + //file << "y = ["; + TDouble3VecVec forecast; + auto pushErrorBar = [&](const maths::SErrorBar& errorBar) { + forecast.push_back({errorBar.s_LowerBound, errorBar.s_Predicted, + errorBar.s_UpperBound}); + //file << errorBar.s_LowerBound << "," + // << errorBar.s_Predicted << "," + // << errorBar.s_UpperBound << std::endl; + }; + + std::string m; + model.forecast(time, time + 800 * bucketLength, 90.0, {-1000.0}, + {1000.0}, pushErrorBar, m); + + //file << "];"; + + double outOfBounds{0.0}; + for (std::size_t i = 0u; i < forecast.size(); ++i) { + CPPUNIT_ASSERT_DOUBLES_EQUAL(expected[i], forecast[i][1], 0.1 * expected[i]); + outOfBounds += static_cast(expected[i] < forecast[i][0] || + expected[i] > forecast[i][2]); + } + double percentageOutOfBounds{100.0 * outOfBounds / + static_cast(forecast.size())}; + LOG_DEBUG("% out-of-bounds = " << percentageOutOfBounds); + CPPUNIT_ASSERT(percentageOutOfBounds < 1.0); + } + + LOG_DEBUG("Univariate: Saw Tooth"); + { + core_t::TTime bucketLength{1800}; + maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; + auto controllers = decayRateControllers(1); + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, + univariateNormal(), &controllers}; + + // Add some data to the model. + + core_t::TTime time{0}; + double value{10.0}; + TDoubleVec noise; + for (auto slope : {0.08, 0.056, 0.028, 0.044, 0.06, 0.03}) { + value = 5.0; + while (value < 95.0) { + rng.generateNormalSamples(0.0, 2.0, 1, noise); + updateModel(time, value + noise[0], model); + //updateTestDebug(time, value + noise[0], model); + time += bucketLength; + value += slope; + } + } + for (auto slope : {0.042}) { + value = 5.0; + for (std::size_t i = 0u; i < 1500; ++i) { + rng.generateNormalSamples(0.0, 2.0, 1, noise); + updateModel(time, value + noise[0], model); + //updateTestDebug(time, value + noise[0], model); + time += bucketLength; + value += slope; + } + } + + // Generate expected values from the same process. + + TDoubleVec expected; + for (auto slope : {0.05, 0.04}) { + while (expected.size() < 2000 && value < 95.0) { + rng.generateNormalSamples(0.0, 2.0, 1, noise); + expected.push_back(value + noise[0]); + //actual << value + noise[0] << std::endl; + value += slope; + } + value = 5.0; + } + + // Test forecasting. + + //std::ofstream file; + //file.open("forecast.m"); + //file << actual.str() << "];"; + //file << modelBounds.str() << "];"; + //file << "y = ["; + TDouble3VecVec forecast; + auto pushErrorBar = [&](const maths::SErrorBar& errorBar) { + forecast.push_back({errorBar.s_LowerBound, errorBar.s_Predicted, + errorBar.s_UpperBound}); + //file << errorBar.s_LowerBound << "," + // << errorBar.s_Predicted << "," + // << errorBar.s_UpperBound << std::endl; + }; + + std::string m; + model.forecast(time, time + 2000 * bucketLength, 90.0, {-1000.0}, + {1000.0}, pushErrorBar, m); + + //file << "];"; + + double outOfBounds{0.0}; + for (std::size_t i = 0u; i < forecast.size(); ++i) { + outOfBounds += static_cast(expected[i] < forecast[i][0] || + expected[i] > forecast[i][2]); + } + double percentageOutOfBounds{100.0 * outOfBounds / + static_cast(forecast.size())}; + LOG_DEBUG("% out-of-bounds = " << percentageOutOfBounds); + CPPUNIT_ASSERT(percentageOutOfBounds < 5.0); + } +} + +void CTimeSeriesModelTest::testLinearScaling() { + // We test that the predictions are good and the bounds do not + // blow up after we: + // 1) linearly scale down a periodic pattern, + // 2) linearly scale up the same periodic pattern. + + TDouble2VecWeightsAryVec weight{maths_t::CUnitWeights::unit(1)}; + auto updateModel = [&](core_t::TTime time, double value, + maths::CUnivariateTimeSeriesModel& model) { + maths_t::setWinsorisationWeight( + model.winsorisationWeight(0.0, time, {value}), weight[0]); + model.addSamples(addSampleParams(1.0, weight), + {core::make_triple(time, TDouble2Vec{value}, TAG)}); + }; + + //std::ostringstream actual, modelBounds; + //actual << "r = ["; + //modelBounds << "x = ["; + //auto updateTestDebug = [&](core_t::TTime time, double value, + // const maths::CUnivariateTimeSeriesModel &model) + // { + // actual << value << std::endl; + // auto x = model.confidenceInterval(time, 90.0, {maths_t::E_SampleCountWeight}, {{1.0}}); + // if (x.size() == 3) + // { + // modelBounds << x[0][0] << "," << x[1][0] << "," << x[2][0] << std::endl; + // } + // }; + + test::CRandomNumbers rng; + + double noiseVariance{3.0}; + + core_t::TTime bucketLength{600}; + maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; + auto controllers = decayRateControllers(1); + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, + univariateNormal(DECAY_RATE / 3.0), &controllers}; + + core_t::TTime time{0}; + TDoubleVec samples; + rng.generateNormalSamples(0.0, noiseVariance, 1000, samples); + for (auto sample : samples) { + sample += 12.0 + 10.0 * smoothDaily(time); + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + time += bucketLength; + } + + // Scale by 0.3 + + rng.generateNormalSamples(0.0, noiseVariance, 200, samples); + for (auto sample : samples) { + sample = 0.3 * (12.0 + 10.0 * smoothDaily(time) + sample); + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + time += bucketLength; + } + rng.generateNormalSamples(0.0, noiseVariance, 1500, samples); + for (auto sample : samples) { + sample = 0.3 * (12.0 + 10.0 * smoothDaily(time) + sample); + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + auto x = model.confidenceInterval( + time, 90.0, maths_t::CUnitWeights::unit(1)); + CPPUNIT_ASSERT(::fabs(sample - x[1][0]) < 1.2 * std::sqrt(noiseVariance)); + CPPUNIT_ASSERT(::fabs(x[2][0] - x[0][0]) < 3.3 * std::sqrt(noiseVariance)); + time += bucketLength; + } + + // Scale by 2 / 0.3 + + rng.generateNormalSamples(0.0, noiseVariance, 200, samples); + for (auto sample : samples) { + sample = 2.0 * (12.0 + 10.0 * smoothDaily(time)) + sample; + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + time += bucketLength; + } + rng.generateNormalSamples(0.0, noiseVariance, 400, samples); + for (auto sample : samples) { + sample = 2.0 * (12.0 + 10.0 * smoothDaily(time)) + sample; + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + auto x = model.confidenceInterval( + time, 90.0, maths_t::CUnitWeights::unit(1)); + CPPUNIT_ASSERT(std::fabs(sample - x[1][0]) < 3.1 * std::sqrt(noiseVariance)); + CPPUNIT_ASSERT(std::fabs(x[2][0] - x[0][0]) < 3.3 * std::sqrt(noiseVariance)); + time += bucketLength; + } + + //std::ofstream file; + //file.open("bounds.m"); + //file << actual.str() << "];"; + //file << modelBounds.str() << "];"; +} + +void CTimeSeriesModelTest::testDaylightSaving() { + TDouble2VecWeightsAryVec weight{maths_t::CUnitWeights::unit(1)}; + auto updateModel = [&](core_t::TTime time, double value, + maths::CUnivariateTimeSeriesModel& model) { + maths_t::setWinsorisationWeight( + model.winsorisationWeight(0.0, time, {value}), weight[0]); + model.addSamples(addSampleParams(1.0, weight), + {core::make_triple(time, TDouble2Vec{value}, TAG)}); + }; + + //std::ostringstream actual, modelBounds; + //actual << "r = ["; + //modelBounds << "x = ["; + //auto updateTestDebug = [&](core_t::TTime time, double value, + // const maths::CUnivariateTimeSeriesModel &model) + // { + // actual << value << std::endl; + // auto x = model.confidenceInterval(time, 90.0, {maths_t::E_SampleCountWeight}, {{1.0}}); + // if (x.size() == 3) + // { + // modelBounds << x[0][0] << "," << x[1][0] << "," << x[2][0] << std::endl; + // } + // }; + + test::CRandomNumbers rng; + + core_t::TTime hour{core::constants::HOUR}; + double noiseVariance{0.36}; + + core_t::TTime bucketLength{600}; + maths::CTimeSeriesDecomposition trend{24.0 * DECAY_RATE, bucketLength}; + auto controllers = decayRateControllers(1); + maths::CUnivariateTimeSeriesModel model{modelParams(bucketLength), 0, trend, + univariateNormal(DECAY_RATE / 3.0), &controllers}; + + core_t::TTime time{0}; + TDoubleVec samples; + rng.generateNormalSamples(0.0, noiseVariance, 1000, samples); + for (auto sample : samples) { + sample += 12.0 + 10.0 * smoothDaily(time); + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + time += bucketLength; + } + + // Shift by +1 hr. + + rng.generateNormalSamples(0.0, noiseVariance, 200, samples); + for (auto sample : samples) { + sample += 12.0 + 10.0 * smoothDaily(time + hour); + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + time += bucketLength; + } + rng.generateNormalSamples(0.0, noiseVariance, 1500, samples); + for (auto sample : samples) { + sample += 12.0 + 10.0 * smoothDaily(time + hour); + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + CPPUNIT_ASSERT_EQUAL(hour, model.trendModel().timeShift()); + auto x = model.confidenceInterval( + time, 90.0, maths_t::CUnitWeights::unit(1)); + CPPUNIT_ASSERT(std::fabs(sample - x[1][0]) < 3.6 * std::sqrt(noiseVariance)); + CPPUNIT_ASSERT(std::fabs(x[2][0] - x[0][0]) < 3.7 * std::sqrt(noiseVariance)); + time += bucketLength; + } + + // Shift by -1 hr. + + rng.generateNormalSamples(0.0, noiseVariance, 200, samples); + for (auto sample : samples) { + sample += 12.0 + 10.0 * smoothDaily(time); + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + time += bucketLength; + } + rng.generateNormalSamples(0.0, noiseVariance, 400, samples); + for (auto sample : samples) { + sample += 12.0 + 10.0 * smoothDaily(time); + updateModel(time, sample, model); + //updateTestDebug(time, sample, model); + CPPUNIT_ASSERT_EQUAL(core_t::TTime(0), model.trendModel().timeShift()); + auto x = model.confidenceInterval( + time, 90.0, maths_t::CUnitWeights::unit(1)); + CPPUNIT_ASSERT(std::fabs(sample - x[1][0]) < 4.1 * std::sqrt(noiseVariance)); + CPPUNIT_ASSERT(std::fabs(x[2][0] - x[0][0]) < 3.9 * std::sqrt(noiseVariance)); + time += bucketLength; + } + + //std::ofstream file; + //file.open("bounds.m"); + //file << actual.str() << "];"; + //file << modelBounds.str() << "];"; +} + CppUnit::Test* CTimeSeriesModelTest::suite() { CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CTimeSeriesModelTest"); @@ -2044,6 +2333,13 @@ CppUnit::Test* CTimeSeriesModelTest::suite() { &CTimeSeriesModelTest::testProbabilityWithCorrelations)); suiteOfTests->addTest(new CppUnit::TestCaller( "CTimeSeriesModelTest::testAnomalyModel", &CTimeSeriesModelTest::testAnomalyModel)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTimeSeriesModelTest::testStepChangeDiscontinuities", + &CTimeSeriesModelTest::testStepChangeDiscontinuities)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTimeSeriesModelTest::testLinearScaling", &CTimeSeriesModelTest::testLinearScaling)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTimeSeriesModelTest::testDaylightSaving", &CTimeSeriesModelTest::testDaylightSaving)); return suiteOfTests; } diff --git a/lib/maths/unittest/CTimeSeriesModelTest.h b/lib/maths/unittest/CTimeSeriesModelTest.h index 40b54cec71..0e2355a777 100644 --- a/lib/maths/unittest/CTimeSeriesModelTest.h +++ b/lib/maths/unittest/CTimeSeriesModelTest.h @@ -24,6 +24,9 @@ class CTimeSeriesModelTest : public CppUnit::TestFixture { void testAddSamplesWithCorrelations(); void testProbabilityWithCorrelations(); void testAnomalyModel(); + void testStepChangeDiscontinuities(); + void testLinearScaling(); + void testDaylightSaving(); static CppUnit::Test* suite(); }; diff --git a/lib/maths/unittest/CToolsTest.cc b/lib/maths/unittest/CToolsTest.cc index 0ea8796b95..6c3617b26a 100644 --- a/lib/maths/unittest/CToolsTest.cc +++ b/lib/maths/unittest/CToolsTest.cc @@ -1035,14 +1035,14 @@ void CToolsTest::testMixtureProbabilityOfLessLikelySample() { } } -void CToolsTest::testDeviation() { - // Test p = inverseDeviation(deviation(p)) +void CToolsTest::testAnomalyScore() { + // Test p = inverseAnomalyScore(anomalyScore(p)) double p = 0.04; for (std::size_t i = 0u; i < 305; ++i, p *= 0.1) { - double deviation = CTools::deviation(p); - LOG_DEBUG(<< "p = " << p << ", deviation = " << deviation); - CPPUNIT_ASSERT_DOUBLES_EQUAL(p, CTools::inverseDeviation(deviation), 1e-3 * p); + double anomalyScore = CTools::anomalyScore(p); + LOG_DEBUG(<< "p = " << p << ", anomalyScore = " << anomalyScore); + CPPUNIT_ASSERT_DOUBLES_EQUAL(p, CTools::inverseAnomalyScore(anomalyScore), 1e-3 * p); } } @@ -1180,7 +1180,7 @@ CppUnit::Test* CToolsTest::suite() { "CToolsTest::testMixtureProbabilityOfLessLikelySample", &CToolsTest::testMixtureProbabilityOfLessLikelySample)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CToolsTest::testDeviation", &CToolsTest::testDeviation)); + "CToolsTest::testAnomalyScore", &CToolsTest::testAnomalyScore)); suiteOfTests->addTest(new CppUnit::TestCaller( "CToolsTest::testSpread", &CToolsTest::testSpread)); suiteOfTests->addTest(new CppUnit::TestCaller( diff --git a/lib/maths/unittest/CToolsTest.h b/lib/maths/unittest/CToolsTest.h index 4e9a5cfedf..8f01243a1a 100644 --- a/lib/maths/unittest/CToolsTest.h +++ b/lib/maths/unittest/CToolsTest.h @@ -14,7 +14,7 @@ class CToolsTest : public CppUnit::TestFixture { void testProbabilityOfLessLikelySample(); void testIntervalExpectation(); void testMixtureProbabilityOfLessLikelySample(); - void testDeviation(); + void testAnomalyScore(); void testSpread(); void testFastLog(); void testMiscellaneous(); diff --git a/lib/maths/unittest/CTrendComponentTest.cc b/lib/maths/unittest/CTrendComponentTest.cc index 8921279847..85130c1b86 100644 --- a/lib/maths/unittest/CTrendComponentTest.cc +++ b/lib/maths/unittest/CTrendComponentTest.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -304,7 +305,11 @@ void CTrendComponentTest::testForecast() { component.shiftOrigin(time); TDouble3VecVec forecast; - component.forecast(time, time + 1000 * bucketLength, 3600, 95.0, forecast); + component.forecast(time, time + 1000 * bucketLength, 3600, 95.0, + [](core_t::TTime) { return TDouble3Vec(3, 0.0); }, + [&forecast](core_t::TTime, const TDouble3Vec& value) { + forecast.push_back(value); + }); TMeanAccumulator meanError; TMeanAccumulator meanErrorAt95; @@ -398,10 +403,11 @@ void CTrendComponentTest::testPersist() { core::CRapidXmlParser parser; CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); core::CRapidXmlStateRestoreTraverser traverser(parser); + maths::SDistributionRestoreParams params{maths_t::E_ContinuousData, 0.1}; maths::CTrendComponent restoredComponent{0.1}; - traverser.traverseSubLevel(boost::bind( - &maths::CTrendComponent::acceptRestoreTraverser, &restoredComponent, _1)); + traverser.traverseSubLevel(boost::bind(&maths::CTrendComponent::acceptRestoreTraverser, + &restoredComponent, boost::cref(params), _1)); CPPUNIT_ASSERT_EQUAL(origComponent.checksum(), restoredComponent.checksum()); diff --git a/lib/maths/unittest/Main.cc b/lib/maths/unittest/Main.cc index 01ee86f3f1..e8bcc74fb2 100644 --- a/lib/maths/unittest/Main.cc +++ b/lib/maths/unittest/Main.cc @@ -45,6 +45,7 @@ #include "CMultivariateMultimodalPriorTest.h" #include "CMultivariateNormalConjugateTest.h" #include "CMultivariateOneOfNPriorTest.h" +#include "CNaiveBayesTest.h" #include "CNaturalBreaksClassifierTest.h" #include "CNormalMeanPrecConjugateTest.h" #include "COneOfNPriorTest.h" @@ -70,6 +71,7 @@ #include "CSolversTest.h" #include "CSplineTest.h" #include "CStatisticalTestsTest.h" +#include "CTimeSeriesChangeDetectorTest.h" #include "CTimeSeriesDecompositionTest.h" #include "CTimeSeriesModelTest.h" #include "CToolsTest.h" @@ -121,6 +123,7 @@ int main(int argc, const char** argv) { runner.addTest(CMultivariateMultimodalPriorTest::suite()); runner.addTest(CMultivariateNormalConjugateTest::suite()); runner.addTest(CMultivariateOneOfNPriorTest::suite()); + runner.addTest(CNaiveBayesTest::suite()); runner.addTest(CNaturalBreaksClassifierTest::suite()); runner.addTest(CNormalMeanPrecConjugateTest::suite()); runner.addTest(COneOfNPriorTest::suite()); @@ -146,6 +149,7 @@ int main(int argc, const char** argv) { runner.addTest(CSolversTest::suite()); runner.addTest(CSplineTest::suite()); runner.addTest(CStatisticalTestsTest::suite()); + runner.addTest(CTimeSeriesChangeDetectorTest::suite()); runner.addTest(CTimeSeriesDecompositionTest::suite()); runner.addTest(CTimeSeriesModelTest::suite()); runner.addTest(CToolsTest::suite()); diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile index 9e7c9c8798..566dcad738 100644 --- a/lib/maths/unittest/Makefile +++ b/lib/maths/unittest/Makefile @@ -56,6 +56,7 @@ SRCS=\ CMultivariateMultimodalPriorTest.cc \ CMultivariateNormalConjugateTest.cc \ CMultivariateOneOfNPriorTest.cc \ + CNaiveBayesTest.cc \ CNaturalBreaksClassifierTest.cc \ CNormalMeanPrecConjugateTest.cc \ COneOfNPriorTest.cc \ @@ -81,6 +82,7 @@ SRCS=\ CSolversTest.cc \ CSplineTest.cc \ CStatisticalTestsTest.cc \ + CTimeSeriesChangeDetectorTest.cc \ CTimeSeriesDecompositionTest.cc \ CTimeSeriesModelTest.cc \ CToolsTest.cc \ diff --git a/lib/maths/unittest/TestUtils.cc b/lib/maths/unittest/TestUtils.cc index 62bbbc9a63..987936987f 100644 --- a/lib/maths/unittest/TestUtils.cc +++ b/lib/maths/unittest/TestUtils.cc @@ -142,7 +142,7 @@ bool CPriorTestInterface::anomalyScore(maths_t::EProbabilityCalculation calculat return false; } - result = CTools::deviation((lowerBound + upperBound) / 2.0); + result = CTools::anomalyScore((lowerBound + upperBound) / 2.0); return true; } diff --git a/lib/model/CAnomalyDetectorModel.cc b/lib/model/CAnomalyDetectorModel.cc index 8f60842896..9a269e62df 100644 --- a/lib/model/CAnomalyDetectorModel.cc +++ b/lib/model/CAnomalyDetectorModel.cc @@ -529,13 +529,9 @@ CAnomalyDetectorModel::SFeatureModels::SFeatureModels(model_t::EFeature feature, bool CAnomalyDetectorModel::SFeatureModels::acceptRestoreTraverser(const SModelParams& params_, core::CStateRestoreTraverser& traverser) { maths_t::EDataType dataType{s_NewModel->dataType()}; - maths::SModelRestoreParams params{ - s_NewModel->params(), - maths::STimeSeriesDecompositionRestoreParams{ - CAnomalyDetectorModelConfig::trendDecayRate(params_.s_DecayRate, - params_.s_BucketLength), - params_.s_BucketLength, params_.s_ComponentSize}, - params_.distributionRestoreParams(dataType)}; + maths::SModelRestoreParams params{s_NewModel->params(), + params_.decompositionRestoreParams(dataType), + params_.distributionRestoreParams(dataType)}; do { if (traverser.name() == MODEL_TAG) { TMathsModelPtr prior; diff --git a/lib/model/CAnomalyDetectorModelConfig.cc b/lib/model/CAnomalyDetectorModelConfig.cc index ef5971a864..a095bb0e61 100644 --- a/lib/model/CAnomalyDetectorModelConfig.cc +++ b/lib/model/CAnomalyDetectorModelConfig.cc @@ -35,9 +35,6 @@ namespace model { namespace { -using TSizeVec = std::vector; -using TTimeVec = std::vector; - const CAnomalyDetectorModelConfig::TIntDetectionRuleVecUMap EMPTY_RULES_MAP; const CAnomalyDetectorModelConfig::TStrDetectionRulePrVec EMPTY_EVENTS; @@ -59,18 +56,26 @@ const std::size_t CAnomalyDetectorModelConfig::DEFAULT_SAMPLE_COUNT_FACTOR_NO_LA const std::size_t CAnomalyDetectorModelConfig::DEFAULT_SAMPLE_COUNT_FACTOR_WITH_LATENCY(10); const double CAnomalyDetectorModelConfig::DEFAULT_SAMPLE_QUEUE_GROWTH_FACTOR(0.1); const core_t::TTime CAnomalyDetectorModelConfig::STANDARD_BUCKET_LENGTH(1800); +const std::size_t CAnomalyDetectorModelConfig::DEFAULT_BUCKET_RESULTS_DELAY(0); const double CAnomalyDetectorModelConfig::DEFAULT_DECAY_RATE(0.0005); const double CAnomalyDetectorModelConfig::DEFAULT_INITIAL_DECAY_RATE_MULTIPLIER(4.0); const double CAnomalyDetectorModelConfig::DEFAULT_LEARN_RATE(1.0); const double CAnomalyDetectorModelConfig::DEFAULT_INDIVIDUAL_MINIMUM_MODE_FRACTION(0.05); const double CAnomalyDetectorModelConfig::DEFAULT_POPULATION_MINIMUM_MODE_FRACTION(0.05); const double CAnomalyDetectorModelConfig::DEFAULT_MINIMUM_CLUSTER_SPLIT_COUNT(12.0); -const double CAnomalyDetectorModelConfig::DEFAULT_CUTOFF_TO_MODEL_EMPTY_BUCKETS(0.2); const double CAnomalyDetectorModelConfig::DEFAULT_CATEGORY_DELETE_FRACTION(0.8); +const double CAnomalyDetectorModelConfig::DEFAULT_CUTOFF_TO_MODEL_EMPTY_BUCKETS(0.2); const std::size_t CAnomalyDetectorModelConfig::DEFAULT_COMPONENT_SIZE(36u); -const std::size_t CAnomalyDetectorModelConfig::DEFAULT_TOTAL_PROBABILITY_CALC_SAMPLING_SIZE(10u); +const core_t::TTime + CAnomalyDetectorModelConfig::DEFAULT_MINIMUM_TIME_TO_DETECT_CHANGE(12 * core::constants::HOUR); +const core_t::TTime + CAnomalyDetectorModelConfig::DEFAULT_MAXIMUM_TIME_TO_TEST_FOR_CHANGE(core::constants::DAY); const double CAnomalyDetectorModelConfig::DEFAULT_MAXIMUM_UPDATES_PER_BUCKET(1.0); const double CAnomalyDetectorModelConfig::DEFAULT_INFLUENCE_CUTOFF(0.5); +const double CAnomalyDetectorModelConfig::DEFAULT_PRUNE_WINDOW_SCALE_MINIMUM(0.25); +const double CAnomalyDetectorModelConfig::DEFAULT_PRUNE_WINDOW_SCALE_MAXIMUM(4.0); +const double CAnomalyDetectorModelConfig::DEFAULT_CORRELATION_MODELS_OVERHEAD(3.0); +const double CAnomalyDetectorModelConfig::DEFAULT_MINIMUM_SIGNIFICANT_CORRELATION(0.3); const double CAnomalyDetectorModelConfig::DEFAULT_AGGREGATION_STYLE_PARAMS[][model_t::NUMBER_AGGREGATION_PARAMS] = {{0.0, 1.0, 1.0, 1.0}, {0.5, 0.5, 1.0, 5.0}, {0.5, 0.5, 1.0, 1.0}}; // The default for maximumanomalousprobability now matches the default @@ -79,7 +84,6 @@ const double CAnomalyDetectorModelConfig::DEFAULT_AGGREGATION_STYLE_PARAMS[][mod const double CAnomalyDetectorModelConfig::DEFAULT_MAXIMUM_ANOMALOUS_PROBABILITY(0.035); const double CAnomalyDetectorModelConfig::DEFAULT_NOISE_PERCENTILE(50.0); const double CAnomalyDetectorModelConfig::DEFAULT_NOISE_MULTIPLIER(1.0); -const std::size_t CAnomalyDetectorModelConfig::DEFAULT_BUCKET_RESULTS_DELAY(0); const CAnomalyDetectorModelConfig::TDoubleDoublePr CAnomalyDetectorModelConfig::DEFAULT_NORMALIZED_SCORE_KNOT_POINTS[9] = { CAnomalyDetectorModelConfig::TDoubleDoublePr(0.0, 0.0), CAnomalyDetectorModelConfig::TDoubleDoublePr(70.0, 1.0), @@ -90,11 +94,6 @@ const CAnomalyDetectorModelConfig::TDoubleDoublePr CAnomalyDetectorModelConfig:: CAnomalyDetectorModelConfig::TDoubleDoublePr(99.0, 50.0), CAnomalyDetectorModelConfig::TDoubleDoublePr(99.9, 90.0), CAnomalyDetectorModelConfig::TDoubleDoublePr(100.0, 100.0)}; -const std::size_t CAnomalyDetectorModelConfig::DEFAULT_RESAMPLING_MAX_SAMPLES(40u); -const double CAnomalyDetectorModelConfig::DEFAULT_PRUNE_WINDOW_SCALE_MINIMUM(0.25); -const double CAnomalyDetectorModelConfig::DEFAULT_PRUNE_WINDOW_SCALE_MAXIMUM(4.0); -const double CAnomalyDetectorModelConfig::DEFAULT_CORRELATION_MODELS_OVERHEAD(3.0); -const double CAnomalyDetectorModelConfig::DEFAULT_MINIMUM_SIGNIFICANT_CORRELATION(0.3); CAnomalyDetectorModelConfig CAnomalyDetectorModelConfig::defaultConfig(core_t::TTime bucketLength, @@ -740,7 +739,6 @@ const std::string ONLINE_LEARN_RATE_PROPERTY("learnrate"); const std::string DECAY_RATE_PROPERTY("decayrate"); const std::string INITIAL_DECAY_RATE_MULTIPLIER_PROPERTY("initialdecayratemultiplier"); const std::string MAXIMUM_UPDATES_PER_BUCKET_PROPERTY("maximumupdatesperbucket"); -const std::string TOTAL_PROBABILITY_CALC_SAMPLING_SIZE_PROPERTY("totalprobabilitycalcsamplingsize"); const std::string INDIVIDUAL_MODE_FRACTION_PROPERTY("individualmodefraction"); const std::string POPULATION_MODE_FRACTION_PROPERTY("populationmodefraction"); const std::string PEERS_MODE_FRACTION_PROPERTY("peersmodefraction"); @@ -814,17 +812,6 @@ bool CAnomalyDetectorModelConfig::processStanza(const boost::property_tree::ptre for (auto& factory : m_Factories) { factory.second->maximumUpdatesPerBucket(maximumUpdatesPerBucket); } - } else if (propName == TOTAL_PROBABILITY_CALC_SAMPLING_SIZE_PROPERTY) { - int totalProbabilityCalcSamplingSize; - if (core::CStringUtils::stringToType(propValue, totalProbabilityCalcSamplingSize) == false || - totalProbabilityCalcSamplingSize <= 0) { - LOG_ERROR(<< "Invalid value for property " << propName << " : " << propValue); - result = false; - continue; - } - for (auto& factory : m_Factories) { - factory.second->totalProbabilityCalcSamplingSize(totalProbabilityCalcSamplingSize); - } } else if (propName == INDIVIDUAL_MODE_FRACTION_PROPERTY) { double fraction; if (core::CStringUtils::stringToType(propValue, fraction) == false || diff --git a/lib/model/CAnomalyScore.cc b/lib/model/CAnomalyScore.cc index 02f57fd604..11ef2cd6ea 100644 --- a/lib/model/CAnomalyScore.cc +++ b/lib/model/CAnomalyScore.cc @@ -56,12 +56,12 @@ std::size_t addProbabilities(const TDoubleVec& probabilities, AGGREGATOR& aggreg //! The function to convert probabilities to *raw* scores. double probabilityToScore(double probability) { - return maths::CTools::deviation(probability); + return maths::CTools::anomalyScore(probability); } //! The function to convert *raw* scores to probabilities. double scoreToProbability(double score) { - return maths::CTools::inverseDeviation(score); + return maths::CTools::inverseAnomalyScore(score); } // We use short field names to reduce the state size diff --git a/lib/model/CEventRatePopulationModel.cc b/lib/model/CEventRatePopulationModel.cc index e8fa194488..7eee44a897 100644 --- a/lib/model/CEventRatePopulationModel.cc +++ b/lib/model/CEventRatePopulationModel.cc @@ -756,14 +756,14 @@ uint64_t CEventRatePopulationModel::checksum(bool includeCurrentBucketStats) con } for (const auto& feature : m_FeatureCorrelatesModels) { - for (const auto& prior : feature.s_Models->correlatePriors()) { - std::size_t cids[]{prior.first.first, prior.first.second}; + for (const auto& model : feature.s_Models->correlationModels()) { + std::size_t cids[]{model.first.first, model.first.second}; if (gatherer.isAttributeActive(cids[0]) && gatherer.isAttributeActive(cids[1])) { uint64_t& hash = hashes[{boost::cref(gatherer.attributeName(cids[0])), boost::cref(gatherer.attributeName(cids[1]))}]; - hash = maths::CChecksum::calculate(hash, prior.second); + hash = maths::CChecksum::calculate(hash, model.second); } } } @@ -908,10 +908,12 @@ void CEventRatePopulationModel::updateRecycledModels() { CDataGatherer& gatherer = this->dataGatherer(); for (auto cid : gatherer.recycledAttributeIds()) { for (auto& feature : m_FeatureModels) { - feature.s_Models[cid].reset(feature.s_NewModel->clone(cid)); - for (const auto& correlates : m_FeatureCorrelatesModels) { - if (feature.s_Feature == correlates.s_Feature) { - feature.s_Models.back()->modelCorrelations(*correlates.s_Models); + if (cid < feature.s_Models.size()) { + feature.s_Models[cid].reset(feature.s_NewModel->clone(cid)); + for (const auto& correlates : m_FeatureCorrelatesModels) { + if (feature.s_Feature == correlates.s_Feature) { + feature.s_Models.back()->modelCorrelations(*correlates.s_Models); + } } } } @@ -939,7 +941,9 @@ void CEventRatePopulationModel::clearPrunedResources(const TSizeVec& /*people*/, const TSizeVec& attributes) { for (auto cid : attributes) { for (auto& feature : m_FeatureModels) { - feature.s_Models[cid].reset(this->tinyModel()); + if (cid < feature.s_Models.size()) { + feature.s_Models[cid].reset(this->tinyModel()); + } } } } diff --git a/lib/model/CHierarchicalResultsAggregator.cc b/lib/model/CHierarchicalResultsAggregator.cc index 465b3b2ebc..c49524f153 100644 --- a/lib/model/CHierarchicalResultsAggregator.cc +++ b/lib/model/CHierarchicalResultsAggregator.cc @@ -208,7 +208,7 @@ void CHierarchicalResultsAggregator::aggregateLeaf(const TNode& node) { node.s_AggregationStyle = style; node.s_SmallestChildProbability = probability; node.s_SmallestDescendantProbability = probability; - node.s_RawAnomalyScore = maths::CTools::deviation(probability); + node.s_RawAnomalyScore = maths::CTools::anomalyScore(probability); } void CHierarchicalResultsAggregator::aggregateNode(const TNode& node, bool pivot) { diff --git a/lib/model/CHierarchicalResultsNormalizer.cc b/lib/model/CHierarchicalResultsNormalizer.cc index dd226b358b..13cffb0e25 100644 --- a/lib/model/CHierarchicalResultsNormalizer.cc +++ b/lib/model/CHierarchicalResultsNormalizer.cc @@ -122,7 +122,7 @@ void CHierarchicalResultsNormalizer::visit(const CHierarchicalResults& /*results // scaled so that it sums to the bucket anomaly score. double score = node.probability() > m_ModelConfig.maximumAnomalousProbability() ? 0.0 - : maths::CTools::deviation(node.probability()); + : maths::CTools::anomalyScore(node.probability()); switch (m_Job) { case E_Update: diff --git a/lib/model/CHierarchicalResultsProbabilityFinalizer.cc b/lib/model/CHierarchicalResultsProbabilityFinalizer.cc index be4786ca15..77bf74f6ce 100644 --- a/lib/model/CHierarchicalResultsProbabilityFinalizer.cc +++ b/lib/model/CHierarchicalResultsProbabilityFinalizer.cc @@ -16,7 +16,7 @@ void CHierarchicalResultsProbabilityFinalizer::visit(const CHierarchicalResults& bool /*pivot*/) { if (node.s_RawAnomalyScore > 0.0) { node.s_AnnotatedProbability.s_Probability = - maths::CTools::inverseDeviation(node.s_RawAnomalyScore); + maths::CTools::inverseAnomalyScore(node.s_RawAnomalyScore); } } } diff --git a/lib/model/CIndividualModel.cc b/lib/model/CIndividualModel.cc index 0eeaba0538..1582d38ab8 100644 --- a/lib/model/CIndividualModel.cc +++ b/lib/model/CIndividualModel.cc @@ -274,12 +274,12 @@ uint64_t CIndividualModel::checksum(bool includeCurrentBucketStats) const { TStrCRefStrCRefPrUInt64Map hashes2; for (const auto& feature : m_FeatureCorrelatesModels) { - for (const auto& prior : feature.s_Models->correlatePriors()) { - std::size_t pids[]{prior.first.first, prior.first.second}; + for (const auto& model : feature.s_Models->correlationModels()) { + std::size_t pids[]{model.first.first, model.first.second}; if (gatherer.isPersonActive(pids[0]) && gatherer.isPersonActive(pids[1])) { uint64_t& hash = hashes2[{boost::cref(this->personName(pids[0])), boost::cref(this->personName(pids[1]))}]; - hash = maths::CChecksum::calculate(hash, prior.second); + hash = maths::CChecksum::calculate(hash, model.second); } } } @@ -510,7 +510,9 @@ void CIndividualModel::clearPrunedResources(const TSizeVec& people, const TSizeVec& /*attributes*/) { for (auto pid : people) { for (auto& feature : m_FeatureModels) { - feature.s_Models[pid].reset(this->tinyModel()); + if (pid < feature.s_Models.size()) { + feature.s_Models[pid].reset(this->tinyModel()); + } } } } @@ -610,7 +612,7 @@ std::string CIndividualModel::printCurrentBucket() const { std::size_t CIndividualModel::numberCorrelations() const { std::size_t result = 0u; for (const auto& feature : m_FeatureCorrelatesModels) { - result += feature.s_Models->correlatePriors().size(); + result += feature.s_Models->correlationModels().size(); } return result; } diff --git a/lib/model/CInterimBucketCorrector.cc b/lib/model/CInterimBucketCorrector.cc index 4c400118bc..d4020e6b4f 100644 --- a/lib/model/CInterimBucketCorrector.cc +++ b/lib/model/CInterimBucketCorrector.cc @@ -6,14 +6,18 @@ #include #include +#include #include #include +#include #include #include #include +#include + namespace ml { namespace model { namespace { @@ -21,13 +25,13 @@ const std::size_t COMPONENT_SIZE(24); const std::string COUNT_TREND_TAG("a"); const std::string COUNT_MEAN_TAG("b"); -double meanDecayRate(core_t::TTime bucketLength) { +double decayRate(core_t::TTime bucketLength) { return CAnomalyDetectorModelConfig::DEFAULT_DECAY_RATE * CAnomalyDetectorModelConfig::bucketNormalizationFactor(bucketLength); } double trendDecayRate(core_t::TTime bucketLength) { - return CAnomalyDetectorModelConfig::trendDecayRate(meanDecayRate(bucketLength), bucketLength); + return CAnomalyDetectorModelConfig::trendDecayRate(decayRate(bucketLength), bucketLength); } } @@ -50,26 +54,21 @@ void CInterimBucketCorrector::update(core_t::TTime time, std::size_t bucketCount m_CountTrend.addPoint(bucketMidPoint, static_cast(bucketCount)); - double alpha = std::exp(-meanDecayRate(m_BucketLength)); + double alpha = std::exp(-decayRate(m_BucketLength)); m_CountMean.age(alpha); m_CountMean.add(bucketCount); } double CInterimBucketCorrector::estimateBucketCompleteness(core_t::TTime time, std::size_t currentCount) const { - double baselineCount = 0.0; core_t::TTime bucketMidPoint = this->calcBucketMidPoint(time); - if (m_CountTrend.initialized()) { - baselineCount = maths::CBasicStatistics::mean(m_CountTrend.baseline(bucketMidPoint)); - } else { - baselineCount = maths::CBasicStatistics::mean(m_CountMean); - } - - if (baselineCount == 0.0) { - return 1.0; - } - return maths::CTools::truncate(static_cast(currentCount) / baselineCount, - 0.0, 1.0); + double bucketCount = m_CountTrend.initialized() + ? maths::CBasicStatistics::mean(m_CountTrend.value(bucketMidPoint)) + : maths::CBasicStatistics::mean(m_CountMean); + return bucketCount > 0.0 + ? maths::CTools::truncate( + static_cast(currentCount) / bucketCount, 0.0, 1.0) + : 1.0; } double CInterimBucketCorrector::corrections(core_t::TTime time, @@ -115,15 +114,15 @@ bool CInterimBucketCorrector::acceptRestoreTraverser(core::CStateRestoreTraverse do { const std::string& name = traverser.name(); if (name == COUNT_TREND_TAG) { - maths::CTimeSeriesDecomposition restored( - trendDecayRate(m_BucketLength), m_BucketLength, COMPONENT_SIZE, traverser); + maths::SDistributionRestoreParams changeModelParams{ + maths_t::E_ContinuousData, decayRate(m_BucketLength)}; + maths::STimeSeriesDecompositionRestoreParams params{ + trendDecayRate(m_BucketLength), m_BucketLength, COMPONENT_SIZE, changeModelParams}; + maths::CTimeSeriesDecomposition restored(params, traverser); m_CountTrend.swap(restored); - } else if (name == COUNT_MEAN_TAG) { - if (m_CountMean.fromDelimited(traverser.value()) == false) { - LOG_ERROR(<< "Invalid count mean in " << traverser.value()); - return false; - } + continue; } + RESTORE(COUNT_MEAN_TAG, m_CountMean.fromDelimited(traverser.value())) } while (traverser.next()); return true; } diff --git a/lib/model/CMetricPopulationModel.cc b/lib/model/CMetricPopulationModel.cc index c4281fcc87..ac232bf8f5 100644 --- a/lib/model/CMetricPopulationModel.cc +++ b/lib/model/CMetricPopulationModel.cc @@ -673,14 +673,14 @@ uint64_t CMetricPopulationModel::checksum(bool includeCurrentBucketStats) const } for (const auto& feature : m_FeatureCorrelatesModels) { - for (const auto& prior : feature.s_Models->correlatePriors()) { - std::size_t cids[]{prior.first.first, prior.first.second}; + for (const auto& model : feature.s_Models->correlationModels()) { + std::size_t cids[]{model.first.first, model.first.second}; if (gatherer.isAttributeActive(cids[0]) && gatherer.isAttributeActive(cids[1])) { uint64_t& hash = hashes[{boost::cref(gatherer.attributeName(cids[0])), boost::cref(gatherer.attributeName(cids[1]))}]; - hash = maths::CChecksum::calculate(hash, prior.second); + hash = maths::CChecksum::calculate(hash, model.second); } } } @@ -817,10 +817,12 @@ void CMetricPopulationModel::updateRecycledModels() { CDataGatherer& gatherer = this->dataGatherer(); for (auto cid : gatherer.recycledAttributeIds()) { for (auto& feature : m_FeatureModels) { - feature.s_Models[cid].reset(feature.s_NewModel->clone(cid)); - for (const auto& correlates : m_FeatureCorrelatesModels) { - if (feature.s_Feature == correlates.s_Feature) { - feature.s_Models.back()->modelCorrelations(*correlates.s_Models); + if (cid < feature.s_Models.size()) { + feature.s_Models[cid].reset(feature.s_NewModel->clone(cid)); + for (const auto& correlates : m_FeatureCorrelatesModels) { + if (feature.s_Feature == correlates.s_Feature) { + feature.s_Models.back()->modelCorrelations(*correlates.s_Models); + } } } } @@ -849,10 +851,12 @@ void CMetricPopulationModel::clearPrunedResources(const TSizeVec& /*people*/, CDataGatherer& gatherer = this->dataGatherer(); for (auto cid : gatherer.recycledAttributeIds()) { for (auto& feature : m_FeatureModels) { - feature.s_Models[cid].reset(feature.s_NewModel->clone(cid)); - for (const auto& correlates : m_FeatureCorrelatesModels) { - if (feature.s_Feature == correlates.s_Feature) { - feature.s_Models.back()->modelCorrelations(*correlates.s_Models); + if (cid < feature.s_Models.size()) { + feature.s_Models[cid].reset(feature.s_NewModel->clone(cid)); + for (const auto& correlates : m_FeatureCorrelatesModels) { + if (feature.s_Feature == correlates.s_Feature) { + feature.s_Models.back()->modelCorrelations(*correlates.s_Models); + } } } } diff --git a/lib/model/CModelFactory.cc b/lib/model/CModelFactory.cc index 664f1e0c97..9b399fb7e4 100644 --- a/lib/model/CModelFactory.cc +++ b/lib/model/CModelFactory.cc @@ -71,8 +71,12 @@ CModelFactory::defaultFeatureModel(model_t::EFeature feature, using TDecayRateController2Ary = boost::array; - maths::CModelParams params{bucketLength, m_ModelParams.s_LearnRate, - m_ModelParams.s_DecayRate, minimumSeasonalVarianceScale}; + maths::CModelParams params{bucketLength, + m_ModelParams.s_LearnRate, + m_ModelParams.s_DecayRate, + minimumSeasonalVarianceScale, + m_ModelParams.s_MinimumTimeToDetectChange, + m_ModelParams.s_MaximumTimeToTestForChange}; std::size_t dimension{model_t::dimension(feature)}; @@ -238,10 +242,6 @@ void CModelFactory::pruneWindowScaleMaximum(double factor) { m_ModelParams.s_PruneWindowScaleMaximum = factor; } -void CModelFactory::totalProbabilityCalcSamplingSize(std::size_t samplingSize) { - m_ModelParams.s_TotalProbabilityCalcSamplingSize = samplingSize; -} - void CModelFactory::multivariateByFields(bool enabled) { m_ModelParams.s_MultivariateByFields = enabled; } diff --git a/lib/model/CModelParams.cc b/lib/model/CModelParams.cc index 1888d854cd..4928aae49a 100644 --- a/lib/model/CModelParams.cc +++ b/lib/model/CModelParams.cc @@ -35,11 +35,11 @@ SModelParams::SModelParams(core_t::TTime bucketLength) s_MinimumModeCount(CAnomalyDetectorModelConfig::DEFAULT_MINIMUM_CLUSTER_SPLIT_COUNT), s_CutoffToModelEmptyBuckets(CAnomalyDetectorModelConfig::DEFAULT_CUTOFF_TO_MODEL_EMPTY_BUCKETS), s_ComponentSize(CAnomalyDetectorModelConfig::DEFAULT_COMPONENT_SIZE), + s_MinimumTimeToDetectChange(CAnomalyDetectorModelConfig::DEFAULT_MINIMUM_TIME_TO_DETECT_CHANGE), + s_MaximumTimeToTestForChange(CAnomalyDetectorModelConfig::DEFAULT_MAXIMUM_TIME_TO_TEST_FOR_CHANGE), s_ExcludeFrequent(model_t::E_XF_None), s_ExcludePersonFrequency(0.1), s_ExcludeAttributeFrequency(0.1), s_MaximumUpdatesPerBucket(CAnomalyDetectorModelConfig::DEFAULT_MAXIMUM_UPDATES_PER_BUCKET), - s_TotalProbabilityCalcSamplingSize( - CAnomalyDetectorModelConfig::DEFAULT_TOTAL_PROBABILITY_CALC_SAMPLING_SIZE), s_InfluenceCutoff(CAnomalyDetectorModelConfig::DEFAULT_INFLUENCE_CUTOFF), s_LatencyBuckets(CAnomalyDetectorModelConfig::DEFAULT_LATENCY_BUCKETS), s_SampleCountFactor(CAnomalyDetectorModelConfig::DEFAULT_SAMPLE_COUNT_FACTOR_NO_LATENCY), @@ -70,20 +70,33 @@ double SModelParams::minimumCategoryCount() const { return s_LearnRate * CAnomalyDetectorModelConfig::DEFAULT_CATEGORY_DELETE_FRACTION; } +maths::STimeSeriesDecompositionRestoreParams +SModelParams::decompositionRestoreParams(maths_t::EDataType dataType) const { + double decayRate{CAnomalyDetectorModelConfig::trendDecayRate(s_DecayRate, s_BucketLength)}; + return {decayRate, s_BucketLength, s_ComponentSize, + this->distributionRestoreParams(dataType)}; +} + maths::SDistributionRestoreParams SModelParams::distributionRestoreParams(maths_t::EDataType dataType) const { - return maths::SDistributionRestoreParams(dataType, s_DecayRate, - s_MinimumModeFraction, s_MinimumModeCount, - this->minimumCategoryCount()); + return {dataType, s_DecayRate, s_MinimumModeFraction, s_MinimumModeCount, + this->minimumCategoryCount()}; } uint64_t SModelParams::checksum(uint64_t seed) const { seed = maths::CChecksum::calculate(seed, s_LearnRate); seed = maths::CChecksum::calculate(seed, s_DecayRate); seed = maths::CChecksum::calculate(seed, s_InitialDecayRateMultiplier); + seed = maths::CChecksum::calculate(seed, s_MinimumModeFraction); + seed = maths::CChecksum::calculate(seed, s_MinimumModeCount); + seed = maths::CChecksum::calculate(seed, s_CutoffToModelEmptyBuckets); + seed = maths::CChecksum::calculate(seed, s_ComponentSize); + seed = maths::CChecksum::calculate(seed, s_MinimumTimeToDetectChange); + seed = maths::CChecksum::calculate(seed, s_MaximumTimeToTestForChange); seed = maths::CChecksum::calculate(seed, s_ExcludeFrequent); + seed = maths::CChecksum::calculate(seed, s_ExcludePersonFrequency); + seed = maths::CChecksum::calculate(seed, s_ExcludeAttributeFrequency); seed = maths::CChecksum::calculate(seed, s_MaximumUpdatesPerBucket); - seed = maths::CChecksum::calculate(seed, s_TotalProbabilityCalcSamplingSize); seed = maths::CChecksum::calculate(seed, s_InfluenceCutoff); seed = maths::CChecksum::calculate(seed, s_LatencyBuckets); seed = maths::CChecksum::calculate(seed, s_SampleCountFactor); @@ -93,7 +106,10 @@ uint64_t SModelParams::checksum(uint64_t seed) const { seed = maths::CChecksum::calculate(seed, s_CorrelationModelsOverhead); seed = maths::CChecksum::calculate(seed, s_MultivariateByFields); seed = maths::CChecksum::calculate(seed, s_MinimumSignificantCorrelation); - return maths::CChecksum::calculate(seed, s_MinimumToFuzzyDeduplicate); + //seed = maths::CChecksum::calculate(seed, s_DetectionRules); + //seed = maths::CChecksum::calculate(seed, s_ScheduledEvents); + seed = maths::CChecksum::calculate(seed, s_MinimumToFuzzyDeduplicate); + return maths::CChecksum::calculate(seed, s_SamplingAgeCutoff); } } } diff --git a/lib/model/unittest/CEventRateModelTest.cc b/lib/model/unittest/CEventRateModelTest.cc index e91ddcd5e0..c62d9dc941 100644 --- a/lib/model/unittest/CEventRateModelTest.cc +++ b/lib/model/unittest/CEventRateModelTest.cc @@ -92,18 +92,17 @@ void generateEvents(const core_t::TTime& startTime, // Generate an ordered collection of event arrival times. test::CRandomNumbers rng; double bucketStartTime = static_cast(startTime); - for (std::size_t i = 0u; i < eventCountsPerBucket.size(); ++i) { + for (auto count : eventCountsPerBucket) { double bucketEndTime = bucketStartTime + static_cast(bucketLength); TDoubleVec bucketEventTimes; rng.generateUniformSamples(bucketStartTime, bucketEndTime - 1.0, - static_cast(eventCountsPerBucket[i]), - bucketEventTimes); + static_cast(count), bucketEventTimes); std::sort(bucketEventTimes.begin(), bucketEventTimes.end()); - for (std::size_t j = 0u; j < bucketEventTimes.size(); ++j) { - core_t::TTime time = static_cast(bucketEventTimes[j]); + for (auto time_ : bucketEventTimes) { + core_t::TTime time = static_cast(time_); time = std::min(static_cast(bucketEndTime - 1.0), std::max(static_cast(bucketStartTime), time)); eventArrivalTimes.push_back(time); @@ -120,37 +119,28 @@ void generateSporadicEvents(const core_t::TTime& startTime, // Generate an ordered collection of event arrival times. test::CRandomNumbers rng; double bucketStartTime = static_cast(startTime); - for (std::size_t i = 0u; i < nonZeroEventCountsPerBucket.size(); ++i) { + for (auto count : nonZeroEventCountsPerBucket) { double bucketEndTime = bucketStartTime + static_cast(bucketLength); TDoubleVec bucketEventTimes; - rng.generateUniformSamples( - bucketStartTime, bucketEndTime - 1.0, - static_cast(nonZeroEventCountsPerBucket[i]), bucketEventTimes); + rng.generateUniformSamples(bucketStartTime, bucketEndTime - 1.0, + static_cast(count), bucketEventTimes); std::sort(bucketEventTimes.begin(), bucketEventTimes.end()); - for (std::size_t j = 0u; j < bucketEventTimes.size(); ++j) { - core_t::TTime time = static_cast(bucketEventTimes[j]); + for (auto time_ : bucketEventTimes) { + core_t::TTime time = static_cast(time_); time = std::min(static_cast(bucketEndTime - 1.0), std::max(static_cast(bucketStartTime), time)); eventArrivalTimes.push_back(time); } TDoubleVec gap; - rng.generateUniformSamples(0.0, 10.0 * static_cast(bucketLength), 1u, gap); - bucketStartTime += static_cast(bucketLength) * - std::ceil(gap[0] / static_cast(bucketLength)); + rng.generateUniformSamples(0.0, 10.0, 1u, gap); + bucketStartTime += static_cast(bucketLength) * std::ceil(gap[0]); } } -class CTimeLess { -public: - bool operator()(const CEventData& lhs, const CEventData& rhs) const { - return lhs.time() < rhs.time(); - } -}; - std::size_t addPerson(const std::string& p, const CModelFactory::TDataGathererPtr& gatherer, CResourceMonitor& resourceMonitor) { @@ -169,7 +159,7 @@ std::size_t addPersonWithInfluence(const std::string& p, std::string i("i"); CDataGatherer::TStrCPtrVec person; person.push_back(&p); - for (std::size_t j = 0; j < numInfluencers; j++) { + for (std::size_t j = 0; j < numInfluencers; ++j) { person.push_back(&i); } if (value) { @@ -546,7 +536,7 @@ void CEventRateModelTest::testOnlineRare() { LOG_TRACE(<< "origXml = " << origXml); LOG_DEBUG(<< "size = " << origXml.size()); - CPPUNIT_ASSERT(origXml.size() < 21000); + CPPUNIT_ASSERT(origXml.size() < 22000); // Restore the XML into a new filter core::CRapidXmlParser parser; @@ -1119,7 +1109,9 @@ void CEventRateModelTest::testPrune() { } } } - std::sort(events.begin(), events.end(), CTimeLess()); + std::sort(events.begin(), events.end(), [](const CEventData& lhs, const CEventData& rhs) { + return lhs.time() < rhs.time(); + }); TEventDataVec expectedEvents; expectedEvents.reserve(events.size()); @@ -1303,11 +1295,8 @@ void CEventRateModelTest::testModelsWithValueFields() { strings.push_back("p1"); strings.push_back("c1"); strings.push_back("c2"); - strings.push_back("trwh5jks9djadkn453hgfadadfjhadhfkdhakj4hkahdlagl4iuy" - "galshkdjbvlaus4hliu4WHGFLIUSDHLKAJ"); - strings.push_back("2H4G55HALFMN569DNIVJ55B3BSJXU;4VBQ-" - "LKDFNUE9HNV904U5QGA;DDFLVJKF95NSD,MMVASD.,A.4,A." - "SD4"); + strings.push_back("trwh5jks9djadkn453hgfadadfjhadhfkdhakj4hkahdlagl4iuygalshkdjbvlaus4hliu4WHGFLIUSDHLKAJ"); + strings.push_back("2H4G55HALFMN569DNIVJ55B3BSJXU;4VBQ-LKDFNUE9HNV904U5QGA;DDFLVJKF95NSD,MMVASD.,A.4,A.SD4"); strings.push_back("a"); strings.push_back("b"); @@ -2190,8 +2179,7 @@ void CEventRateModelTest::testSkipSampling() { addArrival(*gathererWithGap, m_ResourceMonitor, 200, "p1"); addArrival(*gathererWithGap, m_ResourceMonitor, 200, "p2"); modelWithGap->skipSampling(1000); - LOG_DEBUG(<< "Calling sample over skipped interval should do nothing " - "except print some ERRORs"); + LOG_DEBUG(<< "Calling sample over skipped interval should do nothing except print some ERRORs"); modelWithGap->sample(200, 1000, m_ResourceMonitor); // Check prune does not remove people because last seen times are updated by adding gap duration @@ -2207,20 +2195,20 @@ void CEventRateModelTest::testSkipSampling() { CPPUNIT_ASSERT_EQUAL( static_cast( modelWithGap->details()->model(model_t::E_IndividualCountByBucketAndPerson, 0)) - ->prior() + ->residualModel() .checksum(), static_cast( modelNoGap->details()->model(model_t::E_IndividualCountByBucketAndPerson, 0)) - ->prior() + ->residualModel() .checksum()); CPPUNIT_ASSERT_EQUAL( static_cast( modelWithGap->details()->model(model_t::E_IndividualCountByBucketAndPerson, 1)) - ->prior() + ->residualModel() .checksum(), static_cast( modelNoGap->details()->model(model_t::E_IndividualCountByBucketAndPerson, 1)) - ->prior() + ->residualModel() .checksum()); // Confirm last seen times are only updated by gap duration by forcing p2 to be pruned @@ -2318,20 +2306,20 @@ void CEventRateModelTest::testExplicitNulls() { CPPUNIT_ASSERT_EQUAL( static_cast( modelExNullGap->details()->model(model_t::E_IndividualCountByBucketAndPerson, 0)) - ->prior() + ->residualModel() .checksum(), static_cast( modelSkipGap->details()->model(model_t::E_IndividualCountByBucketAndPerson, 0)) - ->prior() + ->residualModel() .checksum()); CPPUNIT_ASSERT_EQUAL( static_cast( modelExNullGap->details()->model(model_t::E_IndividualCountByBucketAndPerson, 1)) - ->prior() + ->residualModel() .checksum(), static_cast( modelSkipGap->details()->model(model_t::E_IndividualCountByBucketAndPerson, 1)) - ->prior() + ->residualModel() .checksum()); } @@ -2723,10 +2711,11 @@ void CEventRateModelTest::testDecayRateControl() { maths::CBasicStatistics::mean(meanPredictionError), 0.05); } - LOG_DEBUG(<< "*** Test step change ***"); + LOG_DEBUG(<< "*** Test linear scaling ***"); { - // Test a step change in a stable signal is detected and we get a - // significant reduction in the prediction error. + // This change point is amongst those we explicitly detect so + // check we get similar detection performance with and without + // decay rate control. params.s_ControlDecayRate = true; params.s_DecayRate = 0.001; @@ -2779,8 +2768,9 @@ void CEventRateModelTest::testDecayRateControl() { LOG_DEBUG(<< "mean = " << maths::CBasicStatistics::mean(meanPredictionError)); LOG_DEBUG(<< "reference = " << maths::CBasicStatistics::mean(meanReferencePredictionError)); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanPredictionError) < - 0.94 * maths::CBasicStatistics::mean(meanReferencePredictionError)); + CPPUNIT_ASSERT_DOUBLES_EQUAL( + maths::CBasicStatistics::mean(meanReferencePredictionError), + maths::CBasicStatistics::mean(meanPredictionError), 0.05); } LOG_DEBUG(<< "*** Test unmodelled cyclic component ***"); @@ -2788,7 +2778,8 @@ void CEventRateModelTest::testDecayRateControl() { // This modulates the event rate using a sine with period 10 weeks // effectively there are significant "manoeuvres" in the event rate // every 5 weeks at the function turning points. We check we get a - // significant reduction in the prediction error. + // significant reduction in the prediction error with decay rate + // control. params.s_ControlDecayRate = true; params.s_DecayRate = 0.001; @@ -2944,12 +2935,12 @@ void CEventRateModelTest::testIgnoreSamplingGivenDetectionRules() { uint64_t withSkipChecksum = static_cast( modelWithSkipView->model(model_t::E_IndividualCountByBucketAndPerson, 0)) - ->prior() + ->residualModel() .checksum(); uint64_t noSkipChecksum = static_cast( modelNoSkipView->model(model_t::E_IndividualCountByBucketAndPerson, 0)) - ->prior() + ->residualModel() .checksum(); CPPUNIT_ASSERT_EQUAL(withSkipChecksum, noSkipChecksum); @@ -2959,7 +2950,7 @@ void CEventRateModelTest::testIgnoreSamplingGivenDetectionRules() { modelNoSkipView->model(model_t::E_IndividualCountByBucketAndPerson, 0)); CPPUNIT_ASSERT(timeSeriesModel); - core_t::TTime time = timeSeriesModel->trend().lastValueTime(); + core_t::TTime time = timeSeriesModel->trendModel().lastValueTime(); CPPUNIT_ASSERT_EQUAL(model_t::sampleTime(model_t::E_IndividualCountByBucketAndPerson, startTime, bucketLength), time); @@ -2967,7 +2958,7 @@ void CEventRateModelTest::testIgnoreSamplingGivenDetectionRules() { // The last times of model with a skip should be the same timeSeriesModel = dynamic_cast( modelWithSkipView->model(model_t::E_IndividualCountByBucketAndPerson, 0)); - CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trend().lastValueTime()); + CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trendModel().lastValueTime()); } CppUnit::Test* CEventRateModelTest::suite() { diff --git a/lib/model/unittest/CEventRatePopulationModelTest.cc b/lib/model/unittest/CEventRatePopulationModelTest.cc index ef3b9613df..0f0b2980a9 100644 --- a/lib/model/unittest/CEventRatePopulationModelTest.cc +++ b/lib/model/unittest/CEventRatePopulationModelTest.cc @@ -1137,8 +1137,7 @@ void CEventRatePopulationModelTest::testSkipSampling() { modelWithGap->sample(100, 200, m_ResourceMonitor); addArrival(SMessage(200, "p1", "a1"), gathererWithGap, m_ResourceMonitor); modelWithGap->skipSampling(1000); - LOG_DEBUG(<< "Calling sample over skipped interval should do nothing " - "except print some ERRORs"); + LOG_DEBUG(<< "Calling sample over skipped interval should do nothing except print some ERRORs"); modelWithGap->sample(200, 1000, m_ResourceMonitor); // Check prune does not remove people because last seen times are updated by adding gap duration @@ -1155,20 +1154,20 @@ void CEventRatePopulationModelTest::testSkipSampling() { CPPUNIT_ASSERT_EQUAL( static_cast( modelWithGap->details()->model(model_t::E_PopulationCountByBucketPersonAndAttribute, 0)) - ->prior() + ->residualModel() .checksum(), static_cast( modelNoGap->details()->model(model_t::E_PopulationCountByBucketPersonAndAttribute, 0)) - ->prior() + ->residualModel() .checksum()); CPPUNIT_ASSERT_EQUAL( static_cast( modelWithGap->details()->model(model_t::E_PopulationCountByBucketPersonAndAttribute, 1)) - ->prior() + ->residualModel() .checksum(), static_cast( modelNoGap->details()->model(model_t::E_PopulationCountByBucketPersonAndAttribute, 1)) - ->prior() + ->residualModel() .checksum()); // Confirm last seen times are only updated by gap duration by forcing p2 and a2 to be pruned @@ -1451,7 +1450,7 @@ void CEventRatePopulationModelTest::testIgnoreSamplingGivenDetectionRules() { model_t::E_PopulationCountByBucketPersonAndAttribute, 0)); CPPUNIT_ASSERT(timeSeriesModel); - core_t::TTime time = timeSeriesModel->trend().lastValueTime(); + core_t::TTime time = timeSeriesModel->trendModel().lastValueTime(); CPPUNIT_ASSERT_EQUAL(model_t::sampleTime(model_t::E_PopulationCountByBucketPersonAndAttribute, 200, bucketLength), time); @@ -1459,23 +1458,23 @@ void CEventRatePopulationModelTest::testIgnoreSamplingGivenDetectionRules() { // The last times of the underlying time series models should all be the same timeSeriesModel = dynamic_cast( modelNoSkipView->model(model_t::E_PopulationCountByBucketPersonAndAttribute, 1)); - CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trend().lastValueTime()); + CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trendModel().lastValueTime()); timeSeriesModel = dynamic_cast( modelNoSkipView->model(model_t::E_PopulationCountByBucketPersonAndAttribute, 2)); - CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trend().lastValueTime()); + CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trendModel().lastValueTime()); timeSeriesModel = dynamic_cast( modelWithSkipView->model(model_t::E_PopulationCountByBucketPersonAndAttribute, 0)); - CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trend().lastValueTime()); + CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trendModel().lastValueTime()); timeSeriesModel = dynamic_cast( modelWithSkipView->model(model_t::E_PopulationCountByBucketPersonAndAttribute, 1)); - CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trend().lastValueTime()); + CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trendModel().lastValueTime()); timeSeriesModel = dynamic_cast( modelWithSkipView->model(model_t::E_PopulationCountByBucketPersonAndAttribute, 2)); - CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trend().lastValueTime()); + CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trendModel().lastValueTime()); timeSeriesModel = dynamic_cast( modelWithSkipView->model(model_t::E_PopulationCountByBucketPersonAndAttribute, 3)); - CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trend().lastValueTime()); + CPPUNIT_ASSERT_EQUAL(time, timeSeriesModel->trendModel().lastValueTime()); } CppUnit::Test* CEventRatePopulationModelTest::suite() { diff --git a/lib/model/unittest/CHierarchicalResultsTest.cc b/lib/model/unittest/CHierarchicalResultsTest.cc index c07237173a..2981588303 100644 --- a/lib/model/unittest/CHierarchicalResultsTest.cc +++ b/lib/model/unittest/CHierarchicalResultsTest.cc @@ -235,8 +235,8 @@ class CCheckScores : public model::CHierarchicalResultsVisitor { const TNode& node, bool /*pivot*/) { LOG_DEBUG(<< node.s_Spec.print() << " score = " << node.s_RawAnomalyScore << ", expected score = " - << maths::CTools::deviation(node.probability())); - CPPUNIT_ASSERT_DOUBLES_EQUAL(maths::CTools::deviation(node.probability()), + << maths::CTools::anomalyScore(node.probability())); + CPPUNIT_ASSERT_DOUBLES_EQUAL(maths::CTools::anomalyScore(node.probability()), node.s_RawAnomalyScore, 1e-10); } }; @@ -1608,7 +1608,7 @@ void CHierarchicalResultsTest::testNormalizer() { // This truncation condition needs to be kept the same as the one in CHierarchicalResultsNormalizer::visit() double score = probability > modelConfig.maximumAnomalousProbability() ? 0.0 - : maths::CTools::deviation(probability); + : maths::CTools::anomalyScore(probability); itr->second->updateQuantiles(score); } for (std::size_t j = 0u; j < extract.leafNodes().size(); ++j) { @@ -1620,7 +1620,7 @@ void CHierarchicalResultsTest::testNormalizer() { // This truncation condition needs to be kept the same as the one in CHierarchicalResultsNormalizer::visit() double score = probability > modelConfig.maximumAnomalousProbability() ? 0.0 - : maths::CTools::deviation(probability); + : maths::CTools::anomalyScore(probability); normalized.push_back(extract.leafNodes()[j]->s_NormalizedAnomalyScore); CPPUNIT_ASSERT(itr->second->normalize(score)); expectedNormalized.push_back(score); @@ -1649,7 +1649,7 @@ void CHierarchicalResultsTest::testNormalizer() { // This truncation condition needs to be kept the same as the one in CHierarchicalResultsNormalizer::visit() double score = probability > modelConfig.maximumAnomalousProbability() ? 0.0 - : maths::CTools::deviation(probability); + : maths::CTools::anomalyScore(probability); itr->second->updateQuantiles(score); } for (std::size_t j = 0u; j < extract.personNodes().size(); ++j) { @@ -1661,7 +1661,7 @@ void CHierarchicalResultsTest::testNormalizer() { // This truncation condition needs to be kept the same as the one in CHierarchicalResultsNormalizer::visit() double score = probability > modelConfig.maximumAnomalousProbability() ? 0.0 - : maths::CTools::deviation(probability); + : maths::CTools::anomalyScore(probability); normalized.push_back(extract.personNodes()[j]->s_NormalizedAnomalyScore); CPPUNIT_ASSERT(itr->second->normalize(score)); expectedNormalized.push_back(score); @@ -1689,7 +1689,7 @@ void CHierarchicalResultsTest::testNormalizer() { // This truncation condition needs to be kept the same as the one in CHierarchicalResultsNormalizer::visit() double score = probability > modelConfig.maximumAnomalousProbability() ? 0.0 - : maths::CTools::deviation(probability); + : maths::CTools::anomalyScore(probability); itr->second->updateQuantiles(score); } for (std::size_t j = 0u; j < extract.partitionNodes().size(); ++j) { @@ -1700,7 +1700,7 @@ void CHierarchicalResultsTest::testNormalizer() { // This truncation condition needs to be kept the same as the one in CHierarchicalResultsNormalizer::visit() double score = probability > modelConfig.maximumAnomalousProbability() ? 0.0 - : maths::CTools::deviation(probability); + : maths::CTools::anomalyScore(probability); normalized.push_back(extract.partitionNodes()[j]->s_NormalizedAnomalyScore); CPPUNIT_ASSERT(itr->second->normalize(score)); expectedNormalized.push_back(score); @@ -1717,7 +1717,7 @@ void CHierarchicalResultsTest::testNormalizer() { // This truncation condition needs to be kept the same as the one in CHierarchicalResultsNormalizer::visit() double score = probability > modelConfig.maximumAnomalousProbability() ? 0.0 - : maths::CTools::deviation(probability); + : maths::CTools::anomalyScore(probability); expectedNormalizers.find(std::string("r"))->second->updateQuantiles(score); expectedNormalizers.find(std::string("r"))->second->normalize(score); diff --git a/lib/model/unittest/CMetricAnomalyDetectorTest.cc b/lib/model/unittest/CMetricAnomalyDetectorTest.cc index f52acf8135..4e35e57317 100644 --- a/lib/model/unittest/CMetricAnomalyDetectorTest.cc +++ b/lib/model/unittest/CMetricAnomalyDetectorTest.cc @@ -301,7 +301,7 @@ void CMetricAnomalyDetectorTest::testAnomalies() { double noise = std::accumulate(anomalyFactors.begin(), anomalyFactors.end(), 0.0); LOG_DEBUG(<< "S/N = " << (signal / noise)); - CPPUNIT_ASSERT(signal / noise > 100.0); + CPPUNIT_ASSERT(signal / noise > 90.0); } // Find the high/low rate partition point. @@ -414,7 +414,7 @@ void CMetricAnomalyDetectorTest::testExcludeFrequent() { // expect there to be 2 anomalies CPPUNIT_ASSERT_EQUAL(std::size_t(2), highAnomalyTimes.size()); - CPPUNIT_ASSERT_DOUBLES_EQUAL(92.0, highAnomalyFactors[1], 0.5); + CPPUNIT_ASSERT_DOUBLES_EQUAL(99.0, highAnomalyFactors[1], 0.5); } { model::CAnomalyDetectorModelConfig modelConfig = @@ -441,7 +441,7 @@ void CMetricAnomalyDetectorTest::testExcludeFrequent() { // expect there to be 1 anomaly CPPUNIT_ASSERT_EQUAL(std::size_t(1), highAnomalyTimes.size()); - CPPUNIT_ASSERT_DOUBLES_EQUAL(23.0, highAnomalyFactors[0], 0.4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(24.0, highAnomalyFactors[0], 0.5); } } diff --git a/lib/model/unittest/CMetricModelTest.cc b/lib/model/unittest/CMetricModelTest.cc index 0ca51abed6..814e6f07cb 100644 --- a/lib/model/unittest/CMetricModelTest.cc +++ b/lib/model/unittest/CMetricModelTest.cc @@ -289,9 +289,9 @@ void CMetricModelTest::testSample() { TTimeDoublePr(61, 1.3), TTimeDoublePr(62, 1.6), TTimeDoublePr(65, 1.7), TTimeDoublePr(66, 1.33), TTimeDoublePr(68, 1.5), TTimeDoublePr(84, 1.58), - TTimeDoublePr(87, 1.99), TTimeDoublePr(157, 1.6), + TTimeDoublePr(87, 1.69), TTimeDoublePr(157, 1.6), TTimeDoublePr(164, 1.66), TTimeDoublePr(199, 1.28), - TTimeDoublePr(202, 1.0), TTimeDoublePr(204, 1.5)}; + TTimeDoublePr(202, 1.2), TTimeDoublePr(204, 1.5)}; unsigned int sampleCounts[] = {2, 1}; unsigned int expectedSampleCounts[] = {2, 1}; @@ -540,9 +540,9 @@ void CMetricModelTest::testMultivariateSample() { double data_[][3] = {{49, 1.5, 1.1}, {60, 1.3, 1.2}, {61, 1.3, 2.1}, {62, 1.6, 1.5}, {65, 1.7, 1.4}, {66, 1.33, 1.6}, - {68, 1.5, 1.37}, {84, 1.58, 1.42}, {87, 1.99, 2.2}, + {68, 1.5, 1.37}, {84, 1.58, 1.42}, {87, 1.6, 1.6}, {157, 1.6, 1.6}, {164, 1.66, 1.55}, {199, 1.28, 1.4}, - {202, 1.0, 0.7}, {204, 1.5, 1.8}}; + {202, 1.3, 1.1}, {204, 1.5, 1.8}}; TTimeDouble2AryPrVec data; for (std::size_t i = 0u; i < boost::size(data_); ++i) { boost::array value = {{data_[i][1], data_[i][2]}}; @@ -633,13 +633,12 @@ void CMetricModelTest::testMultivariateSample() { const auto& prior = dynamic_cast( model.details()->model(model_t::E_IndividualMeanLatLongByPerson, 0)) - ->prior(); + ->residualModel(); LOG_DEBUG(<< "bucket count = " << core::CContainerPrinter::print(count)); - LOG_DEBUG(<< "current = " << core::CContainerPrinter::print(bucketLatLong)); - LOG_DEBUG(<< "expected baseline = " - << maths::CBasicStatistics::mean(expectedBaselineLatLong)); - LOG_DEBUG(<< "actual baseline = " + LOG_DEBUG(<< "current = " << core::CContainerPrinter::print(bucketLatLong) + << "expected baseline = " + << maths::CBasicStatistics::mean(expectedBaselineLatLong) << "actual baseline = " << core::CContainerPrinter::print(baselineLatLong)); CPPUNIT_ASSERT(count); @@ -656,6 +655,7 @@ void CMetricModelTest::testMultivariateSample() { TVector2(baselineLatLong) - maths::CBasicStatistics::mean(expectedBaselineLatLong))); } + CPPUNIT_ASSERT_EQUAL(latLong, featureLatLong); CPPUNIT_ASSERT_EQUAL(expectedPrior->checksum(), prior.checksum()); @@ -1622,11 +1622,11 @@ void CMetricModelTest::testSkipSampling() { CPPUNIT_ASSERT_EQUAL( static_cast( modelNoGap.details()->model(model_t::E_IndividualSumByBucketAndPerson, 0)) - ->prior() + ->residualModel() .checksum(), static_cast( modelWithGap.details()->model(model_t::E_IndividualSumByBucketAndPerson, 0)) - ->prior() + ->residualModel() .checksum()); } @@ -1717,11 +1717,11 @@ void CMetricModelTest::testExplicitNulls() { CPPUNIT_ASSERT_EQUAL( static_cast( modelSkipGap.details()->model(model_t::E_IndividualSumByBucketAndPerson, 0)) - ->prior() + ->residualModel() .checksum(), static_cast( modelExNullGap.details()->model(model_t::E_IndividualSumByBucketAndPerson, 0)) - ->prior() + ->residualModel() .checksum()); } @@ -2247,13 +2247,14 @@ void CMetricModelTest::testDecayRateControl() { << maths::CBasicStatistics::mean(meanReferencePredictionError)); CPPUNIT_ASSERT_DOUBLES_EQUAL( maths::CBasicStatistics::mean(meanReferencePredictionError), - maths::CBasicStatistics::mean(meanPredictionError), 0.06); + maths::CBasicStatistics::mean(meanPredictionError), 0.05); } LOG_DEBUG(<< "*** Test step change ***"); { - // Test a step change in a stable signal is detected and we get a - // significant reduction in the prediction error. + // This change point is amongst those we explicitly detect so + // check we get similar detection performance with and without + // decay rate control. params.s_ControlDecayRate = true; params.s_DecayRate = 0.001; @@ -2301,11 +2302,11 @@ void CMetricModelTest::testDecayRateControl() { referenceModel->baselineBucketMean(feature, 0, 0, type, NO_CORRELATES, t + bucketLength / 2)[0])); } - LOG_DEBUG(<< "mean = " << maths::CBasicStatistics::mean(meanPredictionError)); - LOG_DEBUG(<< "reference = " - << maths::CBasicStatistics::mean(meanReferencePredictionError)); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanPredictionError) < - 0.94 * maths::CBasicStatistics::mean(meanReferencePredictionError)); + LOG_DEBUG("mean = " << maths::CBasicStatistics::mean(meanPredictionError)); + LOG_DEBUG("reference = " << maths::CBasicStatistics::mean(meanReferencePredictionError)); + CPPUNIT_ASSERT_DOUBLES_EQUAL( + maths::CBasicStatistics::mean(meanReferencePredictionError), + maths::CBasicStatistics::mean(meanPredictionError), 0.05); } LOG_DEBUG(<< "*** Test unmodelled cyclic component ***"); @@ -2313,7 +2314,8 @@ void CMetricModelTest::testDecayRateControl() { // This modulates the event rate using a sine with period 10 weeks // effectively there are significant "manoeuvres" in the event rate // every 5 weeks at the function turning points. We check we get a - // significant reduction in the prediction error. + // significant reduction in the prediction error with decay rate + // control. params.s_ControlDecayRate = true; params.s_DecayRate = 0.001; diff --git a/lib/model/unittest/CModelDetailsViewTest.cc b/lib/model/unittest/CModelDetailsViewTest.cc index d60d6a0167..1714acc287 100644 --- a/lib/model/unittest/CModelDetailsViewTest.cc +++ b/lib/model/unittest/CModelDetailsViewTest.cc @@ -7,6 +7,7 @@ #include "CModelDetailsViewTest.h" #include +#include #include #include @@ -62,7 +63,8 @@ void CModelDetailsViewTest::testModelPlot() { maths::CTimeSeriesDecomposition trend; maths::CNormalMeanPrecConjugate prior{ maths::CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData)}; - maths::CModelParams timeSeriesModelParams{bucketLength, 1.0, 0.001, 0.2}; + maths::CModelParams timeSeriesModelParams{ + bucketLength, 1.0, 0.001, 0.2, 6 * core::constants::HOUR, 24 * core::constants::HOUR}; maths::CUnivariateTimeSeriesModel timeSeriesModel{timeSeriesModelParams, 0, trend, prior}; model->mockTimeSeriesModels( diff --git a/lib/model/unittest/CModelToolsTest.cc b/lib/model/unittest/CModelToolsTest.cc index 4a441e494f..6a60edec0d 100644 --- a/lib/model/unittest/CModelToolsTest.cc +++ b/lib/model/unittest/CModelToolsTest.cc @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -40,8 +41,12 @@ maths::CModelParams params(core_t::TTime bucketLength) { static TTimeDoubleMap learnRates; learnRates[bucketLength] = static_cast(bucketLength) / 1800.0; double minimumSeasonalVarianceScale{MINIMUM_SEASONAL_SCALE}; - return maths::CModelParams{bucketLength, learnRates[bucketLength], - DECAY_RATE, minimumSeasonalVarianceScale}; + return maths::CModelParams{bucketLength, + learnRates[bucketLength], + DECAY_RATE, + minimumSeasonalVarianceScale, + 6 * core::constants::HOUR, + 24 * core::constants::HOUR}; } maths::CNormalMeanPrecConjugate normal() { @@ -158,7 +163,6 @@ void CModelToolsTest::testProbabilityCache() { // Test the error introduced by caching the probability and that we // don't get any errors in the tailness we calculate for the value. - using TBool2Vec = core::CSmallVector; using TSize1Vec = core::CSmallVector; using TTime2Vec = core::CSmallVector; using TTime2Vec1Vec = core::CSmallVector; @@ -224,7 +228,7 @@ void CModelToolsTest::testProbabilityCache() { maths::CModelProbabilityParams params; params.addCalculation(maths_t::E_TwoSided) .seasonalConfidenceInterval(0.0) - .addBucketEmpty(TBool2Vec{false}) + .addBucketEmpty({false}) .addWeights(weights[0]); double expectedProbability; TTail2Vec expectedTail; @@ -266,7 +270,7 @@ void CModelToolsTest::testProbabilityCache() { maths::CModelProbabilityParams params; params.addCalculation(maths_t::E_TwoSided) .seasonalConfidenceInterval(0.0) - .addBucketEmpty(TBool2Vec{false}) + .addBucketEmpty({false}) .addWeights(weights[0]); double expectedProbability; TTail2Vec expectedTail; diff --git a/lib/model/unittest/CProbabilityAndInfluenceCalculatorTest.cc b/lib/model/unittest/CProbabilityAndInfluenceCalculatorTest.cc index f574a6dfac..b8ece3b15f 100644 --- a/lib/model/unittest/CProbabilityAndInfluenceCalculatorTest.cc +++ b/lib/model/unittest/CProbabilityAndInfluenceCalculatorTest.cc @@ -7,6 +7,7 @@ #include "CProbabilityAndInfluenceCalculatorTest.h" #include +#include #include #include @@ -80,7 +81,12 @@ TDouble1VecDouble1VecPr make_pair(double first1, double first2, double second1, maths::CModelParams params(core_t::TTime bucketLength) { double learnRate{static_cast(bucketLength) / 1800.0}; double minimumSeasonalVarianceScale{0.4}; - return maths::CModelParams{bucketLength, learnRate, 0.0, minimumSeasonalVarianceScale}; + return maths::CModelParams{bucketLength, + learnRate, + 0.0, + minimumSeasonalVarianceScale, + 6 * core::constants::HOUR, + 24 * core::constants::HOUR}; } std::size_t dimension(double) { diff --git a/lib/test/CTimeSeriesTestData.cc b/lib/test/CTimeSeriesTestData.cc index dc09550c47..d4419e3d94 100644 --- a/lib/test/CTimeSeriesTestData.cc +++ b/lib/test/CTimeSeriesTestData.cc @@ -227,7 +227,7 @@ bool CTimeSeriesTestData::parseLine(const core::CRegex& tokenRegex, core::CRegex::TStrVec tokens; if (tokenRegex.tokenise(line, tokens) == false) { - LOG_ERROR(<< "Regex error '" << tokenRegex.str() << "' " << line); + LOG_ERROR(<< "Regex error '" << tokenRegex.str() << "' '" << line << "'"); return false; }