Skip to content

Commit e16816e

Browse files
authored
[ML] Improve adaption of the modelling of cyclic components to very localised features (#134)
1 parent 4dd90fa commit e16816e

20 files changed

+658
-472
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Explicit change point detection and modelling ({pull}92[#92])
3737
Improve partition analysis memory usage ({pull}97[#97])
3838
Reduce model memory by storing state for periodicity testing in a compressed format ({pull}100[#100])
3939
Improve the accuracy of model memory control ({pull}122[#122])
40+
Improve adaption of the modelling of cyclic components to very localised features ({pull}134[#134])
4041

4142
Forecasting of Machine Learning job time series is now supported for large jobs by temporarily storing
4243
model state on disk ({pull}89[#89])

include/maths/CAdaptiveBucketing.h

Lines changed: 128 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,9 @@ namespace maths {
6161
//!
6262
//! For sufficiently smooth functions and a given number of buckets
6363
//! the objective is minimized by ensuring that "bucket width" x
64-
//! "function range" is approximately equal in all buckets.
64+
//! "function range" is equal in all buckets.
6565
//!
66-
//! The bucketing is aged by relaxing it back towards uniform and
67-
//! aging the counts of the mean value for each bucket as usual.
66+
//! The bucketing is aged by relaxing it back towards uniform.
6867
class MATHS_EXPORT CAdaptiveBucketing {
6968
public:
7069
using TDoubleVec = std::vector<double>;
@@ -73,26 +72,92 @@ class MATHS_EXPORT CAdaptiveBucketing {
7372
using TFloatMeanAccumulatorVec = std::vector<TFloatMeanAccumulator>;
7473

7574
public:
76-
//! Restore by traversing a state document
77-
bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);
75+
//! Refine the bucket end points to minimize the maximum averaging
76+
//! error in any bucket.
77+
//!
78+
//! \param[in] time The time at which to refine.
79+
void refine(core_t::TTime time);
7880

79-
//! Persist by passing information to the supplied inserter.
80-
void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
81+
//! Check if the bucketing has been initialized.
82+
bool initialized() const;
83+
84+
//! Get the number of buckets.
85+
std::size_t size() const;
86+
87+
//! Set the rate at which the bucketing loses information.
88+
void decayRate(double value);
89+
90+
//! Get the rate at which the bucketing loses information.
91+
double decayRate() const;
92+
93+
//! Get the minimum permitted bucket length.
94+
double minimumBucketLength() const;
95+
96+
//! Get the bucket end points.
97+
const TFloatVec& endpoints() const;
98+
99+
//! Get the bucket value centres.
100+
const TFloatVec& centres() const;
101+
102+
//! Get the bucket value centres.
103+
const TFloatVec& largeErrorCounts() const;
104+
105+
//! Get a set of knot points and knot point values to use for
106+
//! interpolating the bucket values.
107+
//!
108+
//! \param[in] time The time at which to get the knot points.
109+
//! \param[in] boundary Controls the style of start and end knots.
110+
//! \param[out] knots Filled in with the knot points to interpolate.
111+
//! \param[out] values Filled in with the values at \p knots.
112+
//! \param[out] variances Filled in with the variances at \p knots.
113+
//! \return True if there are sufficient knot points to interpolate
114+
//! and false otherwise.
115+
bool knots(core_t::TTime time,
116+
CSplineTypes::EBoundaryCondition boundary,
117+
TDoubleVec& knots,
118+
TDoubleVec& values,
119+
TDoubleVec& variances) const;
120+
121+
//! \name Test Functions
122+
//@{
123+
//! Get the total count of in the bucketing.
124+
double count() const;
125+
126+
//! Get the bucket regressions.
127+
TDoubleVec values(core_t::TTime time) const;
128+
129+
//! Get the bucket variances.
130+
TDoubleVec variances() const;
131+
//@}
132+
133+
protected:
134+
using TRestoreFunc = std::function<bool(core::CStateRestoreTraverser&)>;
135+
using TPersistFunc = std::function<void(core::CStatePersistInserter&)>;
136+
137+
protected:
138+
//! The minimum number of standard deviations for an error to be
139+
//! considered large.
140+
static const double LARGE_ERROR_STANDARD_DEVIATIONS;
81141

82142
protected:
83143
CAdaptiveBucketing(double decayRate, double minimumBucketLength);
84-
//! Construct by traversing a state document.
85-
CAdaptiveBucketing(double decayRate,
86-
double minimumBucketLength,
87-
core::CStateRestoreTraverser& traverser);
88144
virtual ~CAdaptiveBucketing() = default;
89145

146+
//! Get the restore function bound to this object.
147+
TRestoreFunc getAcceptRestoreTraverser();
148+
149+
//! Get the accept persist function bound to this object.
150+
TPersistFunc getAcceptPersistInserter() const;
151+
152+
//! Restore by traversing a state document
153+
bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);
154+
155+
//! Persist by passing information to the supplied inserter.
156+
void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
157+
90158
//! Efficiently swap the contents of two bucketing objects.
91159
void swap(CAdaptiveBucketing& other);
92160

93-
//! Check if the bucketing has been initialized.
94-
bool initialized() const;
95-
96161
//! Create a new uniform bucketing with \p n buckets on the
97162
//! interval [\p a, \p b].
98163
//!
@@ -113,75 +178,32 @@ class MATHS_EXPORT CAdaptiveBucketing {
113178
core_t::TTime endTime,
114179
const TFloatMeanAccumulatorVec& values);
115180

116-
//! Get the number of buckets.
117-
std::size_t size() const;
118-
119181
//! Clear the contents of this bucketing and recover any
120182
//! allocated memory.
121183
void clear();
122184

123185
//! Add the function value at \p time.
124186
//!
125187
//! \param[in] bucket The index of the bucket of \p time.
126-
//! \param[in] time The time of \p value.
127-
//! \param[in] weight The weight of function point. The smaller
128-
//! this is the less influence it has on the bucket.
188+
//! \param[in] time The time of the value being added.
189+
//! \param[in] weight The weight of the value being added. The
190+
//! smaller this is the less influence it has on the bucket.
129191
void add(std::size_t bucket, core_t::TTime time, double weight);
130192

131-
//! Set the rate at which the bucketing loses information.
132-
void decayRate(double value);
133-
134-
//! Get the rate at which the bucketing loses information.
135-
double decayRate() const;
193+
//! Add a large error in \p bucket.
194+
void addLargeError(std::size_t bucket, core_t::TTime time);
136195

137196
//! Age the force moments.
138197
void age(double factor);
139198

140-
//! Get the minimum permitted bucket length.
141-
double minimumBucketLength() const;
142-
143-
//! Refine the bucket end points to minimize the maximum averaging
144-
//! error in any bucket.
145-
//!
146-
//! \param[in] time The time at which to refine.
147-
void refine(core_t::TTime time);
148-
149-
//! Get a set of knot points and knot point values to use for
150-
//! interpolating the bucket values.
151-
//!
152-
//! \param[in] time The time at which to get the knot points.
153-
//! \param[in] boundary Controls the style of start and end knots.
154-
//! \param[out] knots Filled in with the knot points to interpolate.
155-
//! \param[out] values Filled in with the values at \p knots.
156-
//! \param[out] variances Filled in with the variances at \p knots.
157-
//! \return True if there are sufficient knot points to interpolate
158-
//! and false otherwise.
159-
bool knots(core_t::TTime time,
160-
CSplineTypes::EBoundaryCondition boundary,
161-
TDoubleVec& knots,
162-
TDoubleVec& values,
163-
TDoubleVec& variances) const;
164-
165-
//! Get the bucket end points.
166-
const TFloatVec& endpoints() const;
167-
168-
//! Get the bucket end points.
169-
TFloatVec& endpoints();
170-
171-
//! Get the bucket value centres.
172-
const TFloatVec& centres() const;
173-
174199
//! Get the bucket value centres.
175200
TFloatVec& centres();
176201

177-
//! Get the total count of in the bucketing.
178-
double count() const;
179-
180-
//! Get the bucket regressions.
181-
TDoubleVec values(core_t::TTime time) const;
202+
//! Get the bucket value centres.
203+
TFloatVec& largeErrorCounts();
182204

183-
//! Get the bucket variances.
184-
TDoubleVec variances() const;
205+
//! Adjust \p weight for significant large error counts.
206+
double adjustedWeight(std::size_t bucket, double weight) const;
185207

186208
//! Compute the index of the bucket to which \p time belongs
187209
bool bucket(core_t::TTime time, std::size_t& result) const;
@@ -192,6 +214,10 @@ class MATHS_EXPORT CAdaptiveBucketing {
192214
//! Get the memory used by this component
193215
std::size_t memoryUsage() const;
194216

217+
private:
218+
using TFloatUInt32Pr = std::pair<CFloatStorage, std::uint32_t>;
219+
using TFloatUInt32PrMinAccumulator = CBasicStatistics::SMin<TFloatUInt32Pr, 2>::TAccumulator;
220+
195221
private:
196222
//! Compute the values corresponding to the change in end
197223
//! points from \p endpoints. The values are assigned based
@@ -208,15 +234,25 @@ class MATHS_EXPORT CAdaptiveBucketing {
208234
//! Get the offset w.r.t. the start of the bucketing of \p time.
209235
virtual double offset(core_t::TTime time) const = 0;
210236

211-
//! The count in \p bucket.
212-
virtual double count(std::size_t bucket) const = 0;
237+
//! Get the count in \p bucket.
238+
virtual double bucketCount(std::size_t bucket) const = 0;
213239

214-
//! Get the predicted value for the \p bucket at \p time.
240+
//! Get the predicted value for \p bucket at \p time.
215241
virtual double predict(std::size_t bucket, core_t::TTime time, double offset) const = 0;
216242

217243
//! Get the variance of \p bucket.
218244
virtual double variance(std::size_t bucket) const = 0;
219245

246+
//! Implements split of \p bucket for derived state.
247+
virtual void split(std::size_t bucket) = 0;
248+
249+
//! Check if there is evidence of systematically large errors in a
250+
//! bucket and split it if there is.
251+
void maybeSplitBucket();
252+
253+
//! Split \p bucket.
254+
void splitBucket(std::size_t bucket);
255+
220256
private:
221257
//! The rate at which information is aged out of the bucket values.
222258
double m_DecayRate;
@@ -225,12 +261,34 @@ class MATHS_EXPORT CAdaptiveBucketing {
225261
//! is ignored.
226262
double m_MinimumBucketLength;
227263

264+
//! The desired number of buckets. We can use more if we determine
265+
//! that we aren't capturing the periodic pattern effectively.
266+
//!
267+
//! \see maybeSplitBucketMostSignificantBuckets for details.
268+
std::size_t m_TargetSize = 0;
269+
270+
//! The bucket of the last large error added.
271+
std::size_t m_LastLargeErrorBucket = 0;
272+
273+
//! The period of the last large error added.
274+
core_t::TTime m_LastLargeErrorPeriod = 0;
275+
276+
//! The p-values of the most significant large error counts.
277+
TFloatUInt32PrMinAccumulator m_LargeErrorCountSignificances;
278+
279+
//! The mean weight of values added.
280+
TFloatMeanAccumulator m_MeanWeight;
281+
228282
//! The bucket end points.
229283
TFloatVec m_Endpoints;
230284

231-
//! The mean periodic time of each regression.
285+
//! The mean offset (relative to the start of the bucket) of samples
286+
//! in each bucket.
232287
TFloatVec m_Centres;
233288

289+
//! The count of large errors in each bucket.
290+
TFloatVec m_LargeErrorCounts;
291+
234292
//! An IIR low pass filter for the total desired end point displacement
235293
//! in refine.
236294
TFloatMeanAccumulator m_MeanDesiredDisplacement;

include/maths/CBasicStatistics.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,8 @@ class MATHS_EXPORT CBasicStatistics {
10321032
using const_iterator = typename CONTAINER::const_iterator;
10331033
using reverse_iterator = typename CONTAINER::reverse_iterator;
10341034
using const_reverse_iterator = typename CONTAINER::const_reverse_iterator;
1035+
using TToString = std::function<std::string(const T&)>;
1036+
using TFromString = std::function<bool(const std::string&, T&)>;
10351037

10361038
public:
10371039
COrderStatisticsImpl(const CONTAINER& statistics, const LESS& less)
@@ -1043,8 +1045,20 @@ class MATHS_EXPORT CBasicStatistics {
10431045
//! Initialize from a delimited string.
10441046
bool fromDelimited(const std::string& value);
10451047

1048+
//! Initialize from a delimited string using \p fromString to initialize
1049+
//! values of type T from a string.
1050+
//!
1051+
//! \warning This functions must not use CBasicStatistics::INTERNAL_DELIMITER.
1052+
bool fromDelimited(const std::string& value, const TFromString& fromString);
1053+
10461054
//! Convert to a delimited string.
10471055
std::string toDelimited() const;
1056+
1057+
//! Convert to a delimited string using \p toString to convert individual
1058+
//! values of type T to a string.
1059+
//!
1060+
//! \warning This functions must not use CBasicStatistics::INTERNAL_DELIMITER.
1061+
std::string toDelimited(const TToString& toString) const;
10481062
//@}
10491063

10501064
//! \name Update
@@ -1367,15 +1381,15 @@ class MATHS_EXPORT CBasicStatistics {
13671381
//! \name Accumulator Typedefs
13681382
//@{
13691383
//! Accumulator object to compute the sample maximum.
1370-
template<typename T>
1384+
template<typename T, std::size_t N = 1>
13711385
struct SMax {
1372-
using TAccumulator = COrderStatisticsStack<T, 1, std::greater<T>>;
1386+
using TAccumulator = COrderStatisticsStack<T, N, std::greater<T>>;
13731387
};
13741388

13751389
//! Accumulator object to compute the sample minimum.
1376-
template<typename T>
1390+
template<typename T, std::size_t N = 1>
13771391
struct SMin {
1378-
using TAccumulator = COrderStatisticsStack<T, 1>;
1392+
using TAccumulator = COrderStatisticsStack<T, N>;
13791393
};
13801394
//@}
13811395

0 commit comments

Comments
 (0)