Skip to content

Commit 548222b

Browse files
authored
[ML] Anomaly detection for multiple bucket features (#175)
1 parent 88095c1 commit 548222b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+2685
-1413
lines changed

bin/autodetect/CCmdLineParser.cc

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ bool CCmdLineParser::parse(int argc,
5252
bool& memoryUsage,
5353
std::size_t& bucketResultsDelay,
5454
bool& multivariateByFields,
55-
std::string& multipleBucketspans,
5655
bool& perPartitionNormalization,
5756
TStrVec& clauseTokens) {
5857
try {
@@ -117,8 +116,6 @@ bool CCmdLineParser::parse(int argc,
117116
"The numer of half buckets to store before choosing which overlapping bucket has the biggest anomaly")
118117
("multivariateByFields",
119118
"Optional flag to enable multi-variate analysis of correlated by fields")
120-
("multipleBucketspans", boost::program_options::value<std::string>(),
121-
"Optional comma-separated list of additional bucketspans - must be direct multiples of the main bucketspan")
122119
("perPartitionNormalization",
123120
"Optional flag to enable per partition normalization")
124121
;
@@ -234,9 +231,6 @@ bool CCmdLineParser::parse(int argc,
234231
if (vm.count("multivariateByFields") > 0) {
235232
multivariateByFields = true;
236233
}
237-
if (vm.count("multipleBucketspans") > 0) {
238-
multipleBucketspans = vm["multipleBucketspans"].as<std::string>();
239-
}
240234
if (vm.count("perPartitionNormalization") > 0) {
241235
perPartitionNormalization = true;
242236
}

bin/autodetect/CCmdLineParser.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ class CCmdLineParser {
6464
bool& memoryUsage,
6565
std::size_t& bucketResultsDelay,
6666
bool& multivariateByFields,
67-
std::string& multipleBucketspans,
6867
bool& perPartitionNormalization,
6968
TStrVec& clauseTokens);
7069

bin/autodetect/Main.cc

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ int main(int argc, char** argv) {
8888
bool memoryUsage(false);
8989
std::size_t bucketResultsDelay(0);
9090
bool multivariateByFields(false);
91-
std::string multipleBucketspans;
9291
bool perPartitionNormalization(false);
9392
TStrVec clauseTokens;
9493
if (ml::autodetect::CCmdLineParser::parse(
@@ -97,10 +96,9 @@ int main(int argc, char** argv) {
9796
summaryCountFieldName, delimiter, lengthEncodedInput, timeField,
9897
timeFormat, quantilesStateFile, deleteStateFiles, persistInterval,
9998
maxQuantileInterval, inputFileName, isInputFileNamedPipe, outputFileName,
100-
isOutputFileNamedPipe, restoreFileName, isRestoreFileNamedPipe,
101-
persistFileName, isPersistFileNamedPipe, maxAnomalyRecords, memoryUsage,
102-
bucketResultsDelay, multivariateByFields, multipleBucketspans,
103-
perPartitionNormalization, clauseTokens) == false) {
99+
isOutputFileNamedPipe, restoreFileName, isRestoreFileNamedPipe, persistFileName,
100+
isPersistFileNamedPipe, maxAnomalyRecords, memoryUsage, bucketResultsDelay,
101+
multivariateByFields, perPartitionNormalization, clauseTokens) == false) {
104102
return EXIT_FAILURE;
105103
}
106104

@@ -147,7 +145,7 @@ int main(int argc, char** argv) {
147145
ml::model::CAnomalyDetectorModelConfig modelConfig =
148146
ml::model::CAnomalyDetectorModelConfig::defaultConfig(
149147
bucketSpan, summaryMode, summaryCountFieldName, latency,
150-
bucketResultsDelay, multivariateByFields, multipleBucketspans);
148+
bucketResultsDelay, multivariateByFields);
151149
modelConfig.perPartitionNormalization(perPartitionNormalization);
152150
modelConfig.detectionRules(ml::model::CAnomalyDetectorModelConfig::TIntDetectionRuleVecUMapCRef(
153151
fieldConfig.detectionRules()));

docs/CHANGELOG.asciidoc

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,20 @@
2828
2929
//=== Regressions
3030
31-
//=== Known Issues
32-
////
31+
== {es} version 6.5.0
32+
33+
//=== Breaking Changes
34+
35+
//=== Deprecations
36+
37+
//=== New Features
38+
39+
=== Enhancements
40+
41+
Perform anomaly detection on features derived from multiple bucket values to improve robustness
42+
of detection with respect to misconfigured bucket lengths and improve detection of long lasting
43+
anomalies. (See {pull}175[#175].)
44+
45+
//=== Bug Fixes
46+
47+
//=== Regressions

include/maths/CBasicStatistics.h

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ class MATHS_EXPORT CBasicStatistics {
149149
template<typename T, unsigned int ORDER>
150150
struct SSampleCentralMoments : public std::unary_function<T, void> {
151151
using TCoordinate = typename SCoordinate<T>::Type;
152+
using TValue = T;
152153

153154
//! See core::CMemory.
154155
static bool dynamicSizeAlwaysZero() {
@@ -1480,17 +1481,6 @@ class MATHS_EXPORT CBasicStatistics {
14801481
//! The set maximum.
14811482
COrderStatisticsStack<T, 1, GREATER> m_Max;
14821483
};
1483-
1484-
// Friends
1485-
template<typename T>
1486-
friend std::ostream&
1487-
operator<<(std::ostream& o, const CBasicStatistics::SSampleCentralMoments<T, 1u>&);
1488-
template<typename T>
1489-
friend std::ostream&
1490-
operator<<(std::ostream& o, const CBasicStatistics::SSampleCentralMoments<T, 2u>&);
1491-
template<typename T>
1492-
friend std::ostream&
1493-
operator<<(std::ostream& o, const CBasicStatistics::SSampleCentralMoments<T, 3u>&);
14941484
};
14951485

14961486
template<typename T>
@@ -1596,6 +1586,23 @@ template<typename U>
15961586
void CBasicStatistics::SSampleCentralMoments<T, ORDER>::add(const U& x, const TCoordinate& n) {
15971587
basic_statistics_detail::SCentralMomentsCustomAdd<U>::add(x, n, *this);
15981588
}
1589+
1590+
//! \brief Defines a promoted type for a SSampleCentralMoments.
1591+
//!
1592+
//! \see CTypeConversions.h for details.
1593+
template<typename T, unsigned int N>
1594+
struct SPromoted<CBasicStatistics::SSampleCentralMoments<T, N>> {
1595+
using Type = CBasicStatistics::SSampleCentralMoments<typename SPromoted<T>::Type, N>;
1596+
};
1597+
1598+
//! \brief Defines SSampleCentralMoments on a suitable floating point type.
1599+
//!
1600+
//! \see CTypeConversions.h for details.
1601+
template<typename T, unsigned int N, typename U>
1602+
struct SFloatingPoint<CBasicStatistics::SSampleCentralMoments<T, N>, U> {
1603+
using Type =
1604+
CBasicStatistics::SSampleCentralMoments<typename SFloatingPoint<T, U>::Type, N>;
1605+
};
15991606
}
16001607
}
16011608

include/maths/CBasicStatisticsPersist.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,16 @@ template<typename T, std::size_t N>
4848
bool stringToType(const std::string& str, CSymmetricMatrixNxN<T, N>& value) {
4949
return value.fromDelimited(str);
5050
}
51+
//! Function to do conversion from string to a vector.
52+
template<typename T>
53+
bool stringToType(const std::string& str, CVector<T>& value) {
54+
return value.fromDelimited(str);
55+
}
56+
//! Function to do conversion from string to a symmetric matrix.
57+
template<typename T>
58+
bool stringToType(const std::string& str, CSymmetricMatrix<T>& value) {
59+
return value.fromDelimited(str);
60+
}
5161

5262
//! Function to do conversion to a string.
5363
template<typename T>
@@ -72,6 +82,16 @@ template<typename T, std::size_t N>
7282
inline std::string typeToString(const CSymmetricMatrixNxN<T, N>& value) {
7383
return value.toDelimited();
7484
}
85+
//! Function to do conversion to a string from a vector.
86+
template<typename T>
87+
inline std::string typeToString(const CVector<T>& value) {
88+
return value.toDelimited();
89+
}
90+
//! Function to do conversion to a string from a symmetric matrix.
91+
template<typename T>
92+
inline std::string typeToString(const CSymmetricMatrix<T>& value) {
93+
return value.toDelimited();
94+
}
7595
}
7696

7797
template<typename T, unsigned int ORDER>

include/maths/CLinearAlgebra.h

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -928,7 +928,6 @@ class CVectorNx1 : private boost::equality_comparable< CVectorNx1<T, N>,
928928

929929
public:
930930
using TArray = T[N];
931-
using TVec = std::vector<T>;
932931
using TBoostArray = boost::array<T, N>;
933932
using TConstIterator = typename TBoostArray::const_iterator;
934933

@@ -950,21 +949,24 @@ class CVectorNx1 : private boost::equality_comparable< CVectorNx1<T, N>,
950949
}
951950

952951
//! Construct from a boost array.
953-
explicit CVectorNx1(const boost::array<T, N>& a) {
952+
template<typename U>
953+
explicit CVectorNx1(const boost::array<U, N>& a) {
954954
for (std::size_t i = 0u; i < N; ++i) {
955955
TBase::m_X[i] = a[i];
956956
}
957957
}
958958

959959
//! Construct from a vector.
960-
explicit CVectorNx1(const TVec& v) {
960+
template<typename U>
961+
explicit CVectorNx1(const std::vector<U>& v) {
961962
for (std::size_t i = 0u; i < N; ++i) {
962963
TBase::m_X[i] = v[i];
963964
}
964965
}
965966

966967
//! Construct from a vector.
967-
explicit CVectorNx1(const core::CSmallVectorBase<T>& v) {
968+
template<typename U>
969+
explicit CVectorNx1(const core::CSmallVectorBase<U>& v) {
968970
for (std::size_t i = 0u; i < N; ++i) {
969971
TBase::m_X[i] = v[i];
970972
}
@@ -1244,10 +1246,14 @@ class CVector : private boost::equality_comparable< CVector<T>,
12441246
}
12451247

12461248
//! Construct from a vector.
1247-
explicit CVector(const TArray& v) { TBase::m_X = v; }
1249+
template<typename U>
1250+
explicit CVector(const std::vector<U>& v) {
1251+
TBase::m_X = v;
1252+
}
12481253

12491254
//! Construct from a vector.
1250-
explicit CVector(const core::CSmallVectorBase<T>& v) {
1255+
template<typename U>
1256+
explicit CVector(const core::CSmallVectorBase<U>& v) {
12511257
TBase::m_X.assign(v.begin(), v.end());
12521258
}
12531259

include/maths/CModel.h

Lines changed: 55 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,6 @@ class MATHS_EXPORT CModelAddSamplesParams {
104104
using TDouble2VecWeightsAryVec = std::vector<maths_t::TDouble2VecWeightsAry>;
105105

106106
public:
107-
CModelAddSamplesParams();
108-
109107
//! Set whether or not the data are integer valued.
110108
CModelAddSamplesParams& integer(bool integer);
111109
//! Get the data type.
@@ -133,15 +131,15 @@ class MATHS_EXPORT CModelAddSamplesParams {
133131

134132
private:
135133
//! The data type.
136-
maths_t::EDataType m_Type;
134+
maths_t::EDataType m_Type = maths_t::E_MixedData;
137135
//! True if the data are non-negative false otherwise.
138-
bool m_IsNonNegative;
136+
bool m_IsNonNegative = false;
139137
//! The propagation interval.
140-
double m_PropagationInterval;
138+
double m_PropagationInterval = 1.0;
141139
//! The trend sample weights.
142-
const TDouble2VecWeightsAryVec* m_TrendWeights;
140+
const TDouble2VecWeightsAryVec* m_TrendWeights = nullptr;
143141
//! The prior sample weights.
144-
const TDouble2VecWeightsAryVec* m_PriorWeights;
142+
const TDouble2VecWeightsAryVec* m_PriorWeights = nullptr;
145143
};
146144

147145
//! \brief The extra parameters needed by CModel::probability.
@@ -178,6 +176,8 @@ class MATHS_EXPORT CModelProbabilityParams {
178176

179177
//! Add whether a value's bucket is empty.
180178
CModelProbabilityParams& addBucketEmpty(const TBool2Vec& empty);
179+
//! Set whether or not the values' bucket is empty.
180+
CModelProbabilityParams& bucketEmpty(const TBool2Vec1Vec& empty);
181181
//! Get whether the values' bucket is empty.
182182
const TBool2Vec1Vec& bucketEmpty() const;
183183

@@ -200,14 +200,19 @@ class MATHS_EXPORT CModelProbabilityParams {
200200
//! Get the most anomalous correlate if there is one.
201201
TOptionalSize mostAnomalousCorrelate() const;
202202

203-
//! Set whether or not to update the anomaly model.
204-
CModelProbabilityParams& updateAnomalyModel(bool update);
205-
//! Get whether or not to update the anomaly model.
206-
bool updateAnomalyModel() const;
203+
//! Set whether or not to use multibucket features.
204+
CModelProbabilityParams& useMultibucketFeatures(bool use);
205+
//! Get whether or not to use multibucket features.
206+
bool useMultibucketFeatures() const;
207+
208+
//! Set whether or not to use the anomaly model.
209+
CModelProbabilityParams& useAnomalyModel(bool use);
210+
//! Get whether or not to use the anomaly model.
211+
bool useAnomalyModel() const;
207212

208213
private:
209214
//! The entity tag (if relevant otherwise 0).
210-
std::size_t m_Tag;
215+
std::size_t m_Tag = 0;
211216
//! The coordinates' probability calculations.
212217
TProbabilityCalculation2Vec m_Calculations;
213218
//! The confidence interval to use when detrending.
@@ -220,8 +225,41 @@ class MATHS_EXPORT CModelProbabilityParams {
220225
TSize2Vec m_Coordinates;
221226
//! The most anomalous coordinate (if there is one).
222227
TOptionalSize m_MostAnomalousCorrelate;
223-
//! Whether or not to update the anomaly model.
224-
bool m_UpdateAnomalyModel;
228+
//! Whether or not to use multibucket features.
229+
bool m_UseMultibucketFeatures = true;
230+
//! Whether or not to use the anomaly model.
231+
bool m_UseAnomalyModel = true;
232+
};
233+
234+
//! \brief Describes the result of the model probability calculation.
235+
struct MATHS_EXPORT SModelProbabilityResult {
236+
using TDouble4Vec = core::CSmallVector<double, 4>;
237+
using TSize1Vec = core::CSmallVector<std::size_t, 1>;
238+
using TTail2Vec = core::CSmallVector<maths_t::ETail, 2>;
239+
240+
//! \brief Wraps up a feature label and probability.
241+
struct MATHS_EXPORT SFeatureProbability {
242+
using TStrCRef = boost::reference_wrapper<const std::string>;
243+
SFeatureProbability();
244+
SFeatureProbability(const std::string& label, double probability);
245+
TStrCRef s_Label;
246+
double s_Probability = 1.0;
247+
};
248+
using TFeatureProbability4Vec = core::CSmallVector<SFeatureProbability, 4>;
249+
250+
//! The overall result probability.
251+
double s_Probability = 1.0;
252+
//! True if the probability depends on the correlation between two
253+
//! time series and false otherwise.
254+
bool s_Conditional = false;
255+
//! The probabilities for each individual feature.
256+
TFeatureProbability4Vec s_FeatureProbabilities;
257+
//! The tail of the current bucket probability.
258+
TTail2Vec s_Tail;
259+
//! The identifier of the time series correlated with this one which
260+
//! has the smallest probability in the current bucket (if and only
261+
//! if the result depends on the correlation structure).
262+
TSize1Vec s_MostAnomalousCorrelate;
225263
};
226264

227265
//! \brief The model interface.
@@ -355,10 +393,7 @@ class MATHS_EXPORT CModel {
355393
virtual bool probability(const CModelProbabilityParams& params,
356394
const TTime2Vec1Vec& time,
357395
const TDouble2Vec1Vec& value,
358-
double& probability,
359-
TTail2Vec& tail,
360-
bool& conditional,
361-
TSize1Vec& mostAnomalousCorrelate) const = 0;
396+
SModelProbabilityResult& result) const = 0;
362397

363398
//! Get the Winsorisation weight to apply to \p value,
364399
//! if appropriate.
@@ -499,14 +534,11 @@ class MATHS_EXPORT CModelStub : public CModel {
499534
const TForecastPushDatapointFunc& forecastPushDataPointFunc,
500535
std::string& messageOut);
501536

502-
//! Returns 1.0.
537+
//! Returns true.
503538
virtual bool probability(const CModelProbabilityParams& params,
504539
const TTime2Vec1Vec& time,
505540
const TDouble2Vec1Vec& value,
506-
double& probability,
507-
TTail2Vec& tail,
508-
bool& conditional,
509-
TSize1Vec& mostAnomalousCorrelate) const;
541+
SModelProbabilityResult& result) const;
510542

511543
//! Returns empty.
512544
virtual TDouble2Vec

0 commit comments

Comments
 (0)