Skip to content

Commit 8d86ac6

Browse files
authored
[ML] Instrumentation for outlier detection (#1068) (#1073)
This PR implements the initial instrumentation of the outlier detection code. Instrumentation information is posted only once on completion of the outlier score computation. Backport of #1068.
1 parent 277d505 commit 8d86ac6

File tree

9 files changed

+282
-33
lines changed

9 files changed

+282
-33
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ the build from version 2.20 to 2.34. (See {ml-pull}1013[#1013].)
5757
model training. (See {ml-pull}1034[#1034].)
5858
* Add instrumentation information for supervised learning data frame analytics jobs.
5959
(See {ml-pull}1031[#1031].)
60+
* Add instrumentation information for outlier detection data frame analytics jobs.
61+
(See {ml-pull}1068[#1068].)
6062

6163
=== Bug Fixes
6264

include/api/CDataFrameAnalysisInstrumentation.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,22 @@ class API_EXPORT CDataFrameOutliersInstrumentation final
116116
public:
117117
explicit CDataFrameOutliersInstrumentation(const std::string& jobId)
118118
: CDataFrameAnalysisInstrumentation(jobId) {}
119+
void parameters(const maths::COutliers::SComputeParameters& parameters) override;
120+
void elapsedTime(std::uint64_t time) override;
121+
void featureInfluenceThreshold(double featureInfluenceThreshold) override;
119122

120123
protected:
121124
counter_t::ECounterTypes memoryCounterType() override;
122125

123126
private:
124127
void writeAnalysisStats(std::int64_t timestamp) override;
128+
void writeTimingStats(rapidjson::Value& parentObject);
129+
void writeParameters(rapidjson::Value& parentObject);
130+
131+
private:
132+
maths::COutliers::SComputeParameters m_Parameters;
133+
std::uint64_t m_ElapsedTime;
134+
double m_FeatureInfluenceThreshold = -1.0;
125135
};
126136

127137
//! \brief Instrumentation class for Supervised Learning jobs.

include/maths/CDataFrameAnalysisInstrumentationInterface.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#define INCLUDED_ml_maths_CDataFrameAnalysisInstrumentationInterface_h
99

1010
#include <maths/CBoostedTree.h>
11+
#include <maths/COutliers.h>
1112
#include <maths/ImportExport.h>
1213

1314
#include <cstdint>
@@ -30,7 +31,7 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface {
3031
virtual ~CDataFrameAnalysisInstrumentationInterface() = default;
3132
//! Adds \p delta to the memory usage statistics.
3233
virtual void updateMemoryUsage(std::int64_t delta) = 0;
33-
//! This adds \p fractionalProgess to the current progress.
34+
//! This adds \p fractionalProgress to the current progress.
3435
//!
3536
//! \note The caller should try to ensure that the sum of the values added
3637
//! at the end of the analysis is equal to one.
@@ -59,7 +60,12 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface {
5960
};
6061

6162
class MATHS_EXPORT CDataFrameOutliersInstrumentationInterface
62-
: virtual public CDataFrameAnalysisInstrumentationInterface {};
63+
: virtual public CDataFrameAnalysisInstrumentationInterface {
64+
public:
65+
virtual void parameters(const maths::COutliers::SComputeParameters& parameters) = 0;
66+
virtual void elapsedTime(std::uint64_t time) = 0;
67+
virtual void featureInfluenceThreshold(double featureInfluenceThreshold) = 0;
68+
};
6369

6470
//! \brief Instrumentation interface for Supervised Learning jobs.
6571
//!
@@ -105,7 +111,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface
105111

106112
public:
107113
virtual ~CDataFrameTrainBoostedTreeInstrumentationInterface() = default;
108-
//! Supevised learning job \p type, can be E_Regression or E_Classification.
114+
//! Supervised learning job \p type, can be E_Regression or E_Classification.
109115
virtual void type(EStatsType type) = 0;
110116
//! Current \p iteration number.
111117
virtual void iteration(std::size_t iteration) = 0;
@@ -126,6 +132,9 @@ class MATHS_EXPORT CDataFrameOutliersInstrumentationStub
126132
void updateMemoryUsage(std::int64_t) override {}
127133
void updateProgress(double) override {}
128134
void nextStep(const std::string& /* phase */) override {}
135+
void parameters(const maths::COutliers::SComputeParameters& /* parameters */) override {}
136+
void elapsedTime(std::uint64_t /* time */) override {}
137+
void featureInfluenceThreshold(double /* featureInfluenceThreshold */) override {}
129138
};
130139

131140
//! \brief Dummies out all instrumentation for supervised learning.

include/maths/COutliers.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
#include <core/Concurrency.h>
1515

1616
#include <maths/CBasicStatistics.h>
17-
#include <maths/CDataFrameAnalysisInstrumentationInterface.h>
1817
#include <maths/CKdTree.h>
1918
#include <maths/CLinearAlgebraShims.h>
2019
#include <maths/COrthogonaliser.h>
@@ -34,6 +33,9 @@
3433

3534
namespace ml {
3635
namespace maths {
36+
37+
class CDataFrameOutliersInstrumentationInterface;
38+
3739
namespace outliers_detail {
3840
using TDoubleVec = std::vector<double>;
3941
using TDouble1Vec = core::CSmallVector<double, 1>;
@@ -657,6 +659,12 @@ class MATHS_EXPORT COutliers : private core::CNonInstantiatable {
657659
template<typename POINT>
658660
using TAnnotatedPoint = CAnnotatedVector<POINT, std::size_t>;
659661

662+
static const std::string LOF;
663+
static const std::string LDOF;
664+
static const std::string DISTANCE_KNN;
665+
static const std::string TOTAL_DISTANCE_KNN;
666+
static const std::string ENSEMBLE;
667+
660668
//! The outlier detection methods which are available.
661669
enum EMethod {
662670
E_Lof,
@@ -680,7 +688,7 @@ class MATHS_EXPORT COutliers : private core::CNonInstantiatable {
680688
std::size_t s_NumberNeighbours;
681689
//! If true also compute the feature influence.
682690
bool s_ComputeFeatureInfluence;
683-
//! The fraction of true outliers amoung the points.
691+
//! The fraction of true outliers among the points.
684692
double s_OutlierFraction;
685693
};
686694

@@ -710,6 +718,9 @@ class MATHS_EXPORT COutliers : private core::CNonInstantiatable {
710718
std::size_t partitionNumberPoints,
711719
std::size_t dimension);
712720

721+
//! Return string representation of the \p method.
722+
static std::string print(EMethod method);
723+
713724
//! \name Test Interface
714725
//@{
715726
//! Compute the normalized LOF scores for \p points.

lib/api/CDataFrameAnalysisInstrumentation.cc

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ const std::string ITERATION_TAG{"iteration"};
2424
const std::string JOB_ID_TAG{"job_id"};
2525
const std::string MEMORY_TYPE_TAG{"analytics_memory_usage"};
2626
const std::string OUTLIER_DETECTION_STATS{"outlier_detection_stats"};
27+
const std::string PARAMETERS_TAG{"parameters"};
2728
const std::string PEAK_MEMORY_USAGE_TAG{"peak_usage_bytes"};
2829
const std::string PROGRESS_TAG{"progress"};
2930
const std::string REGRESSION_STATS_TAG{"regression_stats"};
@@ -59,7 +60,7 @@ const std::string REGULARIZATION_TREE_SIZE_PENALTY_MULTIPLIER_TAG{"regularizatio
5960

6061
// Outlier detection parameters
6162
const std::string N_NEIGHBORS{"n_neighbors"};
62-
const std::string METHODS{"methods"};
63+
const std::string METHOD{"method"};
6364
const std::string COMPUTE_FEATURE_INFLUENCE{"compute_feature_influence"};
6465
const std::string FEATURE_INFLUENCE_THRESHOLD{"feature_influence_threshold"};
6566
const std::string OUTLIER_FRACTION{"outlier_fraction"};
@@ -114,7 +115,7 @@ void CDataFrameAnalysisInstrumentation::resetProgress() {
114115
}
115116

116117
void CDataFrameAnalysisInstrumentation::nextStep(const std::string& /* phase */) {
117-
// TODO reactivate once Java part is ready
118+
// reactivate once java side is ready
118119
// this->writeState();
119120
}
120121

@@ -171,10 +172,69 @@ void CDataFrameOutliersInstrumentation::writeAnalysisStats(std::int64_t timestam
171172
writer->String(this->jobId());
172173
writer->Key(TIMESTAMP_TAG);
173174
writer->Int64(timestamp);
175+
176+
rapidjson::Value parametersObject{writer->makeObject()};
177+
this->writeParameters(parametersObject);
178+
writer->Key(PARAMETERS_TAG);
179+
writer->write(parametersObject);
180+
181+
rapidjson::Value timingStatsObject{writer->makeObject()};
182+
this->writeTimingStats(timingStatsObject);
183+
writer->Key(TIMING_STATS_TAG);
184+
writer->write(timingStatsObject);
185+
174186
writer->EndObject();
175187
}
176188
}
177189

190+
void CDataFrameOutliersInstrumentation::parameters(const maths::COutliers::SComputeParameters& parameters) {
191+
m_Parameters = parameters;
192+
}
193+
194+
void CDataFrameOutliersInstrumentation::elapsedTime(std::uint64_t time) {
195+
m_ElapsedTime = time;
196+
}
197+
198+
void CDataFrameOutliersInstrumentation::featureInfluenceThreshold(double featureInfluenceThreshold) {
199+
m_FeatureInfluenceThreshold = featureInfluenceThreshold;
200+
}
201+
202+
void CDataFrameOutliersInstrumentation::writeTimingStats(rapidjson::Value& parentObject) {
203+
auto* writer = this->writer();
204+
if (writer != nullptr) {
205+
writer->addMember(TIMING_ELAPSED_TIME_TAG,
206+
rapidjson::Value(m_ElapsedTime).Move(), parentObject);
207+
}
208+
}
209+
210+
void CDataFrameOutliersInstrumentation::writeParameters(rapidjson::Value& parentObject) {
211+
auto* writer = this->writer();
212+
213+
if (writer != nullptr) {
214+
215+
writer->addMember(
216+
N_NEIGHBORS,
217+
rapidjson::Value(static_cast<std::uint64_t>(this->m_Parameters.s_NumberNeighbours))
218+
.Move(),
219+
parentObject);
220+
writer->addMember(
221+
COMPUTE_FEATURE_INFLUENCE,
222+
rapidjson::Value(this->m_Parameters.s_ComputeFeatureInfluence).Move(),
223+
parentObject);
224+
writer->addMember(OUTLIER_FRACTION,
225+
rapidjson::Value(this->m_Parameters.s_OutlierFraction).Move(),
226+
parentObject);
227+
writer->addMember(FEATURE_INFLUENCE_THRESHOLD,
228+
rapidjson::Value(this->m_FeatureInfluenceThreshold).Move(),
229+
parentObject);
230+
writer->addMember(
231+
STANDARDIZATION_ENABLED,
232+
rapidjson::Value(this->m_Parameters.s_StandardizeColumns).Move(), parentObject);
233+
writer->addMember(METHOD, maths::COutliers::print(this->m_Parameters.s_Method),
234+
parentObject);
235+
}
236+
}
237+
178238
void CDataFrameTrainBoostedTreeInstrumentation::type(EStatsType type) {
179239
m_Type = type;
180240
}

lib/api/CDataFrameOutliersRunner.cc

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,17 @@ namespace api {
2828
namespace {
2929
const CDataFrameAnalysisConfigReader& parameterReader() {
3030
static const CDataFrameAnalysisConfigReader PARAMETER_READER{[] {
31-
const std::string lof{"lof"};
32-
const std::string ldof{"ldof"};
33-
const std::string knn{"distance_kth_nn"};
34-
const std::string tnn{"distance_knn"};
3531
CDataFrameAnalysisConfigReader theReader;
3632
theReader.addParameter(CDataFrameOutliersRunner::STANDARDIZATION_ENABLED,
3733
CDataFrameAnalysisConfigReader::E_OptionalParameter);
3834
theReader.addParameter(CDataFrameOutliersRunner::N_NEIGHBORS,
3935
CDataFrameAnalysisConfigReader::E_OptionalParameter);
40-
theReader.addParameter(CDataFrameOutliersRunner::METHOD,
41-
CDataFrameAnalysisConfigReader::E_OptionalParameter,
42-
{{lof, int{maths::COutliers::E_Lof}},
43-
{ldof, int{maths::COutliers::E_Ldof}},
44-
{knn, int{maths::COutliers::E_DistancekNN}},
45-
{tnn, int{maths::COutliers::E_TotalDistancekNN}}});
36+
theReader.addParameter(
37+
CDataFrameOutliersRunner::METHOD, CDataFrameAnalysisConfigReader::E_OptionalParameter,
38+
{{maths::COutliers::LOF, int{maths::COutliers::E_Lof}},
39+
{maths::COutliers::LDOF, int{maths::COutliers::E_Ldof}},
40+
{maths::COutliers::DISTANCE_KNN, int{maths::COutliers::E_DistancekNN}},
41+
{maths::COutliers::TOTAL_DISTANCE_KNN, int{maths::COutliers::E_TotalDistancekNN}}});
4642
theReader.addParameter(CDataFrameOutliersRunner::COMPUTE_FEATURE_INFLUENCE,
4743
CDataFrameAnalysisConfigReader::E_OptionalParameter);
4844
theReader.addParameter(CDataFrameOutliersRunner::FEATURE_INFLUENCE_THRESHOLD,
@@ -69,6 +65,8 @@ CDataFrameOutliersRunner::CDataFrameOutliersRunner(const CDataFrameAnalysisSpeci
6965
m_ComputeFeatureInfluence = parameters[COMPUTE_FEATURE_INFLUENCE].fallback(true);
7066
m_FeatureInfluenceThreshold = parameters[FEATURE_INFLUENCE_THRESHOLD].fallback(0.1);
7167
m_OutlierFraction = parameters[OUTLIER_FRACTION].fallback(0.05);
68+
69+
m_Instrumentation.featureInfluenceThreshold(m_FeatureInfluenceThreshold);
7270
}
7371

7472
CDataFrameOutliersRunner::CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec)

0 commit comments

Comments
 (0)