Skip to content

Commit 9ce5e7e

Browse files
author
David Roberts
authored
[ML] Add new categorization stats to model_size_stats (#989)
This change adds support for the following new model_size_stats fields: - categorized_doc_count - total_category_count - frequent_category_count - rare_category_count - dead_category_count - categorization_status Relates elastic/elasticsearch#50749
1 parent c47284e commit 9ce5e7e

24 files changed

+393
-199
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ progress, memory usage, etc. (See {ml-pull}906[#906].)
4646

4747
* Improve initialization of learn rate for better and more stable results in regression
4848
and classification. (See {ml-pull}948[#948].)
49+
* Add new model_size_stats fields to instrument categorization. (See {ml-pull}948[#948]
50+
and {pull}51879[#51879], issue: {issue}50794[#50749].)
4951

5052
=== Bug Fixes
5153

include/api/CAnomalyJob.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,14 +120,14 @@ class API_EXPORT CAnomalyJob : public CDataProcessor {
120120

121121
struct SBackgroundPersistArgs {
122122
SBackgroundPersistArgs(core_t::TTime time,
123-
const model::CResourceMonitor::SResults& modelSizeStats,
123+
const model::CResourceMonitor::SModelSizeStats& modelSizeStats,
124124
const model::CInterimBucketCorrector& interimBucketCorrector,
125125
const model::CHierarchicalResultsAggregator& aggregator,
126126
core_t::TTime latestRecordTime,
127127
core_t::TTime lastResultsTime);
128128

129129
core_t::TTime s_Time;
130-
model::CResourceMonitor::SResults s_ModelSizeStats;
130+
model::CResourceMonitor::SModelSizeStats s_ModelSizeStats;
131131
model::CInterimBucketCorrector s_InterimBucketCorrector;
132132
model::CHierarchicalResultsAggregator s_Aggregator;
133133
std::string s_NormalizerState;
@@ -258,7 +258,7 @@ class API_EXPORT CAnomalyJob : public CDataProcessor {
258258
bool persistCopiedState(const std::string& descriptionPrefix,
259259
core_t::TTime time,
260260
const TKeyCRefAnomalyDetectorPtrPrVec& detectors,
261-
const model::CResourceMonitor::SResults& modelSizeStats,
261+
const model::CResourceMonitor::SModelSizeStats& modelSizeStats,
262262
const model::CInterimBucketCorrector& interimBucketCorrector,
263263
const model::CHierarchicalResultsAggregator& aggregator,
264264
const std::string& normalizerState,

include/api/CJsonOutputWriter.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ class API_EXPORT CJsonOutputWriter : public COutputHandler {
232232

233233
//! Report the current levels of resource usage, as given to us
234234
//! from the CResourceMonitor via a callback
235-
void reportMemoryUsage(const model::CResourceMonitor::SResults& results);
235+
void reportMemoryUsage(const model::CResourceMonitor::SModelSizeStats& modelSizeStats);
236236

237237
//! Acknowledge a flush request by echoing back the flush ID
238238
void acknowledgeFlush(const std::string& flushId, core_t::TTime lastFinalizedBucketEnd);

include/api/CModelSizeStatsJsonWriter.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class API_EXPORT CModelSizeStatsJsonWriter : private core::CNonInstantiatable {
2424
public:
2525
//! Writes the model size stats in the \p results in JSON format.
2626
static void write(const std::string& jobId,
27-
const model::CResourceMonitor::SResults& results,
27+
const model::CResourceMonitor::SModelSizeStats& results,
2828
core::CRapidJsonConcurrentLineWriter& writer);
2929
};
3030
}

include/api/CModelSnapshotJsonWriter.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class API_EXPORT CModelSnapshotJsonWriter {
3333
std::string s_Description;
3434
std::string s_SnapshotId;
3535
size_t s_NumDocs;
36-
model::CResourceMonitor::SResults s_ModelSizeStats;
36+
model::CResourceMonitor::SModelSizeStats s_ModelSizeStats;
3737
std::string s_NormalizerState;
3838
core_t::TTime s_LatestRecordTime;
3939
core_t::TTime s_LatestFinalResultTime;

include/model/CAnomalyDetector.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -270,9 +270,9 @@ class MODEL_EXPORT CAnomalyDetector : public CMonitoredResource {
270270
//! Prune the model.
271271
void prune(std::size_t maximumAge) override;
272272

273-
//! Update the overall model memory stats results with stats from this
274-
//! anomaly detector.
275-
void updateMemoryResults(CResourceMonitor::SResults& results) const override;
273+
//! Update the overall model size stats with information from this anomaly
274+
//! detector.
275+
void updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const override;
276276

277277
//! Get end of the last complete bucket we've observed.
278278
const core_t::TTime& lastBucketEndTime() const;

include/model/CMonitoredResource.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,10 @@ class MODEL_EXPORT CMonitoredResource {
5858
//! discarding the least recently seen entities it knows about.
5959
virtual void prune(std::size_t maximumAge);
6060

61-
//! Update the overall model memory stats results with stats from this
61+
//! Update the overall model size stats results with stats from this
6262
//! monitored resource.
63-
virtual void updateMemoryResults(CResourceMonitor::SResults& results) const = 0;
63+
virtual void
64+
updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const = 0;
6465
};
6566
}
6667
}

include/model/CResourceMonitor.h

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,23 +38,30 @@ class CMonitoredResource;
3838
//! Assess memory used by models and decide on further memory allocations.
3939
class MODEL_EXPORT CResourceMonitor {
4040
public:
41-
struct MODEL_EXPORT SResults {
42-
std::size_t s_Usage;
43-
std::size_t s_AdjustedUsage;
44-
std::size_t s_ByFields;
45-
std::size_t s_PartitionFields;
46-
std::size_t s_OverFields;
47-
std::size_t s_AllocationFailures;
48-
model_t::EMemoryStatus s_MemoryStatus;
49-
core_t::TTime s_BucketStartTime;
50-
std::size_t s_BytesExceeded;
51-
std::size_t s_BytesMemoryLimit;
41+
struct MODEL_EXPORT SModelSizeStats {
42+
std::size_t s_Usage = 0;
43+
std::size_t s_AdjustedUsage = 0;
44+
std::size_t s_ByFields = 0;
45+
std::size_t s_PartitionFields = 0;
46+
std::size_t s_OverFields = 0;
47+
std::size_t s_AllocationFailures = 0;
48+
model_t::EMemoryStatus s_MemoryStatus = model_t::E_MemoryStatusOk;
49+
core_t::TTime s_BucketStartTime = 0;
50+
std::size_t s_BytesExceeded = 0;
51+
std::size_t s_BytesMemoryLimit = 0;
52+
std::size_t s_CategorizedMessages = 0;
53+
std::size_t s_TotalCategories = 0;
54+
std::size_t s_FrequentCategories = 0;
55+
std::size_t s_RareCategories = 0;
56+
std::size_t s_DeadCategories = 0;
57+
model_t::ECategorizationStatus s_CategorizationStatus = model_t::E_CategorizationStatusOk;
5258
};
5359

5460
public:
5561
using TMonitoredResourcePtrSizeUMap =
5662
boost::unordered_map<CMonitoredResource*, std::size_t>;
57-
using TMemoryUsageReporterFunc = std::function<void(const CResourceMonitor::SResults&)>;
63+
using TMemoryUsageReporterFunc =
64+
std::function<void(const CResourceMonitor::SModelSizeStats&)>;
5865
using TTimeSizeMap = std::map<core_t::TTime, std::size_t>;
5966

6067
//! The minimum time between prunes
@@ -109,7 +116,7 @@ class MODEL_EXPORT CResourceMonitor {
109116
void sendMemoryUsageReport(core_t::TTime bucketStartTime);
110117

111118
//! Create a memory usage report
112-
SResults createMemoryUsageReport(core_t::TTime bucketStartTime);
119+
SModelSizeStats createMemoryUsageReport(core_t::TTime bucketStartTime);
113120

114121
//! We are being told that a class has failed to allocate memory
115122
//! based on the resource limits, and we will report this to the

include/model/CTokenListCategory.h

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <model/ImportExport.h>
1212

13+
#include <algorithm>
1314
#include <map>
1415
#include <string>
1516
#include <utility>
@@ -94,16 +95,38 @@ class MODEL_EXPORT CTokenListCategory {
9495
//! this category's common unique tokens?
9596
std::size_t missingCommonTokenWeight(const TSizeSizeMap& uniqueTokenIds) const;
9697

97-
//! Is the weight of tokens in a given map that are missing from this
98-
//! category's common unique tokens equal to zero? It is possible to test:
98+
//! Is the weight of tokens in the provided container that are missing from
99+
//! this category's common unique tokens equal to zero? It is possible to
100+
//! test:
99101
//! if (category.missingCommonTokenWeight(uniqueTokenIds) == 0)
100102
//! instead of calling this method. However, this method is much faster
101103
//! as it can return false as soon as a mismatch occurs.
102-
bool isMissingCommonTokenWeightZero(const TSizeSizeMap& uniqueTokenIds) const;
104+
//! \param uniqueTokenIds A container of pairs where the first element is
105+
//! a token ID and the container is sorted into
106+
//! ascending token ID order.
107+
template<typename PAIR_CONTAINER>
108+
bool isMissingCommonTokenWeightZero(const PAIR_CONTAINER& uniqueTokenIds) const {
109+
110+
auto testIter = uniqueTokenIds.begin();
111+
for (const auto& commonItem : m_CommonUniqueTokenIds) {
112+
testIter = std::find_if(testIter, uniqueTokenIds.end(),
113+
[&commonItem](const auto& testItem) {
114+
return testItem.first >= commonItem.first;
115+
});
116+
if (testIter == uniqueTokenIds.end() ||
117+
testIter->first != commonItem.first ||
118+
testIter->second != commonItem.second) {
119+
return false;
120+
}
121+
++testIter;
122+
}
123+
124+
return true;
125+
}
103126

104127
//! Does the supplied token vector contain all our common tokens in the
105128
//! same order as our base token vector?
106-
bool containsCommonTokensInOrder(const TSizeSizePrVec& tokenIds) const;
129+
bool containsCommonInOrderTokensInOrder(const TSizeSizePrVec& tokenIds) const;
107130

108131
//! \return Does the supplied token ID represent a common unique token?
109132
bool isTokenCommon(std::size_t tokenId) const;

include/model/CTokenListDataCategorizer.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,6 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
8181
//! Get the static size of this object - used for virtual hierarchies
8282
std::size_t staticSize() const override { return sizeof(*this); }
8383

84-
//! Currently the overall model memory stats do not contain any categorizer
85-
//! stats fields.
86-
void updateMemoryResults(CResourceMonitor::SResults& /*results*/) const override {
87-
// NO-OP
88-
}
89-
9084
protected:
9185
//! Split the string into a list of tokens. The result of the
9286
//! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and

0 commit comments

Comments
 (0)