Skip to content

Commit 9979b8a

Browse files
author
David Roberts
authored
[ML] Adding memory usage methods for categorization classes (#859)
This is a step towards adding memory instrumentation to categorization. On its own this PR has no externally visible effect, but by splitting out these boilerplate changes the PR that eventually adds memory instrumentation to categorization will be smaller.
1 parent 3ff88cc commit 9979b8a

20 files changed

+387
-56
lines changed

include/core/CCsvLineParser.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#ifndef INCLUDED_ml_core_CCsvLineParser_h
77
#define INCLUDED_ml_core_CCsvLineParser_h
88

9+
#include <core/CMemoryUsage.h>
910
#include <core/ImportExport.h>
1011

1112
#include <boost/scoped_array.hpp>
@@ -60,6 +61,12 @@ class CORE_EXPORT CCsvLineParser {
6061
//! Are we at the end of the current line?
6162
bool atEnd() const;
6263

64+
//! Debug the memory used by this parser.
65+
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
66+
67+
//! Get the memory used by this parser.
68+
std::size_t memoryUsage() const;
69+
6370
private:
6471
//! Attempt to parse the next token from the working record
6572
//! into the working field.

include/core/CMemory.h

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#ifndef INCLUDED_ml_core_CMemory_h
77
#define INCLUDED_ml_core_CMemory_h
88

9+
#include <core/BoostMultiIndex.h>
910
#include <core/CLogger.h>
1011
#include <core/CMemoryUsage.h>
1112
#include <core/CNonInstantiatable.h>
@@ -14,6 +15,7 @@
1415
#include <boost/any.hpp>
1516
#include <boost/circular_buffer_fwd.hpp>
1617
#include <boost/container/container_fwd.hpp>
18+
#include <boost/mpl/range_c.hpp>
1719
#include <boost/optional/optional_fwd.hpp>
1820
#include <boost/shared_array.hpp>
1921
#include <boost/type_traits/is_pointer.hpp>
@@ -491,6 +493,30 @@ class CORE_EXPORT CMemory : private CNonInstantiatable {
491493
return mem + pageVecEntries * sizeof(std::size_t) + numPages * pageSize;
492494
}
493495

496+
//! Overload for boost::multi_index::multi_index_container.
497+
template<typename T, typename I, typename A>
498+
static std::size_t
499+
dynamicSize(const boost::multi_index::multi_index_container<T, I, A>& t) {
500+
// It's tricky to determine the container overhead of a multi-index
501+
// container. It can have an arbitrary number of indices, each of which
502+
// can be of a different type. To accurately determine the overhead
503+
// would require some serious template metaprogramming to interpret the
504+
// "typename I" template argument, and it's just not worth it given the
505+
// infrequent and relatively simple usage (generally just two indices
506+
// in our current codebase). Therefore there's an approximation here
507+
// that the overhead is 2 pointers per entry per index.
508+
using TMultiIndex = boost::multi_index::multi_index_container<T, I, A>;
509+
constexpr std::size_t indexCount{
510+
boost::mpl::size<typename TMultiIndex::index_type_list>::value};
511+
std::size_t mem = 0;
512+
if (!memory_detail::SDynamicSizeAlwaysZero<T>::value()) {
513+
for (auto i = t.begin(); i != t.end(); ++i) {
514+
mem += dynamicSize(*i);
515+
}
516+
}
517+
return mem + t.size() * (sizeof(T) + 2 * indexCount * sizeof(std::size_t));
518+
}
519+
494520
//! Overload for boost::circular_buffer.
495521
template<typename T, typename A>
496522
static std::size_t dynamicSize(const boost::circular_buffer<T, A>& t) {
@@ -970,7 +996,7 @@ class CORE_EXPORT CMemoryDebug : private CNonInstantiatable {
970996
componentName += "_list";
971997

972998
std::size_t listSize = (memory_detail::EXTRA_NODES + t.size()) *
973-
(sizeof(T) + 4 * sizeof(std::size_t));
999+
(sizeof(T) + 2 * sizeof(std::size_t));
9741000

9751001
CMemoryUsage::SMemoryUsage usage(componentName, listSize);
9761002
CMemoryUsage::TMemoryUsagePtr ptr = mem->addChild();
@@ -1009,6 +1035,37 @@ class CORE_EXPORT CMemoryDebug : private CNonInstantiatable {
10091035
}
10101036
}
10111037

1038+
//! Overload for boost::multi_index::multi_index_container.
1039+
template<typename T, typename I, typename A>
1040+
static void dynamicSize(const char* name,
1041+
const boost::multi_index::multi_index_container<T, I, A>& t,
1042+
CMemoryUsage::TMemoryUsagePtr mem) {
1043+
// It's tricky to determine the container overhead of a multi-index
1044+
// container. It can have an arbitrary number of indices, each of which
1045+
// can be of a different type. To accurately determine the overhead
1046+
// would require some serious template metaprogramming to interpret the
1047+
// "typename I" template argument, and it's just not worth it given the
1048+
// infrequent and relatively simple usage (generally just two indices
1049+
// in our current codebase). Therefore there's an approximation here
1050+
// that the overhead is 2 pointers per entry per index.
1051+
using TMultiIndex = boost::multi_index::multi_index_container<T, I, A>;
1052+
constexpr std::size_t indexCount{
1053+
boost::mpl::size<typename TMultiIndex::index_type_list>::value};
1054+
std::string componentName(name);
1055+
1056+
std::size_t items = t.size();
1057+
CMemoryUsage::SMemoryUsage usage(
1058+
componentName + "::" + typeid(T).name(),
1059+
items * (sizeof(T) + 2 * indexCount * sizeof(std::size_t)));
1060+
CMemoryUsage::TMemoryUsagePtr ptr = mem->addChild();
1061+
ptr->setName(usage);
1062+
1063+
componentName += "_item";
1064+
for (auto i = t.begin(); i != t.end(); ++i) {
1065+
dynamicSize(componentName.c_str(), *i, ptr);
1066+
}
1067+
}
1068+
10121069
//! Overload for boost::circular_buffer.
10131070
template<typename T, typename A>
10141071
static void dynamicSize(const char* name,

include/core/CStringSimilarityTester.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#define INCLUDED_ml_core_CStringSimilarityTester_h
88

99
#include <core/CLogger.h>
10+
#include <core/CMemoryUsage.h>
1011
#include <core/CNonCopyable.h>
1112
#include <core/CompressUtils.h>
1213
#include <core/ImportExport.h>
@@ -286,6 +287,12 @@ class CORE_EXPORT CStringSimilarityTester : private CNonCopyable {
286287
return currentCol[secondLen];
287288
}
288289

290+
//! Debug the memory used by this similarity tester.
291+
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
292+
293+
//! Get the memory used by this similarity tester.
294+
std::size_t memoryUsage() const;
295+
289296
private:
290297
//! Calculate the Levenshtein distance using the naive method of
291298
//! calculating the entire distance matrix. This private method

include/core/CompressUtils.h

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#ifndef INCLUDED_ml_core_CCompressUtils_h
77
#define INCLUDED_ml_core_CCompressUtils_h
88

9+
#include <core/CMemoryUsage.h>
910
#include <core/CNonCopyable.h>
1011
#include <core/ImportExport.h>
1112

@@ -98,6 +99,12 @@ class CORE_EXPORT CCompressUtil : private CNonCopyable {
9899
//! error, it may be desirable to explicitly reset the state.
99100
void reset();
100101

102+
//! Debug the memory used by these compression utils.
103+
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
104+
105+
//! Get the memory used by these compression utils.
106+
std::size_t memoryUsage() const;
107+
101108
protected:
102109
//! Get the underlying stream.
103110
z_stream& stream();
@@ -191,26 +198,26 @@ class CORE_EXPORT CCompressUtil : private CNonCopyable {
191198
class CORE_EXPORT CDeflator final : public CCompressUtil {
192199
public:
193200
CDeflator(bool lengthOnly, int level = Z_DEFAULT_COMPRESSION);
194-
~CDeflator();
201+
~CDeflator() override;
195202

196203
private:
197204
//! Process a chunk of state (optionally flushing).
198-
virtual int streamProcessChunk(int flush);
205+
int streamProcessChunk(int flush) override;
199206
//! Reset the underlying stream.
200-
virtual int resetStream();
207+
int resetStream() override;
201208
};
202209

203210
//! \brief Implementation of CompressUtil for inflating data.
204211
class CORE_EXPORT CInflator final : public CCompressUtil {
205212
public:
206213
CInflator(bool lengthOnly);
207-
~CInflator();
214+
~CInflator() override;
208215

209216
private:
210217
//! Process a chunk of state (optionally flushing).
211-
virtual int streamProcessChunk(int flush);
218+
int streamProcessChunk(int flush) override;
212219
//! Reset the underlying stream.
213-
virtual int resetStream();
220+
int resetStream() override;
214221
};
215222
}
216223
}

include/model/CBaseTokenListDataCategorizer.h

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -102,16 +102,16 @@ class MODEL_EXPORT CBaseTokenListDataCategorizer : public CDataCategorizer {
102102
const std::string& fieldName);
103103

104104
//! Dump stats
105-
virtual void dumpStats() const;
105+
void dumpStats() const override;
106106

107107
//! Compute a category from a string. The raw string length may be longer
108108
//! than the length of the passed string, because the passed string may
109109
//! have the date stripped out of it. Field names/values are available
110110
//! to the category computation.
111-
virtual int computeCategory(bool dryRun,
112-
const TStrStrUMap& fields,
113-
const std::string& str,
114-
size_t rawStringLen);
111+
int computeCategory(bool dryRun,
112+
const TStrStrUMap& fields,
113+
const std::string& str,
114+
size_t rawStringLen) override;
115115

116116
// Bring the other overload of computeCategory() into scope
117117
using CDataCategorizer::computeCategory;
@@ -120,23 +120,29 @@ class MODEL_EXPORT CBaseTokenListDataCategorizer : public CDataCategorizer {
120120
//! that are classified as the given category. Note that the reverse search
121121
//! is only approximate - it may select more records than have actually
122122
//! been classified as the returned category.
123-
virtual bool createReverseSearch(int categoryId,
124-
std::string& part1,
125-
std::string& part2,
126-
size_t& maxMatchingLength,
127-
bool& wasCached);
123+
bool createReverseSearch(int categoryId,
124+
std::string& part1,
125+
std::string& part2,
126+
size_t& maxMatchingLength,
127+
bool& wasCached) override;
128128

129129
//! Has the data categorizer's state changed?
130-
virtual bool hasChanged() const;
130+
bool hasChanged() const override;
131131

132132
//! Populate the object from part of a state document
133-
virtual bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);
133+
bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) override;
134134

135135
//! Persist state by passing information to the supplied inserter
136-
virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
136+
void acceptPersistInserter(core::CStatePersistInserter& inserter) const override;
137137

138138
//! Make a function that can be called later to persist state
139-
virtual TPersistFunc makePersistFunc() const;
139+
TPersistFunc makePersistFunc() const override;
140+
141+
//! Debug the memory used by this categorizer.
142+
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const override;
143+
144+
//! Get the memory used by this categorizer.
145+
std::size_t memoryUsage() const override;
140146

141147
protected:
142148
//! Split the string into a list of tokens. The result of the
@@ -205,6 +211,12 @@ class MODEL_EXPORT CBaseTokenListDataCategorizer : public CDataCategorizer {
205211
//! Increment the category count
206212
void incCategoryCount();
207213

214+
//! Debug the memory used by this item.
215+
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
216+
217+
//! Get the memory used by this item.
218+
std::size_t memoryUsage() const;
219+
208220
private:
209221
//! String value of the token
210222
std::string m_Str;

include/model/CDataCategorizer.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#ifndef INCLUDED_ml_model_CDataCategorizer_h
77
#define INCLUDED_ml_model_CDataCategorizer_h
88

9+
#include <core/CMemoryUsage.h>
910
#include <core/CoreTypes.h>
1011

1112
#include <model/ImportExport.h>
@@ -100,6 +101,12 @@ class MODEL_EXPORT CDataCategorizer {
100101
//! Set last persistence time
101102
void lastPersistTime(core_t::TTime lastPersistTime);
102103

104+
//! Debug the memory used by this categorizer.
105+
virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
106+
107+
//! Get the memory used by this categorizer.
108+
virtual std::size_t memoryUsage() const;
109+
103110
protected:
104111
//! Used if no fields are supplied to the computeCategory() method.
105112
static const TStrStrUMap EMPTY_FIELDS;

include/model/CResourceMonitor.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ namespace model {
3232

3333
class CAnomalyDetector;
3434
class CAnomalyDetectorModel;
35-
class CResourcePruner;
3635

3736
//! \brief Assess memory used by models and decide on further memory allocations.
3837
//!

include/model/CTokenListCategory.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
#ifndef INCLUDED_ml_model_CTokenListCategory_h
77
#define INCLUDED_ml_model_CTokenListCategory_h
88

9+
#include <core/CMemoryUsage.h>
10+
911
#include <model/ImportExport.h>
1012

1113
#include <map>
@@ -113,6 +115,12 @@ class MODEL_EXPORT CTokenListCategory {
113115
//! Set the cached reverse search
114116
void cacheReverseSearch(const std::string& part1, const std::string& part2);
115117

118+
//! Debug the memory used by this category.
119+
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
120+
121+
//! Get the memory used by this category.
122+
std::size_t memoryUsage() const;
123+
116124
private:
117125
bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);
118126

include/model/CTokenListDataCategorizer.h

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#define INCLUDED_ml_model_CTokenListDataCategorizer_h
88

99
#include <core/CLogger.h>
10+
#include <core/CMemory.h>
1011
#include <core/CStringSimilarityTester.h>
1112
#include <core/CTimeUtils.h>
1213
#include <core/CWordDictionary.h>
@@ -58,15 +59,30 @@ class CTokenListDataCategorizer : public CBaseTokenListDataCategorizer {
5859
: CBaseTokenListDataCategorizer(reverseSearchCreator, threshold, fieldName),
5960
m_Dict(core::CWordDictionary::instance()) {}
6061

62+
//! Debug the memory used by this categorizer.
63+
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const override {
64+
mem->setName("CTokenListDataCategorizer");
65+
this->CBaseTokenListDataCategorizer::debugMemoryUsage(mem->addChild());
66+
core::CMemoryDebug::dynamicSize("m_SimilarityTester", m_SimilarityTester, mem);
67+
}
68+
69+
//! Get the memory used by this categorizer.
70+
std::size_t memoryUsage() const override {
71+
std::size_t mem = 0;
72+
mem += this->CBaseTokenListDataCategorizer::memoryUsage();
73+
mem += core::CMemory::dynamicSize(m_SimilarityTester);
74+
return mem;
75+
}
76+
6177
protected:
6278
//! Split the string into a list of tokens. The result of the
6379
//! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and
6480
//! \p totalWeight. Any previous content of these variables is wiped.
65-
virtual void tokeniseString(const TStrStrUMap& fields,
66-
const std::string& str,
67-
TSizeSizePrVec& tokenIds,
68-
TSizeSizeMap& tokenUniqueIds,
69-
size_t& totalWeight) {
81+
void tokeniseString(const TStrStrUMap& fields,
82+
const std::string& str,
83+
TSizeSizePrVec& tokenIds,
84+
TSizeSizeMap& tokenUniqueIds,
85+
size_t& totalWeight) override {
7086
tokenIds.clear();
7187
tokenUniqueIds.clear();
7288
totalWeight = 0;
@@ -115,10 +131,10 @@ class CTokenListDataCategorizer : public CBaseTokenListDataCategorizer {
115131

116132
//! Take a string token, convert it to a numeric ID and a weighting and
117133
//! add these to the provided data structures.
118-
virtual void tokenToIdAndWeight(const std::string& token,
119-
TSizeSizePrVec& tokenIds,
120-
TSizeSizeMap& tokenUniqueIds,
121-
size_t& totalWeight) {
134+
void tokenToIdAndWeight(const std::string& token,
135+
TSizeSizePrVec& tokenIds,
136+
TSizeSizeMap& tokenUniqueIds,
137+
size_t& totalWeight) override {
122138
TSizeSizePr idWithWeight(this->idForToken(token), 1);
123139

124140
if (token.length() >= MIN_DICTIONARY_LENGTH) {
@@ -131,10 +147,10 @@ class CTokenListDataCategorizer : public CBaseTokenListDataCategorizer {
131147
}
132148

133149
//! Compute similarity between two vectors
134-
virtual double similarity(const TSizeSizePrVec& left,
135-
size_t leftWeight,
136-
const TSizeSizePrVec& right,
137-
size_t rightWeight) const {
150+
double similarity(const TSizeSizePrVec& left,
151+
size_t leftWeight,
152+
const TSizeSizePrVec& right,
153+
size_t rightWeight) const override {
138154
double similarity(1.0);
139155

140156
size_t maxWeight(std::max(leftWeight, rightWeight));

0 commit comments

Comments
 (0)