elastic
diff --git a/‎include/core/CCsvLineParser.h‎
Lines changed: 7 additions & 0 deletions b/‎include/core/CCsvLineParser.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/core/CMemory.h‎
Lines changed: 58 additions & 1 deletion b/‎include/core/CMemory.h‎
Lines changed: 58 additions & 1 deletion
diff --git a/‎include/core/CStringSimilarityTester.h‎
Lines changed: 7 additions & 0 deletions b/‎include/core/CStringSimilarityTester.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/core/CompressUtils.h‎
Lines changed: 13 additions & 6 deletions b/‎include/core/CompressUtils.h‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎include/model/CBaseTokenListDataCategorizer.h‎
Lines changed: 26 additions & 14 deletions b/‎include/model/CBaseTokenListDataCategorizer.h‎
Lines changed: 26 additions & 14 deletions
diff --git a/‎include/model/CDataCategorizer.h‎
Lines changed: 7 additions & 0 deletions b/‎include/model/CDataCategorizer.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/model/CResourceMonitor.h‎
Lines changed: 0 additions & 1 deletion b/‎include/model/CResourceMonitor.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/model/CTokenListCategory.h‎
Lines changed: 8 additions & 0 deletions b/‎include/model/CTokenListCategory.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/model/CTokenListDataCategorizer.h‎
Lines changed: 29 additions & 13 deletions b/‎include/model/CTokenListDataCategorizer.h‎
Lines changed: 29 additions & 13 deletions
@@ -6,6 +6,7 @@
 #ifndef INCLUDED_ml_core_CCsvLineParser_h
 #define INCLUDED_ml_core_CCsvLineParser_h
 
+#include <core/CMemoryUsage.h>
 #include <core/ImportExport.h>
 
 #include <boost/scoped_array.hpp>
@@ -60,6 +61,12 @@ class CORE_EXPORT CCsvLineParser {
     //! Are we at the end of the current line?
     bool atEnd() const;
 
+    //! Debug the memory used by this parser.
+    void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
+
+    //! Get the memory used by this parser.
+    std::size_t memoryUsage() const;
+
 private:
     //! Attempt to parse the next token from the working record
     //! into the working field.
 
@@ -6,6 +6,7 @@
 #ifndef INCLUDED_ml_core_CMemory_h
 #define INCLUDED_ml_core_CMemory_h
 
+#include <core/BoostMultiIndex.h>
 #include <core/CLogger.h>
 #include <core/CMemoryUsage.h>
 #include <core/CNonInstantiatable.h>
@@ -14,6 +15,7 @@
 #include <boost/any.hpp>
 #include <boost/circular_buffer_fwd.hpp>
 #include <boost/container/container_fwd.hpp>
+#include <boost/mpl/range_c.hpp>
 #include <boost/optional/optional_fwd.hpp>
 #include <boost/shared_array.hpp>
 #include <boost/type_traits/is_pointer.hpp>
@@ -491,6 +493,30 @@ class CORE_EXPORT CMemory : private CNonInstantiatable {
         return mem + pageVecEntries * sizeof(std::size_t) + numPages * pageSize;
     }
 
+    //! Overload for boost::multi_index::multi_index_container.
+    template<typename T, typename I, typename A>
+    static std::size_t
+    dynamicSize(const boost::multi_index::multi_index_container<T, I, A>& t) {
+        // It's tricky to determine the container overhead of a multi-index
+        // container.  It can have an arbitrary number of indices, each of which
+        // can be of a different type.  To accurately determine the overhead
+        // would require some serious template metaprogramming to interpret the
+        // "typename I" template argument, and it's just not worth it given the
+        // infrequent and relatively simple usage (generally just two indices
+        // in our current codebase).  Therefore there's an approximation here
+        // that the overhead is 2 pointers per entry per index.
+        using TMultiIndex = boost::multi_index::multi_index_container<T, I, A>;
+        constexpr std::size_t indexCount{
+            boost::mpl::size<typename TMultiIndex::index_type_list>::value};
+        std::size_t mem = 0;
+        if (!memory_detail::SDynamicSizeAlwaysZero<T>::value()) {
+            for (auto i = t.begin(); i != t.end(); ++i) {
+                mem += dynamicSize(*i);
+            }
+        }
+        return mem + t.size() * (sizeof(T) + 2 * indexCount * sizeof(std::size_t));
+    }
+
     //! Overload for boost::circular_buffer.
     template<typename T, typename A>
     static std::size_t dynamicSize(const boost::circular_buffer<T, A>& t) {
@@ -970,7 +996,7 @@ class CORE_EXPORT CMemoryDebug : private CNonInstantiatable {
         componentName += "_list";
 
         std::size_t listSize = (memory_detail::EXTRA_NODES + t.size()) *
-                               (sizeof(T) + 4 * sizeof(std::size_t));
+                               (sizeof(T) + 2 * sizeof(std::size_t));
 
         CMemoryUsage::SMemoryUsage usage(componentName, listSize);
         CMemoryUsage::TMemoryUsagePtr ptr = mem->addChild();
@@ -1009,6 +1035,37 @@ class CORE_EXPORT CMemoryDebug : private CNonInstantiatable {
         }
     }
 
+    //! Overload for boost::multi_index::multi_index_container.
+    template<typename T, typename I, typename A>
+    static void dynamicSize(const char* name,
+                            const boost::multi_index::multi_index_container<T, I, A>& t,
+                            CMemoryUsage::TMemoryUsagePtr mem) {
+        // It's tricky to determine the container overhead of a multi-index
+        // container.  It can have an arbitrary number of indices, each of which
+        // can be of a different type.  To accurately determine the overhead
+        // would require some serious template metaprogramming to interpret the
+        // "typename I" template argument, and it's just not worth it given the
+        // infrequent and relatively simple usage (generally just two indices
+        // in our current codebase).  Therefore there's an approximation here
+        // that the overhead is 2 pointers per entry per index.
+        using TMultiIndex = boost::multi_index::multi_index_container<T, I, A>;
+        constexpr std::size_t indexCount{
+            boost::mpl::size<typename TMultiIndex::index_type_list>::value};
+        std::string componentName(name);
+
+        std::size_t items = t.size();
+        CMemoryUsage::SMemoryUsage usage(
+            componentName + "::" + typeid(T).name(),
+            items * (sizeof(T) + 2 * indexCount * sizeof(std::size_t)));
+        CMemoryUsage::TMemoryUsagePtr ptr = mem->addChild();
+        ptr->setName(usage);
+
+        componentName += "_item";
+        for (auto i = t.begin(); i != t.end(); ++i) {
+            dynamicSize(componentName.c_str(), *i, ptr);
+        }
+    }
+
     //! Overload for boost::circular_buffer.
     template<typename T, typename A>
     static void dynamicSize(const char* name,
 
@@ -7,6 +7,7 @@
 #define INCLUDED_ml_core_CStringSimilarityTester_h
 
 #include <core/CLogger.h>
+#include <core/CMemoryUsage.h>
 #include <core/CNonCopyable.h>
 #include <core/CompressUtils.h>
 #include <core/ImportExport.h>
@@ -286,6 +287,12 @@ class CORE_EXPORT CStringSimilarityTester : private CNonCopyable {
         return currentCol[secondLen];
     }
 
+    //! Debug the memory used by this similarity tester.
+    void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
+
+    //! Get the memory used by this similarity tester.
+    std::size_t memoryUsage() const;
+
 private:
     //! Calculate the Levenshtein distance using the naive method of
     //! calculating the entire distance matrix.  This private method
 
@@ -6,6 +6,7 @@
 #ifndef INCLUDED_ml_core_CCompressUtils_h
 #define INCLUDED_ml_core_CCompressUtils_h
 
+#include <core/CMemoryUsage.h>
 #include <core/CNonCopyable.h>
 #include <core/ImportExport.h>
 
@@ -98,6 +99,12 @@ class CORE_EXPORT CCompressUtil : private CNonCopyable {
     //! error, it may be desirable to explicitly reset the state.
     void reset();
 
+    //! Debug the memory used by these compression utils.
+    void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
+
+    //! Get the memory used by these compression utils.
+    std::size_t memoryUsage() const;
+
 protected:
     //! Get the underlying stream.
     z_stream& stream();
@@ -191,26 +198,26 @@ class CORE_EXPORT CCompressUtil : private CNonCopyable {
 class CORE_EXPORT CDeflator final : public CCompressUtil {
 public:
     CDeflator(bool lengthOnly, int level = Z_DEFAULT_COMPRESSION);
-    ~CDeflator();
+    ~CDeflator() override;
 
 private:
     //! Process a chunk of state (optionally flushing).
-    virtual int streamProcessChunk(int flush);
+    int streamProcessChunk(int flush) override;
     //! Reset the underlying stream.
-    virtual int resetStream();
+    int resetStream() override;
 };
 
 //! \brief Implementation of CompressUtil for inflating data.
 class CORE_EXPORT CInflator final : public CCompressUtil {
 public:
     CInflator(bool lengthOnly);
-    ~CInflator();
+    ~CInflator() override;
 
 private:
     //! Process a chunk of state (optionally flushing).
-    virtual int streamProcessChunk(int flush);
+    int streamProcessChunk(int flush) override;
     //! Reset the underlying stream.
-    virtual int resetStream();
+    int resetStream() override;
 };
 }
 }
 
@@ -102,16 +102,16 @@ class MODEL_EXPORT CBaseTokenListDataCategorizer : public CDataCategorizer {
                                   const std::string& fieldName);
 
     //! Dump stats
-    virtual void dumpStats() const;
+    void dumpStats() const override;
 
     //! Compute a category from a string.  The raw string length may be longer
     //! than the length of the passed string, because the passed string may
     //! have the date stripped out of it.  Field names/values are available
     //! to the category computation.
-    virtual int computeCategory(bool dryRun,
-                                const TStrStrUMap& fields,
-                                const std::string& str,
-                                size_t rawStringLen);
+    int computeCategory(bool dryRun,
+                        const TStrStrUMap& fields,
+                        const std::string& str,
+                        size_t rawStringLen) override;
 
     // Bring the other overload of computeCategory() into scope
     using CDataCategorizer::computeCategory;
@@ -120,23 +120,29 @@ class MODEL_EXPORT CBaseTokenListDataCategorizer : public CDataCategorizer {
     //! that are classified as the given category.  Note that the reverse search
     //! is only approximate - it may select more records than have actually
     //! been classified as the returned category.
-    virtual bool createReverseSearch(int categoryId,
-                                     std::string& part1,
-                                     std::string& part2,
-                                     size_t& maxMatchingLength,
-                                     bool& wasCached);
+    bool createReverseSearch(int categoryId,
+                             std::string& part1,
+                             std::string& part2,
+                             size_t& maxMatchingLength,
+                             bool& wasCached) override;
 
     //! Has the data categorizer's state changed?
-    virtual bool hasChanged() const;
+    bool hasChanged() const override;
 
     //! Populate the object from part of a state document
-    virtual bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);
+    bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) override;
 
     //! Persist state by passing information to the supplied inserter
-    virtual void acceptPersistInserter(core::CStatePersistInserter& inserter) const;
+    void acceptPersistInserter(core::CStatePersistInserter& inserter) const override;
 
     //! Make a function that can be called later to persist state
-    virtual TPersistFunc makePersistFunc() const;
+    TPersistFunc makePersistFunc() const override;
+
+    //! Debug the memory used by this categorizer.
+    void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const override;
+
+    //! Get the memory used by this categorizer.
+    std::size_t memoryUsage() const override;
 
 protected:
     //! Split the string into a list of tokens.  The result of the
@@ -205,6 +211,12 @@ class MODEL_EXPORT CBaseTokenListDataCategorizer : public CDataCategorizer {
         //! Increment the category count
         void incCategoryCount();
 
+        //! Debug the memory used by this item.
+        void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
+
+        //! Get the memory used by this item.
+        std::size_t memoryUsage() const;
+
     private:
         //! String value of the token
         std::string m_Str;
 
@@ -6,6 +6,7 @@
 #ifndef INCLUDED_ml_model_CDataCategorizer_h
 #define INCLUDED_ml_model_CDataCategorizer_h
 
+#include <core/CMemoryUsage.h>
 #include <core/CoreTypes.h>
 
 #include <model/ImportExport.h>
@@ -100,6 +101,12 @@ class MODEL_EXPORT CDataCategorizer {
     //! Set last persistence time
     void lastPersistTime(core_t::TTime lastPersistTime);
 
+    //! Debug the memory used by this categorizer.
+    virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
+
+    //! Get the memory used by this categorizer.
+    virtual std::size_t memoryUsage() const;
+
 protected:
     //! Used if no fields are supplied to the computeCategory() method.
     static const TStrStrUMap EMPTY_FIELDS;
 
@@ -32,7 +32,6 @@ namespace model {
 
 class CAnomalyDetector;
 class CAnomalyDetectorModel;
-class CResourcePruner;
 
 //! \brief Assess memory used by models and decide on further memory allocations.
 //!
 
@@ -6,6 +6,8 @@
 #ifndef INCLUDED_ml_model_CTokenListCategory_h
 #define INCLUDED_ml_model_CTokenListCategory_h
 
+#include <core/CMemoryUsage.h>
+
 #include <model/ImportExport.h>
 
 #include <map>
@@ -113,6 +115,12 @@ class MODEL_EXPORT CTokenListCategory {
     //! Set the cached reverse search
     void cacheReverseSearch(const std::string& part1, const std::string& part2);
 
+    //! Debug the memory used by this category.
+    void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
+
+    //! Get the memory used by this category.
+    std::size_t memoryUsage() const;
+
 private:
     bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);
 
 
@@ -7,6 +7,7 @@
 #define INCLUDED_ml_model_CTokenListDataCategorizer_h
 
 #include <core/CLogger.h>
+#include <core/CMemory.h>
 #include <core/CStringSimilarityTester.h>
 #include <core/CTimeUtils.h>
 #include <core/CWordDictionary.h>
@@ -58,15 +59,30 @@ class CTokenListDataCategorizer : public CBaseTokenListDataCategorizer {
         : CBaseTokenListDataCategorizer(reverseSearchCreator, threshold, fieldName),
           m_Dict(core::CWordDictionary::instance()) {}
 
+    //! Debug the memory used by this categorizer.
+    void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const override {
+        mem->setName("CTokenListDataCategorizer");
+        this->CBaseTokenListDataCategorizer::debugMemoryUsage(mem->addChild());
+        core::CMemoryDebug::dynamicSize("m_SimilarityTester", m_SimilarityTester, mem);
+    }
+
+    //! Get the memory used by this categorizer.
+    std::size_t memoryUsage() const override {
+        std::size_t mem = 0;
+        mem += this->CBaseTokenListDataCategorizer::memoryUsage();
+        mem += core::CMemory::dynamicSize(m_SimilarityTester);
+        return mem;
+    }
+
 protected:
     //! Split the string into a list of tokens.  The result of the
     //! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and
     //! \p totalWeight.  Any previous content of these variables is wiped.
-    virtual void tokeniseString(const TStrStrUMap& fields,
-                                const std::string& str,
-                                TSizeSizePrVec& tokenIds,
-                                TSizeSizeMap& tokenUniqueIds,
-                                size_t& totalWeight) {
+    void tokeniseString(const TStrStrUMap& fields,
+                        const std::string& str,
+                        TSizeSizePrVec& tokenIds,
+                        TSizeSizeMap& tokenUniqueIds,
+                        size_t& totalWeight) override {
         tokenIds.clear();
         tokenUniqueIds.clear();
         totalWeight = 0;
@@ -115,10 +131,10 @@ class CTokenListDataCategorizer : public CBaseTokenListDataCategorizer {
 
     //! Take a string token, convert it to a numeric ID and a weighting and
     //! add these to the provided data structures.
-    virtual void tokenToIdAndWeight(const std::string& token,
-                                    TSizeSizePrVec& tokenIds,
-                                    TSizeSizeMap& tokenUniqueIds,
-                                    size_t& totalWeight) {
+    void tokenToIdAndWeight(const std::string& token,
+                            TSizeSizePrVec& tokenIds,
+                            TSizeSizeMap& tokenUniqueIds,
+                            size_t& totalWeight) override {
         TSizeSizePr idWithWeight(this->idForToken(token), 1);
 
         if (token.length() >= MIN_DICTIONARY_LENGTH) {
@@ -131,10 +147,10 @@ class CTokenListDataCategorizer : public CBaseTokenListDataCategorizer {
     }
 
     //! Compute similarity between two vectors
-    virtual double similarity(const TSizeSizePrVec& left,
-                              size_t leftWeight,
-                              const TSizeSizePrVec& right,
-                              size_t rightWeight) const {
+    double similarity(const TSizeSizePrVec& left,
+                      size_t leftWeight,
+                      const TSizeSizePrVec& right,
+                      size_t rightWeight) const override {
         double similarity(1.0);
 
         size_t maxWeight(std::max(leftWeight, rightWeight));