elastic · tveasey · Dec 11, 2018 · Dec 7, 2018 · Dec 10, 2018 · Dec 10, 2018
diff --git a/bin/data_frame_analyzer/Main.cc b/bin/data_frame_analyzer/Main.cc
@@ -103,13 +103,14 @@ int main(int argc, char** argv) {
         return EXIT_FAILURE;
     }
 
-    ml::api::CDataFrameAnalysisSpecification analysisSpecification{analysisSpecificationJson};
-    if (analysisSpecification.bad()) {
+    auto analysisSpecification =
+        std::make_unique<ml::api::CDataFrameAnalysisSpecification>(analysisSpecificationJson);
+    if (analysisSpecification->bad()) {
         LOG_FATAL("Failed to parse analysis specification");
         return EXIT_FAILURE;
     }
-    if (analysisSpecification.threads() > 1) {
-        ml::core::startDefaultAsyncExecutor(analysisSpecification.threads());
+    if (analysisSpecification->numberThreads() > 1) {
+        ml::core::startDefaultAsyncExecutor(analysisSpecification->numberThreads());
     }
 
     ml::api::CDataFrameAnalyzer dataFrameAnalyzer{

diff --git a/include/api/CDataFrameAnalysisRunner.h b/include/api/CDataFrameAnalysisRunner.h
@@ -23,6 +23,10 @@
 namespace ml {
 namespace core {
 class CDataFrame;
+class CRapidJsonConcurrentLineWriter;
+namespace data_frame_detail {
+class CRowRef;
+}
 }
 namespace api {
 class CDataFrameAnalysisSpecification;
@@ -53,6 +57,7 @@ class CDataFrameAnalysisSpecification;
 class API_EXPORT CDataFrameAnalysisRunner {
 public:
     using TStrVec = std::vector<std::string>;
+    using TRowRef = core::data_frame_detail::CRowRef;
 
 public:
     CDataFrameAnalysisRunner(const CDataFrameAnalysisSpecification& spec);
@@ -67,9 +72,25 @@ class API_EXPORT CDataFrameAnalysisRunner {
     //! into main memory during an analysis.
     virtual std::size_t numberOfPartitions() const = 0;
 
-    //! \return The number of columns this analysis requires. This includes
-    //! the columns of the input frame plus any that the analysis will append.
-    virtual std::size_t requiredFrameColumns() const = 0;
+    //! \return The number of columns this analysis appends.
+    virtual std::size_t numberExtraColumns() const = 0;
+
+    //! Write the extra columns of \p row added by the analysis to \p writer.
+    //!
+    //! This should create a new object of the form:
+    //! <pre>
+    //! {
+    //!   "name of column n":   "value of column n",
+    //!   "name of column n+1": "value of column n+1",
+    //!   ...
+    //! }
+    //! </pre>
+    //! with one named member for each column added.
+    //!
+    //! \param[in] row The row to write the columns added by this analysis.
+    //! \param[in,out] writer The stream to which to write the extra columns.
+    virtual void writeOneRow(TRowRef row,
+                             core::CRapidJsonConcurrentLineWriter& writer) const = 0;
 
     //! Checks whether the analysis is already running and if not launches it
     //! in the background.

diff --git a/include/api/CDataFrameAnalysisSpecification.h b/include/api/CDataFrameAnalysisSpecification.h
@@ -74,20 +74,29 @@ class API_EXPORT CDataFrameAnalysisSpecification {
     CDataFrameAnalysisSpecification(TRunnerFactoryUPtrVec runnerFactories,
                                     const std::string& jsonSpecification);
 
+    CDataFrameAnalysisSpecification(const CDataFrameAnalysisSpecification&) = delete;
+    CDataFrameAnalysisSpecification& operator=(const CDataFrameAnalysisSpecification&) = delete;
+    CDataFrameAnalysisSpecification(CDataFrameAnalysisSpecification&&) = delete;
+    CDataFrameAnalysisSpecification& operator=(CDataFrameAnalysisSpecification&&) = delete;
+
     //! Check if the specification is bad.
     bool bad() const;
 
     //! \return The number of rows in the frame.
-    std::size_t rows() const;
+    std::size_t numberRows() const;
 
     //! \return The number of columns in the input frame.
-    std::size_t cols() const;
+    std::size_t numberColumns() const;
+
+    //! \return The number of columns the analysis configured to run will append
+    //! to the data frame.
+    std::size_t numberExtraColumns() const;
 
     //! \return The memory limit for the process.
     std::size_t memoryLimit() const;
 
     //! \return The number of threads the analysis can use.
-    std::size_t threads() const;
+    std::size_t numberThreads() const;
 
     //! Run the analysis in a background thread.
     //!
@@ -107,10 +116,10 @@ class API_EXPORT CDataFrameAnalysisSpecification {
 
 private:
     bool m_Bad = false;
-    std::size_t m_Rows = 0;
-    std::size_t m_Cols = 0;
+    std::size_t m_NumberRows = 0;
+    std::size_t m_NumberColumns = 0;
     std::size_t m_MemoryLimit = 0;
-    std::size_t m_Threads = 0;
+    std::size_t m_NumberThreads = 0;
     // TODO Sparse table support
     // double m_TableLoadFactor = 0.0;
     TRunnerFactoryUPtrVec m_RunnerFactories;

diff --git a/include/api/CDataFrameAnalyzer.h b/include/api/CDataFrameAnalyzer.h
@@ -3,12 +3,10 @@
  * or more contributor license agreements. Licensed under the Elastic License;
  * you may not use this file except in compliance with the Elastic License.
  */
+
 #ifndef INCLUDED_ml_api_CDataFrameAnalyzer_h
 #define INCLUDED_ml_api_CDataFrameAnalyzer_h
 
-#include <core/CDataFrame.h>
-
-#include <api/CDataFrameAnalysisSpecification.h>
 #include <api/ImportExport.h>
 
 #include <cinttypes>
@@ -19,9 +17,12 @@
 
 namespace ml {
 namespace core {
+class CDataFrame;
 class CJsonOutputStreamWrapper;
 }
 namespace api {
+class CDataFrameAnalysisRunner;
+class CDataFrameAnalysisSpecification;
 
 //! \brief Handles input to the data_frame_analyzer command.
 class API_EXPORT CDataFrameAnalyzer {
@@ -30,39 +31,49 @@ class API_EXPORT CDataFrameAnalyzer {
     using TJsonOutputStreamWrapperUPtr = std::unique_ptr<core::CJsonOutputStreamWrapper>;
     using TJsonOutputStreamWrapperUPtrSupplier =
         std::function<TJsonOutputStreamWrapperUPtr()>;
+    using TDataFrameAnalysisSpecificationUPtr = std::unique_ptr<CDataFrameAnalysisSpecification>;
 
 public:
-    explicit CDataFrameAnalyzer(CDataFrameAnalysisSpecification analysisSpecification,
-                                TJsonOutputStreamWrapperUPtrSupplier outStreamSupplier);
+    CDataFrameAnalyzer(TDataFrameAnalysisSpecificationUPtr analysisSpecification,
+                       TJsonOutputStreamWrapperUPtrSupplier outStreamSupplier);
+    ~CDataFrameAnalyzer();
 
     //! This is true if the analyzer is receiving control messages.
     bool usingControlMessages() const;
 
-    //! Handle adding a row of the data frame or a control message.
+    //! Handle receiving a row of the data frame or a control message.
     bool handleRecord(const TStrVec& fieldNames, const TStrVec& fieldValues);
 
+    //! Call when all row have been received.
+    void receivedAllRows();
+
     //! Run the configured analysis.
     void run();
 
+private:
+    using TDataFrameUPtr = std::unique_ptr<core::CDataFrame>;
+
 private:
     static const std::ptrdiff_t CONTROL_FIELD_UNSET{-2};
     static const std::ptrdiff_t CONTROL_FIELD_MISSING{-1};
 
 private:
+    bool sufficientFieldValues(const TStrVec& fieldNames) const;
     bool readyToReceiveControlMessages() const;
     bool prepareToReceiveControlMessages(const TStrVec& fieldNames);
     bool isControlMessage(const TStrVec& fieldValues) const;
     bool handleControlMessage(const TStrVec& fieldValues);
     void addRowToDataFrame(const TStrVec& fieldValues);
+    void writeResultsOf(const CDataFrameAnalysisRunner& analysis) const;
 
 private:
     // This has values: -2 (unset), -1 (missing), >= 0 (control field index).
-    std::ptrdiff_t m_ControlFieldValue = CONTROL_FIELD_UNSET;
+    std::ptrdiff_t m_ControlFieldIndex = CONTROL_FIELD_UNSET;
     std::ptrdiff_t m_BeginDataFieldValues = -1;
     std::ptrdiff_t m_EndDataFieldValues = -1;
     std::uint64_t m_BadValueCount;
-    CDataFrameAnalysisSpecification m_AnalysisSpecification;
-    core::CDataFrame m_DataFrame;
+    TDataFrameAnalysisSpecificationUPtr m_AnalysisSpecification;
+    TDataFrameUPtr m_DataFrame;
     TJsonOutputStreamWrapperUPtrSupplier m_OutStreamSupplier;
 };
 }

diff --git a/include/api/CDataFrameOutliersRunner.h b/include/api/CDataFrameOutliersRunner.h
@@ -4,6 +4,9 @@
  * you may not use this file except in compliance with the Elastic License.
  */
 
+#ifndef INCLUDED_ml_api_CDataFrameOutliersRunner_h
+#define INCLUDED_ml_api_CDataFrameOutliersRunner_h
+
 #include <api/CDataFrameAnalysisRunner.h>
 
 #include <api/ImportExport.h>
@@ -26,8 +29,11 @@ class API_EXPORT CDataFrameOutliersRunner : public CDataFrameAnalysisRunner {
     //! \sa CDataFrameAnalysisRunner::run.
     virtual std::size_t numberOfPartitions() const;
 
-    //! \return The number of columns of the output frame.
-    virtual std::size_t requiredFrameColumns() const;
+    //! \return The number of columns this adds to the data frame.
+    virtual std::size_t numberExtraColumns() const;
+
+    //! Write the extra columns of \p row added by outlier analysis to \p writer.
+    virtual void writeOneRow(TRowRef row, core::CRapidJsonConcurrentLineWriter& writer) const;
 
 private:
     using TOptionalSize = boost::optional<std::size_t>;
@@ -62,3 +68,5 @@ class API_EXPORT CDataFrameOutliersRunnerFactory : public CDataFrameAnalysisRunn
 };
 }
 }
+
+#endif // INCLUDED_ml_api_CDataFrameOutliersRunner_h
diff --git a/include/core/CDataFrame.h b/include/core/CDataFrame.h
@@ -207,9 +207,10 @@ class CORE_EXPORT CDataFrame final {
                EReadWriteToStorage readAndWriteToStoreSyncStrategy,
                const TWriteSliceToStoreFunc& writeSliceToStore);
 
+    ~CDataFrame();
+
     CDataFrame(const CDataFrame&) = delete;
     CDataFrame& operator=(const CDataFrame&) = delete;
-
     CDataFrame(CDataFrame&&) = default;
     CDataFrame& operator=(CDataFrame&&) = default;
 
@@ -415,10 +416,11 @@ class CORE_EXPORT CDataFrame final {
 //! \param[in] readWriteToStoreSyncStrategy Controls whether reads and writes
 //! from slice storage are synchronous or asynchronous.
 CORE_EXPORT
-CDataFrame makeMainStorageDataFrame(std::size_t numberColumns,
-                                    boost::optional<std::size_t> sliceCapacity = boost::none,
-                                    CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy =
-                                        CDataFrame::EReadWriteToStorage::E_Sync);
+std::unique_ptr<CDataFrame>
+makeMainStorageDataFrame(std::size_t numberColumns,
+                         boost::optional<std::size_t> sliceCapacity = boost::none,
+                         CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy =
+                             CDataFrame::EReadWriteToStorage::E_Sync);
 
 //! Make a data frame which uses disk storage for its slices.
 //!
@@ -431,12 +433,13 @@ CDataFrame makeMainStorageDataFrame(std::size_t numberColumns,
 //! \param[in] readWriteToStoreSyncStrategy Controls whether reads and writes
 //! from slice storage are synchronous or asynchronous.
 CORE_EXPORT
-CDataFrame makeDiskStorageDataFrame(const std::string& rootDirectory,
-                                    std::size_t numberColumns,
-                                    std::size_t numberRows,
-                                    boost::optional<std::size_t> sliceCapacity = boost::none,
-                                    CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy =
-                                        CDataFrame::EReadWriteToStorage::E_Async);
+std::unique_ptr<CDataFrame>
+makeDiskStorageDataFrame(const std::string& rootDirectory,
+                         std::size_t numberColumns,
+                         std::size_t numberRows,
+                         boost::optional<std::size_t> sliceCapacity = boost::none,
+                         CDataFrame::EReadWriteToStorage readWriteToStoreSyncStrategy =
+                             CDataFrame::EReadWriteToStorage::E_Async);
 }
 }
 

diff --git a/include/core/CRapidJsonConcurrentLineWriter.h b/include/core/CRapidJsonConcurrentLineWriter.h
@@ -49,10 +49,11 @@ class CORE_EXPORT CRapidJsonConcurrentLineWriter
     std::size_t memoryUsage() const;
 
     //! Write JSON document to outputstream
-    //! Note this non-virtual overwrite is needed to avoid slicing of the writer
-    //! and hence ensure the correct EndObject is called
-    //! \p doc reference to rapidjson document value
-    void write(rapidjson::Value& doc) { doc.Accept(*this); }
+    //! \note This overwrite is needed because the members of rapidjson::Writer
+    //! are not virtual and we need to avoid "slicing" the writer to ensure that
+    //! that the correct StartObject/EndObject functions are called when this is
+    //! passed to \p doc Accept.
+    void write(const rapidjson::Value& doc) { doc.Accept(*this); }
 
 private:
     //! The stream object

diff --git a/include/core/CRapidJsonLineWriter.h b/include/core/CRapidJsonLineWriter.h
@@ -55,10 +55,11 @@ class CRapidJsonLineWriter
     }
 
     //! Write JSON document to outputstream
-    //! Note this non-virtual overwrite is needed to avoid slicing of the writer
-    //! and hence ensure the correct StartObject/EndObject functions are called
-    //! \p doc reference to rapidjson document value
-    void write(rapidjson::Value& doc) { doc.Accept(*this); }
+    //! \note This overwrite is needed because the members of rapidjson::Writer
+    //! are not virtual and we need to avoid "slicing" the writer to ensure that
+    //! that the correct StartObject/EndObject functions are called when this is
+    //! passed to \p doc Accept.
+    void write(const rapidjson::Value& doc) { doc.Accept(*this); }
 
 private:
     size_t m_ObjectCount = 0;

diff --git a/include/core/CRapidJsonWriterBase.h b/include/core/CRapidJsonWriterBase.h
@@ -218,7 +218,7 @@ class CRapidJsonWriterBase
 
     //! write the rapidjson value document to the output stream
     //! \p[in] doc rapidjson document value to write out
-    virtual void write(TValue& doc) { doc.Accept(*this); }
+    virtual void write(const TValue& doc) { doc.Accept(*this); }
 
     //! Return a new rapidjson document
     TDocument makeDoc() const {

diff --git a/include/core/CStaticThreadPool.h b/include/core/CStaticThreadPool.h
@@ -71,8 +71,9 @@ class CORE_EXPORT CStaticThreadPool {
     void worker(std::size_t id);
 
 private:
-    // This doesn't have to be atomic because it is always only set to true and
-    // always set straight before it is checked on each worker in the pool.
+    // This doesn't have to be atomic because it is always only set to true,
+    // always set straight before it is checked on each worker in the pool
+    // and tearing can't happen for single byte writes.
     bool m_Done = false;
     std::atomic_bool m_Busy;
     std::atomic<std::uint64_t> m_Cursor;