add the profiling_trace feature (#222)

LJ-underdog · a32543254 · web-flow · commit ea5c0e92b683 · 2022-09-23T16:19:40.000+08:00
* add the profiling_trace feature

* amend the profiling_trace.hpp

* amend the model.cpp

* add the profiling_trace feature

* add the profiling_trace feature

* add the reshape time

* fix an error

* fix the cpplint

Co-authored-by: Dong, Bo &lt;bo1.dong@intel.com&gt;
diff --git a/nlp_toolkit/backends/neural_engine/executor/include/dispatcher.hpp b/nlp_toolkit/backends/neural_engine/executor/include/dispatcher.hpp
@@ -256,6 +256,18 @@ class Dispatcher {
     kernel_handler_[execute_kernel_]->set_perf_ratio_id(perf_ratio_id);
   }
   inline const string& perf_ratio_id() { return kernel_handler_[execute_kernel_]->perf_ratio_id(); }
+  inline void set_it_shape(const vector<int64_t>input_shape) {
+                           kernel_handler_[execute_kernel_]->set_it_shape(input_shape); }
+  inline void set_ot_shape(const vector<int64_t>output_shape) {
+                           kernel_handler_[execute_kernel_]->set_ot_shape(output_shape); }
+  inline const vector<vector<int64_t>>& get_it_shape() {
+                           return kernel_handler_[execute_kernel_]->get_it_shape(); }
+  inline const vector<vector<int64_t>>& get_ot_shape() {
+                           return kernel_handler_[execute_kernel_]->get_ot_shape(); }
+  inline void set_reshape_time(const float reshape_time_) {
+                           kernel_handler_[execute_kernel_]->set_reshape_time(reshape_time_); }
+  inline const vector<float>& get_reshape_time() {
+                           return kernel_handler_[execute_kernel_]->get_reshape_time(); }
 
  protected:
   // get input_hash
diff --git a/nlp_toolkit/backends/neural_engine/executor/include/model.hpp b/nlp_toolkit/backends/neural_engine/executor/include/model.hpp
@@ -25,7 +25,6 @@
 #include <utility>
 #include <vector>
 #include <unordered_map>
-
 #include "common.hpp"
 #include "glog/logging.h"
 #include "memory_allocator.hpp"
@@ -34,6 +33,7 @@
 #include "operator_registry.hpp"
 #include "tensor.hpp"
 #include "thread_pool.hpp"
+#include "profiling_trace.hpp"
 
 namespace executor {
 
@@ -110,6 +110,8 @@ class Model {
   void ProfilingSparseEstimate(FILE* fp, const shared_ptr<Dispatcher>& op,
                                const float average_latency = 0.);
 
+
+
  protected:
   string name_;
   string weight_root_;
diff --git a/nlp_toolkit/backends/neural_engine/executor/include/operator.hpp b/nlp_toolkit/backends/neural_engine/executor/include/operator.hpp
@@ -98,6 +98,13 @@ class Operator {
   inline const string& table_id() const { return table_id_; }
   inline void set_perf_ratio_id(const string& perf_ratio_id) { perf_ratio_id_ = perf_ratio_id; }
   inline const string& perf_ratio_id() const { return perf_ratio_id_; }
+  inline void set_it_shape(const vector<int64_t> input_shape) { input_tensor_shape.emplace_back(input_shape); }
+  inline void set_ot_shape(const vector<int64_t> output_shape) { output_tensor_shape.emplace_back(output_shape); }
+  inline const vector<vector<int64_t>>& get_it_shape() const { return input_tensor_shape; }
+  inline const vector<vector<int64_t>>& get_ot_shape() const { return output_tensor_shape; }
+  // get executor kernel time add reshape time
+  inline void set_reshape_time(const float reshape_time_) { reshape_time.emplace_back(reshape_time_); }
+  inline const vector<float>& get_reshape_time() const { return reshape_time; }
 
  protected:
   /** The conf that stores the operator configurations */
@@ -117,6 +124,9 @@ class Operator {
   vector<int64_t> weight_shape_;
   string table_id_;
   string perf_ratio_id_;
+  vector<vector<int64_t>> input_tensor_shape;
+  vector<vector<int64_t>> output_tensor_shape;
+  vector<float> reshape_time;
 };  // class Operator
 
 }  // namespace executor
diff --git a/nlp_toolkit/backends/neural_engine/executor/include/profiling_trace.hpp b/nlp_toolkit/backends/neural_engine/executor/include/profiling_trace.hpp
@@ -0,0 +1,186 @@
+//  Copyright (c) 2021 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_EXECUTOR_INCLUDE_PROFILING_TRACE_HPP_
+#define ENGINE_EXECUTOR_INCLUDE_PROFILING_TRACE_HPP_
+
+#include <iostream>
+#include <fstream>
+#include <memory>
+#include <vector>
+#include <string>
+#include "operator.hpp"
+#include "dispatcher.hpp"
+#include "tensor.hpp"
+
+namespace executor {
+class ProfilingTracer {
+ public:
+  ProfilingTracer() : TotalTime(0), iterations_during() {}
+
+  void BeginTrace(const std::string& filepath = "result.json") {
+    OutputStream.open(filepath);
+    TracerHeader();
+  }
+
+  void EndTrace() {
+    TracerFooter();
+    OutputStream.close();
+  }
+
+  void WriteProfile(const vector<shared_ptr<Dispatcher>>& operators_, const vector<vector<Tensor*>>& input_tensors,
+                    const vector<vector<Tensor*>>& output_tensors) {
+    IterationTotalTime(operators_);
+    OutputStream << "{";
+    OutputStream << "\"cat\":\"inference\",";
+    OutputStream << "\"dur\":" << TotalTime*1000<< ",";
+    OutputStream << "\"name\":\"" << "model_inference" << "\",";
+    OutputStream << "\"ph\":\"X\",";
+    OutputStream << "\"pid\": 0,";
+    OutputStream << "\"tid\": \"" << "inference" << "\",";
+    OutputStream << "\"ts\": " << 0;
+    OutputStream << "}";
+    float iter_start = 0;
+    for (int i = 0; i < operators_[1]->latency().size(); ++i) {
+      OutputStream << ",";
+      OutputStream << "{";
+      OutputStream << "\"cat\":\"" << "iteration" << "\",";
+      OutputStream << "\"dur\":" << iterations_during[i]*1000 << ",";
+      OutputStream << "\"name\":\"" << "Iteration" << i << "\",";
+      OutputStream << "\"ph\":\"X\",";
+      OutputStream << "\"pid\": 0,";
+      OutputStream << "\"tid\": \"" << "Iteration" << "\",";
+      OutputStream << "\"ts\":" << iter_start*1000;
+      OutputStream << "}";
+      float op_start = 0;
+      for (int j = 1; j < operators_.size()-1; ++j) {
+        const shared_ptr<Dispatcher>& op = operators_[j];
+        vector<Tensor*> its = input_tensors[j];
+        vector<Tensor*> ots = output_tensors[j];
+        OutputStream << ",";
+        OutputStream << "{";
+        OutputStream << "\"cat\":\"" << op->type() << "\",";
+        OutputStream << "\"dur\":" << op->latency()[i]*1000 + op->get_reshape_time()[i]*1000<< ",";
+        OutputStream << "\"name\":\"" << op->name() << "\",";
+        OutputStream << "\"ph\":\"X\",";
+        OutputStream << "\"pid\": 0,";
+        OutputStream << "\"tid\": \"" << "Operator" << "\",";
+        OutputStream << "\"ts\":" << (op_start + iter_start)*1000<< ",";
+        OutputStream << "\"args\": {";
+        if (!op->post_op().empty()) {
+          OutputStream << "\"post_op\" :\"" << op->post_op() << "\",";
+        }
+        OutputStream << "\"reshape_time\" :\"" << op->get_reshape_time()[i] << "ms" << "\",";
+        OutputStream << "\"forward_time\" :\"" << op->latency()[i] << "ms" << "\",";
+        OutputStream << "\"input_tensor_name\" :\"" << TensorsName(its) << "\",";
+        OutputStream << "\"input_type\" :\"" << TensorsType(its) << "\",";
+        OutputStream << "\"input_shape\" : "<< TensorsShape(op->get_it_shape(), i, its.size()) << ",";
+        OutputStream << "\"output_tensor_name\" :\"" << TensorsName(ots) << "\",";
+        OutputStream << "\"output_type\" :\"" << TensorsType(ots) << "\",";
+        OutputStream << "\"output_shape\" : "<< TensorsShape(op->get_ot_shape(), i, 1);
+        OutputStream << "}";
+        OutputStream << "}";
+        OutputStream.flush();
+        op_start += (op->latency()[i] + op->get_reshape_time()[i]);
+      }
+      iter_start += iterations_during[i];
+    }
+  }
+
+  void TracerHeader() {
+    OutputStream << "{\"otherData\": {}, \"traceEvents\": [";
+    OutputStream.flush();
+  }
+
+  void TracerFooter() {
+    OutputStream << "]}";
+    OutputStream.flush();
+  }
+// get total time and per iteration's time
+  void IterationTotalTime(const vector<shared_ptr<Dispatcher>>& operators_) {
+    for (int i = 0; i < operators_[1]->latency().size(); ++i) {
+      float PerIterTime = 0;
+      for (int j = 1; j < operators_.size()-1; ++j) {
+        PerIterTime += operators_[j]->get_reshape_time()[i];
+        PerIterTime += operators_[j]->latency()[i];
+      }
+      iterations_during.emplace_back(PerIterTime);
+      TotalTime += PerIterTime;
+    }
+  }
+
+  std::string TensorsName(const vector<Tensor*>& Tensors) {
+    std::string result = "";
+    for (int i = 0; i < Tensors.size(); ++i) {
+      if (i == Tensors.size() -1) {
+        result += Tensors[i]->name();
+      } else {
+          result += Tensors[i]->name();
+          result += ",";
+        }
+    }
+    return result;
+  }
+
+  std::string TensorsType(const vector<Tensor*>& Tensors) {
+    std::string result = "";
+    for (int i = 0; i < Tensors.size(); ++i) {
+      if (i == Tensors.size() -1) {
+        result += Tensors[i]->dtype();
+      } else {
+        result += Tensors[i]->dtype();
+        result += ",";
+        }
+    }
+    return result;
+  }
+
+  std::string TensorsShape(const vector<vector<int64_t>>& tensor_shape,
+                           int iteration_time, int tensor_size ) {
+    std::string result = "\"";
+    for (int i = iteration_time*tensor_size; i < (iteration_time + 1)*tensor_size; ++i) {
+      if (i == (iteration_time + 1)*tensor_size -1) {
+        for (int j = 0; j < tensor_shape[i].size(); ++j) {
+        if (j == tensor_shape[i].size()-1) {
+          result += std::to_string(tensor_shape[i][j]);
+        } else {
+            result += std::to_string(tensor_shape[i][j]);
+            result += "*";
+          }
+        }
+      } else {
+          for (int j = 0; j < tensor_shape[i].size(); ++j) {
+            if (j == tensor_shape[i].size()-1) {
+              result += std::to_string(tensor_shape[i][j]);
+              result += ",";
+            } else {
+                result += std::to_string(tensor_shape[i][j]);
+                result += "*";
+              }
+            }
+        }
+    }
+    result += "\"";
+    return result;
+  }
+
+ protected:
+  std::ofstream OutputStream;
+  float TotalTime;
+  vector<float> iterations_during;
+};
+
+}  // namespace executor
+
+#endif  // ENGINE_EXECUTOR_INCLUDE_PEOFILING_TRACE_HPP_
diff --git a/nlp_toolkit/backends/neural_engine/executor/src/model.cpp b/nlp_toolkit/backends/neural_engine/executor/src/model.cpp
@@ -249,6 +249,7 @@ void Model::SetDispatchKernel(const bool& reshape_model) {
 }
 
 vector<Tensor>& Model::Forward(vector<Tensor>& input_data) {
+  float It_start = Time("start");
   CHECK_EQ(input_data.size(), model_input_tensors_.size())
       << "input data size not equal with model input tensor size....";
   // if we want use dynamic input data shape at run time, we should check the
@@ -275,16 +276,29 @@ vector<Tensor>& Model::Forward(vector<Tensor>& input_data) {
     }
   }
   for (int i = 0; i < input_data.size(); ++i) {
-    // model_input_tesnor_[i]->free_data();
-    model_input_tensors_[i]->set_data(input_data[i].mutable_data());
-    model_input_tensors_[i]->set_shape(input_data[i].shape());
+  // model_input_tesnor_[i]->free_data();
+  model_input_tensors_[i]->set_data(input_data[i].mutable_data());
+  model_input_tensors_[i]->set_shape(input_data[i].shape());
   }
 
   SetDispatchKernel(reshape_model);
 
   if (!is_dispatcher_tuning_) {
-    if (reshape_model) {
-      for (int i = 0; i < operators_.size(); ++i) {
+    if (reshape_model&&engine_profiling_) {
+        for (int i = 0; i < operators_.size(); ++i) {
+        LOG(INFO) << "operator " << operators_[i]->name() << " gonna reshape with type " << operators_[i]->type();
+        // get reshape time for profiling
+        float start = Time("start");
+        operators_[i]->Reshape(input_vecs_[i], output_vecs_[i]);
+        float end = Time("end");
+        operators_[i]->set_reshape_time(end - start);
+      }
+    } else if (!reshape_model&&engine_profiling_) {
+        for (int i = 0; i < operators_.size(); ++i) {
+        operators_[i]->set_reshape_time(0);
+      }
+    } else if (reshape_model) {
+        for (int i = 0; i < operators_.size(); ++i) {
         LOG(INFO) << "operator " << operators_[i]->name() << " gonna reshape with type " << operators_[i]->type();
         operators_[i]->Reshape(input_vecs_[i], output_vecs_[i]);
       }
@@ -298,6 +312,12 @@ vector<Tensor>& Model::Forward(vector<Tensor>& input_data) {
           tp.commitTask(std::bind(&executor::Dispatcher::Forward, operators_[i], input_vecs_[i], output_vecs_[i]));
           float end = Time("end");
           operators_[i]->set_latency(end - start);
+          for (int j = 0; j < input_vecs_[i].size(); ++j) {
+            operators_[i]->set_it_shape(input_vecs_[i][j]->shape());
+          }
+          if (i != operators_.size() - 1) {
+            operators_[i]->set_ot_shape(output_vecs_[i][0]->shape());  // the last output is not exsit
+          }
           LOG(INFO) << "operator: " << operators_[i]->name() << ", latency: " << end - start << " ms";
           if (thread_count >= multi_stream_tasks_[i]) {
             tp.waitAllTaskRunOver();
@@ -308,7 +328,14 @@ vector<Tensor>& Model::Forward(vector<Tensor>& input_data) {
           float start = Time("start");
           operators_[i]->Forward(input_vecs_[i], output_vecs_[i]);
           float end = Time("end");
+          // for profiling
           operators_[i]->set_latency(end - start);
+          for (int j = 0; j < input_vecs_[i].size(); ++j) {
+            operators_[i]->set_it_shape(input_vecs_[i][j]->shape());
+          }
+          if (i != operators_.size() - 1) {
+            operators_[i]->set_ot_shape(output_vecs_[i][0]->shape());
+          }
           LOG(INFO) << "operator: " << operators_[i]->name() << ", latency: " << end - start << " ms";
         }
       }
@@ -328,12 +355,14 @@ vector<Tensor>& Model::Forward(vector<Tensor>& input_data) {
       }
     }
   }
+  float Iter_end = Time("end");
   return this->output_tensors();
 }
 
 void Model::Profiling(char* space_name, char* count_name, char* mtx_name, int warm_up) {
   // in multi instance case, dump profiling for each instance
   LOG(INFO) << "Neural engine profiling ...";
+  ipc::shared_memory_object::remove(space_name);
   ipc::managed_shared_memory shm(ipc::open_or_create, space_name, 1024);
   int* inst_count = shm.find_or_construct<int>(count_name)(0);
   std::string profiling_dir = "engine_profiling_";
@@ -384,6 +413,12 @@ void Model::Profiling(char* space_name, char* count_name, char* mtx_name, int wa
       // for spase performance estimate
       ProfilingSparseEstimate(fp, op, average_latency);
     }
+    std::string tracer_file = profiling_dir + "/profiling_" + ch_curr_time \
+                               + "_" + std::to_string((*inst_count)++) + ".json";
+    ProfilingTracer Tracer = ProfilingTracer();
+    Tracer.BeginTrace(tracer_file);
+    Tracer.WriteProfile(operators_, input_vecs_, output_vecs_);
+    Tracer.EndTrace();
     // dense total latency
     fprintf(fp, ",,,,,,,,,,,%s,%.3f,",
                 "total latency(ms)", total_latency);
@@ -432,7 +467,6 @@ void Model::Profiling(char* space_name, char* count_name, char* mtx_name, int wa
     ipc::shared_memory_object::remove(space_name);
   }
 }
-
 void Model::ProfilingSparse(FILE* fp) {
   // weight shape, perf ratio, others
   fprintf(fp, "%s,%s,%s,%s,%s\n", "weight shape", "90% 4x1 perf ratio",