add sparselib 3d tuning function (#224)

zhentaoyu · web-flow · commit c02d3f5cf7a6 · 2022-09-15T16:13:00.000+08:00
* add sparselib 3d tuning function

* fix cpplint

* fix cpplint error

* clean line space
diff --git a/nlp_toolkit/backends/neural_engine/executor/include/dispatcher.hpp b/nlp_toolkit/backends/neural_engine/executor/include/dispatcher.hpp
@@ -76,12 +76,25 @@ class Dispatcher {
   void Prepare(const vector<Tensor*>& input, const vector<Tensor*>& output) {
     // (TODO) handle the case that different kernel with different output data type
     // Prepare will change some status on kernel, but should not on output
+    for (int i = 0; i < kernel_handler_.size(); ++i) sparselib_available_.push_back(false);
+    int idx = 0;
+    // let default kernel prepare first
+    kernel_handler_[type_]->Prepare(input, output);
     for (const auto& k_pair : kernel_handler_) {
+      auto kernel_name = k_pair.first;
       auto kernel = k_pair.second;
       kernel->set_dispatch_from_type(type_);
-      kernel->Prepare(input, output);
+      if (kernel_name != type_) kernel->Prepare(input, output);
+      sparselib_available_[idx++] = kernel->kernel_type() == SparseLib ? true : false;
+      if (tune_dense_in_sparse_ && do_tuning_ && kernel->kernel_type() == SparseLib) {
+        kernel->set_kernel_type(Dense);
+        kernel->Prepare(input, output);
+        kernel->set_kernel_type(SparseLib);
+      }
+      if ((kernel_handler_.size() < 2 || kernel->monopolize_dispatcher())
+          && !sparselib_available_[0]) no_tuning_space_ = true;
       if (kernel->monopolize_dispatcher()) {
-        disable_dispatch_ = true;
+        monopoly_kernel_ = kernel_name;
         break;
       }
     }
@@ -110,7 +123,7 @@ class Dispatcher {
         if (kernel_handler_.size() > 1) kernel_handler_[type_]->set_do_shape_infer(true);
         kernel_handler_[type_]->Reshape(input, output);
       }
-      if (!disable_dispatch_ && has_dispatch_table_file) {
+      if (!no_tuning_space_ && has_dispatch_table_file) {
         // generate hash key and find the best kernel if has dispatch table
         // only load once
         if (DispatchTable::Size() == 0) {
@@ -120,9 +133,16 @@ class Dispatcher {
         vector<string> kernel_config = DispatchTable::Find(type_, GetHash(input));
         if (!kernel_config.empty()) {
           string kernel_name = kernel_config[0];
-          if (kernel_handler_.count(kernel_name) > 0) {
-            execute_kernel_ = kernel_name;
-            kernel_handler_[kernel_name]->set_dispatch_config(kernel_config);
+          // sparselib
+          if (kernel_name == "SparseLib") {
+            execute_kernel_ = type_;
+            kernel_handler_[type_]->set_dispatch_config(kernel_config);
+          } else {
+            // dense
+            if (kernel_handler_.count(kernel_name) > 0) {
+              execute_kernel_ = kernel_name;
+              kernel_handler_[kernel_name]->set_dispatch_config(kernel_config);
+            }
           }
         }
       }
@@ -136,41 +156,60 @@ class Dispatcher {
       size_t input_hash = GetHash(input);
       iter_cnt_ += 1;
       // consider warmup when tuning
-      if (!disable_dispatch_ && kernel_handler_.size() > 1 && (iter_cnt_<= warmup_iter_ + 1 ||
-          DispatchTable::Find(type_, input_hash).empty())) {
+      if (!no_tuning_space_ && (iter_cnt_<= warmup_iter_ + 1 || DispatchTable::Find(type_, input_hash).empty())) {
         // keep kernel with the least time as first pair
         std::map<float, vector<string>, std::less<float>> timer;
         OpTuning op_tuning(type_);
         // increase input tensors' life when tune
         // default kernel does not count towards the extra life
+        int idx = 0;
+        string suffix;
         for (const auto& k_pair : kernel_handler_) {
           auto kernel_name = k_pair.first;
           auto kernel = k_pair.second;
-          op_tuning.Start(kernel_name, kernel, input, output, reshape_model);
+          suffix = sparselib_available_[idx++] ? "SparseLib" : kernel_name;
+          if (tune_dense_in_sparse_ && suffix == "SparseLib") {
+            kernel->set_kernel_type(Dense);
+            op_tuning.Start(kernel_name, kernel, input, output, reshape_model);
+            kernel->set_kernel_type(SparseLib);
+          }
+          op_tuning.Start(suffix, kernel, input, output, reshape_model);
+          if (monopoly_kernel_ == kernel_name) break;
         }
         for (auto& tensor : input) tensor->disposable_extra_life(op_tuning.extra_tensor_life());
         op_tuning.reset_extra_tensor_life();
         // tune kernel
+        idx = 0;
         for (const auto& k_pair : kernel_handler_) {
           auto kernel_name = k_pair.first;
           auto kernel = k_pair.second;
+          suffix = sparselib_available_[idx++] == true ? "SparseLib" : kernel_name;
           try {
-            op_tuning.Run(kernel_name, kernel, input, output, reshape_model);
+            if (tune_dense_in_sparse_ && suffix == "SparseLib") {
+              kernel->set_kernel_type(Dense);
+              op_tuning.Run(kernel_name, kernel, input, output, reshape_model);
+              kernel->set_kernel_type(SparseLib);
+            }
+            op_tuning.Run(suffix, kernel, input, output, reshape_model);
             timer[op_tuning.best_execute_time()] = op_tuning.kernel_config();
           // some kernels don't support specific dtype, fusion, etc.
           } catch (const std::exception& e) {
             LOG(WARNING) << kernel_name << " kernel tuning failure: " << e.what();
           }
+          if (monopoly_kernel_ == kernel_name) break;
         }
         if (timer.size() > 0) {
           execute_kernel_ = timer.begin()->second[0];
           LOG(INFO) << "best kernel is " << execute_kernel_ << " with time " << timer.begin()->first << "ms";
           if (execute_kernel_ != type_) DispatchTable::Insert(type_, input_hash, timer.begin()->second);
         }
       } else {
-        LOG(INFO) << "Skip tuning function due to existing input hash...";
-        if (reshape_model) kernel_handler_[type_]->Reshape(input, output);
-        kernel_handler_[type_]->Forward(input, output);
+        LOG(INFO) << "Skip tuning function due to existing input hash or no tuning space...";
+        vector<string> kernel_config = DispatchTable::Find(type_, input_hash);
+        string kernel_name = (!kernel_config.empty() && kernel_config[0] != "SparseLib") ? kernel_config[0] : type_;
+        kernel_handler_[kernel_name]->set_dispatch_config(kernel_config);
+        if (reshape_model || !kernel_config.empty()) kernel_handler_[kernel_name]->Reshape(input, output);
+        kernel_handler_[kernel_name]->Forward(input, output);
       }
     }
   }
@@ -182,7 +221,7 @@ class Dispatcher {
   inline const string& type() const { return type_; }
   inline const OperatorConfig& operator_conf() const { return operator_conf_; }
   inline const string& execute_kernel() const { return execute_kernel_; }
-  inline const bool& disable_dispatch() const { return disable_dispatch_; }
+  inline const bool& no_tuning_space() const { return no_tuning_space_; }
   inline const void set_warmup_iter(const int& warmup_iter) { warmup_iter_ = warmup_iter; }
   // for profiling
   inline void set_post_op(const string& post_op) { kernel_handler_[execute_kernel_]->set_post_op(post_op); }
@@ -215,6 +254,7 @@ class Dispatcher {
     size_t input_hash = 0;
     for (const auto& tensor : input) combine_hash.push_back(tensor->get_hash());
     input_hash = get_array_hash(input_hash, combine_hash, combine_hash.size());
+    input_hash = get_array_hash(input_hash, sparselib_available_, sparselib_available_.size());
     return input_hash;
   }
 
@@ -225,9 +265,12 @@ class Dispatcher {
   KernelHandler kernel_handler_;
   string execute_kernel_;
   bool do_tuning_ = false;
-  bool disable_dispatch_ = false;
+  bool no_tuning_space_ = false;
   int64_t warmup_iter_ = 1;
   int64_t iter_cnt_ = 0;
+  vector<bool> sparselib_available_;
+  bool tune_dense_in_sparse_ = false;
+  string monopoly_kernel_;
 };
 }  // namespace executor
 
diff --git a/nlp_toolkit/backends/neural_engine/executor/include/op_tuning.hpp b/nlp_toolkit/backends/neural_engine/executor/include/op_tuning.hpp
@@ -88,6 +88,13 @@ class OpTuning {
   // find the best N-W combination
   void IpToConvTune(std::shared_ptr<Operator> kernel, const vector<Tensor*>& input,
                     const vector<Tensor*>& output, const bool& reshape_model) {
+    // only for tuning fp32 and bf16 dtype
+    if (input[1]->dtype() != "fp32"  && input[1]->dtype() != "bf16") {
+      LOG(WARNING) << "Only support fp32 or bf16 dtype when tuning kernel between InnerProduct and Convolution!";
+      best_execute_time_ = std::numeric_limits<float>::max();
+      kernel_config_.clear();
+      return;
+    }
     std::map<float, string, std::less<float>> input_shape_timer;
     vector<string> nw_comb;
     bool is_src0_transposed = input[0]->is_transposed();
@@ -144,6 +151,83 @@ class OpTuning {
     }
   }
 
+  // split the dimension from 2D to 3D when use sparselib gemm
+  void IpToSparseLibTune(std::shared_ptr<Operator> kernel, const vector<Tensor*>& input,
+                    const vector<Tensor*>& output, const bool& reshape_model) {
+    // only for tuning int8 dtype
+    if (input[1]->dtype() != "u8") {
+      LOG(WARNING) << "Only support int8 dtype when tuning InnerProduct kernel with SparseLib!";
+      best_execute_time_ = std::numeric_limits<float>::max();
+      kernel_config_.clear();
+      return;
+    }
+    // sparselib search space
+    vector<int64_t> bs_space = {64, 128, 196, 256};
+    vector<string> mkn_blocks_space = {"1,1,1"};
+    vector<string> tile_shape_space = {"4,4"};
+    // sparselib dispatch kernel config is {"input_shape", "mkn_blocks", "tile_shape"}
+    std::map<float, vector<string>, std::less<float>> bs_attr_timer;
+    // M x k -> mic_bs x K x bs
+    vector<string> micbs_bs_comb;
+    // sparselib graph ir should switch position of src and weight
+    vector<int64_t> src1_shape = input[1]->shape();
+    int64_t m_dim = src1_shape[1];
+    int64_t k_dim = src1_shape[0];
+    bool oneKM_shape_filling = false;
+    for (const auto& bs : bs_space) {
+      if (bs == 0) continue;
+      if (m_dim % bs > 0 && !oneKM_shape_filling) {
+        micbs_bs_comb.push_back("1," + std::to_string(k_dim) + "," + std::to_string(m_dim));
+        oneKM_shape_filling = true;
+      }
+      if (m_dim < bs) break;
+      if (m_dim % bs == 0) {
+        if (m_dim / bs == 1 && oneKM_shape_filling) continue;
+        micbs_bs_comb.push_back(std::to_string(m_dim / bs) + "," + std::to_string(k_dim) + "," + std::to_string(bs));
+        if (m_dim / bs == 1) oneKM_shape_filling = true;
+      }
+    }
+    vector<vector<string>> bs_attr_comb(micbs_bs_comb.size() * mkn_blocks_space.size() * tile_shape_space.size());
+#pragma omp parallel for
+    for (int i = 0; i < micbs_bs_comb.size(); ++i) {
+      for (int j = 0; j < mkn_blocks_space.size(); ++j) {
+#pragma omp simd
+        for (int k = 0; k < tile_shape_space.size(); ++k) {
+          bs_attr_comb[i * mkn_blocks_space.size() * tile_shape_space.size() + j * tile_shape_space.size() + k] = \
+            {micbs_bs_comb[i], mkn_blocks_space[j], tile_shape_space[k]};
+        }
+      }
+    }
+    // add tensor life
+    if (stage_ == "start") {
+      extra_tensor_life_ += bs_attr_comb.size();
+      return;
+    }
+    vector<string> kernel_config_cpy = {kernel_config_[0], "", "", ""};
+    for (const auto& comb : bs_attr_comb) {
+      for (int i = 0; i < comb.size(); ++i) kernel_config_cpy[i + 1] = comb[i];
+      kernel->set_dispatch_config(kernel_config_cpy);
+      float start_time = 0;
+      float reshape_time = 0;
+      start_time = Time("start");
+      kernel->Reshape(input, output);
+      reshape_time = Time("end") - start_time;
+      start_time = Time("start");
+      kernel->Forward(input, output);
+      float execute_time = Time("end") - start_time;
+      if (reshape_model) execute_time += reshape_time;
+      bs_attr_timer[execute_time] = kernel_config_cpy;
+      LOG(INFO) << "IpToSparseLibTune forward time is " << execute_time << "ms, activation shape: " << comb[0]
+                << ", mkn_blocks: " << comb[1] << ", tile_shape: " << comb[2];
+    }
+    if (bs_attr_timer.size() > 0) {
+      best_execute_time_ = bs_attr_timer.begin()->first;
+      kernel_config_ = bs_attr_timer.begin()->second;
+    } else {
+      LOG(FATAL) << "InnerProduct tuning fails with kernel SparseLib...";
+    }
+  }
+
   inline const float& best_execute_time() const { return best_execute_time_;}
   inline const vector<string>& kernel_config() const { return kernel_config_; }
   inline const int& extra_tensor_life() const { return extra_tensor_life_; }
@@ -170,7 +254,8 @@ class OpTuning {
 
 std::unordered_map<string, OpTuning::TuneFunc> OpTuning::tune_func_map_ = {
     {"Base", &OpTuning::BaseTune},
-    {"InnerProduct_to_Convolution", &OpTuning::IpToConvTune}
+    {"InnerProduct_to_Convolution", &OpTuning::IpToConvTune},
+    {"InnerProduct_to_SparseLib", &OpTuning::IpToSparseLibTune}
 };
 }  // namespace executor
 
diff --git a/nlp_toolkit/backends/neural_engine/executor/include/operator.hpp b/nlp_toolkit/backends/neural_engine/executor/include/operator.hpp
@@ -74,7 +74,7 @@ class Operator {
   const OperatorConfig& operator_conf() const { return operator_conf_; }
   // dispatch kernel may need to do reshape and receive config, like InnerProduct to Convolution
   inline void set_dispatch_from_type(const string& type) { dispatch_from_ = type; }
-  inline void set_dispatch_config(const vector<string>& config) { dispatch_config_ = config; }
+  inline void set_dispatch_config(const vector<string>& config = {}) { dispatch_config_ = config; }
   inline void set_do_shape_infer(const bool& do_shape_infer) { do_shape_infer_ = do_shape_infer; }
   inline const bool& do_shape_infer() const { return do_shape_infer_; }
   inline const bool& monopolize_dispatcher() const { return monopolize_dispatcher_; }
@@ -86,6 +86,7 @@ class Operator {
   inline void set_enable_sparse(const bool enable_sparse) { enable_sparse_ = enable_sparse; }
   inline const float& enable_sparse() const { return enable_sparse_; }
   inline const KERNEL_TYPE& kernel_type() const { return kernel_type_; }
+  inline void set_kernel_type(const KERNEL_TYPE& kernel_type) { kernel_type_ = kernel_type; }
   inline const float& weight_zero_ratio() const { return weight_zero_ratio_; }
   inline void set_weight_shape(const vector<int64_t>& weight_shape) { weight_shape_ = weight_shape; }
   inline const vector<int64_t>& weight_shape() const { return weight_shape_; }
diff --git a/nlp_toolkit/backends/neural_engine/executor/include/tensor.hpp b/nlp_toolkit/backends/neural_engine/executor/include/tensor.hpp
@@ -19,6 +19,7 @@
 #include <numeric>
 #include <string>
 #include <vector>
+#include <unordered_map>
 
 #include "common.hpp"
 #include "conf.hpp"
@@ -135,6 +136,35 @@ class Tensor {
     LOG(WARNING) << "please set tensor data to make adding extra tensor life work...";
   }
 
+  // reorder tensor
+  // for example, use reorder in sparselib, 2D for transpose mode, 3D for tuning and dispatch
+  // [a, b] -> [b, a], dst_perm: [1, 0]
+  // [a, b, c] -> [b, a, c], dst_perm: [1, 0 ,2]
+  void reorder(const vector<int64_t>& src_shape, const vector<int64_t>& dst_perm = {1, 0, 2}) {
+    static unordered_map<string, dnnl::memory::data_type> type2mem{
+    {"fp32", dnnl::memory::data_type::f32}, {"s32", dnnl::memory::data_type::s32},
+    {"fp16", dnnl::memory::data_type::f16}, {"u8", dnnl::memory::data_type::u8},
+    {"s8", dnnl::memory::data_type::s8},    {"bf16", dnnl::memory::data_type::bf16}};
+    // execute reorder primitive
+    vector<int64_t> src_stride = GetStrides(src_shape);
+    vector<int64_t> dst_shape = GetShapes(src_shape, dst_perm);
+    vector<int64_t> dst_stride = GetStrides(dst_shape, ReversePerm(dst_perm));
+    dnnl::memory::desc src_md(src_shape, type2mem[this->dtype()], src_stride);
+    dnnl::memory::desc dst_md(src_shape, type2mem[this->dtype()], dst_stride);
+    dnnl::engine reorder_eng(dnnl::engine::kind::cpu, 0);
+    dnnl::stream reorder_eng_stream(reorder_eng);
+    dnnl::reorder::primitive_desc reorder_src_pd(reorder_eng, src_md, reorder_eng, dst_md);
+    dnnl::memory src_m(src_md, reorder_eng);
+    dnnl::memory dst_m(dst_md, reorder_eng);
+    src_m.set_data_handle(const_cast<void*>(this->data()), reorder_eng_stream);
+    dnnl::reorder(src_m, dst_m).execute(reorder_eng_stream, src_m, dst_m);
+    reorder_eng_stream.wait();
+    // inplace dst
+    void* p = dst_m.get_data_handle();
+    size_t data_size = this->size() * type2bytes[this->dtype()];
+    memcpy(this->mutable_data(), p, data_size);
+  }
+
   inline size_t size() { return std::accumulate(shape_.begin(), shape_.end(), size_t(1), std::multiplies<size_t>()); }
 
   void set_shm_handle(const ipc::managed_shared_memory::handle_t& h) { shm_handle_ = h; }
diff --git a/nlp_toolkit/backends/neural_engine/executor/src/common.cpp b/nlp_toolkit/backends/neural_engine/executor/src/common.cpp
@@ -21,6 +21,9 @@ unordered_map<string, int> type2bytes = {{"fp32", sizeof(float)},       {"int8",
                                          {"u8", sizeof(unsigned char)}, {"s8", sizeof(char)},   {"s32", sizeof(int)},
                                          {"bf16", sizeof(uint16_t)}};
 unordered_map<string, vector<string>> dispatch_kernel_config = {{"InnerProduct_to_Convolution", {"input_shape"}},
+                                                                {"InnerProduct_to_SparseLib", {"input_shape",
+                                                                                              "mkn_blocks",
+                                                                                              "tile_shape"}},
                                                                 };
 const int CPU_COUNT = omp_get_max_threads();
 #if __AVX512F__
diff --git a/nlp_toolkit/backends/neural_engine/executor/src/operators/inner_product.cpp b/nlp_toolkit/backends/neural_engine/executor/src/operators/inner_product.cpp