Skip to content

Commit c02d3f5

Browse files
authored
add sparselib 3d tuning function (#224)
* add sparselib 3d tuning function * fix cpplint * fix cpplint error * clean line space
1 parent 1bee4df commit c02d3f5

File tree

6 files changed

+238
-26
lines changed

6 files changed

+238
-26
lines changed

nlp_toolkit/backends/neural_engine/executor/include/dispatcher.hpp

Lines changed: 58 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,25 @@ class Dispatcher {
7676
void Prepare(const vector<Tensor*>& input, const vector<Tensor*>& output) {
7777
// (TODO) handle the case that different kernel with different output data type
7878
// Prepare will change some status on kernel, but should not on output
79+
for (int i = 0; i < kernel_handler_.size(); ++i) sparselib_available_.push_back(false);
80+
int idx = 0;
81+
// let default kernel prepare first
82+
kernel_handler_[type_]->Prepare(input, output);
7983
for (const auto& k_pair : kernel_handler_) {
84+
auto kernel_name = k_pair.first;
8085
auto kernel = k_pair.second;
8186
kernel->set_dispatch_from_type(type_);
82-
kernel->Prepare(input, output);
87+
if (kernel_name != type_) kernel->Prepare(input, output);
88+
sparselib_available_[idx++] = kernel->kernel_type() == SparseLib ? true : false;
89+
if (tune_dense_in_sparse_ && do_tuning_ && kernel->kernel_type() == SparseLib) {
90+
kernel->set_kernel_type(Dense);
91+
kernel->Prepare(input, output);
92+
kernel->set_kernel_type(SparseLib);
93+
}
94+
if ((kernel_handler_.size() < 2 || kernel->monopolize_dispatcher())
95+
&& !sparselib_available_[0]) no_tuning_space_ = true;
8396
if (kernel->monopolize_dispatcher()) {
84-
disable_dispatch_ = true;
97+
monopoly_kernel_ = kernel_name;
8598
break;
8699
}
87100
}
@@ -110,7 +123,7 @@ class Dispatcher {
110123
if (kernel_handler_.size() > 1) kernel_handler_[type_]->set_do_shape_infer(true);
111124
kernel_handler_[type_]->Reshape(input, output);
112125
}
113-
if (!disable_dispatch_ && has_dispatch_table_file) {
126+
if (!no_tuning_space_ && has_dispatch_table_file) {
114127
// generate hash key and find the best kernel if has dispatch table
115128
// only load once
116129
if (DispatchTable::Size() == 0) {
@@ -120,9 +133,16 @@ class Dispatcher {
120133
vector<string> kernel_config = DispatchTable::Find(type_, GetHash(input));
121134
if (!kernel_config.empty()) {
122135
string kernel_name = kernel_config[0];
123-
if (kernel_handler_.count(kernel_name) > 0) {
124-
execute_kernel_ = kernel_name;
125-
kernel_handler_[kernel_name]->set_dispatch_config(kernel_config);
136+
// sparselib
137+
if (kernel_name == "SparseLib") {
138+
execute_kernel_ = type_;
139+
kernel_handler_[type_]->set_dispatch_config(kernel_config);
140+
} else {
141+
// dense
142+
if (kernel_handler_.count(kernel_name) > 0) {
143+
execute_kernel_ = kernel_name;
144+
kernel_handler_[kernel_name]->set_dispatch_config(kernel_config);
145+
}
126146
}
127147
}
128148
}
@@ -136,41 +156,60 @@ class Dispatcher {
136156
size_t input_hash = GetHash(input);
137157
iter_cnt_ += 1;
138158
// consider warmup when tuning
139-
if (!disable_dispatch_ && kernel_handler_.size() > 1 && (iter_cnt_<= warmup_iter_ + 1 ||
140-
DispatchTable::Find(type_, input_hash).empty())) {
159+
if (!no_tuning_space_ && (iter_cnt_<= warmup_iter_ + 1 || DispatchTable::Find(type_, input_hash).empty())) {
141160
// keep kernel with the least time as first pair
142161
std::map<float, vector<string>, std::less<float>> timer;
143162
OpTuning op_tuning(type_);
144163
// increase input tensors' life when tune
145164
// default kernel does not count towards the extra life
165+
int idx = 0;
166+
string suffix;
146167
for (const auto& k_pair : kernel_handler_) {
147168
auto kernel_name = k_pair.first;
148169
auto kernel = k_pair.second;
149-
op_tuning.Start(kernel_name, kernel, input, output, reshape_model);
170+
suffix = sparselib_available_[idx++] ? "SparseLib" : kernel_name;
171+
if (tune_dense_in_sparse_ && suffix == "SparseLib") {
172+
kernel->set_kernel_type(Dense);
173+
op_tuning.Start(kernel_name, kernel, input, output, reshape_model);
174+
kernel->set_kernel_type(SparseLib);
175+
}
176+
op_tuning.Start(suffix, kernel, input, output, reshape_model);
177+
if (monopoly_kernel_ == kernel_name) break;
150178
}
151179
for (auto& tensor : input) tensor->disposable_extra_life(op_tuning.extra_tensor_life());
152180
op_tuning.reset_extra_tensor_life();
153181
// tune kernel
182+
idx = 0;
154183
for (const auto& k_pair : kernel_handler_) {
155184
auto kernel_name = k_pair.first;
156185
auto kernel = k_pair.second;
186+
suffix = sparselib_available_[idx++] == true ? "SparseLib" : kernel_name;
157187
try {
158-
op_tuning.Run(kernel_name, kernel, input, output, reshape_model);
188+
if (tune_dense_in_sparse_ && suffix == "SparseLib") {
189+
kernel->set_kernel_type(Dense);
190+
op_tuning.Run(kernel_name, kernel, input, output, reshape_model);
191+
kernel->set_kernel_type(SparseLib);
192+
}
193+
op_tuning.Run(suffix, kernel, input, output, reshape_model);
159194
timer[op_tuning.best_execute_time()] = op_tuning.kernel_config();
160195
// some kernels don't support specific dtype, fusion, etc.
161196
} catch (const std::exception& e) {
162197
LOG(WARNING) << kernel_name << " kernel tuning failure: " << e.what();
163198
}
199+
if (monopoly_kernel_ == kernel_name) break;
164200
}
165201
if (timer.size() > 0) {
166202
execute_kernel_ = timer.begin()->second[0];
167203
LOG(INFO) << "best kernel is " << execute_kernel_ << " with time " << timer.begin()->first << "ms";
168204
if (execute_kernel_ != type_) DispatchTable::Insert(type_, input_hash, timer.begin()->second);
169205
}
170206
} else {
171-
LOG(INFO) << "Skip tuning function due to existing input hash...";
172-
if (reshape_model) kernel_handler_[type_]->Reshape(input, output);
173-
kernel_handler_[type_]->Forward(input, output);
207+
LOG(INFO) << "Skip tuning function due to existing input hash or no tuning space...";
208+
vector<string> kernel_config = DispatchTable::Find(type_, input_hash);
209+
string kernel_name = (!kernel_config.empty() && kernel_config[0] != "SparseLib") ? kernel_config[0] : type_;
210+
kernel_handler_[kernel_name]->set_dispatch_config(kernel_config);
211+
if (reshape_model || !kernel_config.empty()) kernel_handler_[kernel_name]->Reshape(input, output);
212+
kernel_handler_[kernel_name]->Forward(input, output);
174213
}
175214
}
176215
}
@@ -182,7 +221,7 @@ class Dispatcher {
182221
inline const string& type() const { return type_; }
183222
inline const OperatorConfig& operator_conf() const { return operator_conf_; }
184223
inline const string& execute_kernel() const { return execute_kernel_; }
185-
inline const bool& disable_dispatch() const { return disable_dispatch_; }
224+
inline const bool& no_tuning_space() const { return no_tuning_space_; }
186225
inline const void set_warmup_iter(const int& warmup_iter) { warmup_iter_ = warmup_iter; }
187226
// for profiling
188227
inline void set_post_op(const string& post_op) { kernel_handler_[execute_kernel_]->set_post_op(post_op); }
@@ -215,6 +254,7 @@ class Dispatcher {
215254
size_t input_hash = 0;
216255
for (const auto& tensor : input) combine_hash.push_back(tensor->get_hash());
217256
input_hash = get_array_hash(input_hash, combine_hash, combine_hash.size());
257+
input_hash = get_array_hash(input_hash, sparselib_available_, sparselib_available_.size());
218258
return input_hash;
219259
}
220260

@@ -225,9 +265,12 @@ class Dispatcher {
225265
KernelHandler kernel_handler_;
226266
string execute_kernel_;
227267
bool do_tuning_ = false;
228-
bool disable_dispatch_ = false;
268+
bool no_tuning_space_ = false;
229269
int64_t warmup_iter_ = 1;
230270
int64_t iter_cnt_ = 0;
271+
vector<bool> sparselib_available_;
272+
bool tune_dense_in_sparse_ = false;
273+
string monopoly_kernel_;
231274
};
232275
} // namespace executor
233276

nlp_toolkit/backends/neural_engine/executor/include/op_tuning.hpp

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@ class OpTuning {
8888
// find the best N-W combination
8989
void IpToConvTune(std::shared_ptr<Operator> kernel, const vector<Tensor*>& input,
9090
const vector<Tensor*>& output, const bool& reshape_model) {
91+
// only for tuning fp32 and bf16 dtype
92+
if (input[1]->dtype() != "fp32" && input[1]->dtype() != "bf16") {
93+
LOG(WARNING) << "Only support fp32 or bf16 dtype when tuning kernel between InnerProduct and Convolution!";
94+
best_execute_time_ = std::numeric_limits<float>::max();
95+
kernel_config_.clear();
96+
return;
97+
}
9198
std::map<float, string, std::less<float>> input_shape_timer;
9299
vector<string> nw_comb;
93100
bool is_src0_transposed = input[0]->is_transposed();
@@ -144,6 +151,83 @@ class OpTuning {
144151
}
145152
}
146153

154+
// split the dimension from 2D to 3D when use sparselib gemm
155+
void IpToSparseLibTune(std::shared_ptr<Operator> kernel, const vector<Tensor*>& input,
156+
const vector<Tensor*>& output, const bool& reshape_model) {
157+
// only for tuning int8 dtype
158+
if (input[1]->dtype() != "u8") {
159+
LOG(WARNING) << "Only support int8 dtype when tuning InnerProduct kernel with SparseLib!";
160+
best_execute_time_ = std::numeric_limits<float>::max();
161+
kernel_config_.clear();
162+
return;
163+
}
164+
// sparselib search space
165+
vector<int64_t> bs_space = {64, 128, 196, 256};
166+
vector<string> mkn_blocks_space = {"1,1,1"};
167+
vector<string> tile_shape_space = {"4,4"};
168+
// sparselib dispatch kernel config is {"input_shape", "mkn_blocks", "tile_shape"}
169+
std::map<float, vector<string>, std::less<float>> bs_attr_timer;
170+
// M x k -> mic_bs x K x bs
171+
vector<string> micbs_bs_comb;
172+
// sparselib graph ir should switch position of src and weight
173+
vector<int64_t> src1_shape = input[1]->shape();
174+
int64_t m_dim = src1_shape[1];
175+
int64_t k_dim = src1_shape[0];
176+
bool oneKM_shape_filling = false;
177+
for (const auto& bs : bs_space) {
178+
if (bs == 0) continue;
179+
if (m_dim % bs > 0 && !oneKM_shape_filling) {
180+
micbs_bs_comb.push_back("1," + std::to_string(k_dim) + "," + std::to_string(m_dim));
181+
oneKM_shape_filling = true;
182+
}
183+
if (m_dim < bs) break;
184+
if (m_dim % bs == 0) {
185+
if (m_dim / bs == 1 && oneKM_shape_filling) continue;
186+
micbs_bs_comb.push_back(std::to_string(m_dim / bs) + "," + std::to_string(k_dim) + "," + std::to_string(bs));
187+
if (m_dim / bs == 1) oneKM_shape_filling = true;
188+
}
189+
}
190+
vector<vector<string>> bs_attr_comb(micbs_bs_comb.size() * mkn_blocks_space.size() * tile_shape_space.size());
191+
#pragma omp parallel for
192+
for (int i = 0; i < micbs_bs_comb.size(); ++i) {
193+
for (int j = 0; j < mkn_blocks_space.size(); ++j) {
194+
#pragma omp simd
195+
for (int k = 0; k < tile_shape_space.size(); ++k) {
196+
bs_attr_comb[i * mkn_blocks_space.size() * tile_shape_space.size() + j * tile_shape_space.size() + k] = \
197+
{micbs_bs_comb[i], mkn_blocks_space[j], tile_shape_space[k]};
198+
}
199+
}
200+
}
201+
// add tensor life
202+
if (stage_ == "start") {
203+
extra_tensor_life_ += bs_attr_comb.size();
204+
return;
205+
}
206+
vector<string> kernel_config_cpy = {kernel_config_[0], "", "", ""};
207+
for (const auto& comb : bs_attr_comb) {
208+
for (int i = 0; i < comb.size(); ++i) kernel_config_cpy[i + 1] = comb[i];
209+
kernel->set_dispatch_config(kernel_config_cpy);
210+
float start_time = 0;
211+
float reshape_time = 0;
212+
start_time = Time("start");
213+
kernel->Reshape(input, output);
214+
reshape_time = Time("end") - start_time;
215+
start_time = Time("start");
216+
kernel->Forward(input, output);
217+
float execute_time = Time("end") - start_time;
218+
if (reshape_model) execute_time += reshape_time;
219+
bs_attr_timer[execute_time] = kernel_config_cpy;
220+
LOG(INFO) << "IpToSparseLibTune forward time is " << execute_time << "ms, activation shape: " << comb[0]
221+
<< ", mkn_blocks: " << comb[1] << ", tile_shape: " << comb[2];
222+
}
223+
if (bs_attr_timer.size() > 0) {
224+
best_execute_time_ = bs_attr_timer.begin()->first;
225+
kernel_config_ = bs_attr_timer.begin()->second;
226+
} else {
227+
LOG(FATAL) << "InnerProduct tuning fails with kernel SparseLib...";
228+
}
229+
}
230+
147231
inline const float& best_execute_time() const { return best_execute_time_;}
148232
inline const vector<string>& kernel_config() const { return kernel_config_; }
149233
inline const int& extra_tensor_life() const { return extra_tensor_life_; }
@@ -170,7 +254,8 @@ class OpTuning {
170254

171255
std::unordered_map<string, OpTuning::TuneFunc> OpTuning::tune_func_map_ = {
172256
{"Base", &OpTuning::BaseTune},
173-
{"InnerProduct_to_Convolution", &OpTuning::IpToConvTune}
257+
{"InnerProduct_to_Convolution", &OpTuning::IpToConvTune},
258+
{"InnerProduct_to_SparseLib", &OpTuning::IpToSparseLibTune}
174259
};
175260
} // namespace executor
176261

nlp_toolkit/backends/neural_engine/executor/include/operator.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ class Operator {
7474
const OperatorConfig& operator_conf() const { return operator_conf_; }
7575
// dispatch kernel may need to do reshape and receive config, like InnerProduct to Convolution
7676
inline void set_dispatch_from_type(const string& type) { dispatch_from_ = type; }
77-
inline void set_dispatch_config(const vector<string>& config) { dispatch_config_ = config; }
77+
inline void set_dispatch_config(const vector<string>& config = {}) { dispatch_config_ = config; }
7878
inline void set_do_shape_infer(const bool& do_shape_infer) { do_shape_infer_ = do_shape_infer; }
7979
inline const bool& do_shape_infer() const { return do_shape_infer_; }
8080
inline const bool& monopolize_dispatcher() const { return monopolize_dispatcher_; }
@@ -86,6 +86,7 @@ class Operator {
8686
inline void set_enable_sparse(const bool enable_sparse) { enable_sparse_ = enable_sparse; }
8787
inline const float& enable_sparse() const { return enable_sparse_; }
8888
inline const KERNEL_TYPE& kernel_type() const { return kernel_type_; }
89+
inline void set_kernel_type(const KERNEL_TYPE& kernel_type) { kernel_type_ = kernel_type; }
8990
inline const float& weight_zero_ratio() const { return weight_zero_ratio_; }
9091
inline void set_weight_shape(const vector<int64_t>& weight_shape) { weight_shape_ = weight_shape; }
9192
inline const vector<int64_t>& weight_shape() const { return weight_shape_; }

nlp_toolkit/backends/neural_engine/executor/include/tensor.hpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <numeric>
2020
#include <string>
2121
#include <vector>
22+
#include <unordered_map>
2223

2324
#include "common.hpp"
2425
#include "conf.hpp"
@@ -135,6 +136,35 @@ class Tensor {
135136
LOG(WARNING) << "please set tensor data to make adding extra tensor life work...";
136137
}
137138

139+
// reorder tensor
140+
// for example, use reorder in sparselib, 2D for transpose mode, 3D for tuning and dispatch
141+
// [a, b] -> [b, a], dst_perm: [1, 0]
142+
// [a, b, c] -> [b, a, c], dst_perm: [1, 0 ,2]
143+
void reorder(const vector<int64_t>& src_shape, const vector<int64_t>& dst_perm = {1, 0, 2}) {
144+
static unordered_map<string, dnnl::memory::data_type> type2mem{
145+
{"fp32", dnnl::memory::data_type::f32}, {"s32", dnnl::memory::data_type::s32},
146+
{"fp16", dnnl::memory::data_type::f16}, {"u8", dnnl::memory::data_type::u8},
147+
{"s8", dnnl::memory::data_type::s8}, {"bf16", dnnl::memory::data_type::bf16}};
148+
// execute reorder primitive
149+
vector<int64_t> src_stride = GetStrides(src_shape);
150+
vector<int64_t> dst_shape = GetShapes(src_shape, dst_perm);
151+
vector<int64_t> dst_stride = GetStrides(dst_shape, ReversePerm(dst_perm));
152+
dnnl::memory::desc src_md(src_shape, type2mem[this->dtype()], src_stride);
153+
dnnl::memory::desc dst_md(src_shape, type2mem[this->dtype()], dst_stride);
154+
dnnl::engine reorder_eng(dnnl::engine::kind::cpu, 0);
155+
dnnl::stream reorder_eng_stream(reorder_eng);
156+
dnnl::reorder::primitive_desc reorder_src_pd(reorder_eng, src_md, reorder_eng, dst_md);
157+
dnnl::memory src_m(src_md, reorder_eng);
158+
dnnl::memory dst_m(dst_md, reorder_eng);
159+
src_m.set_data_handle(const_cast<void*>(this->data()), reorder_eng_stream);
160+
dnnl::reorder(src_m, dst_m).execute(reorder_eng_stream, src_m, dst_m);
161+
reorder_eng_stream.wait();
162+
// inplace dst
163+
void* p = dst_m.get_data_handle();
164+
size_t data_size = this->size() * type2bytes[this->dtype()];
165+
memcpy(this->mutable_data(), p, data_size);
166+
}
167+
138168
inline size_t size() { return std::accumulate(shape_.begin(), shape_.end(), size_t(1), std::multiplies<size_t>()); }
139169

140170
void set_shm_handle(const ipc::managed_shared_memory::handle_t& h) { shm_handle_ = h; }

nlp_toolkit/backends/neural_engine/executor/src/common.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ unordered_map<string, int> type2bytes = {{"fp32", sizeof(float)}, {"int8",
2121
{"u8", sizeof(unsigned char)}, {"s8", sizeof(char)}, {"s32", sizeof(int)},
2222
{"bf16", sizeof(uint16_t)}};
2323
unordered_map<string, vector<string>> dispatch_kernel_config = {{"InnerProduct_to_Convolution", {"input_shape"}},
24+
{"InnerProduct_to_SparseLib", {"input_shape",
25+
"mkn_blocks",
26+
"tile_shape"}},
2427
};
2528
const int CPU_COUNT = omp_get_max_threads();
2629
#if __AVX512F__

0 commit comments

Comments
 (0)