Skip to content

Commit ea5c0e9

Browse files
add the profiling_trace feature (#222)
* add the profiling_trace feature * amend the profiling_trace.hpp * amend the model.cpp * add the profiling_trace feature * add the profiling_trace feature * add the reshape time * fix an error * fix the cpplint Co-authored-by: Dong, Bo <[email protected]>
1 parent ae8476e commit ea5c0e9

File tree

5 files changed

+251
-7
lines changed

5 files changed

+251
-7
lines changed

nlp_toolkit/backends/neural_engine/executor/include/dispatcher.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,18 @@ class Dispatcher {
256256
kernel_handler_[execute_kernel_]->set_perf_ratio_id(perf_ratio_id);
257257
}
258258
inline const string& perf_ratio_id() { return kernel_handler_[execute_kernel_]->perf_ratio_id(); }
259+
inline void set_it_shape(const vector<int64_t>input_shape) {
260+
kernel_handler_[execute_kernel_]->set_it_shape(input_shape); }
261+
inline void set_ot_shape(const vector<int64_t>output_shape) {
262+
kernel_handler_[execute_kernel_]->set_ot_shape(output_shape); }
263+
inline const vector<vector<int64_t>>& get_it_shape() {
264+
return kernel_handler_[execute_kernel_]->get_it_shape(); }
265+
inline const vector<vector<int64_t>>& get_ot_shape() {
266+
return kernel_handler_[execute_kernel_]->get_ot_shape(); }
267+
inline void set_reshape_time(const float reshape_time_) {
268+
kernel_handler_[execute_kernel_]->set_reshape_time(reshape_time_); }
269+
inline const vector<float>& get_reshape_time() {
270+
return kernel_handler_[execute_kernel_]->get_reshape_time(); }
259271

260272
protected:
261273
// get input_hash

nlp_toolkit/backends/neural_engine/executor/include/model.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
#include <utility>
2626
#include <vector>
2727
#include <unordered_map>
28-
2928
#include "common.hpp"
3029
#include "glog/logging.h"
3130
#include "memory_allocator.hpp"
@@ -34,6 +33,7 @@
3433
#include "operator_registry.hpp"
3534
#include "tensor.hpp"
3635
#include "thread_pool.hpp"
36+
#include "profiling_trace.hpp"
3737

3838
namespace executor {
3939

@@ -110,6 +110,8 @@ class Model {
110110
void ProfilingSparseEstimate(FILE* fp, const shared_ptr<Dispatcher>& op,
111111
const float average_latency = 0.);
112112

113+
114+
113115
protected:
114116
string name_;
115117
string weight_root_;

nlp_toolkit/backends/neural_engine/executor/include/operator.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,13 @@ class Operator {
9898
inline const string& table_id() const { return table_id_; }
9999
inline void set_perf_ratio_id(const string& perf_ratio_id) { perf_ratio_id_ = perf_ratio_id; }
100100
inline const string& perf_ratio_id() const { return perf_ratio_id_; }
101+
inline void set_it_shape(const vector<int64_t> input_shape) { input_tensor_shape.emplace_back(input_shape); }
102+
inline void set_ot_shape(const vector<int64_t> output_shape) { output_tensor_shape.emplace_back(output_shape); }
103+
inline const vector<vector<int64_t>>& get_it_shape() const { return input_tensor_shape; }
104+
inline const vector<vector<int64_t>>& get_ot_shape() const { return output_tensor_shape; }
105+
// get executor kernel time add reshape time
106+
inline void set_reshape_time(const float reshape_time_) { reshape_time.emplace_back(reshape_time_); }
107+
inline const vector<float>& get_reshape_time() const { return reshape_time; }
101108

102109
protected:
103110
/** The conf that stores the operator configurations */
@@ -117,6 +124,9 @@ class Operator {
117124
vector<int64_t> weight_shape_;
118125
string table_id_;
119126
string perf_ratio_id_;
127+
vector<vector<int64_t>> input_tensor_shape;
128+
vector<vector<int64_t>> output_tensor_shape;
129+
vector<float> reshape_time;
120130
}; // class Operator
121131

122132
} // namespace executor
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
// Copyright (c) 2021 Intel Corporation
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#ifndef ENGINE_EXECUTOR_INCLUDE_PROFILING_TRACE_HPP_
16+
#define ENGINE_EXECUTOR_INCLUDE_PROFILING_TRACE_HPP_
17+
18+
#include <iostream>
19+
#include <fstream>
20+
#include <memory>
21+
#include <vector>
22+
#include <string>
23+
#include "operator.hpp"
24+
#include "dispatcher.hpp"
25+
#include "tensor.hpp"
26+
27+
namespace executor {
28+
class ProfilingTracer {
29+
public:
30+
ProfilingTracer() : TotalTime(0), iterations_during() {}
31+
32+
void BeginTrace(const std::string& filepath = "result.json") {
33+
OutputStream.open(filepath);
34+
TracerHeader();
35+
}
36+
37+
void EndTrace() {
38+
TracerFooter();
39+
OutputStream.close();
40+
}
41+
42+
void WriteProfile(const vector<shared_ptr<Dispatcher>>& operators_, const vector<vector<Tensor*>>& input_tensors,
43+
const vector<vector<Tensor*>>& output_tensors) {
44+
IterationTotalTime(operators_);
45+
OutputStream << "{";
46+
OutputStream << "\"cat\":\"inference\",";
47+
OutputStream << "\"dur\":" << TotalTime*1000<< ",";
48+
OutputStream << "\"name\":\"" << "model_inference" << "\",";
49+
OutputStream << "\"ph\":\"X\",";
50+
OutputStream << "\"pid\": 0,";
51+
OutputStream << "\"tid\": \"" << "inference" << "\",";
52+
OutputStream << "\"ts\": " << 0;
53+
OutputStream << "}";
54+
float iter_start = 0;
55+
for (int i = 0; i < operators_[1]->latency().size(); ++i) {
56+
OutputStream << ",";
57+
OutputStream << "{";
58+
OutputStream << "\"cat\":\"" << "iteration" << "\",";
59+
OutputStream << "\"dur\":" << iterations_during[i]*1000 << ",";
60+
OutputStream << "\"name\":\"" << "Iteration" << i << "\",";
61+
OutputStream << "\"ph\":\"X\",";
62+
OutputStream << "\"pid\": 0,";
63+
OutputStream << "\"tid\": \"" << "Iteration" << "\",";
64+
OutputStream << "\"ts\":" << iter_start*1000;
65+
OutputStream << "}";
66+
float op_start = 0;
67+
for (int j = 1; j < operators_.size()-1; ++j) {
68+
const shared_ptr<Dispatcher>& op = operators_[j];
69+
vector<Tensor*> its = input_tensors[j];
70+
vector<Tensor*> ots = output_tensors[j];
71+
OutputStream << ",";
72+
OutputStream << "{";
73+
OutputStream << "\"cat\":\"" << op->type() << "\",";
74+
OutputStream << "\"dur\":" << op->latency()[i]*1000 + op->get_reshape_time()[i]*1000<< ",";
75+
OutputStream << "\"name\":\"" << op->name() << "\",";
76+
OutputStream << "\"ph\":\"X\",";
77+
OutputStream << "\"pid\": 0,";
78+
OutputStream << "\"tid\": \"" << "Operator" << "\",";
79+
OutputStream << "\"ts\":" << (op_start + iter_start)*1000<< ",";
80+
OutputStream << "\"args\": {";
81+
if (!op->post_op().empty()) {
82+
OutputStream << "\"post_op\" :\"" << op->post_op() << "\",";
83+
}
84+
OutputStream << "\"reshape_time\" :\"" << op->get_reshape_time()[i] << "ms" << "\",";
85+
OutputStream << "\"forward_time\" :\"" << op->latency()[i] << "ms" << "\",";
86+
OutputStream << "\"input_tensor_name\" :\"" << TensorsName(its) << "\",";
87+
OutputStream << "\"input_type\" :\"" << TensorsType(its) << "\",";
88+
OutputStream << "\"input_shape\" : "<< TensorsShape(op->get_it_shape(), i, its.size()) << ",";
89+
OutputStream << "\"output_tensor_name\" :\"" << TensorsName(ots) << "\",";
90+
OutputStream << "\"output_type\" :\"" << TensorsType(ots) << "\",";
91+
OutputStream << "\"output_shape\" : "<< TensorsShape(op->get_ot_shape(), i, 1);
92+
OutputStream << "}";
93+
OutputStream << "}";
94+
OutputStream.flush();
95+
op_start += (op->latency()[i] + op->get_reshape_time()[i]);
96+
}
97+
iter_start += iterations_during[i];
98+
}
99+
}
100+
101+
void TracerHeader() {
102+
OutputStream << "{\"otherData\": {}, \"traceEvents\": [";
103+
OutputStream.flush();
104+
}
105+
106+
void TracerFooter() {
107+
OutputStream << "]}";
108+
OutputStream.flush();
109+
}
110+
// get total time and per iteration's time
111+
void IterationTotalTime(const vector<shared_ptr<Dispatcher>>& operators_) {
112+
for (int i = 0; i < operators_[1]->latency().size(); ++i) {
113+
float PerIterTime = 0;
114+
for (int j = 1; j < operators_.size()-1; ++j) {
115+
PerIterTime += operators_[j]->get_reshape_time()[i];
116+
PerIterTime += operators_[j]->latency()[i];
117+
}
118+
iterations_during.emplace_back(PerIterTime);
119+
TotalTime += PerIterTime;
120+
}
121+
}
122+
123+
std::string TensorsName(const vector<Tensor*>& Tensors) {
124+
std::string result = "";
125+
for (int i = 0; i < Tensors.size(); ++i) {
126+
if (i == Tensors.size() -1) {
127+
result += Tensors[i]->name();
128+
} else {
129+
result += Tensors[i]->name();
130+
result += ",";
131+
}
132+
}
133+
return result;
134+
}
135+
136+
std::string TensorsType(const vector<Tensor*>& Tensors) {
137+
std::string result = "";
138+
for (int i = 0; i < Tensors.size(); ++i) {
139+
if (i == Tensors.size() -1) {
140+
result += Tensors[i]->dtype();
141+
} else {
142+
result += Tensors[i]->dtype();
143+
result += ",";
144+
}
145+
}
146+
return result;
147+
}
148+
149+
std::string TensorsShape(const vector<vector<int64_t>>& tensor_shape,
150+
int iteration_time, int tensor_size ) {
151+
std::string result = "\"";
152+
for (int i = iteration_time*tensor_size; i < (iteration_time + 1)*tensor_size; ++i) {
153+
if (i == (iteration_time + 1)*tensor_size -1) {
154+
for (int j = 0; j < tensor_shape[i].size(); ++j) {
155+
if (j == tensor_shape[i].size()-1) {
156+
result += std::to_string(tensor_shape[i][j]);
157+
} else {
158+
result += std::to_string(tensor_shape[i][j]);
159+
result += "*";
160+
}
161+
}
162+
} else {
163+
for (int j = 0; j < tensor_shape[i].size(); ++j) {
164+
if (j == tensor_shape[i].size()-1) {
165+
result += std::to_string(tensor_shape[i][j]);
166+
result += ",";
167+
} else {
168+
result += std::to_string(tensor_shape[i][j]);
169+
result += "*";
170+
}
171+
}
172+
}
173+
}
174+
result += "\"";
175+
return result;
176+
}
177+
178+
protected:
179+
std::ofstream OutputStream;
180+
float TotalTime;
181+
vector<float> iterations_during;
182+
};
183+
184+
} // namespace executor
185+
186+
#endif // ENGINE_EXECUTOR_INCLUDE_PEOFILING_TRACE_HPP_

nlp_toolkit/backends/neural_engine/executor/src/model.cpp

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ void Model::SetDispatchKernel(const bool& reshape_model) {
249249
}
250250

251251
vector<Tensor>& Model::Forward(vector<Tensor>& input_data) {
252+
float It_start = Time("start");
252253
CHECK_EQ(input_data.size(), model_input_tensors_.size())
253254
<< "input data size not equal with model input tensor size....";
254255
// if we want use dynamic input data shape at run time, we should check the
@@ -275,16 +276,29 @@ vector<Tensor>& Model::Forward(vector<Tensor>& input_data) {
275276
}
276277
}
277278
for (int i = 0; i < input_data.size(); ++i) {
278-
// model_input_tesnor_[i]->free_data();
279-
model_input_tensors_[i]->set_data(input_data[i].mutable_data());
280-
model_input_tensors_[i]->set_shape(input_data[i].shape());
279+
// model_input_tesnor_[i]->free_data();
280+
model_input_tensors_[i]->set_data(input_data[i].mutable_data());
281+
model_input_tensors_[i]->set_shape(input_data[i].shape());
281282
}
282283

283284
SetDispatchKernel(reshape_model);
284285

285286
if (!is_dispatcher_tuning_) {
286-
if (reshape_model) {
287-
for (int i = 0; i < operators_.size(); ++i) {
287+
if (reshape_model&&engine_profiling_) {
288+
for (int i = 0; i < operators_.size(); ++i) {
289+
LOG(INFO) << "operator " << operators_[i]->name() << " gonna reshape with type " << operators_[i]->type();
290+
// get reshape time for profiling
291+
float start = Time("start");
292+
operators_[i]->Reshape(input_vecs_[i], output_vecs_[i]);
293+
float end = Time("end");
294+
operators_[i]->set_reshape_time(end - start);
295+
}
296+
} else if (!reshape_model&&engine_profiling_) {
297+
for (int i = 0; i < operators_.size(); ++i) {
298+
operators_[i]->set_reshape_time(0);
299+
}
300+
} else if (reshape_model) {
301+
for (int i = 0; i < operators_.size(); ++i) {
288302
LOG(INFO) << "operator " << operators_[i]->name() << " gonna reshape with type " << operators_[i]->type();
289303
operators_[i]->Reshape(input_vecs_[i], output_vecs_[i]);
290304
}
@@ -298,6 +312,12 @@ vector<Tensor>& Model::Forward(vector<Tensor>& input_data) {
298312
tp.commitTask(std::bind(&executor::Dispatcher::Forward, operators_[i], input_vecs_[i], output_vecs_[i]));
299313
float end = Time("end");
300314
operators_[i]->set_latency(end - start);
315+
for (int j = 0; j < input_vecs_[i].size(); ++j) {
316+
operators_[i]->set_it_shape(input_vecs_[i][j]->shape());
317+
}
318+
if (i != operators_.size() - 1) {
319+
operators_[i]->set_ot_shape(output_vecs_[i][0]->shape()); // the last output is not exsit
320+
}
301321
LOG(INFO) << "operator: " << operators_[i]->name() << ", latency: " << end - start << " ms";
302322
if (thread_count >= multi_stream_tasks_[i]) {
303323
tp.waitAllTaskRunOver();
@@ -308,7 +328,14 @@ vector<Tensor>& Model::Forward(vector<Tensor>& input_data) {
308328
float start = Time("start");
309329
operators_[i]->Forward(input_vecs_[i], output_vecs_[i]);
310330
float end = Time("end");
331+
// for profiling
311332
operators_[i]->set_latency(end - start);
333+
for (int j = 0; j < input_vecs_[i].size(); ++j) {
334+
operators_[i]->set_it_shape(input_vecs_[i][j]->shape());
335+
}
336+
if (i != operators_.size() - 1) {
337+
operators_[i]->set_ot_shape(output_vecs_[i][0]->shape());
338+
}
312339
LOG(INFO) << "operator: " << operators_[i]->name() << ", latency: " << end - start << " ms";
313340
}
314341
}
@@ -328,12 +355,14 @@ vector<Tensor>& Model::Forward(vector<Tensor>& input_data) {
328355
}
329356
}
330357
}
358+
float Iter_end = Time("end");
331359
return this->output_tensors();
332360
}
333361

334362
void Model::Profiling(char* space_name, char* count_name, char* mtx_name, int warm_up) {
335363
// in multi instance case, dump profiling for each instance
336364
LOG(INFO) << "Neural engine profiling ...";
365+
ipc::shared_memory_object::remove(space_name);
337366
ipc::managed_shared_memory shm(ipc::open_or_create, space_name, 1024);
338367
int* inst_count = shm.find_or_construct<int>(count_name)(0);
339368
std::string profiling_dir = "engine_profiling_";
@@ -384,6 +413,12 @@ void Model::Profiling(char* space_name, char* count_name, char* mtx_name, int wa
384413
// for spase performance estimate
385414
ProfilingSparseEstimate(fp, op, average_latency);
386415
}
416+
std::string tracer_file = profiling_dir + "/profiling_" + ch_curr_time \
417+
+ "_" + std::to_string((*inst_count)++) + ".json";
418+
ProfilingTracer Tracer = ProfilingTracer();
419+
Tracer.BeginTrace(tracer_file);
420+
Tracer.WriteProfile(operators_, input_vecs_, output_vecs_);
421+
Tracer.EndTrace();
387422
// dense total latency
388423
fprintf(fp, ",,,,,,,,,,,%s,%.3f,",
389424
"total latency(ms)", total_latency);
@@ -432,7 +467,6 @@ void Model::Profiling(char* space_name, char* count_name, char* mtx_name, int wa
432467
ipc::shared_memory_object::remove(space_name);
433468
}
434469
}
435-
436470
void Model::ProfilingSparse(FILE* fp) {
437471
// weight shape, perf ratio, others
438472
fprintf(fp, "%s,%s,%s,%s,%s\n", "weight shape", "90% 4x1 perf ratio",

0 commit comments

Comments
 (0)