Skip to content

Commit b2ef228

Browse files
committed
cpp optimization does not work
1 parent a537d9f commit b2ef228

File tree

3 files changed

+136
-152
lines changed

3 files changed

+136
-152
lines changed

core/runtime/TRTEngine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,9 @@ TRTEngine::TRTEngine(
237237
out_binding_names[pyt_idx] = binding_name;
238238
}
239239
num_io = std::make_pair(inputs_size, outputs);
240+
241+
this->current_device_id = at::cuda::current_device();
242+
this->stream = c10::cuda::getCurrentCUDAStream(this->current_device_id);
240243
}
241244

242245
#ifndef NDEBUG

core/runtime/TRTEngine.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,13 +169,14 @@ struct TRTEngine : torch::CustomClassHolder {
169169

170170
// CUDAGraph-Related Functionality
171171
at::cuda::CUDAGraph cudagraph = {};
172-
at::cuda::CUDAStream engine_stream = c10::cuda::getDefaultCUDAStream();
173-
at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
172+
at::cuda::CUDAStream stream = c10::cuda::getDefaultCUDAStream();
173+
int64_t current_device_id = at::cuda::current_device();
174174
std::vector<at::Tensor> input_buffers = {};
175175
std::vector<at::Tensor> output_buffers = {};
176176
std::string shape_key = "None";
177177
bool use_pre_allocated_outputs = false;
178178
std::vector<at::Tensor> pre_allocated_outputs;
179+
std::vector<at::Tensor> allocated_outputs;
179180

180181
// Output Allocator-Related Functionality
181182
bool requires_output_allocator = false; // engine requires output allocator

core/runtime/execute_engine.cpp

Lines changed: 130 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,8 @@ void setup_input_tensors(
104104
for (size_t i = 0; i < inputs.size(); i++) {
105105
std::string name = compiled_engine->in_binding_names[i];
106106

107-
TORCHTRT_CHECK(
108-
inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
107+
// TORCHTRT_CHECK(
108+
// inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
109109

110110
auto expected_type =
111111
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
@@ -202,30 +202,30 @@ void create_output_allocator(c10::intrusive_ptr<TRTEngine> compiled_engine) {
202202

203203
std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
204204
auto run_standard_execution = [&]() {
205-
bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
206-
bool shape_changed = _validate_shapes(inputs, compiled_engine);
205+
bool cudagraphs_enabled = false; //(CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
206+
bool shape_changed = false; //_validate_shapes(inputs, compiled_engine);
207207

208208
// Whether cudagraphs needs to record the graph on this pass
209209
auto result = compiled_engine->runtime_states.set_runtime_states(
210210
cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs, shape_changed);
211211

212-
bool need_cudagraphs_record = std::get<0>(result);
212+
bool need_cudagraphs_record = false; //std::get<0>(result);
213213
bool can_use_pre_allocated_outputs = std::get<1>(result);
214214
bool need_cudagraphs_reset = std::get<2>(result);
215215

216-
if (need_cudagraphs_reset) {
217-
compiled_engine->cudagraph.reset();
218-
}
216+
// if (need_cudagraphs_reset) {
217+
// compiled_engine->cudagraph.reset();
218+
// }
219219

220-
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
220+
std::vector<at::Tensor> outputs;
221221

222222
// Intialize inputs and outputs to be available throughout the succeeding scopes
223223
{ // Input Setup
224-
std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
225-
if (compiled_engine->profile_execution) {
226-
input_profiler_guard =
227-
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
228-
}
224+
// std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
225+
// if (compiled_engine->profile_execution) {
226+
// input_profiler_guard =
227+
// std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
228+
// }
229229

230230
setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
231231
// Check if input shapes can be inferred.
@@ -240,72 +240,71 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
240240
}
241241

242242
{ // Output Setup
243-
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
244-
if (compiled_engine->profile_execution) {
245-
output_profiler_guard =
246-
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
247-
}
243+
bool new_outputs = false;
244+
// std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
245+
// if (compiled_engine->profile_execution) {
246+
// output_profiler_guard =
247+
// std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
248+
// }
248249
if (can_use_pre_allocated_outputs) {
249250
outputs = compiled_engine->pre_allocated_outputs;
250251
} else {
251-
outputs = create_output_tensors(compiled_engine);
252+
if (compiled_engine->allocated_outputs.size() == 0) {
253+
compiled_engine->allocated_outputs = create_output_tensors(compiled_engine);
254+
std::cout << "new_outputs" << std::endl;
255+
new_outputs = true;
256+
}
257+
outputs = compiled_engine->allocated_outputs;
252258
}
253259

254-
for (auto output_indices : compiled_engine->out_binding_map) {
255-
auto pyt_idx = output_indices.second;
256-
std::string name = compiled_engine->out_binding_names[pyt_idx];
257-
if (need_cudagraphs_record) {
258-
// If we are recording the cuda graph then we need to update the persistent output buffer
259-
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
260-
}
260+
if (new_outputs) {
261+
for (auto output_indices : compiled_engine->out_binding_map) {
262+
auto pyt_idx = output_indices.second;
263+
std::string name = compiled_engine->out_binding_names[pyt_idx];
264+
if (need_cudagraphs_record) {
265+
// If we are recording the cuda graph then we need to update the persistent output buffer
266+
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
267+
}
261268

262-
if (cudagraphs_enabled) {
263-
TORCHTRT_CHECK(
264-
compiled_engine->exec_ctx->setTensorAddress(
265-
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
266-
"Error while setting the output tensor address");
267-
} else {
268-
TORCHTRT_CHECK(
269-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
270-
"Error while setting the output tensor address");
269+
if (cudagraphs_enabled) {
270+
TORCHTRT_CHECK(
271+
compiled_engine->exec_ctx->setTensorAddress(
272+
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
273+
"Error while setting the output tensor address");
274+
} else {
275+
TORCHTRT_CHECK(
276+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
277+
"Error while setting the output tensor address");
278+
}
271279
}
272280
}
273281
}
274282

275-
auto current_device_id = -1;
276-
if (inputs.size() > 0) {
277-
current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
278-
} else if (outputs.size() > 0) {
279-
current_device_id = outputs[0].device().index(); // Done this way to avoid a call to cudart
280-
}
281-
282-
compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
283-
if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream(current_device_id)) {
284-
// Create a new stream if the engine stream is the default stream
285-
compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
286-
}
283+
// auto current_device_id = -1;
284+
// if (inputs.size() > 0) {
285+
// current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
286+
// if (current_device_id != compiled_engine->current_device_id) {
287+
// compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
288+
// }
289+
// }
287290

288291
{ // Engine Execution (execute on engine stream)
289-
c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
290292

291-
std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
292-
if (compiled_engine->profile_execution) {
293-
enqueue_profiler_guard =
294-
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
295-
}
293+
// std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
294+
// if (compiled_engine->profile_execution) {
295+
// enqueue_profiler_guard =
296+
// std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
297+
// }
298+
296299

297-
// Block engine stream until results are available on caller stream
298-
at::cuda::CUDAEvent caller_exec_complete;
299-
caller_exec_complete.record(compiled_engine->caller_stream);
300-
caller_exec_complete.block(compiled_engine->engine_stream);
301300

302301
if (!cudagraphs_enabled) {
303302
// Direct execution uses the caller buffers directly
304-
compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
303+
compiled_engine->exec_ctx->enqueueV3(compiled_engine->stream);
305304
} else {
306305
if (need_cudagraphs_record) {
307306
// If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
308-
c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream;
307+
c10::cuda::CUDAStream recording_stream = compiled_engine->stream;
309308
compiled_engine->cudagraph.capture_begin();
310309
compiled_engine->exec_ctx->enqueueV3(recording_stream);
311310
compiled_engine->cudagraph.capture_end();
@@ -321,27 +320,22 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
321320
} // End engine exeuction (resets to caller stream)
322321

323322
// Create output buffer for next execution of graph or trt context.
324-
if (compiled_engine->use_pre_allocated_outputs) {
325-
compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
326-
}
327-
328-
// Block caller stream until engine execution is complete
329-
at::cuda::CUDAEvent trt_exec_complete;
330-
trt_exec_complete.record(compiled_engine->engine_stream);
331-
trt_exec_complete.block(compiled_engine->caller_stream);
332-
333-
if (cudagraphs_enabled) {
334-
// If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
335-
for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
336-
outputs[o].copy_(compiled_engine->output_buffers[o], false);
337-
}
338-
}
339-
340-
if (compiled_engine->profile_execution) {
341-
LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
342-
dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
343-
compiled_engine->dump_engine_layer_info();
344-
}
323+
// if (compiled_engine->use_pre_allocated_outputs) {
324+
// compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
325+
// }
326+
327+
// if (cudagraphs_enabled) {
328+
// // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
329+
// for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
330+
// outputs[o].copy_(compiled_engine->output_buffers[o], false);
331+
// }
332+
// }
333+
334+
// if (compiled_engine->profile_execution) {
335+
// LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
336+
// dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
337+
// compiled_engine->dump_engine_layer_info();
338+
// }
345339

346340
return outputs;
347341
};
@@ -378,45 +372,31 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
378372
auto current_device_id = -1;
379373
if (inputs.size() > 0) {
380374
current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
381-
} else {
382-
current_device_id = at::cuda::current_device();
383-
}
375+
if (current_device_id != compiled_engine->current_device_id) {
376+
compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
377+
378+
}
379+
}
384380

385-
compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
386-
if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream(current_device_id)) {
387-
// Create a new stream if the engine stream is the default stream
388-
compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
389-
}
390381

391382
{ // Engine Execution (execute on engine stream)
392-
c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
393-
394-
std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
395-
if (compiled_engine->profile_execution) {
396-
enqueue_profiler_guard =
397-
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
398-
}
399383

400-
// Block engine stream until results are available on caller stream
401-
at::cuda::CUDAEvent caller_exec_complete;
402-
caller_exec_complete.record(compiled_engine->caller_stream);
403-
caller_exec_complete.block(compiled_engine->engine_stream);
384+
// std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
385+
// if (compiled_engine->profile_execution) {
386+
// enqueue_profiler_guard =
387+
// std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
388+
// }
404389

405390
// Direct execution uses the caller buffers directly
406-
compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
391+
compiled_engine->exec_ctx->enqueueV3(compiled_engine->stream);
407392

408393
} // End engine exeuction (resets to caller stream)
409394

410-
// Block caller stream until engine execution is complete
411-
at::cuda::CUDAEvent trt_exec_complete;
412-
trt_exec_complete.record(compiled_engine->engine_stream);
413-
trt_exec_complete.block(compiled_engine->caller_stream);
414-
415-
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
416-
if (compiled_engine->profile_execution) {
417-
output_profiler_guard =
418-
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
419-
}
395+
// std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
396+
// if (compiled_engine->profile_execution) {
397+
// output_profiler_guard =
398+
// std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
399+
// }
420400
std::vector<at::Tensor> outputs;
421401
for (size_t i = 0; i < compiled_engine->out_binding_names.size(); i++) {
422402
auto name = compiled_engine->out_binding_names[i];
@@ -476,45 +456,45 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
476456
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->device_profile_path);
477457
}
478458

479-
RTDevice curr_device = get_current_device();
480-
LOG_DEBUG("Current Device: " << curr_device);
481-
482-
// Generic Target Device Prefix
483-
std::string target_device = "cuda:";
484-
485-
if (is_switch_required(curr_device, compiled_engine->device_info)) {
486-
// Scan through available CUDA devices and set the CUDA device context correctly
487-
RTDevice device =
488-
select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
489-
set_rt_device(device);
490-
491-
// Target device is new device
492-
target_device += std::to_string(device.id);
493-
494-
for (auto& in : inputs) {
495-
in = in.to(torch::Device(target_device));
496-
}
497-
} else {
498-
// Target device is current device
499-
target_device += std::to_string(curr_device.id);
500-
}
501-
502-
// For each input, ensure its current device is the desired target device
503-
for (size_t i = 0; i < inputs.size(); i++) {
504-
at::Tensor* in = &inputs[i];
505-
std::string current_tensor_device = in->device().str();
506-
507-
// If current device string does not match target device, display warning and move tensor accordingly
508-
if (current_tensor_device != target_device) {
509-
LOG_WARNING(
510-
"Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
511-
<< " but should be on " << target_device << ". This tensor is being moved by the runtime but "
512-
<< "for performance considerations, ensure your inputs are all on GPU "
513-
<< "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
514-
<< "warning persists.");
515-
*in = in->to(torch::Device(target_device));
516-
}
517-
}
459+
// RTDevice curr_device = get_current_device();
460+
// LOG_DEBUG("Current Device: " << curr_device);
461+
462+
// // Generic Target Device Prefix
463+
// std::string target_device = "cuda:";
464+
465+
// if (is_switch_required(curr_device, compiled_engine->device_info)) {
466+
// // Scan through available CUDA devices and set the CUDA device context correctly
467+
// RTDevice device =
468+
// select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
469+
// set_rt_device(device);
470+
471+
// // Target device is new device
472+
// target_device += std::to_string(device.id);
473+
474+
// for (auto& in : inputs) {
475+
// in = in.to(torch::Device(target_device));
476+
// }
477+
// } else {
478+
// // Target device is current device
479+
// target_device += std::to_string(curr_device.id);
480+
// }
481+
482+
// // For each input, ensure its current device is the desired target device
483+
// for (size_t i = 0; i < inputs.size(); i++) {
484+
// at::Tensor* in = &inputs[i];
485+
// std::string current_tensor_device = in->device().str();
486+
487+
// // If current device string does not match target device, display warning and move tensor accordingly
488+
// if (current_tensor_device != target_device) {
489+
// LOG_WARNING(
490+
// "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
491+
// << " but should be on " << target_device << ". This tensor is being moved by the runtime but "
492+
// << "for performance considerations, ensure your inputs are all on GPU "
493+
// << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
494+
// << "warning persists.");
495+
// *in = in->to(torch::Device(target_device));
496+
// }
497+
// }
518498
}
519499

520500
if (compiled_engine->requires_output_allocator) { // engine requires OA

0 commit comments

Comments
 (0)