@@ -104,8 +104,8 @@ void setup_input_tensors(
104
104
for (size_t i = 0 ; i < inputs.size (); i++) {
105
105
std::string name = compiled_engine->in_binding_names [i];
106
106
107
- TORCHTRT_CHECK (
108
- inputs[i].is_cuda (), " Expected input tensors to have device cuda, found device " << inputs[i].device ());
107
+ // TORCHTRT_CHECK(
108
+ // inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
109
109
110
110
auto expected_type =
111
111
util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
@@ -202,30 +202,30 @@ void create_output_allocator(c10::intrusive_ptr<TRTEngine> compiled_engine) {
202
202
203
203
std::vector<at::Tensor> execute_engine (std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
204
204
auto run_standard_execution = [&]() {
205
- bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
206
- bool shape_changed = _validate_shapes (inputs, compiled_engine);
205
+ bool cudagraphs_enabled = false ; // (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
206
+ bool shape_changed = false ; // _validate_shapes(inputs, compiled_engine);
207
207
208
208
// Whether cudagraphs needs to record the graph on this pass
209
209
auto result = compiled_engine->runtime_states .set_runtime_states (
210
210
cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs , shape_changed);
211
211
212
- bool need_cudagraphs_record = std::get<0 >(result);
212
+ bool need_cudagraphs_record = false ; // std::get<0>(result);
213
213
bool can_use_pre_allocated_outputs = std::get<1 >(result);
214
214
bool need_cudagraphs_reset = std::get<2 >(result);
215
215
216
- if (need_cudagraphs_reset) {
217
- compiled_engine->cudagraph .reset ();
218
- }
216
+ // if (need_cudagraphs_reset) {
217
+ // compiled_engine->cudagraph.reset();
218
+ // }
219
219
220
- std::vector<at::Tensor> outputs (compiled_engine-> num_io . second ) ;
220
+ std::vector<at::Tensor> outputs;
221
221
222
222
// Intialize inputs and outputs to be available throughout the succeeding scopes
223
223
{ // Input Setup
224
- std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
225
- if (compiled_engine->profile_execution ) {
226
- input_profiler_guard =
227
- std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
228
- }
224
+ // std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
225
+ // if (compiled_engine->profile_execution) {
226
+ // input_profiler_guard =
227
+ // std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
228
+ // }
229
229
230
230
setup_input_tensors (inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
231
231
// Check if input shapes can be inferred.
@@ -240,72 +240,71 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
240
240
}
241
241
242
242
{ // Output Setup
243
- std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
244
- if (compiled_engine->profile_execution ) {
245
- output_profiler_guard =
246
- std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
247
- }
243
+ bool new_outputs = false ;
244
+ // std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
245
+ // if (compiled_engine->profile_execution) {
246
+ // output_profiler_guard =
247
+ // std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
248
+ // }
248
249
if (can_use_pre_allocated_outputs) {
249
250
outputs = compiled_engine->pre_allocated_outputs ;
250
251
} else {
251
- outputs = create_output_tensors (compiled_engine);
252
+ if (compiled_engine->allocated_outputs .size () == 0 ) {
253
+ compiled_engine->allocated_outputs = create_output_tensors (compiled_engine);
254
+ std::cout << " new_outputs" << std::endl;
255
+ new_outputs = true ;
256
+ }
257
+ outputs = compiled_engine->allocated_outputs ;
252
258
}
253
259
254
- for (auto output_indices : compiled_engine->out_binding_map ) {
255
- auto pyt_idx = output_indices.second ;
256
- std::string name = compiled_engine->out_binding_names [pyt_idx];
257
- if (need_cudagraphs_record) {
258
- // If we are recording the cuda graph then we need to update the persistent output buffer
259
- compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
260
- }
260
+ if (new_outputs) {
261
+ for (auto output_indices : compiled_engine->out_binding_map ) {
262
+ auto pyt_idx = output_indices.second ;
263
+ std::string name = compiled_engine->out_binding_names [pyt_idx];
264
+ if (need_cudagraphs_record) {
265
+ // If we are recording the cuda graph then we need to update the persistent output buffer
266
+ compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
267
+ }
261
268
262
- if (cudagraphs_enabled) {
263
- TORCHTRT_CHECK (
264
- compiled_engine->exec_ctx ->setTensorAddress (
265
- name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
266
- " Error while setting the output tensor address" );
267
- } else {
268
- TORCHTRT_CHECK (
269
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
270
- " Error while setting the output tensor address" );
269
+ if (cudagraphs_enabled) {
270
+ TORCHTRT_CHECK (
271
+ compiled_engine->exec_ctx ->setTensorAddress (
272
+ name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
273
+ " Error while setting the output tensor address" );
274
+ } else {
275
+ TORCHTRT_CHECK (
276
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
277
+ " Error while setting the output tensor address" );
278
+ }
271
279
}
272
280
}
273
281
}
274
282
275
- auto current_device_id = -1 ;
276
- if (inputs.size () > 0 ) {
277
- current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
278
- } else if (outputs.size () > 0 ) {
279
- current_device_id = outputs[0 ].device ().index (); // Done this way to avoid a call to cudart
280
- }
281
-
282
- compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
283
- if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream (current_device_id)) {
284
- // Create a new stream if the engine stream is the default stream
285
- compiled_engine->engine_stream = c10::cuda::getStreamFromPool (false , current_device_id);
286
- }
283
+ // auto current_device_id = -1;
284
+ // if (inputs.size() > 0) {
285
+ // current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
286
+ // if (current_device_id != compiled_engine->current_device_id) {
287
+ // compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
288
+ // }
289
+ // }
287
290
288
291
{ // Engine Execution (execute on engine stream)
289
- c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
290
292
291
- std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
292
- if (compiled_engine->profile_execution ) {
293
- enqueue_profiler_guard =
294
- std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
295
- }
293
+ // std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
294
+ // if (compiled_engine->profile_execution) {
295
+ // enqueue_profiler_guard =
296
+ // std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
297
+ // }
298
+
296
299
297
- // Block engine stream until results are available on caller stream
298
- at::cuda::CUDAEvent caller_exec_complete;
299
- caller_exec_complete.record (compiled_engine->caller_stream );
300
- caller_exec_complete.block (compiled_engine->engine_stream );
301
300
302
301
if (!cudagraphs_enabled) {
303
302
// Direct execution uses the caller buffers directly
304
- compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
303
+ compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
305
304
} else {
306
305
if (need_cudagraphs_record) {
307
306
// If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
308
- c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream ;
307
+ c10::cuda::CUDAStream recording_stream = compiled_engine->stream ;
309
308
compiled_engine->cudagraph .capture_begin ();
310
309
compiled_engine->exec_ctx ->enqueueV3 (recording_stream);
311
310
compiled_engine->cudagraph .capture_end ();
@@ -321,27 +320,22 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
321
320
} // End engine exeuction (resets to caller stream)
322
321
323
322
// Create output buffer for next execution of graph or trt context.
324
- if (compiled_engine->use_pre_allocated_outputs ) {
325
- compiled_engine->pre_allocated_outputs = create_output_tensors (compiled_engine);
326
- }
327
-
328
- // Block caller stream until engine execution is complete
329
- at::cuda::CUDAEvent trt_exec_complete;
330
- trt_exec_complete.record (compiled_engine->engine_stream );
331
- trt_exec_complete.block (compiled_engine->caller_stream );
332
-
333
- if (cudagraphs_enabled) {
334
- // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
335
- for (size_t o = 0 ; o < compiled_engine->output_buffers .size (); o++) {
336
- outputs[o].copy_ (compiled_engine->output_buffers [o], false );
337
- }
338
- }
339
-
340
- if (compiled_engine->profile_execution ) {
341
- LOG_INFO (std::endl << *compiled_engine->trt_engine_profiler );
342
- dump_trace (compiled_engine->trt_engine_profile_path , *compiled_engine->trt_engine_profiler );
343
- compiled_engine->dump_engine_layer_info ();
344
- }
323
+ // if (compiled_engine->use_pre_allocated_outputs) {
324
+ // compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
325
+ // }
326
+
327
+ // if (cudagraphs_enabled) {
328
+ // // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
329
+ // for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
330
+ // outputs[o].copy_(compiled_engine->output_buffers[o], false);
331
+ // }
332
+ // }
333
+
334
+ // if (compiled_engine->profile_execution) {
335
+ // LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
336
+ // dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
337
+ // compiled_engine->dump_engine_layer_info();
338
+ // }
345
339
346
340
return outputs;
347
341
};
@@ -378,45 +372,31 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
378
372
auto current_device_id = -1 ;
379
373
if (inputs.size () > 0 ) {
380
374
current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
381
- } else {
382
- current_device_id = at::cuda::current_device ();
383
- }
375
+ if (current_device_id != compiled_engine->current_device_id ) {
376
+ compiled_engine->stream = c10::cuda::getCurrentCUDAStream (current_device_id);
377
+
378
+ }
379
+ }
384
380
385
- compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
386
- if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream (current_device_id)) {
387
- // Create a new stream if the engine stream is the default stream
388
- compiled_engine->engine_stream = c10::cuda::getStreamFromPool (false , current_device_id);
389
- }
390
381
391
382
{ // Engine Execution (execute on engine stream)
392
- c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
393
-
394
- std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
395
- if (compiled_engine->profile_execution ) {
396
- enqueue_profiler_guard =
397
- std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
398
- }
399
383
400
- // Block engine stream until results are available on caller stream
401
- at::cuda::CUDAEvent caller_exec_complete;
402
- caller_exec_complete.record (compiled_engine->caller_stream );
403
- caller_exec_complete.block (compiled_engine->engine_stream );
384
+ // std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
385
+ // if (compiled_engine->profile_execution) {
386
+ // enqueue_profiler_guard =
387
+ // std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
388
+ // }
404
389
405
390
// Direct execution uses the caller buffers directly
406
- compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
391
+ compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
407
392
408
393
} // End engine exeuction (resets to caller stream)
409
394
410
- // Block caller stream until engine execution is complete
411
- at::cuda::CUDAEvent trt_exec_complete;
412
- trt_exec_complete.record (compiled_engine->engine_stream );
413
- trt_exec_complete.block (compiled_engine->caller_stream );
414
-
415
- std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
416
- if (compiled_engine->profile_execution ) {
417
- output_profiler_guard =
418
- std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
419
- }
395
+ // std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
396
+ // if (compiled_engine->profile_execution) {
397
+ // output_profiler_guard =
398
+ // std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
399
+ // }
420
400
std::vector<at::Tensor> outputs;
421
401
for (size_t i = 0 ; i < compiled_engine->out_binding_names .size (); i++) {
422
402
auto name = compiled_engine->out_binding_names [i];
@@ -476,45 +456,45 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
476
456
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->device_profile_path );
477
457
}
478
458
479
- RTDevice curr_device = get_current_device ();
480
- LOG_DEBUG (" Current Device: " << curr_device);
481
-
482
- // Generic Target Device Prefix
483
- std::string target_device = " cuda:" ;
484
-
485
- if (is_switch_required (curr_device, compiled_engine->device_info )) {
486
- // Scan through available CUDA devices and set the CUDA device context correctly
487
- RTDevice device =
488
- select_rt_device (compiled_engine->device_info , curr_device, compiled_engine->hardware_compatible );
489
- set_rt_device (device);
490
-
491
- // Target device is new device
492
- target_device += std::to_string (device.id );
493
-
494
- for (auto & in : inputs) {
495
- in = in.to (torch::Device (target_device));
496
- }
497
- } else {
498
- // Target device is current device
499
- target_device += std::to_string (curr_device.id );
500
- }
501
-
502
- // For each input, ensure its current device is the desired target device
503
- for (size_t i = 0 ; i < inputs.size (); i++) {
504
- at::Tensor* in = &inputs[i];
505
- std::string current_tensor_device = in->device ().str ();
506
-
507
- // If current device string does not match target device, display warning and move tensor accordingly
508
- if (current_tensor_device != target_device) {
509
- LOG_WARNING (
510
- " Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
511
- << " but should be on " << target_device << " . This tensor is being moved by the runtime but "
512
- << " for performance considerations, ensure your inputs are all on GPU "
513
- << " and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
514
- << " warning persists." );
515
- *in = in->to (torch::Device (target_device));
516
- }
517
- }
459
+ // RTDevice curr_device = get_current_device();
460
+ // LOG_DEBUG("Current Device: " << curr_device);
461
+
462
+ // // Generic Target Device Prefix
463
+ // std::string target_device = "cuda:";
464
+
465
+ // if (is_switch_required(curr_device, compiled_engine->device_info)) {
466
+ // // Scan through available CUDA devices and set the CUDA device context correctly
467
+ // RTDevice device =
468
+ // select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
469
+ // set_rt_device(device);
470
+
471
+ // // Target device is new device
472
+ // target_device += std::to_string(device.id);
473
+
474
+ // for (auto& in : inputs) {
475
+ // in = in.to(torch::Device(target_device));
476
+ // }
477
+ // } else {
478
+ // // Target device is current device
479
+ // target_device += std::to_string(curr_device.id);
480
+ // }
481
+
482
+ // // For each input, ensure its current device is the desired target device
483
+ // for (size_t i = 0; i < inputs.size(); i++) {
484
+ // at::Tensor* in = &inputs[i];
485
+ // std::string current_tensor_device = in->device().str();
486
+
487
+ // // If current device string does not match target device, display warning and move tensor accordingly
488
+ // if (current_tensor_device != target_device) {
489
+ // LOG_WARNING(
490
+ // "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
491
+ // << " but should be on " << target_device << ". This tensor is being moved by the runtime but "
492
+ // << "for performance considerations, ensure your inputs are all on GPU "
493
+ // << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
494
+ // << "warning persists.");
495
+ // *in = in->to(torch::Device(target_device));
496
+ // }
497
+ // }
518
498
}
519
499
520
500
if (compiled_engine->requires_output_allocator ) { // engine requires OA
0 commit comments