@@ -284,261 +284,3 @@ def __del__(self):
284284 """Close profiler's stream."""
285285 if self .output_file :
286286 self .output_file .close ()
287-
288-
289- class PyTorchProfiler (BaseProfiler ):
290-
291- PROFILED_FUNCTIONS = ("training_step_and_backward" , "validation_step" , "test_step" )
292- AVAILABLE_SORT_KEYS = (
293- "cpu_time" ,
294- "cuda_time" ,
295- "cpu_time_total" ,
296- "cuda_time_total" ,
297- "cpu_memory_usage" ,
298- "cuda_memory_usage" ,
299- "self_cpu_memory_usage" ,
300- "self_cuda_memory_usage" ,
301- "count" ,
302- )
303-
304- def __init__ (
305- self ,
306- output_filename : Optional [str ] = None ,
307- enabled : bool = True ,
308- use_cuda : bool = False ,
309- record_shapes : bool = False ,
310- profile_memory : bool = False ,
311- group_by_input_shapes : bool = False ,
312- with_stack : bool = False ,
313- use_kineto : bool = False ,
314- use_cpu : bool = True ,
315- emit_nvtx : bool = False ,
316- export_to_chrome : bool = False ,
317- path_to_export_trace : str = None ,
318- row_limit : int = 20 ,
319- sort_by_key : Optional [str ] = None ,
320- profiled_functions : Optional [List ] = None ,
321- local_rank : Optional [int ] = None ,
322- ):
323- """
324- This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of
325- different operators inside your model - both on the CPU and GPU
326-
327- Args:
328-
329- output_filename: optionally save profile results to file instead of printing
330- to std out when training is finished. When using ``ddp``,
331- each rank will stream the profiled operation to their own file
332- with the extension ``_{rank}.txt``
333-
334- enabled: Setting this to False makes this context manager a no-op.
335-
336- use_cuda: Enables timing of CUDA events as well using the cudaEvent API.
337- Adds approximately 4us of overhead to each tensor operation.
338-
339- record_shapes: If shapes recording is set, information about input dimensions will be collected.
340-
341- profile_memory: Whether to report memory usage, default: True (Introduced in PyTorch 1.6.0)
342-
343- group_by_input_shapes: Include operator input shapes and group calls by shape.
344-
345- with_stack: record source information (file and line number) for the ops (Introduced in PyTorch 1.7.0)
346-
347- use_kineto: experimental support for Kineto profiler (Introduced in PyTorch 1.8.0)
348-
349- use_cpu: use_kineto=True and can be used to lower the overhead
350- for GPU-only profiling (Introduced in PyTorch 1.8.0)
351-
352- emit_nvtx: Context manager that makes every autograd operation emit an NVTX range
353- Run::
354-
355- nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
356-
357- To visualize, you can either use::
358-
359- nvvp trace_name.prof
360- torch.autograd.profiler.load_nvprof(path)
361-
362- export_to_chrome: Wether to export the sequence of profiled operators for Chrome.
363- It will generate a ``.json`` file which can be read by Chrome.
364-
365- path_to_export_trace: Directory path to export ``.json`` traces when using ``export_to_chrome=True``.
366- By default, it will be save where the file being is being run.
367-
368- row_limit: Limit the number of rows in a table, `0` is a special value that
369- removes the limit completely.
370-
371- sort_by_key: Keys to sort out profiled table
372-
373- profiled_functions: list of profiled functions which will create a context manager on.
374- Any other will be pass through.
375-
376- local_rank: When running in distributed setting, local_rank is used for each process
377- to write to their own file if `output_fname` is provided.
378- """
379-
380- self .profiled_actions = {}
381- self .enabled = enabled
382- self .profiled_functions = profiled_functions or self .PROFILED_FUNCTIONS
383- self .use_cuda = use_cuda
384- self .record_shapes = record_shapes
385- self .profile_memory = profile_memory
386- self .sort_by_key = sort_by_key or ("cuda_time_total" if self .use_cuda else "cpu_time_total" )
387- self .with_stack = with_stack
388- self .group_by_input_shapes = group_by_input_shapes and record_shapes
389- self .use_kineto = use_kineto
390- self .use_cpu = use_cpu
391- self .row_limit = row_limit
392- self .emit_nvtx = emit_nvtx
393- self .export_to_chrome = export_to_chrome
394- self .path_to_export_trace = path_to_export_trace
395-
396- if export_to_chrome and path_to_export_trace is None :
397- rank_zero_warn (
398- "The exported trace would be save locally as `path_to_export_trace` is empty."
399- " Note: Each functions will generate its own traced file."
400- )
401-
402- if self .sort_by_key not in self .AVAILABLE_SORT_KEYS :
403- raise MisconfigurationException (
404- f"Found sort_by_key: { sort_by_key } . Should be within { self .AVAILABLE_SORT_KEYS } . "
405- )
406-
407- self .profiled_actions = {}
408- self .context_names = {}
409- self .running_stack = []
410- self .profiler = None
411-
412- self .output_fname = output_filename
413- self .output_file = None
414- if local_rank is not None :
415- self .on_train_start (local_rank = local_rank )
416- self .on_train_start = super ().on_train_start
417-
418- def on_train_start (self , local_rank : Optional [str ] = None ):
419- self .local_rank = local_rank
420-
421- # when logging to `log.info`, only perform profiling on rank 0
422- if local_rank != 0 and self .output_fname is None :
423- self .wrap_functions_into_rank_zero_only ()
424-
425- if self .output_fname :
426- if local_rank is not None :
427- if '.txt' not in self .output_fname :
428- raise MisconfigurationException ("Log file should be .txt file." )
429-
430- self .output_fname = self .output_fname .replace (".txt" , f"_{ self .local_rank } .txt" )
431-
432- fs = get_filesystem (self .output_fname )
433- self .output_file = fs .open (self .output_fname , "w" )
434-
435- streaming_out = [self .output_file .write ] if self .output_file else [log .info ]
436- super ().__init__ (output_streams = streaming_out )
437-
438- def wrap_functions_into_rank_zero_only (self ):
439- self .start = rank_zero_only (self .start )
440- self .stop = rank_zero_only (self .stop )
441- self .summary = rank_zero_only (self .summary )
442- self .describe = rank_zero_only (self .describe )
443-
444- def start (self , action_name : str ) -> None :
445- if action_name not in self .profiled_functions :
446- return
447-
448- if len (self .running_stack ) > 0 :
449- self ._stop (self .running_stack [- 1 ])
450- self .running_stack .append (action_name )
451-
452- self .context_names [action_name ] = "/" .join (self .running_stack )
453-
454- self ._start (action_name )
455-
456- def _start (self , action_name : str ) -> None :
457- if self .emit_nvtx :
458- self ._create_profiler (action_name , torch .cuda .profiler .profile , enter = False )
459- self ._create_profiler (action_name , torch .autograd .profiler .emit_nvtx )
460- else :
461- self ._create_profiler (action_name , torch .autograd .profiler .profile )
462-
463- def _create_profiler (self , action_name , profiler , enter = True ):
464- init_args = inspect .signature (profiler .__init__ ).parameters
465- profiler_args = {k : v for k , v in vars (self ).items () if k in init_args }
466- pr = profiler (** profiler_args )
467- if enter :
468- pr = pr .__enter__ ()
469- self .profiler = pr
470-
471- def _stop (self , action_name : str ) -> None :
472- if self .profiler is None :
473- return
474-
475- self .profiler .__exit__ (exc_type = None , exc_val = None , exc_tb = None )
476-
477- function_events = self .profiler .function_events
478- self .profiler = None
479- for name in self .running_stack :
480- if name not in self .profiled_actions :
481- self .profiled_actions [name ] = function_events
482- else :
483- self .profiled_actions [name ] += function_events
484-
485- def stop (self , action_name : str ) -> None :
486- if action_name not in self .profiled_functions :
487- return
488-
489- if len (self .running_stack ) == 0 or self .running_stack [- 1 ] != action_name :
490- raise ValueError ( # pragma: no-cover
491- f"Attempting to stop recording an action ({ action_name } ) which was never started."
492- )
493- self ._stop (action_name )
494- self .running_stack .pop ()
495- # restore running profiler
496- if len (self .running_stack ) > 0 :
497- self ._start (self .running_stack [- 1 ])
498-
499- def summary (self ) -> str :
500- recorded_stats = {}
501- output_string = ''
502- local_rank = '0' if self .local_rank is None else self .local_rank
503-
504- if not self .enabled :
505- return output_string
506-
507- for action_name , function_events in self .profiled_actions .items ():
508-
509- # next line is a workaround for a pytorch issue (fixed on master, still present
510- # on 1.7). Without it the code fails with `AssertionError: There is already a CPU
511- # parent event for detach`
512- function_events .populate_cpu_children = lambda : None
513-
514- if self .export_to_chrome :
515- filename = f"{ action_name } _{ local_rank } _trace.json"
516- path_to_trace = filename if self .path_to_export_trace is None \
517- else os .path .join (self .path_to_export_trace , filename )
518- function_events .export_chrome_trace (path_to_trace )
519-
520- if self .emit_nvtx :
521- return output_string
522-
523- else :
524- data = function_events .key_averages (group_by_input_shapes = self .group_by_input_shapes )
525- table = data .table (sort_by = self .sort_by_key , row_limit = self .row_limit )
526- recorded_stats [action_name ] = table
527-
528- # log to standard out
529- output_string = f"{ os .linesep } Profiler Report{ os .linesep } "
530- for action , stats in recorded_stats .items ():
531- output_string += (f"{ os .linesep } Profile stats for: { action } rank: { local_rank } { os .linesep } { stats } " )
532-
533- return output_string
534-
535- def describe (self ):
536- """Logs a profile report after the conclusion of the training run."""
537- super ().describe ()
538- if self .output_file :
539- self .output_file .flush ()
540-
541- def __del__ (self ):
542- """Close profiler's stream."""
543- if self .output_file :
544- self .output_file .close ()
0 commit comments