diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index d9a5b4b0d50..e398c5ab866 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -26,7 +26,7 @@ pip install -r $DIR/../requirements.txt
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # RC Link
 # pip uninstall -y torch torchvision torchaudio torchtext
-# pip install -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch torchvision torchaudio torchtext
+# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch torchvision torchaudio torchtext
 
 # For Tensorboard. Until 1.14 moves to the release channel.
 pip install tb-nightly
diff --git a/_static/img/perf_viz.png b/_static/img/perf_viz.png
new file mode 100644
index 00000000000..85608557bcb
Binary files /dev/null and b/_static/img/perf_viz.png differ
diff --git a/_static/img/trace_img.png b/_static/img/trace_img.png
index 172aeb1bef0..8c540ceb519 100644
Binary files a/_static/img/trace_img.png and b/_static/img/trace_img.png differ
diff --git a/beginner_source/audio_preprocessing_tutorial.py b/beginner_source/audio_preprocessing_tutorial.py
index 8b29b07bb42..d094ddf48af 100644
--- a/beginner_source/audio_preprocessing_tutorial.py
+++ b/beginner_source/audio_preprocessing_tutorial.py
@@ -26,7 +26,7 @@
 ######################################################################
 # Preparing data and utility functions (skip this section)
 # --------------------------------------------------------
-# 
+#
 
 #@title Prepare data and utility functions. {display-mode: "form"}
 #@markdown
@@ -52,6 +52,8 @@
 import requests
 import matplotlib
 import matplotlib.pyplot as plt
+import pandas as pd
+import time
 from IPython.display import Audio, display
 
 [width, height] = matplotlib.rcParams['figure.figsize']
@@ -117,7 +119,10 @@ def _get_sample(path, resample=None):
     ["remix", "1"]
   ]
   if resample:
-    effects.append(["rate", f'{resample}'])
+    effects.extend([
+      ["lowpass", f"{resample // 2}"],
+      ["rate", f'{resample}'],
+    ])
   return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
 
 def get_speech_sample(*, resample=None):
@@ -138,18 +143,6 @@ def get_rir_sample(*, resample=None, processed=False):
 def get_noise_sample(*, resample=None):
   return _get_sample(SAMPLE_NOISE_PATH, resample=resample)
 
-def print_metadata(metadata, src=None):
-  if src:
-    print("-" * 10)
-    print("Source:", src)
-    print("-" * 10)
-  print(" - sample_rate:", metadata.sample_rate)
-  print(" - num_channels:", metadata.num_channels)
-  print(" - num_frames:", metadata.num_frames)
-  print(" - bits_per_sample:", metadata.bits_per_sample)
-  print(" - encoding:", metadata.encoding)
-  print()
-
 def print_stats(waveform, sample_rate=None, src=None):
   if src:
     print("-" * 10)
@@ -222,7 +215,7 @@ def inspect_file(path):
   print("Source:", path)
   print("-" * 10)
   print(f" - File size: {os.path.getsize(path)} bytes")
-  print_metadata(torchaudio.info(path))
+  print(f" - {torchaudio.info(path)}")
 
 def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
   fig, axs = plt.subplots(1, 1)
@@ -300,38 +293,137 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
   axis.legend(lns, labels, loc=0)
   plt.show(block=False)
 
+DEFAULT_OFFSET = 201
+SWEEP_MAX_SAMPLE_RATE = 48000
+DEFAULT_LOWPASS_FILTER_WIDTH = 6
+DEFAULT_ROLLOFF = 0.99
+DEFAULT_RESAMPLING_METHOD = 'sinc_interpolation'
+
+def _get_log_freq(sample_rate, max_sweep_rate, offset):
+  """Get freqs evenly spaced out in log-scale, between [0, max_sweep_rate // 2]
+
+  offset is used to avoid negative infinity `log(offset + x)`.
+
+  """
+  half = sample_rate // 2
+  start, stop = math.log(offset), math.log(offset + max_sweep_rate // 2)
+  return torch.exp(torch.linspace(start, stop, sample_rate, dtype=torch.double)) - offset
+
+def _get_inverse_log_freq(freq, sample_rate, offset):
+  """Find the time where the given frequency is given by _get_log_freq"""
+  half = sample_rate // 2
+  return sample_rate * (math.log(1 + freq / offset) / math.log(1 + half / offset))
+
+def _get_freq_ticks(sample_rate, offset, f_max):
+  # Given the original sample rate used for generating the sweep,
+  # find the x-axis value where the log-scale major frequency values fall in
+  time, freq = [], []
+  for exp in range(2, 5):
+    for v in range(1, 10):
+      f = v * 10 ** exp
+      if f < sample_rate // 2:
+        t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate
+        time.append(t)
+        freq.append(f)
+  t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate
+  time.append(t_max)
+  freq.append(f_max)
+  return time, freq
+
+def plot_sweep(waveform, sample_rate, title, max_sweep_rate=SWEEP_MAX_SAMPLE_RATE, offset=DEFAULT_OFFSET):
+  x_ticks = [100, 500, 1000, 5000, 10000, 20000, max_sweep_rate // 2]
+  y_ticks = [1000, 5000, 10000, 20000, sample_rate//2]
+
+  time, freq = _get_freq_ticks(max_sweep_rate, offset, sample_rate // 2)
+  freq_x = [f if f in x_ticks and f <= max_sweep_rate // 2 else None for f in freq]
+  freq_y = [f for f in freq if f >= 1000 and f in y_ticks and f <= sample_rate // 2]
+
+  figure, axis = plt.subplots(1, 1)
+  axis.specgram(waveform[0].numpy(), Fs=sample_rate)
+  plt.xticks(time, freq_x)
+  plt.yticks(freq_y, freq_y)
+  axis.set_xlabel('Original Signal Frequency (Hz, log scale)')
+  axis.set_ylabel('Waveform Frequency (Hz)')
+  axis.xaxis.grid(True, alpha=0.67)
+  axis.yaxis.grid(True, alpha=0.67)
+  figure.suptitle(f'{title} (sample rate: {sample_rate} Hz)')
+  plt.show(block=True)
+
+def get_sine_sweep(sample_rate, offset=DEFAULT_OFFSET):
+    max_sweep_rate = sample_rate
+    freq = _get_log_freq(sample_rate, max_sweep_rate, offset)
+    delta = 2 * math.pi * freq / sample_rate
+    cummulative = torch.cumsum(delta, dim=0)
+    signal = torch.sin(cummulative).unsqueeze(dim=0)
+    return signal
+
+def benchmark_resample(
+    method,
+    waveform,
+    sample_rate,
+    resample_rate,
+    lowpass_filter_width=DEFAULT_LOWPASS_FILTER_WIDTH,
+    rolloff=DEFAULT_ROLLOFF,
+    resampling_method=DEFAULT_RESAMPLING_METHOD,
+    beta=None,
+    librosa_type=None,
+    iters=5
+):
+  if method == "functional":
+    begin = time.time()
+    for _ in range(iters):
+      F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width,
+                 rolloff=rolloff, resampling_method=resampling_method)
+    elapsed = time.time() - begin
+    return elapsed / iters
+  elif method == "transforms":
+    resampler = T.Resample(sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width,
+                           rolloff=rolloff, resampling_method=resampling_method, dtype=waveform.dtype)
+    begin = time.time()
+    for _ in range(iters):
+      resampler(waveform)
+    elapsed = time.time() - begin
+    return elapsed / iters
+  elif method == "librosa":
+    waveform_np = waveform.squeeze().numpy()
+    begin = time.time()
+    for _ in range(iters):
+      librosa.resample(waveform_np, sample_rate, resample_rate, res_type=librosa_type)
+    elapsed = time.time() - begin
+    return elapsed / iters
+
 
 ######################################################################
 # Audio I/O
 # =========
-# 
+#
 # torchaudio integrates ``libsox`` and provides a rich set of audio I/O.
-# 
+#
 
 
 ######################################################################
 # Quering audio metadata
 # ----------------------
-# 
+#
 # ``torchaudio.info`` function fetches metadata of audio. You can provide
 # a path-like object or file-like object.
-# 
+#
 
 metadata = torchaudio.info(SAMPLE_WAV_PATH)
-print_metadata(metadata, src=SAMPLE_WAV_PATH)
+print(metadata)
 
 
 ######################################################################
 # Where
-# 
+#
 # -  ``sample_rate`` is the sampling rate of the audio
 # -  ``num_channels`` is the number of channels
 # -  ``num_frames`` is the number of frames per channel
 # -  ``bits_per_sample`` is bit depth
 # -  ``encoding`` is the sample coding format
-# 
+#
 # The values ``encoding`` can take are one of the following
-# 
+#
 # -  ``"PCM_S"``: Signed integer linear PCM
 # -  ``"PCM_U"``: Unsigned integer linear PCM
 # -  ``"PCM_F"``: Floating point linear PCM
@@ -351,34 +443,35 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 # -  ``"GSM"``: GSM-FR
 #    [`wikipedia <https://en.wikipedia.org/wiki/Full_Rate>`__]
 # -  ``"UNKNOWN"`` None of avobe
-# 
+#
 
 
 ######################################################################
 # **Note**
-# 
+#
 # -  ``bits_per_sample`` can be ``0`` for formats with compression and/or
 #    variable bit rate. (such as mp3)
 # -  ``num_frames`` can be ``0`` for GSM-FR format.
-# 
+#
 
 metadata = torchaudio.info(SAMPLE_MP3_PATH)
-print_metadata(metadata, src=SAMPLE_MP3_PATH)
+print(metadata)
 
 metadata = torchaudio.info(SAMPLE_GSM_PATH)
-print_metadata(metadata, src=SAMPLE_GSM_PATH)
+print(metadata)
 
 
 ######################################################################
 # Querying file-like object
 # ~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # ``info`` function works on file-like object as well.
-# 
+#
 
+print("Source:", SAMPLE_WAV_URL)
 with requests.get(SAMPLE_WAV_URL, stream=True) as response:
   metadata = torchaudio.info(response.raw)
-print_metadata(metadata, src=SAMPLE_WAV_URL)
+print(metadata)
 
 
 ######################################################################
@@ -387,35 +480,36 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 # Therefore, depending on the audio format, it cannot get the correct
 # metadata, including the format itself. The following example illustrates
 # this.
-# 
+#
 # -  Use ``format`` argument to tell what audio format it is.
 # -  The returned metadata has ``num_frames = 0``
-# 
+#
 
+print("Source:", SAMPLE_MP3_URL)
 with requests.get(SAMPLE_MP3_URL, stream=True) as response:
   metadata = torchaudio.info(response.raw, format="mp3")
 
   print(f"Fetched {response.raw.tell()} bytes.")
-print_metadata(metadata, src=SAMPLE_MP3_URL)
+print(metadata)
 
 
 ######################################################################
 # Loading audio data into Tensor
 # ------------------------------
-# 
+#
 # To load audio data, you can use ``torchaudio.load``.
-# 
+#
 # This function accepts path-like object and file-like object.
-# 
+#
 # The returned value is a tuple of waveform (``Tensor``) and sample rate
 # (``int``).
-# 
+#
 # By default, the resulting tensor object has ``dtype=torch.float32`` and
 # its value range is normalized within ``[-1.0, 1.0]``.
-# 
+#
 # For the list of supported format, please refer to `the torchaudio
 # documentation <https://pytorch.org/audio>`__.
-# 
+#
 
 waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)
 
@@ -429,11 +523,11 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Loading from file-like object
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # ``torchaudio``\ ’s I/O functions now support file-like object. This
 # allows to fetch audio data and decode at the same time from the location
 # other than local file system. The following examples illustrates this.
-# 
+#
 
 # Load audio data as HTTP request
 with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response:
@@ -457,22 +551,22 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Tips on slicing
 # ~~~~~~~~~~~~~~~
-# 
+#
 # Providing ``num_frames`` and ``frame_offset`` arguments will slice the
 # resulting Tensor object while decoding.
-# 
+#
 # The same result can be achieved using the regular Tensor slicing,
 # (i.e. ``waveform[:, frame_offset:frame_offset+num_frames]``) however,
 # providing ``num_frames`` and ``frame_offset`` arguments is more
 # efficient.
-# 
+#
 # This is because the function will stop data acquisition and decoding
 # once it finishes decoding the requested frames. This is advantageous
 # when the audio data are transfered via network as the data transfer will
 # stop as soon as the necessary amount of data is fetched.
-# 
+#
 # The following example illustrates this;
-# 
+#
 
 # Illustration of two different decoding methods.
 # The first one will fetch all the data and decode them, while
@@ -502,26 +596,26 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Saving audio to file
 # --------------------
-# 
+#
 # To save audio data in the formats intepretable by common applications,
 # you can use ``torchaudio.save``.
-# 
+#
 # This function accepts path-like object and file-like object.
-# 
+#
 # When passing file-like object, you also need to provide ``format``
 # argument so that the function knows which format it should be using. In
 # case of path-like object, the function will detemine the format based on
 # the extension. If you are saving to a file without extension, you need
 # to provide ``format`` argument.
-# 
+#
 # When saving as WAV format, the default encoding for ``float32`` Tensor
 # is 32-bit floating-point PCM. You can provide ``encoding`` and
 # ``bits_per_sample`` argument to change this. For example, to save data
 # in 16 bit signed integer PCM, you can do the following.
-# 
+#
 # **Note** Saving data in encodings with lower bit depth reduces the
 # resulting file size but loses precision.
-# 
+#
 
 waveform, sample_rate = get_sample()
 print_stats(waveform, sample_rate=sample_rate)
@@ -545,9 +639,9 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 
 ######################################################################
 # ``torchaudio.save`` can also handle other formats. To name a few;
-# 
+#
 
-waveform, sample_rate = get_sample()
+waveform, sample_rate = get_sample(resample=8000)
 
 formats = [
   "mp3",
@@ -569,11 +663,11 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Saving to file-like object
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # Similar to the other I/O functions, you can save audio into file-like
 # object. When saving to file-like object, ``format`` argument is
 # required.
-# 
+#
 
 waveform, sample_rate = get_sample()
 
@@ -585,48 +679,315 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 print(buffer_.read(16))
 
 
+######################################################################
+# Resampling
+# ==========
+#
+# To resample an audio waveform from one freqeuncy to another, you can use
+# ``transforms.Resample`` or ``functional.resample``.
+# ``transforms.Resample`` precomputes and caches the kernel used for
+# resampling, while ``functional.resample`` computes it on the fly, so
+# using ``transforms.Resample`` will result in a speedup if resampling
+# multiple waveforms using the same parameters (see Benchmarking section).
+#
+# Both resampling methods use `bandlimited sinc
+# interpolation <https://ccrma.stanford.edu/~jos/resample/>`__ to compute
+# signal values at arbitrary time steps. The implementation involves
+# convolution, so we can take advantage of GPU / multithreading for
+# performance improvements. When using resampling in multiple
+# subprocesses, such as data loading with multiple worker processes, your
+# application might create more threads than your system can handle
+# efficiently. Setting ``torch.set_num_threads(1)`` might help in this
+# case.
+#
+# Because a finite number of samples can only represent a finite number of
+# frequencies, resampling does not produce perfect results, and a variety
+# of parameters can be used to control for its quality and computational
+# speed. We demonstrate these properties through resampling a logarithmic
+# sine sweep, which is a sine wave that increases exponentially in
+# frequency over time.
+#
+# The spectrograms below show the frequency representation of the signal,
+# where the x-axis labels correspond to the frequency of the original
+# waveform (in log scale), the y-axis corresponds to the frequency of the
+# plotted waveform, and the color intensity refers to amplitude.
+#
+
+sample_rate = 48000
+resample_rate = 32000
+
+waveform = get_sine_sweep(sample_rate)
+plot_sweep(waveform, sample_rate, title="Original Waveform")
+play_audio(waveform, sample_rate)
+
+resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
+resampled_waveform = resampler(waveform)
+plot_sweep(resampled_waveform, resample_rate, title="Resampled Waveform")
+play_audio(waveform, sample_rate)
+
+
+######################################################################
+# Controling resampling quality with parameters
+# ---------------------------------------------
+#
+# Lowpass filter width
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# Because the filter used for interpolation extends infinitely, the
+# ``lowpass_filter_width`` parameter is used to control for the width of
+# the filter to use to window the interpolation. It is also referred to as
+# the number of zero crossings, since the interpolation passes through
+# zero at every time unit. Using a larger ``lowpass_filter_width``
+# provides a sharper, more precise filter, but is more computationally
+# expensive.
+#
+
+sample_rate = 48000
+resample_rate = 32000
+
+resampled_waveform = F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=6)
+plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=6")
+
+resampled_waveform = F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=128)
+plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=128")
+
+
+######################################################################
+# Rolloff
+# ~~~~~~~
+#
+# The ``rolloff`` parameter is represented as a fraction of the Nyquist
+# frequency, which is the maximal frequency representable by a given
+# finite sample rate. ``rolloff`` determines the lowpass filter cutoff and
+# controls the degree of aliasing, which takes place when frequencies
+# higher than the Nyquist are mapped to lower frequencies. A lower rolloff
+# will therefore reduce the amount of aliasing, but it will also reduce
+# some of the higher frequencies.
+#
+
+sample_rate = 48000
+resample_rate = 32000
+
+resampled_waveform = F.resample(waveform, sample_rate, resample_rate, rolloff=0.99)
+plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.99")
+
+resampled_waveform = F.resample(waveform, sample_rate, resample_rate, rolloff=0.8)
+plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.8")
+
+
+######################################################################
+# Window function
+# ~~~~~~~~~~~~~~~
+#
+# By default, torchaudio’s resample uses the Hann window filter, which is
+# a weighted cosine function. It additionally supports the Kaiser window,
+# which is a near optimal window function that contains an additional
+# ``beta`` parameter that allows for the design of the smoothness of the
+# filter and width of impulse. This can be controlled using the
+# ``resampling_method`` parameter.
+#
+
+sample_rate = 48000
+resample_rate = 32000
+
+resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interpolation")
+plot_sweep(resampled_waveform, resample_rate, title="Hann Window Default")
+
+resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="kaiser_window")
+plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Default")
+
+
+######################################################################
+# Comparison against librosa
+# --------------------------
+#
+# torchaudio’s resample function can be used to produce results similar to
+# that of librosa (resampy)’s kaiser window resampling, with some noise
+#
+
+sample_rate = 48000
+resample_rate = 32000
+
+### kaiser_best
+resampled_waveform = F.resample(
+    waveform,
+    sample_rate,
+    resample_rate,
+    lowpass_filter_width=64,
+    rolloff=0.9475937167399596,
+    resampling_method="kaiser_window",
+    beta=14.769656459379492
+)
+plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Best (torchaudio)")
+
+librosa_resampled_waveform = torch.from_numpy(
+    librosa.resample(waveform.squeeze().numpy(), sample_rate, resample_rate, res_type='kaiser_best')).unsqueeze(0)
+plot_sweep(librosa_resampled_waveform, resample_rate, title="Kaiser Window Best (librosa)")
+
+mse = torch.square(resampled_waveform - librosa_resampled_waveform).mean().item()
+print("torchaudio and librosa kaiser best MSE:", mse)
+
+### kaiser_fast
+resampled_waveform = F.resample(
+    waveform,
+    sample_rate,
+    resample_rate,
+    lowpass_filter_width=16,
+    rolloff=0.85,
+    resampling_method="kaiser_window",
+    beta=8.555504641634386
+)
+plot_specgram(resampled_waveform, resample_rate, title="Kaiser Window Fast (torchaudio)")
+
+librosa_resampled_waveform = torch.from_numpy(
+    librosa.resample(waveform.squeeze().numpy(), sample_rate, resample_rate, res_type='kaiser_fast')).unsqueeze(0)
+plot_sweep(librosa_resampled_waveform, resample_rate, title="Kaiser Window Fast (librosa)")
+
+mse = torch.square(resampled_waveform - librosa_resampled_waveform).mean().item()
+print("torchaudio and librosa kaiser fast MSE:", mse)
+
+
+######################################################################
+# Performance Benchmarking
+# ------------------------
+#
+# Below are benchmarks for downsampling and upsampling waveforms between
+# two pairs of sampling rates. We demonstrate the performance implications
+# that the ``lowpass_filter_wdith``, window type, and sample rates can
+# have. Additionally, we provide a comparison against ``librosa``\ ’s
+# ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters
+# in ``torchaudio``.
+#
+# To elaborate on the results:
+# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
+#   and therefore increases computation time for both the kernel computation
+#   and convolution
+# - using ``kaiser_window`` results in longer computation times than the default
+#   ``sinc_interpolation`` because it is more complex to compute the intermediate
+#   window values - a large GCD between the sample and resample rate will result
+#   in a simplification that allows for a smaller kernel and faster kernel computation.
+#
+
+configs = {
+    "downsample (48 -> 44.1 kHz)": [48000, 44100],
+    "downsample (16 -> 8 kHz)": [16000, 8000],
+    "upsample (44.1 -> 48 kHz)": [44100, 48000],
+    "upsample (8 -> 16 kHz)": [8000, 1600],
+}
+
+for label in configs:
+  times, rows = [], []
+  sample_rate = configs[label][0]
+  resample_rate = configs[label][1]
+  waveform = get_sine_sweep(sample_rate)
+
+  # sinc 64 zero-crossings
+  f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
+  t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
+  times.append([None, 1000 * f_time, 1000 * t_time])
+  rows.append(f"sinc (width 64)")
+
+  # sinc 6 zero-crossings
+  f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
+  t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
+  times.append([None, 1000 * f_time, 1000 * t_time])
+  rows.append(f"sinc (width 16)")
+
+  # kaiser best
+  lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best")
+  f_time = benchmark_resample(
+      "functional",
+      waveform,
+      sample_rate,
+      resample_rate,
+      lowpass_filter_width=64,
+      rolloff=0.9475937167399596,
+      resampling_method="kaiser_window",
+      beta=14.769656459379492)
+  t_time = benchmark_resample(
+      "transforms",
+      waveform,
+      sample_rate,
+      resample_rate,
+      lowpass_filter_width=64,
+      rolloff=0.9475937167399596,
+      resampling_method="kaiser_window",
+      beta=14.769656459379492)
+  times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
+  rows.append(f"kaiser_best")
+
+  # kaiser fast
+  lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast")
+  f_time = benchmark_resample(
+      "functional",
+      waveform,
+      sample_rate,
+      resample_rate,
+      lowpass_filter_width=16,
+      rolloff=0.85,
+      resampling_method="kaiser_window",
+      beta=8.555504641634386)
+  t_time = benchmark_resample(
+      "transforms",
+      waveform,
+      sample_rate,
+      resample_rate,
+      lowpass_filter_width=16,
+      rolloff=0.85,
+      resampling_method="kaiser_window",
+      beta=8.555504641634386)
+  times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
+  rows.append(f"kaiser_fast")
+
+  df = pd.DataFrame(times,
+                    columns=["librosa", "functional", "transforms"],
+                    index=rows)
+  df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"],df.columns])
+  display(df.round(2))
+
+
 ######################################################################
 # Data Augmentation
 # =================
-# 
+#
 # ``torchaudio`` provides a variety of ways to augment audio data.
-# 
+#
 
 
 ######################################################################
 # Applying effects and filtering
 # ------------------------------
-# 
+#
 # ``torchaudio.sox_effects`` module provides ways to apply filiters like
 # ``sox`` command on Tensor objects and file-object audio sources
 # directly.
-# 
+#
 # There are two functions for this;
-# 
+#
 # -  ``torchaudio.sox_effects.apply_effects_tensor`` for applying effects
 #    on Tensor
 # -  ``torchaudio.sox_effects.apply_effects_file`` for applying effects on
 #    other audio source
-# 
+#
 # Both function takes effects in the form of ``List[List[str]]``. This
 # mostly corresponds to how ``sox`` command works, but one caveat is that
 # ``sox`` command adds some effects automatically, but torchaudio’s
 # implementation does not do that.
-# 
+#
 # For the list of available effects, please refer to `the sox
 # documentation <http://sox.sourceforge.net/sox.html>`__.
-# 
+#
 # **Tip** If you need to load and resample your audio data on-the-fly,
 # then you can use ``torchaudio.sox_effects.apply_effects_file`` with
 # ``"rate"`` effect.
-# 
+#
 # **Note** ``apply_effects_file`` accepts file-like object or path-like
 # object. Similar to ``torchaudio.load``, when the audio format cannot be
 # detected from either file extension or header, you can provide
 # ``format`` argument to tell what format the audio source is.
-# 
+#
 # **Note** This process is not differentiable.
-# 
+#
 
 # Load the data
 waveform1, sample_rate1 = get_sample(resample=16000)
@@ -635,7 +996,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 effects = [
   ["lowpass", "-1", "300"], # apply single-pole lowpass filter
   ["speed", "0.8"],  # reduce the speed
-                     # This only changes sample rate, so it is necessary to 
+                     # This only changes sample rate, so it is necessary to
                      # add `rate` effect with original sample rate after this.
   ["rate", f"{sample_rate1}"],
   ["reverb", "-w"],  # Reverbration gives some dramatic feeling
@@ -656,7 +1017,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 # Note that the number of frames and number of channels are different from
 # the original after the effects. Let’s listen to the audio. Doesn’t it
 # sound more dramatic?
-# 
+#
 
 plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04))
 play_audio(waveform1, sample_rate1)
@@ -667,19 +1028,19 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Simulating room reverbration
 # ----------------------------
-# 
+#
 # `Convolution
 # reverb <https://en.wikipedia.org/wiki/Convolution_reverb>`__ is a
 # technique used to make a clean audio data sound like in a different
 # environment.
-# 
+#
 # Using Room Impulse Response (RIR), we can make a clean speech sound like
 # uttered in a conference room.
-# 
+#
 # For this process, we need RIR data. The following data are from VOiCES
 # dataset, but you can record one by your self. Just turn on microphone
 # and clap you hands.
-# 
+#
 
 sample_rate = 8000
 
@@ -693,7 +1054,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # First, we need to clean up the RIR. We extract the main impulse,
 # normalize the signal power, then flip the time axis.
-# 
+#
 
 rir = rir_raw[:, int(sample_rate*1.01):int(sample_rate*1.3)]
 rir = rir / torch.norm(rir, p=2)
@@ -706,7 +1067,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 
 ######################################################################
 # Then we convolve the speech signal with the RIR filter.
-# 
+#
 
 speech, _ = get_speech_sample(resample=sample_rate)
 
@@ -726,22 +1087,22 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Adding background noise
 # -----------------------
-# 
+#
 # To add background noise to audio data, you can simply add audio Tensor
 # and noise Tensor. A commonly way to adjust the intensity of noise is to
 # change Signal-to-Noise Ratio (SNR).
 # [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__]
-# 
+#
 # .. math::
-# 
-# 
+#
+#
 #    \mathrm{SNR} = \frac{P_\mathrm{signal}}{P_\mathrm{noise}}
-# 
+#
 # .. math::
-# 
-# 
+#
+#
 #    {\mathrm  {SNR_{{dB}}}}=10\log _{{10}}\left({\mathrm  {SNR}}\right)
-# 
+#
 
 sample_rate = 8000
 speech, _ = get_speech_sample(resample=sample_rate)
@@ -769,11 +1130,11 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Applying codec to Tensor object
 # -------------------------------
-# 
+#
 # ``torchaudio.functional.apply_codec`` can apply codecs to Tensor object.
-# 
+#
 # **Note** This process is not differentiable.
-# 
+#
 
 waveform, sample_rate = get_speech_sample(resample=8000)
 
@@ -796,11 +1157,11 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Simulating a phone recoding
 # ---------------------------
-# 
+#
 # Combining the previous techniques, we can simulate audio that sounds
 # like a person talking over a phone in a echoey room with people talking
 # in the background.
-# 
+#
 
 sample_rate = 16000
 speech, _ = get_speech_sample(resample=sample_rate)
@@ -817,7 +1178,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 play_audio(speech, sample_rate)
 
 # Add background noise
-# Because the noise is recorded in the actual environment, we consider that 
+# Because the noise is recorded in the actual environment, we consider that
 # the noise contains the acoustic feature of the environment. Therefore, we add
 # the noise after RIR application.
 noise, _ = get_noise_sample(resample=sample_rate)
@@ -855,34 +1216,34 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Feature Extractions
 # ===================
-# 
+#
 # ``torchaudio`` implements feature extractions commonly used in audio
 # domain. They are available in ``torchaudio.functional`` and
 # ``torchaudio.transforms``.
-# 
+#
 # ``functional`` module implements features as a stand alone functions.
 # They are stateless.
-# 
+#
 # ``transforms`` module implements features in object-oriented manner,
 # using implementations from ``functional`` and ``torch.nn.Module``.
-# 
+#
 # Because all the transforms are subclass of ``torch.nn.Module``, they can
 # be serialized using TorchScript.
-# 
+#
 # For the complete list of available features, please refer to the
 # documentation. In this tutorial, we will look into conversion between
 # time domain and frequency domain (``Spectrogram``, ``GriffinLim``,
 # ``MelSpectrogram``) and augmentation technique called SpecAugment.
-# 
+#
 
 
 ######################################################################
 # Spectrogram
 # -----------
-# 
+#
 # To get the frequency representation of audio signal, you can use
 # ``Spectrogram`` transform.
-# 
+#
 
 waveform, sample_rate = get_speech_sample()
 
@@ -910,9 +1271,9 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # GriffinLim
 # ----------
-# 
+#
 # To recover a waveform from spectrogram, you can use ``GriffinLim``.
-# 
+#
 
 torch.random.manual_seed(0)
 waveform, sample_rate = get_speech_sample()
@@ -944,13 +1305,13 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Mel Filter Bank
 # ---------------
-# 
+#
 # ``torchaudio.functional.create_fb_matrix`` can generate the filter bank
 # to convert frequency bins to Mel-scale bins.
-# 
+#
 # Since this function does not require input audio/features, there is no
 # equivalent transform in ``torchaudio.transforms``.
-# 
+#
 
 n_fft = 256
 n_mels = 64
@@ -971,13 +1332,10 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Comparison against librosa
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # As a comparison, here is the equivalent way to get the mel filter bank
 # with ``librosa``.
-# 
-# **Note** Currently, the result matches only when ``htk=True``.
-# ``torchaudio`` does not support the equivalent of ``htk=False`` option.
-# 
+#
 
 mel_filters_librosa = librosa.filters.mel(
     sample_rate,
@@ -998,11 +1356,11 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # MelSpectrogram
 # --------------
-# 
+#
 # Mel-scale spectrogram is a combination of Spectrogram and mel scale
 # conversion. In ``torchaudio``, there is a transform ``MelSpectrogram``
 # which is composed of ``Spectrogram`` and ``MelScale``.
-# 
+#
 
 waveform, sample_rate = get_speech_sample()
 
@@ -1022,6 +1380,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
     norm='slaney',
     onesided=True,
     n_mels=n_mels,
+    mel_scale="htk",
 )
 
 melspec = mel_spectrogram(waveform)
@@ -1033,13 +1392,10 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Comparison against librosa
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # As a comparison, here is the equivalent way to get Mel-scale spectrogram
 # with ``librosa``.
-# 
-# **Note** Currently, the result matches only when ``htk=True``.
-# ``torchaudio`` does not support the equivalent of ``htk=False`` option.
-# 
+#
 
 melspec_librosa = librosa.feature.melspectrogram(
     waveform.numpy()[0],
@@ -1064,7 +1420,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # MFCC
 # ----
-# 
+#
 
 waveform, sample_rate = get_speech_sample()
 
@@ -1076,7 +1432,14 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 
 mfcc_transform = T.MFCC(
     sample_rate=sample_rate,
-    n_mfcc=n_mfcc, melkwargs={'n_fft': n_fft, 'n_mels': n_mels, 'hop_length': hop_length})
+    n_mfcc=n_mfcc,
+    melkwargs={
+      'n_fft': n_fft,
+      'n_mels': n_mels,
+      'hop_length': hop_length,
+      'mel_scale': 'htk',
+    }
+)
 
 mfcc = mfcc_transform(waveform)
 
@@ -1087,7 +1450,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Comparing against librosa
 # ~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 
 melspec = librosa.feature.melspectrogram(
   y=waveform.numpy()[0], sr=sample_rate, n_fft=n_fft,
@@ -1107,7 +1470,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Pitch
 # -----
-# 
+#
 
 waveform, sample_rate = get_speech_sample()
 
@@ -1119,22 +1482,22 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Kaldi Pitch (beta)
 # ------------------
-# 
+#
 # Kaldi Pitch feature [1] is pitch detection mechanism tuned for ASR
 # application. This is a beta feature in torchaudio, and only
 # ``functional`` form is available.
-# 
+#
 # 1. A pitch extraction algorithm tuned for automatic speech recognition
-# 
+#
 #    Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
 #    Khudanpur
-# 
+#
 #    2014 IEEE International Conference on Acoustics, Speech and Signal
 #    Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
 #    10.1109/ICASSP.2014.6854049.
 #    [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
 #    [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
-# 
+#
 
 waveform, sample_rate = get_speech_sample(resample=16000)
 
@@ -1148,25 +1511,25 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Feature Augmentation
 # ====================
-# 
+#
 
 
 ######################################################################
 # SpecAugment
 # -----------
-# 
+#
 # `SpecAugment <https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html>`__
 # is a popular augmentation technique applied on spectrogram.
-# 
+#
 # ``torchaudio`` implements ``TimeStrech``, ``TimeMasking`` and
 # ``FrequencyMasking``.
-# 
+#
 
 
 ######################################################################
 # TimeStrech
 # ~~~~~~~~~~
-# 
+#
 
 spec = get_spectrogram(power=None)
 strech = T.TimeStretch()
@@ -1185,7 +1548,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # TimeMasking
 # ~~~~~~~~~~~
-# 
+#
 
 torch.random.manual_seed(4)
 
@@ -1201,7 +1564,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # FrequencyMasking
 # ~~~~~~~~~~~~~~~~
-# 
+#
 
 torch.random.manual_seed(4)
 
@@ -1217,13 +1580,13 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 ######################################################################
 # Datasets
 # ========
-# 
+#
 # ``torchaudio`` provides easy access to common, publicly accessible
 # datasets. Please checkout the official documentation for the list of
 # available datasets.
-# 
+#
 # Here, we take ``YESNO`` dataset and look into how to use it.
-# 
+#
 
 YESNO_DOWNLOAD_PROCESS.join()
 
diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py
index 2dd88929821..a556192bab0 100644
--- a/beginner_source/text_sentiment_ngrams_tutorial.py
+++ b/beginner_source/text_sentiment_ngrams_tutorial.py
@@ -49,32 +49,35 @@
 #
 # We have revisited the very basic components of the torchtext library, including vocab, word vectors, tokenizer. Those are the basic data processing building blocks for raw text string.
 #
-# Here is an example for typical NLP data processing with tokenizer and vocabulary. The first step is to build a vocabulary with the raw training dataset. Users can have a customized vocab by setting up arguments in the constructor of the Vocab class. For example, the minimum frequency ``min_freq`` for the tokens to be included.
+# Here is an example for typical NLP data processing with tokenizer and vocabulary. The first step is to build a vocabulary with the raw training dataset. Here we use built in
+# factory function `build_vocab_from_iterator` which accepts iterator that yield list or iterator of tokens. Users can also pass any special symbols to be added to the
+# vocabulary.
 
 
 from torchtext.data.utils import get_tokenizer
-from collections import Counter
-from torchtext.vocab import Vocab
+from torchtext.vocab import build_vocab_from_iterator
 
 tokenizer = get_tokenizer('basic_english')
 train_iter = AG_NEWS(split='train')
-counter = Counter()
-for (label, line) in train_iter:
-    counter.update(tokenizer(line))
-vocab = Vocab(counter, min_freq=1)
 
+def yield_tokens(data_iter):
+    for _, text in data_iter:
+        yield tokenizer(text)
+
+vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
+vocab.set_default_index(vocab["<unk>"])
 
 ######################################################################
 # The vocabulary block converts a list of tokens into integers.
 #
 # ::
 #
-#     [vocab[token] for token in ['here', 'is', 'an', 'example']]
-#     >>> [476, 22, 31, 5298]
+#     vocab(['here', 'is', 'an', 'example'])
+#     >>> [475, 21, 30, 5286]
 #
 # Prepare the text processing pipeline with the tokenizer and vocabulary. The text and label pipelines will be used to process the raw data strings from the dataset iterators.
 
-text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
+text_pipeline = lambda x: vocab(tokenizer(x))
 label_pipeline = lambda x: int(x) - 1
 
 
@@ -246,6 +249,7 @@ def evaluate(dataloader):
 
 
 from torch.utils.data.dataset import random_split
+from torchtext.data.functional import to_map_style_dataset
 # Hyperparameters
 EPOCHS = 10 # epoch
 LR = 5  # learning rate
@@ -256,8 +260,8 @@ def evaluate(dataloader):
 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
 total_accu = None
 train_iter, test_iter = AG_NEWS()
-train_dataset = list(train_iter)
-test_dataset = list(test_iter)
+train_dataset = to_map_style_dataset(train_iter)
+test_dataset = to_map_style_dataset(test_iter)
 num_train = int(len(train_dataset) * 0.95)
 split_train_, split_valid_ = \
     random_split(train_dataset, [num_train, len(train_dataset) - num_train])
@@ -285,72 +289,6 @@ def evaluate(dataloader):
     print('-' * 59)
 
 
-######################################################################
-# Running the model on GPU with the following printout:
-#
-# ::
-#
-#        | epoch   1 |   500/ 1782 batches | accuracy    0.684
-#        | epoch   1 |  1000/ 1782 batches | accuracy    0.852
-#        | epoch   1 |  1500/ 1782 batches | accuracy    0.877
-#        -----------------------------------------------------------
-#        | end of epoch   1 | time:  8.33s | valid accuracy    0.867
-#        -----------------------------------------------------------
-#        | epoch   2 |   500/ 1782 batches | accuracy    0.895
-#        | epoch   2 |  1000/ 1782 batches | accuracy    0.900
-#        | epoch   2 |  1500/ 1782 batches | accuracy    0.903
-#        -----------------------------------------------------------
-#        | end of epoch   2 | time:  8.18s | valid accuracy    0.890
-#        -----------------------------------------------------------
-#        | epoch   3 |   500/ 1782 batches | accuracy    0.914
-#        | epoch   3 |  1000/ 1782 batches | accuracy    0.914
-#        | epoch   3 |  1500/ 1782 batches | accuracy    0.916
-#        -----------------------------------------------------------
-#        | end of epoch   3 | time:  8.20s | valid accuracy    0.897
-#        -----------------------------------------------------------
-#        | epoch   4 |   500/ 1782 batches | accuracy    0.926
-#        | epoch   4 |  1000/ 1782 batches | accuracy    0.924
-#        | epoch   4 |  1500/ 1782 batches | accuracy    0.921
-#        -----------------------------------------------------------
-#        | end of epoch   4 | time:  8.18s | valid accuracy    0.895
-#        -----------------------------------------------------------
-#        | epoch   5 |   500/ 1782 batches | accuracy    0.938
-#        | epoch   5 |  1000/ 1782 batches | accuracy    0.935
-#        | epoch   5 |  1500/ 1782 batches | accuracy    0.937
-#        -----------------------------------------------------------
-#        | end of epoch   5 | time:  8.16s | valid accuracy    0.902
-#        -----------------------------------------------------------
-#        | epoch   6 |   500/ 1782 batches | accuracy    0.939
-#        | epoch   6 |  1000/ 1782 batches | accuracy    0.939
-#        | epoch   6 |  1500/ 1782 batches | accuracy    0.938
-#        -----------------------------------------------------------
-#        | end of epoch   6 | time:  8.16s | valid accuracy    0.906
-#        -----------------------------------------------------------
-#        | epoch   7 |   500/ 1782 batches | accuracy    0.941
-#        | epoch   7 |  1000/ 1782 batches | accuracy    0.939
-#        | epoch   7 |  1500/ 1782 batches | accuracy    0.939
-#        -----------------------------------------------------------
-#        | end of epoch   7 | time:  8.19s | valid accuracy    0.903
-#        -----------------------------------------------------------
-#        | epoch   8 |   500/ 1782 batches | accuracy    0.942
-#        | epoch   8 |  1000/ 1782 batches | accuracy    0.941
-#        | epoch   8 |  1500/ 1782 batches | accuracy    0.942
-#        -----------------------------------------------------------
-#        | end of epoch   8 | time:  8.16s | valid accuracy    0.904
-#        -----------------------------------------------------------
-#        | epoch   9 |   500/ 1782 batches | accuracy    0.942
-#        | epoch   9 |  1000/ 1782 batches | accuracy    0.941
-#        | epoch   9 |  1500/ 1782 batches | accuracy    0.942
-#        -----------------------------------------------------------
-#          end of epoch   9 | time:  8.16s | valid accuracy    0.904
-#        -----------------------------------------------------------
-#        | epoch  10 |   500/ 1782 batches | accuracy    0.940
-#        | epoch  10 |  1000/ 1782 batches | accuracy    0.942
-#        | epoch  10 |  1500/ 1782 batches | accuracy    0.942
-#        -----------------------------------------------------------
-#        | end of epoch  10 | time:  8.15s | valid accuracy    0.904
-#        -----------------------------------------------------------
-
 
 ######################################################################
 # Evaluate the model with test dataset
@@ -366,12 +304,7 @@ def evaluate(dataloader):
 accu_test = evaluate(test_dataloader)
 print('test accuracy {:8.3f}'.format(accu_test))
 
-################################################
-#
-# ::
-#
-#        test accuracy    0.906
-#
+
 
 
 ######################################################################
@@ -409,10 +342,3 @@ def predict(text, text_pipeline):
 
 print("This is a %s news" %ag_news_label[predict(ex_text_str, text_pipeline)])
 
-
-################################################
-#
-# ::
-#
-#        This is a Sports news
-#
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index 680e9dc4b62..81a25c9b5c9 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -1,10 +1,10 @@
 """
-Sequence-to-Sequence Modeling with nn.Transformer and TorchText
+Language Modeling with nn.Transformer and TorchText
 ===============================================================
 
 This is a tutorial on how to train a sequence-to-sequence model
 that uses the
-`nn.Transformer <https://pytorch.org/docs/master/nn.html?highlight=nn%20transformer#torch.nn.Transformer>`__ module.
+`nn.Transformer <https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html>`__ module.
 
 PyTorch 1.2 release includes a standard transformer module based on the
 paper `Attention is All You
@@ -12,9 +12,9 @@
 has been proved to be superior in quality for many sequence-to-sequence
 problems while being more parallelizable. The ``nn.Transformer`` module
 relies entirely on an attention mechanism (another module recently
-implemented as `nn.MultiheadAttention <https://pytorch.org/docs/master/nn.html?highlight=multiheadattention#torch.nn.MultiheadAttention>`__) to draw global dependencies
+implemented as `nn.MultiheadAttention <https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html>`__) to draw global dependencies
 between input and output. The ``nn.Transformer`` module is now highly
-modularized such that a single component (like `nn.TransformerEncoder <https://pytorch.org/docs/master/nn.html?highlight=nn%20transformerencoder#torch.nn.TransformerEncoder>`__
+modularized such that a single component (like `nn.TransformerEncoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html>`__
 in this tutorial) can be easily adapted/composed.
 
 .. image:: ../_static/img/transformer_architecture.jpg
@@ -35,7 +35,7 @@
 # layer first, followed by a positional encoding layer to account for the order
 # of the word (see the next paragraph for more details). The
 # ``nn.TransformerEncoder`` consists of multiple layers of
-# `nn.TransformerEncoderLayer <https://pytorch.org/docs/master/nn.html?highlight=transformerencoderlayer#torch.nn.TransformerEncoderLayer>`__. Along with the input sequence, a square
+# `nn.TransformerEncoderLayer <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html>`__. Along with the input sequence, a square
 # attention mask is required because the self-attention layers in
 # ``nn.TransformerEncoder`` are only allowed to attend the earlier positions in
 # the sequence. For the language modeling task, any tokens on the future
@@ -144,23 +144,18 @@ def forward(self, x):
 # efficient batch processing.
 #
 
-import io
 import torch
 from torchtext.datasets import WikiText2
 from torchtext.data.utils import get_tokenizer
-from collections import Counter
-from torchtext.vocab import Vocab
+from torchtext.vocab import build_vocab_from_iterator
 
 train_iter = WikiText2(split='train')
 tokenizer = get_tokenizer('basic_english')
-counter = Counter()
-for line in train_iter:
-    counter.update(tokenizer(line))
-vocab = Vocab(counter)
+vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"])
+vocab.set_default_index(vocab["<unk>"]) 
 
 def data_process(raw_text_iter):
-  data = [torch.tensor([vocab[token] for token in tokenizer(item)],
-                       dtype=torch.long) for item in raw_text_iter]
+  data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
   return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
 
 train_iter, val_iter, test_iter = WikiText2()
@@ -225,7 +220,7 @@ def get_batch(source, i):
 # equal to the length of the vocab object.
 #
 
-ntokens = len(vocab.stoi) # the size of vocabulary
+ntokens = len(vocab) # the size of vocabulary
 emsize = 200 # embedding dimension
 nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
 nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py
index 49b37b1f564..bb5ac9339c5 100644
--- a/intermediate_source/pipeline_tutorial.py
+++ b/intermediate_source/pipeline_tutorial.py
@@ -148,27 +148,24 @@ def forward(self, x):
 # efficient batch processing.
 #
 
-import io
 import torch
-from torchtext.utils import download_from_url, extract_archive
+from torchtext.datasets import WikiText2
 from torchtext.data.utils import get_tokenizer
 from torchtext.vocab import build_vocab_from_iterator
 
-url = 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'
-test_filepath, valid_filepath, train_filepath = extract_archive(download_from_url(url))
+train_iter = WikiText2(split='train')
 tokenizer = get_tokenizer('basic_english')
-vocab = build_vocab_from_iterator(map(tokenizer,
-                                      iter(io.open(train_filepath,
-                                                   encoding="utf8"))))
+vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"])
+vocab.set_default_index(vocab["<unk>"]) 
 
 def data_process(raw_text_iter):
-  data = [torch.tensor([vocab[token] for token in tokenizer(item)],
-                       dtype=torch.long) for item in raw_text_iter]
+  data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
   return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
 
-train_data = data_process(iter(io.open(train_filepath, encoding="utf8")))
-val_data = data_process(iter(io.open(valid_filepath, encoding="utf8")))
-test_data = data_process(iter(io.open(test_filepath, encoding="utf8")))
+train_iter, val_iter, test_iter = WikiText2()
+train_data = data_process(train_iter)
+val_data = data_process(val_iter)
+test_data = data_process(test_iter)
 
 device = torch.device("cuda")
 
@@ -244,7 +241,7 @@ def get_batch(source, i):
 #    allows the Pipe to work with only two partitions and avoid any
 #    cross-partition overheads.
 
-ntokens = len(vocab.stoi) # the size of vocabulary
+ntokens = len(vocab) # the size of vocabulary
 emsize = 4096 # embedding dimension
 nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder
 nlayers = 12 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
@@ -330,7 +327,7 @@ def train():
     model.train() # Turn on the train mode
     total_loss = 0.
     start_time = time.time()
-    ntokens = len(vocab.stoi)
+    ntokens = len(vocab)
 
     # Train only for 50 batches to keep script execution time low.
     nbatches = min(50 * bptt, train_data.size(0) - 1)
@@ -366,7 +363,7 @@ def train():
 def evaluate(eval_model, data_source):
     eval_model.eval() # Turn on the evaluation mode
     total_loss = 0.
-    ntokens = len(vocab.stoi)
+    ntokens = len(vocab)
     # Evaluate only for 50 batches to keep script execution time low.
     nbatches = min(50 * bptt, data_source.size(0) - 1)
     with torch.no_grad():
@@ -418,39 +415,3 @@ def evaluate(eval_model, data_source):
 print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
     test_loss, math.exp(test_loss)))
 print('=' * 89)
-
-
-######################################################################
-# Output
-# ------
-#
-
-
-######################################################################
-#.. code-block:: py
-#
-#   Total parameters in model: 1,847,087,215
-#   | epoch   1 |    10/   50 batches | lr 5.00 | ms/batch 2387.45 | loss 42.16 | ppl 2036775646369743616.00
-#   | epoch   1 |    20/   50 batches | lr 5.00 | ms/batch 2150.93 | loss 48.24 | ppl 891334049215401558016.00
-#   | epoch   1 |    30/   50 batches | lr 5.00 | ms/batch 2155.23 | loss 34.66 | ppl 1125676483188404.62
-#   | epoch   1 |    40/   50 batches | lr 5.00 | ms/batch 2158.42 | loss 38.87 | ppl 76287208340888368.00
-#   -----------------------------------------------------------------------------------------
-#   | end of epoch   1 | time: 119.65s | valid loss  2.95 | valid ppl    19.15
-#   -----------------------------------------------------------------------------------------
-#   | epoch   2 |    10/   50 batches | lr 4.51 | ms/batch 2376.16 | loss 34.92 | ppl 1458001430957104.00
-#   | epoch   2 |    20/   50 batches | lr 4.51 | ms/batch 2160.96 | loss 34.75 | ppl 1232463826541886.50
-#   | epoch   2 |    30/   50 batches | lr 4.51 | ms/batch 2160.66 | loss 28.10 | ppl 1599598251136.51
-#   | epoch   2 |    40/   50 batches | lr 4.51 | ms/batch 2160.07 | loss 20.25 | ppl 621174306.77
-#   -----------------------------------------------------------------------------------------
-#   | end of epoch   2 | time: 119.76s | valid loss  0.87 | valid ppl     2.38
-#   -----------------------------------------------------------------------------------------
-#   | epoch   3 |    10/   50 batches | lr 4.29 | ms/batch 2376.49 | loss 13.20 | ppl 537727.23
-#   | epoch   3 |    20/   50 batches | lr 4.29 | ms/batch 2160.12 | loss 10.98 | ppl 58548.58
-#   | epoch   3 |    30/   50 batches | lr 4.29 | ms/batch 2160.05 | loss 12.01 | ppl 164152.79
-#   | epoch   3 |    40/   50 batches | lr 4.29 | ms/batch 2160.03 | loss 10.63 | ppl 41348.00
-#   -----------------------------------------------------------------------------------------
-#   | end of epoch   3 | time: 119.76s | valid loss  0.78 | valid ppl     2.17
-#   -----------------------------------------------------------------------------------------
-#   =========================================================================================
-#   | End of training | test loss  0.69 | test ppl     1.99
-#   =========================================================================================
diff --git a/prototype_source/numeric_suite_tutorial.py b/prototype_source/numeric_suite_tutorial.py
index df386f4efd2..35052f4b2f4 100644
--- a/prototype_source/numeric_suite_tutorial.py
+++ b/prototype_source/numeric_suite_tutorial.py
@@ -168,7 +168,7 @@ def forward(self, x):
 # And then we can pass this logger into above APIs such as:
 
 data = img_data[0][0]
-act_compare_dict = ns.compare_model_outputs(float_model, qmodel, data, Logger=MyOutputLogger)
+act_compare_dict = ns.compare_model_outputs(float_model, qmodel, data, logger_cls=MyOutputLogger)
 
 ##############################################################################
 # or:
@@ -260,7 +260,7 @@ def forward(self, x, y):
 # And then we can pass this logger into above APIs such as:
 
 data = img_data[0][0]
-ob_dict = ns.compare_model_stub(float_model, qmodel, module_swap_list, data, Logger=MyShadowLogger)
+ob_dict = ns.compare_model_stub(float_model, qmodel, module_swap_list, data, logger_cls=MyShadowLogger)
 
 ##############################################################################
 # or:
diff --git a/recipes_source/recipes/profiler_recipe.py b/recipes_source/recipes/profiler_recipe.py
index 4c400107633..d399c90ca7e 100644
--- a/recipes_source/recipes/profiler_recipe.py
+++ b/recipes_source/recipes/profiler_recipe.py
@@ -30,9 +30,12 @@
 #
 # 1. Import all necessary libraries
 # 2. Instantiate a simple Resnet model
-# 3. Use profiler to analyze execution time
-# 4. Use profiler to analyze memory consumption
+# 3. Using profiler to analyze execution time
+# 4. Using profiler to analyze memory consumption
 # 5. Using tracing functionality
+# 6. Examining stack traces
+# 7. Visualizing data as a flamegraph
+# 8. Using profiler to analyze long-running jobs
 #
 # 1. Import all necessary libraries
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -43,7 +46,7 @@
 
 import torch
 import torchvision.models as models
-import torch.autograd.profiler as profiler
+from torch.profiler import profile, record_function, ProfilerActivity
 
 
 ######################################################################
@@ -58,27 +61,36 @@
 inputs = torch.randn(5, 3, 224, 224)
 
 ######################################################################
-# 3. Use profiler to analyze execution time
+# 3. Using profiler to analyze execution time
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # PyTorch profiler is enabled through the context manager and accepts
 # a number of parameters, some of the most useful are:
 #
+# - ``activities`` - a list of activities to profile:
+#    - ``ProfilerActivity.CPU`` - PyTorch operators, TorchScript functions and
+#      user-defined code labels (see ``record_function`` below);
+#    - ``ProfilerActivity.CUDA`` - on-device CUDA kernels;
 # - ``record_shapes`` - whether to record shapes of the operator inputs;
 # - ``profile_memory`` - whether to report amount of memory consumed by
 #   model's Tensors;
 # - ``use_cuda`` - whether to measure execution time of CUDA kernels.
 #
+# Note: when using CUDA, profiler also shows the runtime CUDA events
+# occuring on the host.
+
+######################################################################
 # Let's see how we can use profiler to analyze the execution time:
 
-with profiler.profile(record_shapes=True) as prof:
-    with profiler.record_function("model_inference"):
+with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
+    with record_function("model_inference"):
         model(inputs)
 
 ######################################################################
 # Note that we can use ``record_function`` context manager to label
 # arbitrary code ranges with user provided names
 # (``model_inference`` is used as a label in the example above).
+#
 # Profiler allows one to check which operators were called during the
 # execution of a code range wrapped with a profiler context manager.
 # If multiple profiler ranges are active at the same time (e.g. in
@@ -95,50 +107,96 @@
 ######################################################################
 # The output will look like (omitting some columns):
 
-# -------------------------  --------------  ----------  ------------  ---------
-# Name                       Self CPU total   CPU total  CPU time avg  # Calls
-# -------------------------  --------------  ----------  ------------  ---------
-# model_inference            3.541ms         69.571ms    69.571ms      1
-# conv2d                     69.122us        40.556ms    2.028ms       20
-# convolution                79.100us        40.487ms    2.024ms       20
-# _convolution               349.533us       40.408ms    2.020ms       20
-# mkldnn_convolution         39.822ms        39.988ms    1.999ms       20
-# batch_norm                 105.559us       15.523ms    776.134us     20
-# _batch_norm_impl_index     103.697us       15.417ms    770.856us     20
-# native_batch_norm          9.387ms         15.249ms    762.471us     20
-# max_pool2d                 29.400us        7.200ms     7.200ms       1
-# max_pool2d_with_indices    7.154ms         7.170ms     7.170ms       1
-# -------------------------  --------------  ----------  ------------  ---------
+# ---------------------------------  ------------  ------------  ------------  ------------
+#                              Name      Self CPU     CPU total  CPU time avg    # of Calls
+# ---------------------------------  ------------  ------------  ------------  ------------
+#                   model_inference       5.509ms      57.503ms      57.503ms             1
+#                      aten::conv2d     231.000us      31.931ms       1.597ms            20
+#                 aten::convolution     250.000us      31.700ms       1.585ms            20
+#                aten::_convolution     336.000us      31.450ms       1.573ms            20
+#          aten::mkldnn_convolution      30.838ms      31.114ms       1.556ms            20
+#                  aten::batch_norm     211.000us      14.693ms     734.650us            20
+#      aten::_batch_norm_impl_index     319.000us      14.482ms     724.100us            20
+#           aten::native_batch_norm       9.229ms      14.109ms     705.450us            20
+#                        aten::mean     332.000us       2.631ms     125.286us            21
+#                      aten::select       1.668ms       2.292ms       8.988us           255
+# ---------------------------------  ------------  ------------  ------------  ------------
+# Self CPU time total: 57.549ms
 
 ######################################################################
 # Here we see that, as expected, most of the time is spent in convolution (and specifically in ``mkldnn_convolution``
 # for PyTorch compiled with MKL-DNN support).
 # Note the difference between self cpu time and cpu time - operators can call other operators, self cpu time exludes time
-# spent in children operator calls, while total cpu time includes it.
+# spent in children operator calls, while total cpu time includes it. You can choose to sort by the self cpu time by passing
+# ``sort_by="self_cpu_time_total"`` into the ``table`` call.
 #
-# To get a finer granularity of results and include operator input shapes, pass ``group_by_input_shape=True``:
+# To get a finer granularity of results and include operator input shapes, pass ``group_by_input_shape=True``
+# (note: this requires running the profiler with ``record_shapes=True``):
 
 print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
 
 # (omitting some columns)
-# -------------------------  -----------  --------  -------------------------------------
-# Name                       CPU total    # Calls         Input Shapes
-# -------------------------  -----------  --------  -------------------------------------
-# model_inference            69.571ms     1         []
-# conv2d                     9.019ms      4         [[5, 64, 56, 56], [64, 64, 3, 3], []]
-# convolution                9.006ms      4         [[5, 64, 56, 56], [64, 64, 3, 3], []]
-# _convolution               8.982ms      4         [[5, 64, 56, 56], [64, 64, 3, 3], []]
-# mkldnn_convolution         8.894ms      4         [[5, 64, 56, 56], [64, 64, 3, 3], []]
-# max_pool2d                 7.200ms      1         [[5, 64, 112, 112]]
-# conv2d                     7.189ms      3         [[5, 512, 7, 7], [512, 512, 3, 3], []]
-# convolution                7.180ms      3         [[5, 512, 7, 7], [512, 512, 3, 3], []]
-# _convolution               7.171ms      3         [[5, 512, 7, 7], [512, 512, 3, 3], []]
-# max_pool2d_with_indices    7.170ms      1         [[5, 64, 112, 112]]
-# -------------------------  -----------  --------  --------------------------------------
-
-
-######################################################################
-# 4. Use profiler to analyze memory consumption
+# ---------------------------------  ------------  -------------------------------------------
+#                              Name     CPU total                                 Input Shapes
+# ---------------------------------  ------------  -------------------------------------------
+#                   model_inference      57.503ms                                           []
+#                      aten::conv2d       8.008ms      [5,64,56,56], [64,64,3,3], [], ..., []]
+#                 aten::convolution       7.956ms     [[5,64,56,56], [64,64,3,3], [], ..., []]
+#                aten::_convolution       7.909ms     [[5,64,56,56], [64,64,3,3], [], ..., []]
+#          aten::mkldnn_convolution       7.834ms     [[5,64,56,56], [64,64,3,3], [], ..., []]
+#                      aten::conv2d       6.332ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
+#                 aten::convolution       6.303ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
+#                aten::_convolution       6.273ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
+#          aten::mkldnn_convolution       6.233ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
+#                      aten::conv2d       4.751ms  [[5,256,14,14], [256,256,3,3], [], ..., []]
+# ---------------------------------  ------------  -------------------------------------------
+# Self CPU time total: 57.549ms
+
+######################################################################
+# Note the occurence of ``aten::convolution`` twice with different input shapes.
+
+######################################################################
+# Profiler can also be used to analyze performance of models executed on GPUs:
+
+model = models.resnet18().cuda()
+inputs = torch.randn(5, 3, 224, 224).cuda()
+
+with profile(activities=[
+        ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
+    with record_function("model_inference"):
+        model(inputs)
+
+print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+######################################################################
+# (Note: the first use of CUDA profiling may bring an extra overhead.)
+
+######################################################################
+# The resulting table output:
+
+# (omitting some columns)
+# -------------------------------------------------------  ------------  ------------
+#                                                    Name     Self CUDA    CUDA total
+# -------------------------------------------------------  ------------  ------------
+#                                         model_inference       0.000us      11.666ms
+#                                            aten::conv2d       0.000us      10.484ms
+#                                       aten::convolution       0.000us      10.484ms
+#                                      aten::_convolution       0.000us      10.484ms
+#                              aten::_convolution_nogroup       0.000us      10.484ms
+#                                       aten::thnn_conv2d       0.000us      10.484ms
+#                               aten::thnn_conv2d_forward      10.484ms      10.484ms
+# void at::native::im2col_kernel<float>(long, float co...       3.844ms       3.844ms
+#                                       sgemm_32x32x32_NN       3.206ms       3.206ms
+#                                   sgemm_32x32x32_NN_vec       3.093ms       3.093ms
+# -------------------------------------------------------  ------------  ------------
+# Self CPU time total: 23.015ms
+# Self CUDA time total: 11.666ms
+
+######################################################################
+# Note the occurence of on-device kernels in the output (e.g. ``sgemm_32x32x32_NN``).
+
+######################################################################
+# 4. Using profiler to analyze memory consumption
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # PyTorch profiler can also show the amount of memory (used by the model's tensors)
@@ -147,44 +205,50 @@
 # by the operator, excluding the children calls to the other operators.
 # To enable memory profiling functionality pass ``profile_memory=True``.
 
-with profiler.profile(profile_memory=True, record_shapes=True) as prof:
+model = models.resnet18()
+inputs = torch.randn(5, 3, 224, 224)
+
+with profile(activities=[ProfilerActivity.CPU],
+        profile_memory=True, record_shapes=True) as prof:
     model(inputs)
 
 print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
 
 # (omitting some columns)
-# ---------------------------  ---------------  ---------------  ---------------
-# Name                         CPU Mem          Self CPU Mem     Number of Calls
-# ---------------------------  ---------------  ---------------  ---------------
-# empty                        94.79 Mb         94.79 Mb         123
-# resize_                      11.48 Mb         11.48 Mb         2
-# addmm                        19.53 Kb         19.53 Kb         1
-# empty_strided                4 b              4 b              1
-# conv2d                       47.37 Mb         0 b              20
-# ---------------------------  ---------------  ---------------  ---------------
+# ---------------------------------  ------------  ------------  ------------
+#                              Name       CPU Mem  Self CPU Mem    # of Calls
+# ---------------------------------  ------------  ------------  ------------
+#                       aten::empty      94.79 Mb      94.79 Mb           121
+#     aten::max_pool2d_with_indices      11.48 Mb      11.48 Mb             1
+#                       aten::addmm      19.53 Kb      19.53 Kb             1
+#               aten::empty_strided         572 b         572 b            25
+#                     aten::resize_         240 b         240 b             6
+#                         aten::abs         480 b         240 b             4
+#                         aten::add         160 b         160 b            20
+#               aten::masked_select         120 b         112 b             1
+#                          aten::ne         122 b          53 b             6
+#                          aten::eq          60 b          30 b             2
+# ---------------------------------  ------------  ------------  ------------
+# Self CPU time total: 53.064ms
 
 print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
 
 # (omitting some columns)
-# ---------------------------  ---------------  ---------------  ---------------
-# Name                         CPU Mem          Self CPU Mem     Number of Calls
-# ---------------------------  ---------------  ---------------  ---------------
-# empty                        94.79 Mb         94.79 Mb         123
-# batch_norm                   47.41 Mb         0 b              20
-# _batch_norm_impl_index       47.41 Mb         0 b              20
-# native_batch_norm            47.41 Mb         0 b              20
-# conv2d                       47.37 Mb         0 b              20
-# convolution                  47.37 Mb         0 b              20
-# _convolution                 47.37 Mb         0 b              20
-# mkldnn_convolution           47.37 Mb         0 b              20
-# empty_like                   47.37 Mb         0 b              20
-# max_pool2d                   11.48 Mb         0 b              1
-# max_pool2d_with_indices      11.48 Mb         0 b              1
-# resize_                      11.48 Mb         11.48 Mb         2
-# addmm                        19.53 Kb         19.53 Kb         1
-# adaptive_avg_pool2d          10.00 Kb         0 b              1
-# mean                         10.00 Kb         0 b              1
-# ---------------------------  ---------------  ---------------  ---------------
+# ---------------------------------  ------------  ------------  ------------
+#                              Name       CPU Mem  Self CPU Mem    # of Calls
+# ---------------------------------  ------------  ------------  ------------
+#                       aten::empty      94.79 Mb      94.79 Mb           121
+#                  aten::batch_norm      47.41 Mb           0 b            20
+#      aten::_batch_norm_impl_index      47.41 Mb           0 b            20
+#           aten::native_batch_norm      47.41 Mb           0 b            20
+#                      aten::conv2d      47.37 Mb           0 b            20
+#                 aten::convolution      47.37 Mb           0 b            20
+#                aten::_convolution      47.37 Mb           0 b            20
+#          aten::mkldnn_convolution      47.37 Mb           0 b            20
+#                  aten::max_pool2d      11.48 Mb           0 b             1
+#     aten::max_pool2d_with_indices      11.48 Mb      11.48 Mb             1
+# ---------------------------------  ------------  ------------  ------------
+# Self CPU time total: 53.064ms
 
 ######################################################################
 # 5. Using tracing functionality
@@ -192,19 +256,164 @@
 #
 # Profiling results can be outputted as a .json trace file:
 
-with profiler.profile() as prof:
-    with profiler.record_function("model_inference"):
-        model(inputs)
+model = models.resnet18().cuda()
+inputs = torch.randn(5, 3, 224, 224).cuda()
+
+with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+    model(inputs)
 
 prof.export_chrome_trace("trace.json")
 
 ######################################################################
-# User can examine the sequence of profiled operators after loading the trace file
-# in Chrome (``chrome://tracing``):
+# You can examine the sequence of profiled operators and CUDA kernels
+# in Chrome trace viewer (``chrome://tracing``):
 #
 # .. image:: ../../_static/img/trace_img.png
 #    :scale: 25 %
 
+######################################################################
+# 6. Examining stack traces
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Profiler can be used to analyze Python and TorchScript stack traces:
+
+with profile(
+    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+    with_stack=True,
+) as prof:
+    model(inputs)
+
+# Print aggregated stats
+print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2))
+
+# (omitting some columns)
+# -------------------------  -----------------------------------------------------------
+#                      Name  Source Location
+# -------------------------  -----------------------------------------------------------
+# aten::thnn_conv2d_forward  .../torch/nn/modules/conv.py(439): _conv_forward
+#                            .../torch/nn/modules/conv.py(443): forward
+#                            .../torch/nn/modules/module.py(1051): _call_impl
+#                            .../site-packages/torchvision/models/resnet.py(63): forward
+#                            .../torch/nn/modules/module.py(1051): _call_impl
+#
+# aten::thnn_conv2d_forward  .../torch/nn/modules/conv.py(439): _conv_forward
+#                            .../torch/nn/modules/conv.py(443): forward
+#                            .../torch/nn/modules/module.py(1051): _call_impl
+#                            .../site-packages/torchvision/models/resnet.py(59): forward
+#                            .../torch/nn/modules/module.py(1051): _call_impl
+#
+# -------------------------  -----------------------------------------------------------
+# Self CPU time total: 34.016ms
+# Self CUDA time total: 11.659ms
+
+######################################################################
+# Note the two convolutions and the two callsites in ``torchvision/models/resnet.py`` script.
+#
+# (Warning: stack tracing adds an extra profiling overhead.)
+
+
+######################################################################
+# 7. Visualizing data as a flamegraph
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Execution time (``self_cpu_time_total`` and ``self_cuda_time_total`` metrics) and stack traces
+# can also be visualized as a flame graph. To do this, first export the raw data using ``export_stacks`` (requires ``with_stack=True``):
+
+prof.export_stacks("/tmp/profiler_stacks.txt", "self_cuda_time_total")
+
+######################################################################
+# We recommend using e.g. `Flamegraph tool <https://github.com/brendangregg/FlameGraph>`_ to generate an
+# interactive SVG:
+
+# git clone https://github.com/brendangregg/FlameGraph
+# cd FlameGraph
+# ./flamegraph.pl --title "CUDA time" --countname "us." /tmp/profiler_stacks.txt > perf_viz.svg
+
+######################################################################
+#
+# .. image:: ../../_static/img/perf_viz.png
+#    :scale: 25 %
+
+
+######################################################################
+# 8. Using profiler to analyze long-running jobs
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# PyTorch profiler offers an additional API to handle long-running jobs
+# (such as training loops). Tracing all of the execution can be
+# slow and result in very large trace files. To avoid this, use optional
+# arguments:
+#
+# - ``schedule`` - specifies a function that takes an integer argument (step number)
+#   as an input and returns an action for the profiler, the best way to use this parameter
+#   is to use ``torch.profiler.schedule`` helper function that can generate a schedule for you;
+# - ``on_trace_ready`` - specifies a function that takes a reference to the profiler as
+#   an input and is called by the profiler each time the new trace is ready.
+#
+# To illustrate how the API works, let's first consider the following example with
+# ``torch.profiler.schedule`` helper function:
+
+from torch.profiler import schedule
+
+my_schedule = schedule(
+    skip_first=10,
+    wait=5,
+    warmup=1,
+    active=3,
+    repeat=2)
+
+######################################################################
+# Profiler assumes that the long-running job is composed of steps, numbered
+# starting from zero. The example above defines the following sequence of actions
+# for the profiler:
+#
+# 1. Parameter ``skip_first`` tells profiler that it should ignore the first 10 steps
+#    (default value of ``skip_first`` is zero);
+# 2. After the first ``skip_first`` steps, profiler starts executing profiler cycles;
+# 3. Each cycle consists of three phases:
+#
+#    - idling (``wait=5`` steps), during this phase profiler is not active;
+#    - warming up (``warmup=1`` steps), during this phase profiler starts tracing, but
+#      the results are discarded; this phase is used to discard the samples obtained by
+#      the profiler at the beginning of the trace since they are usually skewed by an extra
+#      overhead;
+#    - active tracing (``active=3`` steps), during this phase profiler traces and records data;
+# 4. An optional ``repeat`` parameter specifies an upper bound on the number of cycles.
+#    By default (zero value), profiler will execute cycles as long as the job runs.
+
+######################################################################
+# Thus, in the example above, profiler will skip the first 15 steps, spend the next step on the warm up,
+# actively record the next 3 steps, skip another 5 steps, spend the next step on the warm up, actively
+# record another 3 steps. Since the ``repeat=2`` parameter value is specified, the profiler will stop
+# the recording after the first two cycles.
+#
+# At the end of each cycle profiler calls the specified ``on_trace_ready`` function and passes itself as
+# an argument. This function is used to process the new trace - either by obtaining the table output or
+# by saving the output on disk as a trace file.
+#
+# To send the signal to the profiler that the next step has started, call ``prof.step()`` function.
+# The current profiler step is stored in ``prof.step_num``.
+#
+# The following example shows how to use all of the concepts above:
+
+def trace_handler(p):
+    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
+    print(output)
+    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
+
+with profile(
+    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+    schedule=torch.profiler.schedule(
+        wait=1,
+        warmup=1,
+        active=2),
+    on_trace_ready=trace_handler
+) as p:
+    for idx in range(8):
+        model(inputs)
+        p.step()
+
+
 ######################################################################
 # Learn More
 # ----------
@@ -212,5 +421,6 @@
 # Take a look at the following recipes/tutorials to continue your learning:
 #
 # -  `PyTorch Benchmark <https://pytorch.org/tutorials/recipes/recipes/benchmark.html>`_
+# -  `PyTorch Profiler with TensorBoard <https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html>`_ tutorial
 # -  `Visualizing models, data, and training with TensorBoard <https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html>`_ tutorial
 #