From b4ce8d8f86ba574b31f1e7c7e329007fb224e2ca Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Wed, 2 Jun 2021 14:16:00 -0400
Subject: [PATCH 01/11] Use bibtex

---
 docs/requirements.txt                         |  1 +
 docs/source/conf.py                           |  3 +
 docs/source/models.rst                        | 23 ++++--
 docs/source/refs.bib                          | 58 ++++++++++++++
 torchaudio/functional/filtering.py            | 80 ++++++++++---------
 torchaudio/functional/functional.py           | 11 +--
 torchaudio/models/conv_tasnet.py              | 26 ++----
 torchaudio/models/wav2vec2/model.py           | 35 ++------
 .../models/wav2vec2/utils/import_fairseq.py   |  4 +
 .../wav2vec2/utils/import_huggingface.py      |  2 +
 torchaudio/transforms.py                      | 21 +----
 11 files changed, 143 insertions(+), 121 deletions(-)
 create mode 100644 docs/source/refs.bib

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 99a7811de6..fa5ea2b7e6 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,4 +1,5 @@
 sphinx==2.4.4
 -e git+git://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 sphinxcontrib.katex
+sphinxcontrib.bibtex
 matplotlib
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f78858e381..1757ba1f52 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -41,6 +41,7 @@
     'sphinx.ext.napoleon',
     'sphinx.ext.viewcode',
     'sphinxcontrib.katex',
+    'sphinxcontrib.bibtex',
 ]
 
 # katex options
@@ -55,6 +56,8 @@
 ]
 '''
 
+bibtex_bibfiles = ['refs.bib']
+
 napoleon_use_ivar = True
 napoleon_numpy_docstring = False
 napoleon_google_docstring = True
diff --git a/docs/source/models.rst b/docs/source/models.rst
index 3d7d08cc2b..f3bbfd55e3 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -37,8 +37,8 @@ The models subpackage contains definitions of models for addressing common audio
 :hidden:`Wav2Vec2.0`
 ~~~~~~~~~~~~~~~~~~~~
 
-Model
------
+Wav2Vec2Model
+-------------
 
 .. autoclass:: Wav2Vec2Model
 
@@ -46,22 +46,31 @@ Model
 
   .. automethod:: forward
 
-Factory Functions
------------------
+wav2vec2_base
+-------------
 
 .. autofunction:: wav2vec2_base
 
+wav2vec2_large
+--------------
+
 .. autofunction:: wav2vec2_large
 
+wav2vec2_large_lv60k
+--------------------
+
 .. autofunction:: wav2vec2_large_lv60k
 
 .. currentmodule:: torchaudio.models.wav2vec2.utils
 
-Utility Functions
------------------
-
+import_huggingface_model
+------------------------
+		   
 .. autofunction:: import_huggingface_model
 
+import_fairseq_model
+--------------------
+		   
 .. autofunction:: import_fairseq_model
 
 .. currentmodule:: torchaudio.models
diff --git a/docs/source/refs.bib b/docs/source/refs.bib
new file mode 100644
index 0000000000..5446232070
--- /dev/null
+++ b/docs/source/refs.bib
@@ -0,0 +1,58 @@
+@misc{baevski2020wav2vec,
+      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations}, 
+      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
+      year={2020},
+      eprint={2006.11477},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@article{Luo_2019,
+   title={Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation},
+   volume={27},
+   ISSN={2329-9304},
+   url={http://dx.doi.org/10.1109/TASLP.2019.2915167},
+   DOI={10.1109/taslp.2019.2915167},
+   number={8},
+   journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
+   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
+   author={Luo, Yi and Mesgarani, Nima},
+   year={2019},
+   month={Aug},
+   pages={1256–1266}
+}
+@InProceedings{ brian_mcfee-proc-scipy-2015,
+  author    = { {B}rian {M}c{F}ee and {C}olin {R}affel and {D}awen {L}iang and {D}aniel {P}.{W}. {E}llis and {M}att {M}c{V}icar and {E}ric {B}attenberg and {O}riol {N}ieto },
+  title     = { librosa: {A}udio and {M}usic {S}ignal {A}nalysis in {P}ython },
+  booktitle = { {P}roceedings of the 14th {P}ython in {S}cience {C}onference },
+  pages     = { 18 - 24 },
+  year      = { 2015 },
+  editor    = { {K}athryn {H}uff and {J}ames {B}ergstra },
+  doi       = { 10.25080/Majora-7b98e3ed-003 }
+}
+@INPROCEEDINGS{6701851,
+  author={Perraudin, Nathanaël and Balazs, Peter and Søndergaard, Peter L.},
+  booktitle={2013 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics}, 
+  title={A fast Griffin-Lim algorithm}, 
+  year={2013},
+  volume={},
+  number={},
+  pages={1-4},
+  doi={10.1109/WASPAA.2013.6701851}}
+@INPROCEEDINGS{1172092,
+  author={Griffin, D. and Jae Lim},
+  booktitle={ICASSP '83. IEEE International Conference on Acoustics, Speech, and Signal Processing}, 
+  title={Signal estimation from modified short-time Fourier transform}, 
+  year={1983},
+  volume={8},
+  number={},
+  pages={804-807},
+  doi={10.1109/ICASSP.1983.1172092}}
+@INPROCEEDINGS{6854049,
+  author={Ghahremani, Pegah and BabaAli, Bagher and Povey, Daniel and Riedhammer, Korbinian and Trmal, Jan and Khudanpur, Sanjeev},
+  booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  title={A pitch extraction algorithm tuned for automatic speech recognition}, 
+  year={2014},
+  volume={},
+  number={},
+  pages={2494-2498},
+  doi={10.1109/ICASSP.2014.6854049}}
diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index 85abe81339..68269d9b34 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -80,9 +80,9 @@ def allpass_biquad(
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
     """
     dtype = waveform.dtype
     device = waveform.device
@@ -123,9 +123,9 @@ def band_biquad(
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
     """
     dtype = waveform.dtype
     device = waveform.device
@@ -171,9 +171,9 @@ def bandpass_biquad(
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
     """
     dtype = waveform.dtype
     device = waveform.device
@@ -207,9 +207,9 @@ def bandreject_biquad(
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
     """
     dtype = waveform.dtype
     device = waveform.device
@@ -247,9 +247,9 @@ def bass_biquad(
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
     """
     dtype = waveform.dtype
     device = waveform.device
@@ -325,8 +325,8 @@ def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html
     """
 
     if not 0 <= enhancement_amount <= 100:
@@ -358,8 +358,8 @@ def dcshift(
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html
     """
     output_waveform = waveform
     limiter_threshold = 0.0
@@ -405,9 +405,9 @@ def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
     """
 
     if sample_rate == 44100:
@@ -680,10 +680,12 @@ def flanger(
     Returns:
         Tensor: Waveform of dimension of `(..., channel, time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html
 
-        Scott Lehman, Effects Explained,
+        - Scott Lehman, `Effects Explained`_,
+
+    .. _Effects Explained:
         https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
     """
 
@@ -1027,8 +1029,8 @@ def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html
     """
     actual_shape = waveform.shape
     device, dtype = waveform.device, waveform.dtype
@@ -1096,9 +1098,11 @@ def phaser(
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
-        Scott Lehman, Effects Explained,
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - Scott Lehman, `Effects Explained`_.
+
+    .. _Effects Explained:
         https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
     """
     actual_shape = waveform.shape
@@ -1166,9 +1170,9 @@ def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
     """
 
     if sample_rate == 44100:
@@ -1234,9 +1238,9 @@ def treble_biquad(
     Returns:
         Tensor: Waveform of dimension of `(..., time)`
 
-    References:
-        http://sox.sourceforge.net/sox.html
-        https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
     """
     dtype = waveform.dtype
     device = waveform.device
@@ -1420,8 +1424,8 @@ def vad(
     Returns:
         Tensor: Tensor of audio of dimension (..., time).
 
-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html
     """
 
     if waveform.ndim > 2:
diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py
index 4ece68b4f3..e585afc4ca 100644
--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -1215,7 +1215,7 @@ def compute_kaldi_pitch(
         recompute_frame: int = 500,
         snip_edges: bool = True,
 ) -> torch.Tensor:
-    """Extract pitch based on method described in [1].
+    """Extract pitch based on method described in [:footcite:`6854049`].
 
     This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
 
@@ -1275,14 +1275,7 @@ def compute_kaldi_pitch(
        Tensor: Pitch feature. Shape: ``(batch, frames 2)`` where the last dimension
        corresponds to pitch and NCCF.
 
-    Reference:
-        - A pitch extraction algorithm tuned for automatic speech recognition
-
-          P. Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S. Khudanpur
-
-          2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),
-
-          Florence, 2014, pp. 2494-2498, doi: 10.1109/ICASSP.2014.6854049.
+    .. footbibliography::
     """
     shape = waveform.shape
     waveform = waveform.reshape(-1, shape[-1])
diff --git a/torchaudio/models/conv_tasnet.py b/torchaudio/models/conv_tasnet.py
index eb5f121fae..bddb189c7e 100644
--- a/torchaudio/models/conv_tasnet.py
+++ b/torchaudio/models/conv_tasnet.py
@@ -9,7 +9,7 @@
 
 
 class ConvBlock(torch.nn.Module):
-    """1D Convolutional block.
+    """1D Convolutional block used in [:footcite:`Luo_2019`].
 
     Args:
         io_channels (int): The number of input/output channels, <B, Sc>
@@ -22,12 +22,7 @@ class ConvBlock(torch.nn.Module):
     Note:
         This implementation corresponds to the "non-causal" setting in the paper.
 
-    Reference:
-        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
-
-          Luo, Yi and Mesgarani, Nima
-
-          https://arxiv.org/abs/1809.07454
+    .. footbibliography::
     """
 
     def __init__(
@@ -83,7 +78,7 @@ def forward(
 
 
 class MaskGenerator(torch.nn.Module):
-    """TCN (Temporal Convolution Network) Separation Module
+    """TCN (Temporal Convolution Network) Separation Module used in [:footcite:`Luo_2019`]
 
     Generates masks for separation.
 
@@ -99,10 +94,8 @@ class MaskGenerator(torch.nn.Module):
     Note:
         This implementation corresponds to the "non-causal" setting in the paper.
 
-    References:
-        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
-          Luo, Yi and Mesgarani, Nima
-          https://arxiv.org/abs/1809.07454
+    .. footbibliography::
+
     """
 
     def __init__(
@@ -176,7 +169,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
 
 class ConvTasNet(torch.nn.Module):
-    """Conv-TasNet: a fully-convolutional time-domain audio separation network
+    """Conv-TasNet: a fully-convolutional time-domain audio separation network [:footcite:`Luo_2019`].
 
     Args:
         num_sources (int): The number of sources to split.
@@ -191,12 +184,7 @@ class ConvTasNet(torch.nn.Module):
     Note:
         This implementation corresponds to the "non-causal" setting in the paper.
 
-    Reference:
-        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
-
-          Luo, Yi and Mesgarani, Nima
-
-          https://arxiv.org/abs/1809.07454
+    .. footbibliography::
     """
 
     def __init__(
diff --git a/torchaudio/models/wav2vec2/model.py b/torchaudio/models/wav2vec2/model.py
index 16ba3d0d8b..a643651842 100644
--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -7,7 +7,7 @@
 
 
 class Wav2Vec2Model(Module):
-    """Model used in wav2vec2.0 paper. [1]
+    """Encoder model used in [:footcite:`baevski2020wav2vec`].
 
     Note:
         To build the model, please use one of the factory functions.
@@ -20,12 +20,8 @@ class Wav2Vec2Model(Module):
             Encoder that converts the audio features into the sequence of probability
             distribution (in negative log-likelihood) over labels.
 
-    Reference:
-        - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
+    .. footbibliography::
 
-          Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli
-
-          https://arxiv.org/abs/2006.11477
     """
     def __init__(
             self,
@@ -129,7 +125,7 @@ def _get_model(
 
 
 def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with **Base** configuration. [1]
+    """Build wav2vec2.0 model with "Base" configuration.
 
     Args:
         num_out: int
@@ -137,13 +133,6 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
 
     Returns:
         Wav2Vec2Model: The resulting model.
-
-    Reference:
-        - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
-
-          Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli
-
-          https://arxiv.org/abs/2006.11477
     """
     return _get_model(
         extractor_mode="group_norm",
@@ -166,7 +155,7 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
 
 
 def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with **Large** configuration. [1]
+    """Build wav2vec2.0 model with "Large" configuration.
 
     Args:
         num_out: int
@@ -174,13 +163,6 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
 
     Returns:
         Wav2Vec2Model: The resulting model.
-
-    Reference:
-        - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
-
-          Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli
-
-          https://arxiv.org/abs/2006.11477
     """
     return _get_model(
         extractor_mode="group_norm",
@@ -203,7 +185,7 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
 
 
 def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with **Large LV-60k** configuration. [1]
+    """Build wav2vec2.0 model with "Large LV-60k" configuration.
 
     Args:
         num_out: int
@@ -211,13 +193,6 @@ def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
 
     Returns:
         Wav2Vec2Model: The resulting model.
-
-    Reference:
-        - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
-
-          Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli
-
-          https://arxiv.org/abs/2006.11477
     """
     return _get_model(
         extractor_mode="layer_norm",
diff --git a/torchaudio/models/wav2vec2/utils/import_fairseq.py b/torchaudio/models/wav2vec2/utils/import_fairseq.py
index f871957f58..b7dbe9cbb6 100644
--- a/torchaudio/models/wav2vec2/utils/import_fairseq.py
+++ b/torchaudio/models/wav2vec2/utils/import_fairseq.py
@@ -141,6 +141,8 @@ def import_fairseq_model(
         Wav2Vec2Model: Imported model.
 
     Example - Loading pretrain-only model
+        >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model
+        >>>
         >>> # Load model using fairseq
         >>> model_file = 'wav2vec_small.pt'
         >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
@@ -156,6 +158,8 @@ def import_fairseq_model(
         >>> torch.testing.assert_allclose(features, reference)
 
     Example - Fine-tuned model
+        >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model
+        >>>
         >>> # Load model using fairseq
         >>> model_file = 'wav2vec_small_960h.pt'
         >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
diff --git a/torchaudio/models/wav2vec2/utils/import_huggingface.py b/torchaudio/models/wav2vec2/utils/import_huggingface.py
index 6072d0ff83..0983cd1ad8 100644
--- a/torchaudio/models/wav2vec2/utils/import_huggingface.py
+++ b/torchaudio/models/wav2vec2/utils/import_huggingface.py
@@ -50,6 +50,8 @@ def import_huggingface_model(original: Module) -> Wav2Vec2Model:
         Wav2Vec2Model: Imported model.
 
     Example
+        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
+        >>>
         >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
         >>> model = import_huggingface_model(original)
         >>>
diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py
index 5dacd583a7..796ea6fe96 100644
--- a/torchaudio/transforms.py
+++ b/torchaudio/transforms.py
@@ -126,7 +126,7 @@ def forward(self, waveform: Tensor) -> Tensor:
 class GriffinLim(torch.nn.Module):
     r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
 
-    Implementation ported from ``librosa`` [1]_, [2]_, [3]_.
+    Implementation ported from [:footcite:`brian_mcfee-proc-scipy-2015`], [:footcite:`6701851`], [:footcite:`1172092`].
 
     Args:
         n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
@@ -144,23 +144,8 @@ class GriffinLim(torch.nn.Module):
         length (int, optional): Array length of the expected output. (Default: ``None``)
         rand_init (bool, optional): Initializes phase randomly if True and to zero otherwise. (Default: ``True``)
 
-    References:
-        .. [1]
-           | McFee, Brian, Colin Raffel, Dawen Liang, Daniel PW Ellis, Matt McVicar, Eric Battenberg,
-             and Oriol Nieto.
-           | "librosa: Audio and music signal analysis in python."
-           | In Proceedings of the 14th python in science conference, pp. 18-25. 2015.
-
-        .. [2]
-           | Perraudin, N., Balazs, P., & Søndergaard, P. L.
-           | "A fast Griffin-Lim algorithm,"
-           | IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
-           | Oct. 2013.
-
-        .. [3]
-           | D. W. Griffin and J. S. Lim,
-           | "Signal estimation from modified short-time Fourier transform,"
-           | IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
+    .. footbibliography::
+
     """
     __constants__ = ['n_fft', 'n_iter', 'win_length', 'hop_length', 'power',
                      'length', 'momentum', 'rand_init']

From c86e2d37af22f1fb100fff77030c3a409dbbb082 Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Wed, 2 Jun 2021 14:22:30 -0400
Subject: [PATCH 02/11] Fix more

---
 docs/source/refs.bib                |  8 ++++++++
 torchaudio/models/deepspeech.py     |  6 +++---
 torchaudio/models/wav2vec2/model.py | 12 +++++++++---
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/docs/source/refs.bib b/docs/source/refs.bib
index 5446232070..84c8a99a60 100644
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -6,6 +6,14 @@ @misc{baevski2020wav2vec
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
+@misc{hannun2014deep,
+      title={Deep Speech: Scaling up end-to-end speech recognition}, 
+      author={Awni Hannun and Carl Case and Jared Casper and Bryan Catanzaro and Greg Diamos and Erich Elsen and Ryan Prenger and Sanjeev Satheesh and Shubho Sengupta and Adam Coates and Andrew Y. Ng},
+      year={2014},
+      eprint={1412.5567},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
 @article{Luo_2019,
    title={Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation},
    volume={27},
diff --git a/torchaudio/models/deepspeech.py b/torchaudio/models/deepspeech.py
index 477993e411..639d018927 100644
--- a/torchaudio/models/deepspeech.py
+++ b/torchaudio/models/deepspeech.py
@@ -31,14 +31,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 class DeepSpeech(torch.nn.Module):
     """
-    DeepSpeech model architecture from
-    `"Deep Speech: Scaling up end-to-end speech recognition"`
-    <https://arxiv.org/abs/1412.5567> paper.
+    DeepSpeech model architecture from [:footcite:`hannun2014deep`].
 
     Args:
         n_feature: Number of input features
         n_hidden: Internal hidden unit size.
         n_class: Number of output classes
+
+    .. footbibliography::
     """
 
     def __init__(
diff --git a/torchaudio/models/wav2vec2/model.py b/torchaudio/models/wav2vec2/model.py
index a643651842..3ce32d8218 100644
--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -125,7 +125,7 @@ def _get_model(
 
 
 def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with "Base" configuration.
+    """Build wav2vec2.0 model with "Base" configuration from [:footcite:`baevski2020wav2vec`].
 
     Args:
         num_out: int
@@ -133,6 +133,8 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
 
     Returns:
         Wav2Vec2Model: The resulting model.
+
+    .. footbibliography::
     """
     return _get_model(
         extractor_mode="group_norm",
@@ -155,7 +157,7 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
 
 
 def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with "Large" configuration.
+    """Build wav2vec2.0 model with "Large" configuration from [:footcite:`baevski2020wav2vec`].
 
     Args:
         num_out: int
@@ -163,6 +165,8 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
 
     Returns:
         Wav2Vec2Model: The resulting model.
+
+    .. footbibliography::
     """
     return _get_model(
         extractor_mode="group_norm",
@@ -185,7 +189,7 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
 
 
 def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with "Large LV-60k" configuration.
+    """Build wav2vec2.0 model with "Large LV-60k" configuration from [:footcite:`baevski2020wav2vec`].
 
     Args:
         num_out: int
@@ -193,6 +197,8 @@ def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
 
     Returns:
         Wav2Vec2Model: The resulting model.
+
+    .. footbibliography::
     """
     return _get_model(
         extractor_mode="layer_norm",

From bcc5e901e5d58f61ac7dd23fa31cbecb128b6f70 Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Wed, 2 Jun 2021 14:32:21 -0400
Subject: [PATCH 03/11] Update rnnt loss

---
 docs/source/refs.bib                    |  8 ++++++++
 torchaudio/backend/soundfile_backend.py | 27 +++++++++++++++----------
 torchaudio/prototype/rnnt_loss.py       |  8 ++++++--
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/docs/source/refs.bib b/docs/source/refs.bib
index 84c8a99a60..0835910569 100644
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -14,6 +14,14 @@ @misc{hannun2014deep
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
+@misc{graves2012sequence,
+      title={Sequence Transduction with Recurrent Neural Networks}, 
+      author={Alex Graves},
+      year={2012},
+      eprint={1211.3711},
+      archivePrefix={arXiv},
+      primaryClass={cs.NE}
+}
 @article{Luo_2019,
    title={Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation},
    volume={27},
diff --git a/torchaudio/backend/soundfile_backend.py b/torchaudio/backend/soundfile_backend.py
index 8ad28a6fcf..afce945026 100644
--- a/torchaudio/backend/soundfile_backend.py
+++ b/torchaudio/backend/soundfile_backend.py
@@ -85,18 +85,20 @@ def _get_encoding(format: str, subtype: str):
 def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
     """Get signal information of an audio file.
 
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
     Args:
         filepath (path-like object or file-like object):
             Source of audio data.
-            Note:
-                  * This argument is intentionally annotated as ``str`` only,
-                    for the consistency with "sox_io" backend, which has a restriction
-                    on type annotation due to TorchScript compiler compatiblity.
         format (str, optional):
             Not used. PySoundFile does not accept format hint.
 
     Returns:
         AudioMetaData: meta data of the given audio.
+
     """
     sinfo = soundfile.info(filepath)
     return AudioMetaData(
@@ -159,13 +161,14 @@ def load(
     For these formats, this function always returns ``float32`` Tensor with values normalized to
     ``[-1.0, 1.0]``.
 
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
     Args:
         filepath (path-like object or file-like object):
             Source of audio data.
-            Note:
-                  * This argument is intentionally annotated as ``str`` only,
-                    for the consistency with "sox_io" backend, which has a restriction
-                    on type annotation due to TorchScript compiler compatiblity.
         frame_offset (int):
             Number of frames to skip before start reading data.
         num_frames (int):
@@ -324,11 +327,13 @@ def save(
         * OGG/VORBIS
         * SPHERE
 
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
     Args:
         filepath (str or pathlib.Path): Path to audio file.
-            This functionalso handles ``pathlib.Path`` objects, but is annotated as ``str``
-            for the consistency with "sox_io" backend, which has a restriction on type annotation
-            for TorchScript compiler compatiblity.
         src (torch.Tensor): Audio data to save. must be 2D tensor.
         sample_rate (int): sampling rate
         channels_first (bool): If ``True``, the given tensor is interpreted as ``[channel, time]``,
diff --git a/torchaudio/prototype/rnnt_loss.py b/torchaudio/prototype/rnnt_loss.py
index 2bce60835b..5c398193a3 100644
--- a/torchaudio/prototype/rnnt_loss.py
+++ b/torchaudio/prototype/rnnt_loss.py
@@ -20,7 +20,7 @@ def rnnt_loss(
     """
     Compute the RNN Transducer Loss.
 
-    The RNN Transducer loss (`Graves 2012 <https://arxiv.org/pdf/1211.3711.pdf>`__) extends the CTC loss by defining
+    The RNN Transducer loss [:footcite:`graves2012sequence`] extends the CTC loss by defining
     a distribution over output sequences of all lengths, and by jointly modelling both input-output and output-output
     dependencies.
 
@@ -34,6 +34,8 @@ def rnnt_loss(
         runtime_check (bool): whether to do sanity check during runtime. (Default: ``False``)
         fused_log_softmax (bool): set to False if calling log_softmax outside loss (Default: ``True``)
         reuse_logits_for_grads (bool): whether to save memory by reusing logits memory for grads (Default: ``True``)
+
+    .. footbibliography::
     """
     if not fused_log_softmax:
         logits = torch.nn.functional.log_softmax(logits, dim=-1)
@@ -61,7 +63,7 @@ class RNNTLoss(torch.nn.Module):
     """
     Compute the RNN Transducer Loss.
 
-    The RNN Transducer loss (`Graves 2012 <https://arxiv.org/pdf/1211.3711.pdf>`__) extends the CTC loss by defining
+    The RNN Transducer loss [:footcite:`graves2012sequence`] extends the CTC loss by defining
     a distribution over output sequences of all lengths, and by jointly modelling both input-output and output-output
     dependencies.
 
@@ -70,6 +72,8 @@ class RNNTLoss(torch.nn.Module):
         clamp (float): clamp for gradients (Default: ``-1``)
         fused_log_softmax (bool): set to False if calling log_softmax outside loss (Default: ``True``)
         reuse_logits_for_grads (bool): whether to save memory by reusing logits memory for grads (Default: ``True``)
+
+    .. footbibliography::
     """
 
     def __init__(

From a98fcce696d868ff087bdb3b4453782b09186bcd Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Wed, 2 Jun 2021 15:17:38 -0400
Subject: [PATCH 04/11] fix

---
 docs/source/rnnt_loss.rst           |  6 ++---
 torchaudio/functional/functional.py | 17 ++++----------
 torchaudio/models/conv_tasnet.py    | 10 +++-----
 torchaudio/models/wav2vec2/model.py | 36 +++++++++++++++++++++++++++++
 torchaudio/prototype/rnnt_loss.py   | 17 ++++++--------
 torchaudio/transforms.py            |  3 ++-
 6 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/docs/source/rnnt_loss.rst b/docs/source/rnnt_loss.rst
index 0c3b075d65..f3b07e6c28 100644
--- a/docs/source/rnnt_loss.rst
+++ b/docs/source/rnnt_loss.rst
@@ -2,7 +2,7 @@
     :class: hidden-section
 
 torchaudio.prototype.rnnt_loss
-===============================
+==============================
 
 .. currentmodule:: torchaudio.prototype.rnnt_loss
 
@@ -15,8 +15,8 @@ rnnt_loss
 
 .. autofunction:: rnnt_loss
 
-:hidden:`RNNTLoss`
-~~~~~~~~~~~~~~~~~~
+RNNTLoss
+--------
 
 .. autoclass:: RNNTLoss
 
diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py
index e585afc4ca..ecec626fd2 100644
--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -156,18 +156,9 @@ def griffinlim(
         rand_init: bool
 ) -> Tensor:
     r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
-        Implementation ported from `librosa`.
-
-    *  [1] McFee, Brian, Colin Raffel, Dawen Liang, Daniel PW Ellis, Matt McVicar, Eric Battenberg, and Oriol Nieto.
-        "librosa: Audio and music signal analysis in python."
-        In Proceedings of the 14th python in science conference, pp. 18-25. 2015.
-    *  [2] Perraudin, N., Balazs, P., & Søndergaard, P. L.
-        "A fast Griffin-Lim algorithm,"
-        IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
-        Oct. 2013.
-    *  [3] D. W. Griffin and J. S. Lim,
-        "Signal estimation from modified short-time Fourier transform,"
-        IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
+
+    Implementation ported from
+    [:footcite:`brian_mcfee-proc-scipy-2015`], [:footcite:`6701851`] and [:footcite:`1172092`].
 
     Args:
         specgram (Tensor): A magnitude-only STFT spectrogram of dimension (..., freq, frames)
@@ -188,6 +179,8 @@ def griffinlim(
 
     Returns:
         torch.Tensor: waveform of (..., time), where time equals the ``length`` parameter if given.
+
+    .. footbibliography::
     """
     assert momentum < 1, 'momentum={} > 1 can be unstable'.format(momentum)
     assert momentum >= 0, 'momentum={} < 0'.format(momentum)
diff --git a/torchaudio/models/conv_tasnet.py b/torchaudio/models/conv_tasnet.py
index bddb189c7e..4f9701f915 100644
--- a/torchaudio/models/conv_tasnet.py
+++ b/torchaudio/models/conv_tasnet.py
@@ -9,7 +9,7 @@
 
 
 class ConvBlock(torch.nn.Module):
-    """1D Convolutional block used in [:footcite:`Luo_2019`].
+    """1D Convolutional block.
 
     Args:
         io_channels (int): The number of input/output channels, <B, Sc>
@@ -21,8 +21,6 @@ class ConvBlock(torch.nn.Module):
 
     Note:
         This implementation corresponds to the "non-causal" setting in the paper.
-
-    .. footbibliography::
     """
 
     def __init__(
@@ -78,7 +76,7 @@ def forward(
 
 
 class MaskGenerator(torch.nn.Module):
-    """TCN (Temporal Convolution Network) Separation Module used in [:footcite:`Luo_2019`]
+    """TCN (Temporal Convolution Network) Separation Module
 
     Generates masks for separation.
 
@@ -93,9 +91,6 @@ class MaskGenerator(torch.nn.Module):
 
     Note:
         This implementation corresponds to the "non-causal" setting in the paper.
-
-    .. footbibliography::
-
     """
 
     def __init__(
@@ -185,6 +180,7 @@ class ConvTasNet(torch.nn.Module):
         This implementation corresponds to the "non-causal" setting in the paper.
 
     .. footbibliography::
+
     """
 
     def __init__(
diff --git a/torchaudio/models/wav2vec2/model.py b/torchaudio/models/wav2vec2/model.py
index 3ce32d8218..05ed0e13fb 100644
--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -134,6 +134,18 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
     Returns:
         Wav2Vec2Model: The resulting model.
 
+    Example - Reload fine-tuned model from Hugging Face:
+        >>> # Session 1 - Convert pretrained model from Hugging Face and save the parameters.
+        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
+        >>>
+        >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> model = import_huggingface_model(original)
+        >>> torch.save(model.state_dict(), "wav2vec2-base-960h.pt")
+        >>>
+        >>> # Session 2 - Load model and the parameters
+        >>> model = wav2vec2_base(num_out=32)
+        >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt"))
+
     .. footbibliography::
     """
     return _get_model(
@@ -166,6 +178,18 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
     Returns:
         Wav2Vec2Model: The resulting model.
 
+    Example - Reload fine-tuned model from Hugging Face:
+        >>> # Session 1 - Convert pretrained model from Hugging Face and save the parameters.
+        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
+        >>>
+        >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
+        >>> model = import_huggingface_model(original)
+        >>> torch.save(model.state_dict(), "wav2vec2-base-960h.pt")
+        >>>
+        >>> # Session 2 - Load model and the parameters
+        >>> model = wav2vec2_large(num_out=32)
+        >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt"))
+
     .. footbibliography::
     """
     return _get_model(
@@ -198,6 +222,18 @@ def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
     Returns:
         Wav2Vec2Model: The resulting model.
 
+    Example - Reload fine-tuned model from Hugging Face:
+        >>> # Session 1 - Convert pretrained model from Hugging Face and save the parameters.
+        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
+        >>>
+        >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
+        >>> model = import_huggingface_model(original)
+        >>> torch.save(model.state_dict(), "wav2vec2-base-960h.pt")
+        >>>
+        >>> # Session 2 - Load model and the parameters
+        >>> model = wav2vec2_large_lv60k(num_out=32)
+        >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt"))
+
     .. footbibliography::
     """
     return _get_model(
diff --git a/torchaudio/prototype/rnnt_loss.py b/torchaudio/prototype/rnnt_loss.py
index 5c398193a3..e7158d18ea 100644
--- a/torchaudio/prototype/rnnt_loss.py
+++ b/torchaudio/prototype/rnnt_loss.py
@@ -17,11 +17,10 @@ def rnnt_loss(
     fused_log_softmax: bool = True,
     reuse_logits_for_grads: bool = True,
 ):
-    """
-    Compute the RNN Transducer Loss.
+    """Compute the RNN Transducer loss. [:footcite:`graves2012sequence`]
 
-    The RNN Transducer loss [:footcite:`graves2012sequence`] extends the CTC loss by defining
-    a distribution over output sequences of all lengths, and by jointly modelling both input-output and output-output
+    The RNN Transducer loss  extends the CTC loss by defining a distribution over output
+    sequences of all lengths, and by jointly modelling both input-output and output-output
     dependencies.
 
     Args:
@@ -36,6 +35,7 @@ def rnnt_loss(
         reuse_logits_for_grads (bool): whether to save memory by reusing logits memory for grads (Default: ``True``)
 
     .. footbibliography::
+
     """
     if not fused_log_softmax:
         logits = torch.nn.functional.log_softmax(logits, dim=-1)
@@ -60,11 +60,10 @@ def rnnt_loss(
 
 
 class RNNTLoss(torch.nn.Module):
-    """
-    Compute the RNN Transducer Loss.
+    """Compute the RNN Transducer Loss. [:footcite:`graves2012sequence`]
 
-    The RNN Transducer loss [:footcite:`graves2012sequence`] extends the CTC loss by defining
-    a distribution over output sequences of all lengths, and by jointly modelling both input-output and output-output
+    The RNN Transducer loss extends the CTC loss by defining a distribution over output
+    sequences of all lengths, and by jointly modelling both input-output and output-output
     dependencies.
 
     Args:
@@ -72,8 +71,6 @@ class RNNTLoss(torch.nn.Module):
         clamp (float): clamp for gradients (Default: ``-1``)
         fused_log_softmax (bool): set to False if calling log_softmax outside loss (Default: ``True``)
         reuse_logits_for_grads (bool): whether to save memory by reusing logits memory for grads (Default: ``True``)
-
-    .. footbibliography::
     """
 
     def __init__(
diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py
index 796ea6fe96..8b4add73be 100644
--- a/torchaudio/transforms.py
+++ b/torchaudio/transforms.py
@@ -126,7 +126,8 @@ def forward(self, waveform: Tensor) -> Tensor:
 class GriffinLim(torch.nn.Module):
     r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
 
-    Implementation ported from [:footcite:`brian_mcfee-proc-scipy-2015`], [:footcite:`6701851`], [:footcite:`1172092`].
+    Implementation ported from
+    [:footcite:`brian_mcfee-proc-scipy-2015`], [:footcite:`6701851`] and [:footcite:`1172092`].
 
     Args:
         n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)

From f09c8f631e84da670067f5f8227ca72a6c433cfa Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Wed, 2 Jun 2021 16:00:11 -0400
Subject: [PATCH 05/11] fix

---
 docs/source/_static/css/override.css      |   4 +
 docs/source/_static/css/pytorch_theme.css | 118 ----------------------
 docs/source/conf.py                       |   3 +-
 torchaudio/prototype/rnnt_loss.py         |   2 +-
 4 files changed, 7 insertions(+), 120 deletions(-)
 create mode 100644 docs/source/_static/css/override.css
 delete mode 100644 docs/source/_static/css/pytorch_theme.css

diff --git a/docs/source/_static/css/override.css b/docs/source/_static/css/override.css
new file mode 100644
index 0000000000..93a0383463
--- /dev/null
+++ b/docs/source/_static/css/override.css
@@ -0,0 +1,4 @@
+/* Fix for bibtex back reference */
+dl.footnote.brackets > dt.label > span.fn-backref > a {
+    position: inherit
+}
diff --git a/docs/source/_static/css/pytorch_theme.css b/docs/source/_static/css/pytorch_theme.css
deleted file mode 100644
index 0e54497643..0000000000
--- a/docs/source/_static/css/pytorch_theme.css
+++ /dev/null
@@ -1,118 +0,0 @@
-body {
-    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
-}
-
-/* Default header fonts are ugly */
-h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
-    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
-}
-
-/* Use white for docs background */
-.wy-side-nav-search {
-    background-color: #fff;
-}
-
-.wy-nav-content-wrap, .wy-menu li.current > a  {
-    background-color: #fff;
-}
-
-@media screen and (min-width: 1400px) {
-    .wy-nav-content-wrap {
-        background-color: rgba(0, 0, 0, 0.0470588);
-    }
-
-    .wy-nav-content {
-        background-color: #fff;
-    }
-}
-
-/* Fixes for mobile */
-.wy-nav-top {
-    background-color: #fff;
-    background-image: url('../img/pytorch-logo-dark.svg');
-    background-repeat: no-repeat;
-    background-position: center;
-    padding: 0;
-    margin: 0.4045em 0.809em;
-    color: #333;
-}
-
-.wy-nav-top > a {
-    display: none;
-}
-
-@media screen and (max-width: 768px) {
-    .wy-side-nav-search>a img.logo {
-        height: 60px;
-    }
-}
-
-/* This is needed to ensure that logo above search scales properly */
-.wy-side-nav-search a {
-    display: block;
-}
-
-/* This ensures that multiple constructors will remain in separate lines. */
-.rst-content dl:not(.docutils) dt {
-    display: table;
-}
-
-/* Use our red for literals (it's very similar to the original color) */
-.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
-    color: #F05732;
-}
-
-.rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
-.rst-content code.xref, a .rst-content tt, a .rst-content code {
-    color: #404040;
-}
-
-/* Change link colors (except for the menu) */
-
-a {
-    color: #F05732;
-}
-
-a:hover {
-    color: #F05732;
-}
-
-
-a:visited {
-    color: #D44D2C;
-}
-
-.wy-menu a {
-    color: #b3b3b3;
-}
-
-.wy-menu a:hover {
-    color: #b3b3b3;
-}
-
-/* Default footer text is quite big */
-footer {
-    font-size: 80%;
-}
-
-footer .rst-footer-buttons {
-    font-size: 125%; /* revert footer settings - 1/80% = 125% */
-}
-
-footer p {
-    font-size: 100%;
-}
-
-/* For hidden headers that appear in TOC tree */
-/* see http://stackoverflow.com/a/32363545/3343043 */
-.rst-content .hidden-section {
-    display: none;
-}
-
-nav .hidden-section {
-    display: inherit;
-}
-
-.wy-side-nav-search>div.version {
-    color: #000;
-}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1757ba1f52..af56bd1e05 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -57,6 +57,7 @@
 '''
 
 bibtex_bibfiles = ['refs.bib']
+bibtex_default_style = 'unsrtalpha'
 
 napoleon_use_ivar = True
 napoleon_numpy_docstring = False
@@ -136,7 +137,7 @@
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
-
+html_css_files = ['css/override.css']
 
 def setup(app):
     # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
diff --git a/torchaudio/prototype/rnnt_loss.py b/torchaudio/prototype/rnnt_loss.py
index e7158d18ea..d909b7561d 100644
--- a/torchaudio/prototype/rnnt_loss.py
+++ b/torchaudio/prototype/rnnt_loss.py
@@ -19,7 +19,7 @@ def rnnt_loss(
 ):
     """Compute the RNN Transducer loss. [:footcite:`graves2012sequence`]
 
-    The RNN Transducer loss  extends the CTC loss by defining a distribution over output
+    The RNN Transducer loss extends the CTC loss by defining a distribution over output
     sequences of all lengths, and by jointly modelling both input-output and output-output
     dependencies.
 

From 14cca2dff206bb048259151212b018977fec8978 Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Wed, 2 Jun 2021 16:04:49 -0400
Subject: [PATCH 06/11] Fix lint

---
 docs/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index af56bd1e05..a2aed4d306 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -139,6 +139,7 @@
 html_static_path = ['_static']
 html_css_files = ['css/override.css']
 
+
 def setup(app):
     # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
     # and can be moved outside of this function (and the setup(app) function

From 2cae773ad276e3f216bb5bace81308d970eeefd9 Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Wed, 2 Jun 2021 16:20:57 -0400
Subject: [PATCH 07/11] further fix

---
 docs/source/_static/css/override.css |  4 ++++
 docs/source/conf.py                  | 21 ++++-----------------
 docs/source/refs.bib                 | 16 ++++++++++++++++
 torchaudio/models/wav2letter.py      |  6 +++---
 torchaudio/models/wavernn.py         |  9 ++++-----
 5 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/docs/source/_static/css/override.css b/docs/source/_static/css/override.css
index 93a0383463..a97d8c91ae 100644
--- a/docs/source/_static/css/override.css
+++ b/docs/source/_static/css/override.css
@@ -1,3 +1,7 @@
+/* Fix for bibtex reference */
+dl.footnote.brackets > dt.label > span.brackets > a.fn-backref {
+    position: inherit
+}
 /* Fix for bibtex back reference */
 dl.footnote.brackets > dt.label > span.fn-backref > a {
     position: inherit
diff --git a/docs/source/conf.py b/docs/source/conf.py
index a2aed4d306..9c0a8abafd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -137,23 +137,10 @@
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
-html_css_files = ['css/override.css']
-
-
-def setup(app):
-    # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
-    # and can be moved outside of this function (and the setup(app) function
-    # can be deleted).
-    html_css_files = [
-        'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css'
-    ]
-
-    # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is
-    # `add_stylesheet` (deprecated in 1.8).
-    add_css = getattr(app, 'add_css_file', app.add_stylesheet)
-    for css_file in html_css_files:
-        add_css(css_file)
-
+html_css_files = [
+    'css/override.css',
+    'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css'
+]
 
 # -- Options for HTMLHelp output ------------------------------------------
 
diff --git a/docs/source/refs.bib b/docs/source/refs.bib
index 0835910569..c6ddb3b61a 100644
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -22,6 +22,22 @@ @misc{graves2012sequence
       archivePrefix={arXiv},
       primaryClass={cs.NE}
 }
+@misc{collobert2016wav2letter,
+      title={Wav2Letter: an End-to-End ConvNet-based Speech Recognition System}, 
+      author={Ronan Collobert and Christian Puhrsch and Gabriel Synnaeve},
+      year={2016},
+      eprint={1609.03193},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{kalchbrenner2018efficient,
+      title={Efficient Neural Audio Synthesis}, 
+      author={Nal Kalchbrenner and Erich Elsen and Karen Simonyan and Seb Noury and Norman Casagrande and Edward Lockhart and Florian Stimberg and Aaron van den Oord and Sander Dieleman and Koray Kavukcuoglu},
+      year={2018},
+      eprint={1802.08435},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD}
+}
 @article{Luo_2019,
    title={Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation},
    volume={27},
diff --git a/torchaudio/models/wav2letter.py b/torchaudio/models/wav2letter.py
index 20a665a784..c5ef120de2 100644
--- a/torchaudio/models/wav2letter.py
+++ b/torchaudio/models/wav2letter.py
@@ -7,9 +7,7 @@
 
 
 class Wav2Letter(nn.Module):
-    r"""Wav2Letter model architecture from the `Wav2Letter an End-to-End ConvNet-based Speech Recognition System`_.
-
-    .. _Wav2Letter an End-to-End ConvNet-based Speech Recognition System: https://arxiv.org/abs/1609.03193
+    r"""Wav2Letter model architecture from the [:footcite:`collobert2016wav2letter`].
 
      :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`
 
@@ -18,6 +16,8 @@ class Wav2Letter(nn.Module):
         input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum``
          or ``mfcc`` (Default: ``waveform``).
         num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
+
+    .. footbibliography::
     """
 
     def __init__(self, num_classes: int = 40,
diff --git a/torchaudio/models/wavernn.py b/torchaudio/models/wavernn.py
index eedbd3c589..2db253fd9a 100644
--- a/torchaudio/models/wavernn.py
+++ b/torchaudio/models/wavernn.py
@@ -14,9 +14,7 @@
 
 
 class ResBlock(nn.Module):
-    r"""ResNet block based on "Deep Residual Learning for Image Recognition"
-
-    The paper link is https://arxiv.org/pdf/1512.03385.pdf.
+    r"""ResNet block based on [:footcite:`kalchbrenner2018efficient`].
 
     Args:
         n_freq: the number of bins in a spectrogram. (Default: ``128``)
@@ -204,8 +202,7 @@ def forward(self, specgram: Tensor) -> Tuple[Tensor, Tensor]:
 class WaveRNN(nn.Module):
     r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.
 
-    The original implementation was introduced in
-    `"Efficient Neural Audio Synthesis" <https://arxiv.org/pdf/1802.08435.pdf>`_.
+    The original implementation was introduced in [:footcite:`kalchbrenner2018efficient`].
     The input channels of waveform and spectrogram have to be 1. The product of
     `upsample_scales` must equal `hop_length`.
 
@@ -228,6 +225,8 @@ class WaveRNN(nn.Module):
         >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
         >>> output = wavernn(waveform, specgram)
         >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
+
+    .. footbibliography::
     """
 
     def __init__(self,

From dd735886b484ad3da906ca74fc97aff802143b6e Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Wed, 2 Jun 2021 16:22:59 -0400
Subject: [PATCH 08/11] Fix

---
 torchaudio/transforms.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py
index 8b4add73be..ab7a7789eb 100644
--- a/torchaudio/transforms.py
+++ b/torchaudio/transforms.py
@@ -1071,8 +1071,8 @@ class Vad(torch.nn.Module):
         lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
             in the detector algorithm. (Default: 2000.0)
 
-    References:
-        http://sox.sourceforge.net/sox.html
+    Reference:
+        - http://sox.sourceforge.net/sox.html
     """
 
     def __init__(self,

From 131d2340f76fe926314e68c727ece0bb72768a3f Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Wed, 2 Jun 2021 16:36:56 -0400
Subject: [PATCH 09/11] tweak

---
 docs/source/models.rst | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/docs/source/models.rst b/docs/source/models.rst
index f3bbfd55e3..844def08f1 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -2,7 +2,7 @@
     :class: hidden-section
 
 torchaudio.models
-======================
+=================
 
 .. currentmodule:: torchaudio.models
 
@@ -10,7 +10,7 @@ The models subpackage contains definitions of models for addressing common audio
 
 
 :hidden:`ConvTasNet`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: ConvTasNet
 
@@ -18,7 +18,7 @@ The models subpackage contains definitions of models for addressing common audio
 
 
 :hidden:`DeepSpeech`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: DeepSpeech
 
@@ -26,7 +26,7 @@ The models subpackage contains definitions of models for addressing common audio
 
 
 :hidden:`Wav2Letter`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: Wav2Letter
 
@@ -46,6 +46,9 @@ Wav2Vec2Model
 
   .. automethod:: forward
 
+Factory Functions
+-----------------
+
 wav2vec2_base
 -------------
 
@@ -63,6 +66,9 @@ wav2vec2_large_lv60k
 
 .. currentmodule:: torchaudio.models.wav2vec2.utils
 
+Utility Functions
+-----------------
+
 import_huggingface_model
 ------------------------
 		   
@@ -76,7 +82,7 @@ import_fairseq_model
 .. currentmodule:: torchaudio.models
 
 :hidden:`WaveRNN`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~
 
 .. autoclass:: WaveRNN
 

From 1bcbf848fad2657d70d2bee8114ac54cadc48a44 Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Wed, 2 Jun 2021 17:27:18 -0400
Subject: [PATCH 10/11] Place references par page

---
 docs/source/conf.py                 |  1 -
 docs/source/functional.rst          |  5 +++++
 docs/source/models.rst              | 26 +++++++++++++---------
 docs/source/rnnt_loss.rst           |  9 ++++++--
 docs/source/transforms.rst          | 34 +++++++++++++++++------------
 torchaudio/functional/functional.py |  8 ++-----
 torchaudio/models/conv_tasnet.py    |  5 +----
 torchaudio/models/deepspeech.py     |  4 +---
 torchaudio/models/wav2letter.py     |  4 +---
 torchaudio/models/wav2vec2/model.py |  9 --------
 torchaudio/models/wavernn.py        |  6 ++---
 torchaudio/prototype/rnnt_loss.py   |  7 ++----
 torchaudio/transforms.py            |  5 +----
 13 files changed, 58 insertions(+), 65 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 9c0a8abafd..a878157610 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -57,7 +57,6 @@
 '''
 
 bibtex_bibfiles = ['refs.bib']
-bibtex_default_style = 'unsrtalpha'
 
 napoleon_use_ivar = True
 napoleon_numpy_docstring = False
diff --git a/docs/source/functional.rst b/docs/source/functional.rst
index 4e318725c7..37593da3fc 100644
--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
@@ -235,3 +235,8 @@ vad
 ---------------------------
 
 .. autofunction:: spectral_centroid
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
diff --git a/docs/source/models.rst b/docs/source/models.rst
index 844def08f1..39e162baa0 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -9,24 +9,24 @@ torchaudio.models
 The models subpackage contains definitions of models for addressing common audio tasks.
 
 
-:hidden:`ConvTasNet`
-~~~~~~~~~~~~~~~~~~~~
+ConvTasNet
+~~~~~~~~~~
 
 .. autoclass:: ConvTasNet
 
   .. automethod:: forward
 
 
-:hidden:`DeepSpeech`
-~~~~~~~~~~~~~~~~~~~~
+DeepSpeech
+~~~~~~~~~~
 
 .. autoclass:: DeepSpeech
 
   .. automethod:: forward
 
 
-:hidden:`Wav2Letter`
-~~~~~~~~~~~~~~~~~~~~
+Wav2Letter
+~~~~~~~~~~
 
 .. autoclass:: Wav2Letter
 
@@ -34,8 +34,8 @@ The models subpackage contains definitions of models for addressing common audio
 
 
 
-:hidden:`Wav2Vec2.0`
-~~~~~~~~~~~~~~~~~~~~
+Wav2Vec2.0
+~~~~~~~~~~
 
 Wav2Vec2Model
 -------------
@@ -81,9 +81,15 @@ import_fairseq_model
 
 .. currentmodule:: torchaudio.models
 
-:hidden:`WaveRNN`
-~~~~~~~~~~~~~~~~~
+WaveRNN
+~~~~~~~
 
 .. autoclass:: WaveRNN
 
   .. automethod:: forward
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
+
diff --git a/docs/source/rnnt_loss.rst b/docs/source/rnnt_loss.rst
index f3b07e6c28..6e8ac81909 100644
--- a/docs/source/rnnt_loss.rst
+++ b/docs/source/rnnt_loss.rst
@@ -11,13 +11,18 @@ torchaudio.prototype.rnnt_loss
     The RNN transducer loss is a prototype feature, see `here <https://pytorch.org/audio>`_ to learn more about the nomenclature. It is only available within the nightlies, and also needs to be imported explicitly using: :code:`from torchaudio.prototype.rnnt_loss import rnnt_loss, RNNTLoss`.
 
 rnnt_loss
----------
+~~~~~~~~~
 
 .. autofunction:: rnnt_loss
 
 RNNTLoss
---------
+~~~~~~~~
 
 .. autoclass:: RNNTLoss
 
   .. automethod:: forward
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 8269193e22..787673f8df 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -53,99 +53,105 @@ Transforms are common audio transforms. They can be chained together using :clas
   .. automethod:: forward
 
 :hidden:`MFCC`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~
 
 .. autoclass:: MFCC
 
   .. automethod:: forward
 
 :hidden:`MuLawEncoding`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: MuLawEncoding
 
   .. automethod:: forward
 
 :hidden:`MuLawDecoding`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: MuLawDecoding
 
   .. automethod:: forward
 
 :hidden:`Resample`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: Resample
 
   .. automethod:: forward
 
 :hidden:`ComplexNorm`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: ComplexNorm
 
   .. automethod:: forward
 
 :hidden:`ComputeDeltas`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: ComputeDeltas
 
   .. automethod:: forward
 
 :hidden:`TimeStretch`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: TimeStretch
 
   .. automethod:: forward
 
 :hidden:`Fade`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~
 
 .. autoclass:: Fade
 
   .. automethod:: forward
 
 :hidden:`FrequencyMasking`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: FrequencyMasking
 
   .. automethod:: forward
 
 :hidden:`TimeMasking`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: TimeMasking
 
   .. automethod:: forward
 
 :hidden:`Vol`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~
 
 .. autoclass:: Vol
 
   .. automethod:: forward
 
 :hidden:`SlidingWindowCmn`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: SlidingWindowCmn
 
   .. automethod:: forward
 
 :hidden:`SpectralCentroid`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: SpectralCentroid
 
   .. automethod:: forward
 
 :hidden:`Vad`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~
 
 .. autoclass:: Vad
 
   .. automethod:: forward
+
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py
index ecec626fd2..cb35c2dfe6 100644
--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -158,7 +158,7 @@ def griffinlim(
     r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
 
     Implementation ported from
-    [:footcite:`brian_mcfee-proc-scipy-2015`], [:footcite:`6701851`] and [:footcite:`1172092`].
+    :footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`.
 
     Args:
         specgram (Tensor): A magnitude-only STFT spectrogram of dimension (..., freq, frames)
@@ -179,8 +179,6 @@ def griffinlim(
 
     Returns:
         torch.Tensor: waveform of (..., time), where time equals the ``length`` parameter if given.
-
-    .. footbibliography::
     """
     assert momentum < 1, 'momentum={} > 1 can be unstable'.format(momentum)
     assert momentum >= 0, 'momentum={} < 0'.format(momentum)
@@ -1208,7 +1206,7 @@ def compute_kaldi_pitch(
         recompute_frame: int = 500,
         snip_edges: bool = True,
 ) -> torch.Tensor:
-    """Extract pitch based on method described in [:footcite:`6854049`].
+    """Extract pitch based on method described in :footcite:`6854049`.
 
     This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
 
@@ -1267,8 +1265,6 @@ def compute_kaldi_pitch(
     Returns:
        Tensor: Pitch feature. Shape: ``(batch, frames 2)`` where the last dimension
        corresponds to pitch and NCCF.
-
-    .. footbibliography::
     """
     shape = waveform.shape
     waveform = waveform.reshape(-1, shape[-1])
diff --git a/torchaudio/models/conv_tasnet.py b/torchaudio/models/conv_tasnet.py
index 4f9701f915..c9a88191ae 100644
--- a/torchaudio/models/conv_tasnet.py
+++ b/torchaudio/models/conv_tasnet.py
@@ -164,7 +164,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
 
 class ConvTasNet(torch.nn.Module):
-    """Conv-TasNet: a fully-convolutional time-domain audio separation network [:footcite:`Luo_2019`].
+    """Conv-TasNet: a fully-convolutional time-domain audio separation network :footcite:`Luo_2019`.
 
     Args:
         num_sources (int): The number of sources to split.
@@ -178,9 +178,6 @@ class ConvTasNet(torch.nn.Module):
 
     Note:
         This implementation corresponds to the "non-causal" setting in the paper.
-
-    .. footbibliography::
-
     """
 
     def __init__(
diff --git a/torchaudio/models/deepspeech.py b/torchaudio/models/deepspeech.py
index 639d018927..e325275278 100644
--- a/torchaudio/models/deepspeech.py
+++ b/torchaudio/models/deepspeech.py
@@ -31,14 +31,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 class DeepSpeech(torch.nn.Module):
     """
-    DeepSpeech model architecture from [:footcite:`hannun2014deep`].
+    DeepSpeech model architecture from :footcite:`hannun2014deep`.
 
     Args:
         n_feature: Number of input features
         n_hidden: Internal hidden unit size.
         n_class: Number of output classes
-
-    .. footbibliography::
     """
 
     def __init__(
diff --git a/torchaudio/models/wav2letter.py b/torchaudio/models/wav2letter.py
index c5ef120de2..f0d83005fa 100644
--- a/torchaudio/models/wav2letter.py
+++ b/torchaudio/models/wav2letter.py
@@ -7,7 +7,7 @@
 
 
 class Wav2Letter(nn.Module):
-    r"""Wav2Letter model architecture from the [:footcite:`collobert2016wav2letter`].
+    r"""Wav2Letter model architecture from the :footcite:`collobert2016wav2letter`.
 
      :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`
 
@@ -16,8 +16,6 @@ class Wav2Letter(nn.Module):
         input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum``
          or ``mfcc`` (Default: ``waveform``).
         num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
-
-    .. footbibliography::
     """
 
     def __init__(self, num_classes: int = 40,
diff --git a/torchaudio/models/wav2vec2/model.py b/torchaudio/models/wav2vec2/model.py
index 05ed0e13fb..9c6bb06599 100644
--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -19,9 +19,6 @@ class Wav2Vec2Model(Module):
         encoder (torch.nn.Module):
             Encoder that converts the audio features into the sequence of probability
             distribution (in negative log-likelihood) over labels.
-
-    .. footbibliography::
-
     """
     def __init__(
             self,
@@ -145,8 +142,6 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
         >>> # Session 2 - Load model and the parameters
         >>> model = wav2vec2_base(num_out=32)
         >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt"))
-
-    .. footbibliography::
     """
     return _get_model(
         extractor_mode="group_norm",
@@ -189,8 +184,6 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
         >>> # Session 2 - Load model and the parameters
         >>> model = wav2vec2_large(num_out=32)
         >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt"))
-
-    .. footbibliography::
     """
     return _get_model(
         extractor_mode="group_norm",
@@ -233,8 +226,6 @@ def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
         >>> # Session 2 - Load model and the parameters
         >>> model = wav2vec2_large_lv60k(num_out=32)
         >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt"))
-
-    .. footbibliography::
     """
     return _get_model(
         extractor_mode="layer_norm",
diff --git a/torchaudio/models/wavernn.py b/torchaudio/models/wavernn.py
index 2db253fd9a..3763821ad6 100644
--- a/torchaudio/models/wavernn.py
+++ b/torchaudio/models/wavernn.py
@@ -14,7 +14,7 @@
 
 
 class ResBlock(nn.Module):
-    r"""ResNet block based on [:footcite:`kalchbrenner2018efficient`].
+    r"""ResNet block based on :footcite:`kalchbrenner2018efficient`.
 
     Args:
         n_freq: the number of bins in a spectrogram. (Default: ``128``)
@@ -202,7 +202,7 @@ def forward(self, specgram: Tensor) -> Tuple[Tensor, Tensor]:
 class WaveRNN(nn.Module):
     r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.
 
-    The original implementation was introduced in [:footcite:`kalchbrenner2018efficient`].
+    The original implementation was introduced in :footcite:`kalchbrenner2018efficient`.
     The input channels of waveform and spectrogram have to be 1. The product of
     `upsample_scales` must equal `hop_length`.
 
@@ -225,8 +225,6 @@ class WaveRNN(nn.Module):
         >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
         >>> output = wavernn(waveform, specgram)
         >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
-
-    .. footbibliography::
     """
 
     def __init__(self,
diff --git a/torchaudio/prototype/rnnt_loss.py b/torchaudio/prototype/rnnt_loss.py
index d909b7561d..0765ea2dcd 100644
--- a/torchaudio/prototype/rnnt_loss.py
+++ b/torchaudio/prototype/rnnt_loss.py
@@ -17,7 +17,7 @@ def rnnt_loss(
     fused_log_softmax: bool = True,
     reuse_logits_for_grads: bool = True,
 ):
-    """Compute the RNN Transducer loss. [:footcite:`graves2012sequence`]
+    """Compute the RNN Transducer loss from :footcite:`graves2012sequence`.
 
     The RNN Transducer loss extends the CTC loss by defining a distribution over output
     sequences of all lengths, and by jointly modelling both input-output and output-output
@@ -33,9 +33,6 @@ def rnnt_loss(
         runtime_check (bool): whether to do sanity check during runtime. (Default: ``False``)
         fused_log_softmax (bool): set to False if calling log_softmax outside loss (Default: ``True``)
         reuse_logits_for_grads (bool): whether to save memory by reusing logits memory for grads (Default: ``True``)
-
-    .. footbibliography::
-
     """
     if not fused_log_softmax:
         logits = torch.nn.functional.log_softmax(logits, dim=-1)
@@ -60,7 +57,7 @@ def rnnt_loss(
 
 
 class RNNTLoss(torch.nn.Module):
-    """Compute the RNN Transducer Loss. [:footcite:`graves2012sequence`]
+    """Compute the RNN Transducer loss from :footcite:`graves2012sequence`.
 
     The RNN Transducer loss extends the CTC loss by defining a distribution over output
     sequences of all lengths, and by jointly modelling both input-output and output-output
diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py
index ab7a7789eb..4d4c2d3f6c 100644
--- a/torchaudio/transforms.py
+++ b/torchaudio/transforms.py
@@ -127,7 +127,7 @@ class GriffinLim(torch.nn.Module):
     r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
 
     Implementation ported from
-    [:footcite:`brian_mcfee-proc-scipy-2015`], [:footcite:`6701851`] and [:footcite:`1172092`].
+    :footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`.
 
     Args:
         n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
@@ -144,9 +144,6 @@ class GriffinLim(torch.nn.Module):
             Values near 1 can lead to faster convergence, but above 1 may not converge. (Default: ``0.99``)
         length (int, optional): Array length of the expected output. (Default: ``None``)
         rand_init (bool, optional): Initializes phase randomly if True and to zero otherwise. (Default: ``True``)
-
-    .. footbibliography::
-
     """
     __constants__ = ['n_fft', 'n_iter', 'win_length', 'hop_length', 'power',
                      'length', 'momentum', 'rand_init']

From 4fdb1deab22d78d3f4843c241c4657d3ab3e5655 Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Thu, 3 Jun 2021 09:36:41 -0400
Subject: [PATCH 11/11] Address review comment

---
 torchaudio/models/wav2letter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchaudio/models/wav2letter.py b/torchaudio/models/wav2letter.py
index f0d83005fa..c47c6f4aed 100644
--- a/torchaudio/models/wav2letter.py
+++ b/torchaudio/models/wav2letter.py
@@ -7,7 +7,7 @@
 
 
 class Wav2Letter(nn.Module):
-    r"""Wav2Letter model architecture from the :footcite:`collobert2016wav2letter`.
+    r"""Wav2Letter model architecture from :footcite:`collobert2016wav2letter`.
 
      :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`