diff --git a/docs/requirements.txt b/docs/requirements.txt index 99a7811de6..fa5ea2b7e6 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,5 @@ sphinx==2.4.4 -e git+git://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme sphinxcontrib.katex +sphinxcontrib.bibtex matplotlib diff --git a/docs/source/_static/css/override.css b/docs/source/_static/css/override.css new file mode 100644 index 0000000000..a97d8c91ae --- /dev/null +++ b/docs/source/_static/css/override.css @@ -0,0 +1,8 @@ +/* Fix for bibtex reference */ +dl.footnote.brackets > dt.label > span.brackets > a.fn-backref { + position: inherit +} +/* Fix for bibtex back reference */ +dl.footnote.brackets > dt.label > span.fn-backref > a { + position: inherit +} diff --git a/docs/source/_static/css/pytorch_theme.css b/docs/source/_static/css/pytorch_theme.css deleted file mode 100644 index 0e54497643..0000000000 --- a/docs/source/_static/css/pytorch_theme.css +++ /dev/null @@ -1,118 +0,0 @@ -body { - font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif; -} - -/* Default header fonts are ugly */ -h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption { - font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif; -} - -/* Use white for docs background */ -.wy-side-nav-search { - background-color: #fff; -} - -.wy-nav-content-wrap, .wy-menu li.current > a { - background-color: #fff; -} - -@media screen and (min-width: 1400px) { - .wy-nav-content-wrap { - background-color: rgba(0, 0, 0, 0.0470588); - } - - .wy-nav-content { - background-color: #fff; - } -} - -/* Fixes for mobile */ -.wy-nav-top { - background-color: #fff; - background-image: url('../img/pytorch-logo-dark.svg'); - background-repeat: no-repeat; - background-position: center; - padding: 0; - margin: 0.4045em 0.809em; - color: #333; -} - -.wy-nav-top > a { - display: none; -} - -@media screen and (max-width: 768px) { - .wy-side-nav-search>a img.logo { - height: 60px; - } -} - -/* This is needed to ensure that logo above search scales properly */ -.wy-side-nav-search a { - display: block; -} - -/* This ensures that multiple constructors will remain in separate lines. */ -.rst-content dl:not(.docutils) dt { - display: table; -} - -/* Use our red for literals (it's very similar to the original color) */ -.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal { - color: #F05732; -} - -.rst-content tt.xref, a .rst-content tt, .rst-content tt.xref, -.rst-content code.xref, a .rst-content tt, a .rst-content code { - color: #404040; -} - -/* Change link colors (except for the menu) */ - -a { - color: #F05732; -} - -a:hover { - color: #F05732; -} - - -a:visited { - color: #D44D2C; -} - -.wy-menu a { - color: #b3b3b3; -} - -.wy-menu a:hover { - color: #b3b3b3; -} - -/* Default footer text is quite big */ -footer { - font-size: 80%; -} - -footer .rst-footer-buttons { - font-size: 125%; /* revert footer settings - 1/80% = 125% */ -} - -footer p { - font-size: 100%; -} - -/* For hidden headers that appear in TOC tree */ -/* see http://stackoverflow.com/a/32363545/3343043 */ -.rst-content .hidden-section { - display: none; -} - -nav .hidden-section { - display: inherit; -} - -.wy-side-nav-search>div.version { - color: #000; -} diff --git a/docs/source/conf.py b/docs/source/conf.py index f78858e381..a878157610 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -41,6 +41,7 @@ 'sphinx.ext.napoleon', 'sphinx.ext.viewcode', 'sphinxcontrib.katex', + 'sphinxcontrib.bibtex', ] # katex options @@ -55,6 +56,8 @@ ] ''' +bibtex_bibfiles = ['refs.bib'] + napoleon_use_ivar = True napoleon_numpy_docstring = False napoleon_google_docstring = True @@ -133,22 +136,10 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] - - -def setup(app): - # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value - # and can be moved outside of this function (and the setup(app) function - # can be deleted). - html_css_files = [ - 'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css' - ] - - # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is - # `add_stylesheet` (deprecated in 1.8). - add_css = getattr(app, 'add_css_file', app.add_stylesheet) - for css_file in html_css_files: - add_css(css_file) - +html_css_files = [ + 'css/override.css', + 'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css' +] # -- Options for HTMLHelp output ------------------------------------------ diff --git a/docs/source/functional.rst b/docs/source/functional.rst index 4e318725c7..37593da3fc 100644 --- a/docs/source/functional.rst +++ b/docs/source/functional.rst @@ -235,3 +235,8 @@ vad --------------------------- .. autofunction:: spectral_centroid + +References +~~~~~~~~~~ + +.. footbibliography:: diff --git a/docs/source/models.rst b/docs/source/models.rst index 3d7d08cc2b..39e162baa0 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -2,31 +2,31 @@ :class: hidden-section torchaudio.models -====================== +================= .. currentmodule:: torchaudio.models The models subpackage contains definitions of models for addressing common audio tasks. -:hidden:`ConvTasNet` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +ConvTasNet +~~~~~~~~~~ .. autoclass:: ConvTasNet .. automethod:: forward -:hidden:`DeepSpeech` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +DeepSpeech +~~~~~~~~~~ .. autoclass:: DeepSpeech .. automethod:: forward -:hidden:`Wav2Letter` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Wav2Letter +~~~~~~~~~~ .. autoclass:: Wav2Letter @@ -34,11 +34,11 @@ The models subpackage contains definitions of models for addressing common audio -:hidden:`Wav2Vec2.0` -~~~~~~~~~~~~~~~~~~~~ +Wav2Vec2.0 +~~~~~~~~~~ -Model ------ +Wav2Vec2Model +------------- .. autoclass:: Wav2Vec2Model @@ -49,10 +49,19 @@ Model Factory Functions ----------------- +wav2vec2_base +------------- + .. autofunction:: wav2vec2_base +wav2vec2_large +-------------- + .. autofunction:: wav2vec2_large +wav2vec2_large_lv60k +-------------------- + .. autofunction:: wav2vec2_large_lv60k .. currentmodule:: torchaudio.models.wav2vec2.utils @@ -60,15 +69,27 @@ Factory Functions Utility Functions ----------------- +import_huggingface_model +------------------------ + .. autofunction:: import_huggingface_model +import_fairseq_model +-------------------- + .. autofunction:: import_fairseq_model .. currentmodule:: torchaudio.models -:hidden:`WaveRNN` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +WaveRNN +~~~~~~~ .. autoclass:: WaveRNN .. automethod:: forward + +References +~~~~~~~~~~ + +.. footbibliography:: + diff --git a/docs/source/refs.bib b/docs/source/refs.bib new file mode 100644 index 0000000000..c6ddb3b61a --- /dev/null +++ b/docs/source/refs.bib @@ -0,0 +1,90 @@ +@misc{baevski2020wav2vec, + title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations}, + author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli}, + year={2020}, + eprint={2006.11477}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +@misc{hannun2014deep, + title={Deep Speech: Scaling up end-to-end speech recognition}, + author={Awni Hannun and Carl Case and Jared Casper and Bryan Catanzaro and Greg Diamos and Erich Elsen and Ryan Prenger and Sanjeev Satheesh and Shubho Sengupta and Adam Coates and Andrew Y. Ng}, + year={2014}, + eprint={1412.5567}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +@misc{graves2012sequence, + title={Sequence Transduction with Recurrent Neural Networks}, + author={Alex Graves}, + year={2012}, + eprint={1211.3711}, + archivePrefix={arXiv}, + primaryClass={cs.NE} +} +@misc{collobert2016wav2letter, + title={Wav2Letter: an End-to-End ConvNet-based Speech Recognition System}, + author={Ronan Collobert and Christian Puhrsch and Gabriel Synnaeve}, + year={2016}, + eprint={1609.03193}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} +@misc{kalchbrenner2018efficient, + title={Efficient Neural Audio Synthesis}, + author={Nal Kalchbrenner and Erich Elsen and Karen Simonyan and Seb Noury and Norman Casagrande and Edward Lockhart and Florian Stimberg and Aaron van den Oord and Sander Dieleman and Koray Kavukcuoglu}, + year={2018}, + eprint={1802.08435}, + archivePrefix={arXiv}, + primaryClass={cs.SD} +} +@article{Luo_2019, + title={Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation}, + volume={27}, + ISSN={2329-9304}, + url={http://dx.doi.org/10.1109/TASLP.2019.2915167}, + DOI={10.1109/taslp.2019.2915167}, + number={8}, + journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, + publisher={Institute of Electrical and Electronics Engineers (IEEE)}, + author={Luo, Yi and Mesgarani, Nima}, + year={2019}, + month={Aug}, + pages={1256–1266} +} +@InProceedings{ brian_mcfee-proc-scipy-2015, + author = { {B}rian {M}c{F}ee and {C}olin {R}affel and {D}awen {L}iang and {D}aniel {P}.{W}. {E}llis and {M}att {M}c{V}icar and {E}ric {B}attenberg and {O}riol {N}ieto }, + title = { librosa: {A}udio and {M}usic {S}ignal {A}nalysis in {P}ython }, + booktitle = { {P}roceedings of the 14th {P}ython in {S}cience {C}onference }, + pages = { 18 - 24 }, + year = { 2015 }, + editor = { {K}athryn {H}uff and {J}ames {B}ergstra }, + doi = { 10.25080/Majora-7b98e3ed-003 } +} +@INPROCEEDINGS{6701851, + author={Perraudin, Nathanaël and Balazs, Peter and Søndergaard, Peter L.}, + booktitle={2013 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics}, + title={A fast Griffin-Lim algorithm}, + year={2013}, + volume={}, + number={}, + pages={1-4}, + doi={10.1109/WASPAA.2013.6701851}} +@INPROCEEDINGS{1172092, + author={Griffin, D. and Jae Lim}, + booktitle={ICASSP '83. IEEE International Conference on Acoustics, Speech, and Signal Processing}, + title={Signal estimation from modified short-time Fourier transform}, + year={1983}, + volume={8}, + number={}, + pages={804-807}, + doi={10.1109/ICASSP.1983.1172092}} +@INPROCEEDINGS{6854049, + author={Ghahremani, Pegah and BabaAli, Bagher and Povey, Daniel and Riedhammer, Korbinian and Trmal, Jan and Khudanpur, Sanjeev}, + booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + title={A pitch extraction algorithm tuned for automatic speech recognition}, + year={2014}, + volume={}, + number={}, + pages={2494-2498}, + doi={10.1109/ICASSP.2014.6854049}} diff --git a/docs/source/rnnt_loss.rst b/docs/source/rnnt_loss.rst index 0c3b075d65..6e8ac81909 100644 --- a/docs/source/rnnt_loss.rst +++ b/docs/source/rnnt_loss.rst @@ -2,7 +2,7 @@ :class: hidden-section torchaudio.prototype.rnnt_loss -=============================== +============================== .. currentmodule:: torchaudio.prototype.rnnt_loss @@ -11,13 +11,18 @@ torchaudio.prototype.rnnt_loss The RNN transducer loss is a prototype feature, see `here `_ to learn more about the nomenclature. It is only available within the nightlies, and also needs to be imported explicitly using: :code:`from torchaudio.prototype.rnnt_loss import rnnt_loss, RNNTLoss`. rnnt_loss ---------- +~~~~~~~~~ .. autofunction:: rnnt_loss -:hidden:`RNNTLoss` -~~~~~~~~~~~~~~~~~~ +RNNTLoss +~~~~~~~~ .. autoclass:: RNNTLoss .. automethod:: forward + +References +~~~~~~~~~~ + +.. footbibliography:: diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 8269193e22..787673f8df 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -53,99 +53,105 @@ Transforms are common audio transforms. They can be chained together using :clas .. automethod:: forward :hidden:`MFCC` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~ .. autoclass:: MFCC .. automethod:: forward :hidden:`MuLawEncoding` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: MuLawEncoding .. automethod:: forward :hidden:`MuLawDecoding` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: MuLawDecoding .. automethod:: forward :hidden:`Resample` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~ .. autoclass:: Resample .. automethod:: forward :hidden:`ComplexNorm` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: ComplexNorm .. automethod:: forward :hidden:`ComputeDeltas` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: ComputeDeltas .. automethod:: forward :hidden:`TimeStretch` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: TimeStretch .. automethod:: forward :hidden:`Fade` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~ .. autoclass:: Fade .. automethod:: forward :hidden:`FrequencyMasking` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: FrequencyMasking .. automethod:: forward :hidden:`TimeMasking` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: TimeMasking .. automethod:: forward :hidden:`Vol` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~ .. autoclass:: Vol .. automethod:: forward :hidden:`SlidingWindowCmn` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: SlidingWindowCmn .. automethod:: forward :hidden:`SpectralCentroid` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: SpectralCentroid .. automethod:: forward :hidden:`Vad` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~ .. autoclass:: Vad .. automethod:: forward + + +References +~~~~~~~~~~ + +.. footbibliography:: diff --git a/torchaudio/backend/soundfile_backend.py b/torchaudio/backend/soundfile_backend.py index 8ad28a6fcf..afce945026 100644 --- a/torchaudio/backend/soundfile_backend.py +++ b/torchaudio/backend/soundfile_backend.py @@ -85,18 +85,20 @@ def _get_encoding(format: str, subtype: str): def info(filepath: str, format: Optional[str] = None) -> AudioMetaData: """Get signal information of an audio file. + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, + which has a restriction on type annotation due to TorchScript compiler compatiblity. + Args: filepath (path-like object or file-like object): Source of audio data. - Note: - * This argument is intentionally annotated as ``str`` only, - for the consistency with "sox_io" backend, which has a restriction - on type annotation due to TorchScript compiler compatiblity. format (str, optional): Not used. PySoundFile does not accept format hint. Returns: AudioMetaData: meta data of the given audio. + """ sinfo = soundfile.info(filepath) return AudioMetaData( @@ -159,13 +161,14 @@ def load( For these formats, this function always returns ``float32`` Tensor with values normalized to ``[-1.0, 1.0]``. + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, + which has a restriction on type annotation due to TorchScript compiler compatiblity. + Args: filepath (path-like object or file-like object): Source of audio data. - Note: - * This argument is intentionally annotated as ``str`` only, - for the consistency with "sox_io" backend, which has a restriction - on type annotation due to TorchScript compiler compatiblity. frame_offset (int): Number of frames to skip before start reading data. num_frames (int): @@ -324,11 +327,13 @@ def save( * OGG/VORBIS * SPHERE + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, + which has a restriction on type annotation due to TorchScript compiler compatiblity. + Args: filepath (str or pathlib.Path): Path to audio file. - This functionalso handles ``pathlib.Path`` objects, but is annotated as ``str`` - for the consistency with "sox_io" backend, which has a restriction on type annotation - for TorchScript compiler compatiblity. src (torch.Tensor): Audio data to save. must be 2D tensor. sample_rate (int): sampling rate channels_first (bool): If ``True``, the given tensor is interpreted as ``[channel, time]``, diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py index 85abe81339..68269d9b34 100644 --- a/torchaudio/functional/filtering.py +++ b/torchaudio/functional/filtering.py @@ -80,9 +80,9 @@ def allpass_biquad( Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF """ dtype = waveform.dtype device = waveform.device @@ -123,9 +123,9 @@ def band_biquad( Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF """ dtype = waveform.dtype device = waveform.device @@ -171,9 +171,9 @@ def bandpass_biquad( Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF """ dtype = waveform.dtype device = waveform.device @@ -207,9 +207,9 @@ def bandreject_biquad( Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF """ dtype = waveform.dtype device = waveform.device @@ -247,9 +247,9 @@ def bass_biquad( Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF """ dtype = waveform.dtype device = waveform.device @@ -325,8 +325,8 @@ def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor: Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html + Reference: + - http://sox.sourceforge.net/sox.html """ if not 0 <= enhancement_amount <= 100: @@ -358,8 +358,8 @@ def dcshift( Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html + Reference: + - http://sox.sourceforge.net/sox.html """ output_waveform = waveform limiter_threshold = 0.0 @@ -405,9 +405,9 @@ def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor: Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF """ if sample_rate == 44100: @@ -680,10 +680,12 @@ def flanger( Returns: Tensor: Waveform of dimension of `(..., channel, time)` - References: - http://sox.sourceforge.net/sox.html + Reference: + - http://sox.sourceforge.net/sox.html - Scott Lehman, Effects Explained, + - Scott Lehman, `Effects Explained`_, + + .. _Effects Explained: https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html """ @@ -1027,8 +1029,8 @@ def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor: Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html + Reference: + - http://sox.sourceforge.net/sox.html """ actual_shape = waveform.shape device, dtype = waveform.device, waveform.dtype @@ -1096,9 +1098,11 @@ def phaser( Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html - Scott Lehman, Effects Explained, + Reference: + - http://sox.sourceforge.net/sox.html + - Scott Lehman, `Effects Explained`_. + + .. _Effects Explained: https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html """ actual_shape = waveform.shape @@ -1166,9 +1170,9 @@ def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor: Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF """ if sample_rate == 44100: @@ -1234,9 +1238,9 @@ def treble_biquad( Returns: Tensor: Waveform of dimension of `(..., time)` - References: - http://sox.sourceforge.net/sox.html - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF """ dtype = waveform.dtype device = waveform.device @@ -1420,8 +1424,8 @@ def vad( Returns: Tensor: Tensor of audio of dimension (..., time). - References: - http://sox.sourceforge.net/sox.html + Reference: + - http://sox.sourceforge.net/sox.html """ if waveform.ndim > 2: diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py index 4ece68b4f3..cb35c2dfe6 100644 --- a/torchaudio/functional/functional.py +++ b/torchaudio/functional/functional.py @@ -156,18 +156,9 @@ def griffinlim( rand_init: bool ) -> Tensor: r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation. - Implementation ported from `librosa`. - - * [1] McFee, Brian, Colin Raffel, Dawen Liang, Daniel PW Ellis, Matt McVicar, Eric Battenberg, and Oriol Nieto. - "librosa: Audio and music signal analysis in python." - In Proceedings of the 14th python in science conference, pp. 18-25. 2015. - * [2] Perraudin, N., Balazs, P., & Søndergaard, P. L. - "A fast Griffin-Lim algorithm," - IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4), - Oct. 2013. - * [3] D. W. Griffin and J. S. Lim, - "Signal estimation from modified short-time Fourier transform," - IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. + + Implementation ported from + :footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`. Args: specgram (Tensor): A magnitude-only STFT spectrogram of dimension (..., freq, frames) @@ -1215,7 +1206,7 @@ def compute_kaldi_pitch( recompute_frame: int = 500, snip_edges: bool = True, ) -> torch.Tensor: - """Extract pitch based on method described in [1]. + """Extract pitch based on method described in :footcite:`6854049`. This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi. @@ -1274,15 +1265,6 @@ def compute_kaldi_pitch( Returns: Tensor: Pitch feature. Shape: ``(batch, frames 2)`` where the last dimension corresponds to pitch and NCCF. - - Reference: - - A pitch extraction algorithm tuned for automatic speech recognition - - P. Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S. Khudanpur - - 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), - - Florence, 2014, pp. 2494-2498, doi: 10.1109/ICASSP.2014.6854049. """ shape = waveform.shape waveform = waveform.reshape(-1, shape[-1]) diff --git a/torchaudio/models/conv_tasnet.py b/torchaudio/models/conv_tasnet.py index eb5f121fae..c9a88191ae 100644 --- a/torchaudio/models/conv_tasnet.py +++ b/torchaudio/models/conv_tasnet.py @@ -21,13 +21,6 @@ class ConvBlock(torch.nn.Module): Note: This implementation corresponds to the "non-causal" setting in the paper. - - Reference: - - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation - - Luo, Yi and Mesgarani, Nima - - https://arxiv.org/abs/1809.07454 """ def __init__( @@ -98,11 +91,6 @@ class MaskGenerator(torch.nn.Module): Note: This implementation corresponds to the "non-causal" setting in the paper. - - References: - - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation - Luo, Yi and Mesgarani, Nima - https://arxiv.org/abs/1809.07454 """ def __init__( @@ -176,7 +164,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: class ConvTasNet(torch.nn.Module): - """Conv-TasNet: a fully-convolutional time-domain audio separation network + """Conv-TasNet: a fully-convolutional time-domain audio separation network :footcite:`Luo_2019`. Args: num_sources (int): The number of sources to split. @@ -190,13 +178,6 @@ class ConvTasNet(torch.nn.Module): Note: This implementation corresponds to the "non-causal" setting in the paper. - - Reference: - - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation - - Luo, Yi and Mesgarani, Nima - - https://arxiv.org/abs/1809.07454 """ def __init__( diff --git a/torchaudio/models/deepspeech.py b/torchaudio/models/deepspeech.py index 477993e411..e325275278 100644 --- a/torchaudio/models/deepspeech.py +++ b/torchaudio/models/deepspeech.py @@ -31,9 +31,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class DeepSpeech(torch.nn.Module): """ - DeepSpeech model architecture from - `"Deep Speech: Scaling up end-to-end speech recognition"` - paper. + DeepSpeech model architecture from :footcite:`hannun2014deep`. Args: n_feature: Number of input features diff --git a/torchaudio/models/wav2letter.py b/torchaudio/models/wav2letter.py index 20a665a784..c47c6f4aed 100644 --- a/torchaudio/models/wav2letter.py +++ b/torchaudio/models/wav2letter.py @@ -7,9 +7,7 @@ class Wav2Letter(nn.Module): - r"""Wav2Letter model architecture from the `Wav2Letter an End-to-End ConvNet-based Speech Recognition System`_. - - .. _Wav2Letter an End-to-End ConvNet-based Speech Recognition System: https://arxiv.org/abs/1609.03193 + r"""Wav2Letter model architecture from :footcite:`collobert2016wav2letter`. :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}` diff --git a/torchaudio/models/wav2vec2/model.py b/torchaudio/models/wav2vec2/model.py index 16ba3d0d8b..9c6bb06599 100644 --- a/torchaudio/models/wav2vec2/model.py +++ b/torchaudio/models/wav2vec2/model.py @@ -7,7 +7,7 @@ class Wav2Vec2Model(Module): - """Model used in wav2vec2.0 paper. [1] + """Encoder model used in [:footcite:`baevski2020wav2vec`]. Note: To build the model, please use one of the factory functions. @@ -19,13 +19,6 @@ class Wav2Vec2Model(Module): encoder (torch.nn.Module): Encoder that converts the audio features into the sequence of probability distribution (in negative log-likelihood) over labels. - - Reference: - - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations - - Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli - - https://arxiv.org/abs/2006.11477 """ def __init__( self, @@ -129,7 +122,7 @@ def _get_model( def wav2vec2_base(num_out: int) -> Wav2Vec2Model: - """Build wav2vec2.0 model with **Base** configuration. [1] + """Build wav2vec2.0 model with "Base" configuration from [:footcite:`baevski2020wav2vec`]. Args: num_out: int @@ -138,12 +131,17 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model: Returns: Wav2Vec2Model: The resulting model. - Reference: - - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations - - Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli - - https://arxiv.org/abs/2006.11477 + Example - Reload fine-tuned model from Hugging Face: + >>> # Session 1 - Convert pretrained model from Hugging Face and save the parameters. + >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model + >>> + >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") + >>> model = import_huggingface_model(original) + >>> torch.save(model.state_dict(), "wav2vec2-base-960h.pt") + >>> + >>> # Session 2 - Load model and the parameters + >>> model = wav2vec2_base(num_out=32) + >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt")) """ return _get_model( extractor_mode="group_norm", @@ -166,7 +164,7 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model: def wav2vec2_large(num_out: int) -> Wav2Vec2Model: - """Build wav2vec2.0 model with **Large** configuration. [1] + """Build wav2vec2.0 model with "Large" configuration from [:footcite:`baevski2020wav2vec`]. Args: num_out: int @@ -175,12 +173,17 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model: Returns: Wav2Vec2Model: The resulting model. - Reference: - - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations - - Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli - - https://arxiv.org/abs/2006.11477 + Example - Reload fine-tuned model from Hugging Face: + >>> # Session 1 - Convert pretrained model from Hugging Face and save the parameters. + >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model + >>> + >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") + >>> model = import_huggingface_model(original) + >>> torch.save(model.state_dict(), "wav2vec2-base-960h.pt") + >>> + >>> # Session 2 - Load model and the parameters + >>> model = wav2vec2_large(num_out=32) + >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt")) """ return _get_model( extractor_mode="group_norm", @@ -203,7 +206,7 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model: def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model: - """Build wav2vec2.0 model with **Large LV-60k** configuration. [1] + """Build wav2vec2.0 model with "Large LV-60k" configuration from [:footcite:`baevski2020wav2vec`]. Args: num_out: int @@ -212,12 +215,17 @@ def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model: Returns: Wav2Vec2Model: The resulting model. - Reference: - - wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations - - Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli - - https://arxiv.org/abs/2006.11477 + Example - Reload fine-tuned model from Hugging Face: + >>> # Session 1 - Convert pretrained model from Hugging Face and save the parameters. + >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model + >>> + >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") + >>> model = import_huggingface_model(original) + >>> torch.save(model.state_dict(), "wav2vec2-base-960h.pt") + >>> + >>> # Session 2 - Load model and the parameters + >>> model = wav2vec2_large_lv60k(num_out=32) + >>> model.load_state_dict(torch.load("wav2vec2-base-960h.pt")) """ return _get_model( extractor_mode="layer_norm", diff --git a/torchaudio/models/wav2vec2/utils/import_fairseq.py b/torchaudio/models/wav2vec2/utils/import_fairseq.py index f871957f58..b7dbe9cbb6 100644 --- a/torchaudio/models/wav2vec2/utils/import_fairseq.py +++ b/torchaudio/models/wav2vec2/utils/import_fairseq.py @@ -141,6 +141,8 @@ def import_fairseq_model( Wav2Vec2Model: Imported model. Example - Loading pretrain-only model + >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model + >>> >>> # Load model using fairseq >>> model_file = 'wav2vec_small.pt' >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file]) @@ -156,6 +158,8 @@ def import_fairseq_model( >>> torch.testing.assert_allclose(features, reference) Example - Fine-tuned model + >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model + >>> >>> # Load model using fairseq >>> model_file = 'wav2vec_small_960h.pt' >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file]) diff --git a/torchaudio/models/wav2vec2/utils/import_huggingface.py b/torchaudio/models/wav2vec2/utils/import_huggingface.py index 6072d0ff83..0983cd1ad8 100644 --- a/torchaudio/models/wav2vec2/utils/import_huggingface.py +++ b/torchaudio/models/wav2vec2/utils/import_huggingface.py @@ -50,6 +50,8 @@ def import_huggingface_model(original: Module) -> Wav2Vec2Model: Wav2Vec2Model: Imported model. Example + >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model + >>> >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") >>> model = import_huggingface_model(original) >>> diff --git a/torchaudio/models/wavernn.py b/torchaudio/models/wavernn.py index eedbd3c589..3763821ad6 100644 --- a/torchaudio/models/wavernn.py +++ b/torchaudio/models/wavernn.py @@ -14,9 +14,7 @@ class ResBlock(nn.Module): - r"""ResNet block based on "Deep Residual Learning for Image Recognition" - - The paper link is https://arxiv.org/pdf/1512.03385.pdf. + r"""ResNet block based on :footcite:`kalchbrenner2018efficient`. Args: n_freq: the number of bins in a spectrogram. (Default: ``128``) @@ -204,8 +202,7 @@ def forward(self, specgram: Tensor) -> Tuple[Tensor, Tensor]: class WaveRNN(nn.Module): r"""WaveRNN model based on the implementation from `fatchord `_. - The original implementation was introduced in - `"Efficient Neural Audio Synthesis" `_. + The original implementation was introduced in :footcite:`kalchbrenner2018efficient`. The input channels of waveform and spectrogram have to be 1. The product of `upsample_scales` must equal `hop_length`. diff --git a/torchaudio/prototype/rnnt_loss.py b/torchaudio/prototype/rnnt_loss.py index 2bce60835b..0765ea2dcd 100644 --- a/torchaudio/prototype/rnnt_loss.py +++ b/torchaudio/prototype/rnnt_loss.py @@ -17,11 +17,10 @@ def rnnt_loss( fused_log_softmax: bool = True, reuse_logits_for_grads: bool = True, ): - """ - Compute the RNN Transducer Loss. + """Compute the RNN Transducer loss from :footcite:`graves2012sequence`. - The RNN Transducer loss (`Graves 2012 `__) extends the CTC loss by defining - a distribution over output sequences of all lengths, and by jointly modelling both input-output and output-output + The RNN Transducer loss extends the CTC loss by defining a distribution over output + sequences of all lengths, and by jointly modelling both input-output and output-output dependencies. Args: @@ -58,11 +57,10 @@ def rnnt_loss( class RNNTLoss(torch.nn.Module): - """ - Compute the RNN Transducer Loss. + """Compute the RNN Transducer loss from :footcite:`graves2012sequence`. - The RNN Transducer loss (`Graves 2012 `__) extends the CTC loss by defining - a distribution over output sequences of all lengths, and by jointly modelling both input-output and output-output + The RNN Transducer loss extends the CTC loss by defining a distribution over output + sequences of all lengths, and by jointly modelling both input-output and output-output dependencies. Args: diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py index 5dacd583a7..4d4c2d3f6c 100644 --- a/torchaudio/transforms.py +++ b/torchaudio/transforms.py @@ -126,7 +126,8 @@ def forward(self, waveform: Tensor) -> Tensor: class GriffinLim(torch.nn.Module): r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation. - Implementation ported from ``librosa`` [1]_, [2]_, [3]_. + Implementation ported from + :footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`. Args: n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) @@ -143,24 +144,6 @@ class GriffinLim(torch.nn.Module): Values near 1 can lead to faster convergence, but above 1 may not converge. (Default: ``0.99``) length (int, optional): Array length of the expected output. (Default: ``None``) rand_init (bool, optional): Initializes phase randomly if True and to zero otherwise. (Default: ``True``) - - References: - .. [1] - | McFee, Brian, Colin Raffel, Dawen Liang, Daniel PW Ellis, Matt McVicar, Eric Battenberg, - and Oriol Nieto. - | "librosa: Audio and music signal analysis in python." - | In Proceedings of the 14th python in science conference, pp. 18-25. 2015. - - .. [2] - | Perraudin, N., Balazs, P., & Søndergaard, P. L. - | "A fast Griffin-Lim algorithm," - | IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4), - | Oct. 2013. - - .. [3] - | D. W. Griffin and J. S. Lim, - | "Signal estimation from modified short-time Fourier transform," - | IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. """ __constants__ = ['n_fft', 'n_iter', 'win_length', 'hop_length', 'power', 'length', 'momentum', 'rand_init'] @@ -1085,8 +1068,8 @@ class Vad(torch.nn.Module): lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used in the detector algorithm. (Default: 2000.0) - References: - http://sox.sourceforge.net/sox.html + Reference: + - http://sox.sourceforge.net/sox.html """ def __init__(self,