2121import torchaudio
2222import matplotlib .pyplot as plt
2323
24-
2524######################################################################
26- # Opening a dataset
25+ # Opening a file
2726# -----------------
2827#
29-
30-
31- ######################################################################
32- # torchaudio supports loading sound files in the wav and mp3 format. We
28+ # ``torchaudio`` also supports loading sound files in the wav and mp3 format. We
3329# call waveform the resulting raw audio signal.
3430#
3531
4238plt .figure ()
4339plt .plot (waveform .t ().numpy ())
4440
41+ ######################################################################
42+ # When you load a file in ``torchaudio``, you can optionally specify the backend to use either
43+ # `SoX <https://pypi.org/project/sox/>`_ or `SoundFile <https://pypi.org/project/SoundFile/>`_
44+ # via ``torchaudio.set_audio_backend``. These backends are loaded lazily when needed.
45+ #
46+ # ``torchaudio`` also makes JIT compilation optional for functions, and uses ``nn.Module`` where possible.
4547
4648######################################################################
4749# Transformations
4850# ---------------
4951#
50- # torchaudio supports a growing list of
52+ # `` torchaudio`` supports a growing list of
5153# `transformations <https://pytorch.org/audio/transforms.html>`_.
5254#
5355# - **Resample**: Resample waveform to a different sample rate.
5456# - **Spectrogram**: Create a spectrogram from a waveform.
57+ # - **GriffinLim**: Compute waveform from a linear scale magnitude spectrogram using
58+ # the Griffin-Lim transformation.
59+ # - **ComputeDeltas**: Compute delta coefficients of a tensor, usually a spectrogram.
60+ # - **ComplexNorm**: Compute the norm of a complex tensor.
5561# - **MelScale**: This turns a normal STFT into a Mel-frequency STFT,
5662# using a conversion matrix.
5763# - **AmplitudeToDB**: This turns a spectrogram from the
6268# STFT function in PyTorch.
6369# - **MuLawEncoding**: Encode waveform based on mu-law companding.
6470# - **MuLawDecoding**: Decode mu-law encoded waveform.
71+ # - **TimeStretch**: Stretch a spectrogram in time without modifying pitch for a given rate.
72+ # - **FrequencyMasking**: Apply masking to a spectrogram in the frequency domain.
73+ # - **TimeMasking**: Apply masking to a spectrogram in the time domain.
74+ #
75+ # Each transform supports batching: you can perform a transform on a single raw
76+ # audio signal or spectrogram, or many of the same shape.
6577#
66- # Since all transforms are nn.Modules or jit.ScriptModules, they can be
78+ # Since all transforms are `` nn.Modules`` or `` jit.ScriptModules`` , they can be
6779# used as part of a neural network at any point.
6880#
6981
@@ -168,13 +180,86 @@ def normalize(tensor):
168180print ("Median relative difference between original and MuLaw reconstucted signals: {:.2%}" .format (err ))
169181
170182
183+ ######################################################################
184+ # Functional
185+ # ---------------
186+ #
187+ # The transformations seen above rely on lower level stateless functions for their computations.
188+ # These functions are available under ``torchaudio.functional``. The complete list is available
189+ # `here <https://pytorch.org/audio/functional.html>`_ and includes:
190+ #
191+ # - **istft**: Inverse short time Fourier Transform.
192+ # - **gain**: Applies amplification or attenuation to the whole waveform.
193+ # - **dither**: Increases the perceived dynamic range of audio stored at a
194+ # particular bit-depth.
195+ # - **compute_deltas**: Compute delta coefficients of a tensor.
196+ # - **equalizer_biquad**: Design biquad peaking equalizer filter and perform filtering.
197+ # - **lowpass_biquad**: Design biquad lowpass filter and perform filtering.
198+ # - **highpass_biquad**:Design biquad highpass filter and perform filtering.
199+ #
200+ # For example, let's try the `mu_law_encoding` functional:
201+
202+ mu_law_encoding_waveform = torchaudio .functional .mu_law_encoding (waveform , quantization_channels = 256 )
203+
204+ print ("Shape of transformed waveform: {}" .format (mu_law_encoding_waveform .size ()))
205+
206+ plt .figure ()
207+ plt .plot (mu_law_encoding_waveform [0 ,:].numpy ())
208+
209+ ######################################################################
210+ # You can see how the output fron ``torchaudio.functional.mu_law_encoding`` is the same as
211+ # the output from ``torchaudio.transforms.MuLawEncoding``.
212+ #
213+ # Now let's experiment with a few of the other functionals and visualize their output. Taking our
214+ # spectogram, we can compute it's deltas:
215+
216+ computed = torchaudio .functional .compute_deltas (specgram , win_length = 3 )
217+ print ("Shape of computed deltas: {}" .format (computed .shape ))
218+
219+ plt .figure ()
220+ plt .imshow (computed .log2 ()[0 ,:,:].detach ().numpy (), cmap = 'gray' )
221+
222+ ######################################################################
223+ # We can take the original waveform and apply different effects to it.
224+ #
225+
226+ gain_waveform = torchaudio .functional .gain (waveform , gain_db = 5.0 )
227+ print ("Min of gain_waveform: {}\n Max of gain_waveform: {}\n Mean of gain_waveform: {}" .format (gain_waveform .min (), gain_waveform .max (), gain_waveform .mean ()))
228+
229+ dither_waveform = torchaudio .functional .dither (waveform )
230+ print ("Min of dither_waveform: {}\n Max of dither_waveform: {}\n Mean of dither_waveform: {}" .format (dither_waveform .min (), dither_waveform .max (), dither_waveform .mean ()))
231+
232+ ######################################################################
233+ # Another example of the capabilities in ``torchaudio.functional`` are applying filters to our
234+ # waveform. Applying the lowpass biquad filter to our waveform will output a new waveform with
235+ # the signal of the frequency modified.
236+
237+ lowpass_waveform = torchaudio .functional .lowpass_biquad (waveform , sample_rate , cutoff_freq = 3000 )
238+
239+ print ("Min of lowpass_waveform: {}\n Max of lowpass_waveform: {}\n Mean of lowpass_waveform: {}" .format (lowpass_waveform .min (), lowpass_waveform .max (), lowpass_waveform .mean ()))
240+
241+ plt .figure ()
242+ plt .plot (lowpass_waveform .t ().numpy ())
243+
244+ ######################################################################
245+ # We can also visualize a waveform with the highpass biquad filter.
246+ #
247+
248+ highpass_waveform = torchaudio .functional .highpass_biquad (waveform , sample_rate , cutoff_freq = 2000 )
249+
250+ print ("Min of highpass_waveform: {}\n Max of highpass_waveform: {}\n Mean of highpass_waveform: {}" .format (highpass_waveform .min (), highpass_waveform .max (), highpass_waveform .mean ()))
251+
252+ plt .figure ()
253+ plt .plot (highpass_waveform .t ().numpy ())
254+
255+
171256######################################################################
172257# Migrating to torchaudio from Kaldi
173258# ----------------------------------
174259#
175260# Users may be familiar with
176261# `Kaldi <http://github.com/kaldi-asr/kaldi>`_, a toolkit for speech
177- # recognition. torchaudio offers compatibility with it in
262+ # recognition. `` torchaudio`` offers compatibility with it in
178263# ``torchaudio.kaldi_io``. It can indeed read from kaldi scp, or ark file
179264# or streams with:
180265#
@@ -184,8 +269,8 @@ def normalize(tensor):
184269# - read_mat_scp
185270# - read_mat_ark
186271#
187- # torchaudio provides Kaldi-compatible transforms for ``spectrogram`` and
188- # ``fbank`` with the benefit of GPU support, see
272+ # `` torchaudio`` provides Kaldi-compatible transforms for ``spectrogram``,
273+ # ``fbank``, ``mfcc``, and ``resample_waveform with the benefit of GPU support, see
189274# `here <compliance.kaldi.html>`__ for more information.
190275#
191276
@@ -225,13 +310,66 @@ def normalize(tensor):
225310plt .imshow (fbank .t ().numpy (), cmap = 'gray' )
226311
227312
313+ ######################################################################
314+ # You can create mel frequency cepstral coefficients from a raw audio signal
315+ # This matches the input/output of Kaldi’s compute-mfcc-feats.
316+ #
317+
318+ mfcc = torchaudio .compliance .kaldi .mfcc (waveform , ** params )
319+
320+ print ("Shape of mfcc: {}" .format (mfcc .size ()))
321+
322+ plt .figure ()
323+ plt .imshow (mfcc .t ().numpy (), cmap = 'gray' )
324+
325+
326+ ######################################################################
327+ # Available Datasets
328+ # -----------------
329+ #
330+ # If you do not want to create your own dataset to train your model, ``torchaudio`` offers a
331+ # unified dataset interface. This interface supports lazy-loading of files to memory, download
332+ # and extract functions, and datasets to build models.
333+ #
334+ # The datasets ``torchaudio`` currently supports are:
335+ #
336+ # - **VCTK**: Speech data uttered by 109 native speakers of English with various accents
337+ # (`Read more here <https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html>`_).
338+ # - **Yesno**: Sixty recordings of one individual saying yes or no in Hebrew; each
339+ # recording is eight words long (`Read more here <https://www.openslr.org/1/>`_).
340+ # - **Common Voice**: An open source, multi-language dataset of voices that anyone can use
341+ # to train speech-enabled applications (`Read more here <https://voice.mozilla.org/en/datasets>`_).
342+ # - **LibriSpeech**: Large-scale (1000 hours) corpus of read English speech (`Read more here <http://www.openslr.org/12>`_).
343+ #
344+
345+ yesno_data = torchaudio .datasets .YESNO ('./' , download = True )
346+
347+ # A data point in Yesno is a tuple (waveform, sample_rate, labels) where labels is a list of integers with 1 for yes and 0 for no.
348+
349+ # Pick data point number 3 to see an example of the the yesno_data:
350+ n = 3
351+ waveform , sample_rate , labels = yesno_data [n ]
352+
353+ print ("Waveform: {}\n Sample rate: {}\n Labels: {}" .format (waveform , sample_rate , labels ))
354+
355+ plt .figure ()
356+ plt .plot (waveform .t ().numpy ())
357+
358+
359+ ######################################################################
360+ # Now, whenever you ask for a sound file from the dataset, it is loaded in memory only when you ask for it.
361+ # Meaning, the dataset only loads and keeps in memory the items that you want and use, saving on memory.
362+ #
363+
228364######################################################################
229365# Conclusion
230366# ----------
231367#
232368# We used an example raw audio signal, or waveform, to illustrate how to
233- # open an audio file using torchaudio, and how to pre-process and
234- # transform such waveform. Given that torchaudio is built on PyTorch,
369+ # open an audio file using ``torchaudio``, and how to pre-process,
370+ # transform, and apply functions to such waveform. We also demonstrated how
371+ # to use familiar Kaldi functions, as well as utilize built-in datasets to
372+ # construct our models. Given that ``torchaudio`` is built on PyTorch,
235373# these techniques can be used as building blocks for more advanced audio
236374# applications, such as speech recognition, while leveraging GPUs.
237375#
0 commit comments